[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Delete resurrected files (from ia64 merge).
# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID 43e28a2f6037266ff44500dd57cb4f841495372e # Parent d75a502b45eba186f2c51ea8fc3dc5aa8f0e3a4a Delete resurrected files (from ia64 merge). diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/Makefile --- a/linux-2.4.30-xen-sparse/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,592 +0,0 @@ -VERSION = 2 -PATCHLEVEL = 4 -SUBLEVEL = 30 -EXTRAVERSION = - -KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) - -# SUBARCH always tells us the underlying machine architecture. -# Unless overridden, by default ARCH is equivalent to SUBARCH. -# This will be overriden for Xen and UML builds. -SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/) -ARCH ?= $(SUBARCH) - -## XXX The following hack can be discarded after users have adjusted to the -## architectural name change 'xeno' -> 'xen'. -ifeq ($(ARCH),xeno) - ARCH := xen -endif - -KERNELPATH=kernel-$(shell echo $(KERNELRELEASE) | sed -e "s/-//g") - -CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ - else if [ -x /bin/bash ]; then echo /bin/bash; \ - else echo sh; fi ; fi) -TOPDIR := $(shell /bin/pwd) - -HPATH = $(TOPDIR)/include -FINDHPATH = $(HPATH)/asm $(HPATH)/linux $(HPATH)/scsi $(HPATH)/net $(HPATH)/math-emu - -HOSTCC = gcc -HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer - -CROSS_COMPILE = - -# -# Include the make variables (CC, etc...) -# - -AS = $(CROSS_COMPILE)as -LD = $(CROSS_COMPILE)ld -CC = $(CROSS_COMPILE)gcc -CPP = $(CC) -E -AR = $(CROSS_COMPILE)ar -NM = $(CROSS_COMPILE)nm -STRIP = $(CROSS_COMPILE)strip -OBJCOPY = $(CROSS_COMPILE)objcopy -OBJDUMP = $(CROSS_COMPILE)objdump -MAKEFILES = $(TOPDIR)/.config -GENKSYMS = /sbin/genksyms -DEPMOD = /sbin/depmod -MODFLAGS = -DMODULE -CFLAGS_KERNEL = -PERL = perl -AWK = awk -RPM := $(shell if [ -x "/usr/bin/rpmbuild" ]; then echo rpmbuild; \ - else echo rpm; fi) - -export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ - CONFIG_SHELL TOPDIR HPATH HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \ - CPP AR NM STRIP OBJCOPY OBJDUMP MAKE MAKEFILES GENKSYMS MODFLAGS PERL AWK - -all: do-it-all - -# -# Make "config" the default target if there is no configuration file or -# "depend" the target if there is no top-level dependency information. -# - -ifeq (.config,$(wildcard .config)) -include .config -ifeq (.depend,$(wildcard .depend)) -include .depend -do-it-all: Version vmlinux -else -CONFIGURATION = depend -do-it-all: depend -endif -else -CONFIGURATION = config -do-it-all: config -endif - -# -# INSTALL_PATH specifies where to place the updated kernel and system map -# images. Uncomment if you want to place them anywhere other than root. -# - -#export INSTALL_PATH=/boot - -# -# INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory -# relocations required by build roots. This is not defined in the -# makefile but the arguement can be passed to make if needed. -# - -MODLIB = $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) -export MODLIB - -# -# standard CFLAGS -# - -CPPFLAGS := -D__KERNEL__ -I$(HPATH) - -CFLAGS := $(CPPFLAGS) -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ - -fno-strict-aliasing -fno-common -ifndef CONFIG_FRAME_POINTER -CFLAGS += -fomit-frame-pointer -endif -AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) - -# -# ROOT_DEV specifies the default root-device when making the image. -# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case -# the default of FLOPPY is used by 'build'. -# This is i386 specific. -# - -export ROOT_DEV = CURRENT - -# -# If you want to preset the SVGA mode, uncomment the next line and -# set SVGA_MODE to whatever number you want. -# Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode. -# The number is the same as you would ordinarily press at bootup. -# This is i386 specific. -# - -export SVGA_MODE = -DSVGA_MODE=NORMAL_VGA - -# -# If you want the RAM disk device, define this to be the size in blocks. -# This is i386 specific. -# - -#export RAMDISK = -DRAMDISK=512 - -CORE_FILES =kernel/kernel.o mm/mm.o fs/fs.o ipc/ipc.o -NETWORKS =net/network.o - -LIBS =$(TOPDIR)/lib/lib.a -SUBDIRS =kernel drivers mm fs net ipc lib crypto - -DRIVERS-n := -DRIVERS-y := -DRIVERS-m := -DRIVERS- := - -DRIVERS-$(CONFIG_ACPI_BOOT) += drivers/acpi/acpi.o -DRIVERS-$(CONFIG_PARPORT) += drivers/parport/driver.o -DRIVERS-y += drivers/char/char.o \ - drivers/block/block.o \ - drivers/misc/misc.o \ - drivers/net/net.o -DRIVERS-$(CONFIG_AGP) += drivers/char/agp/agp.o -DRIVERS-$(CONFIG_DRM_NEW) += drivers/char/drm/drm.o -DRIVERS-$(CONFIG_DRM_OLD) += drivers/char/drm-4.0/drm.o -DRIVERS-$(CONFIG_NUBUS) += drivers/nubus/nubus.a -DRIVERS-$(CONFIG_NET_FC) += drivers/net/fc/fc.o -DRIVERS-$(CONFIG_DEV_APPLETALK) += drivers/net/appletalk/appletalk.o -DRIVERS-$(CONFIG_TR) += drivers/net/tokenring/tr.o -DRIVERS-$(CONFIG_WAN) += drivers/net/wan/wan.o -DRIVERS-$(CONFIG_ARCNET) += drivers/net/arcnet/arcnetdrv.o -DRIVERS-$(CONFIG_ATM) += drivers/atm/atm.o -DRIVERS-$(CONFIG_IDE) += drivers/ide/idedriver.o -DRIVERS-$(CONFIG_FC4) += drivers/fc4/fc4.a -DRIVERS-$(CONFIG_SCSI) += drivers/scsi/scsidrv.o -DRIVERS-$(CONFIG_FUSION_BOOT) += drivers/message/fusion/fusion.o -DRIVERS-$(CONFIG_IEEE1394) += drivers/ieee1394/ieee1394drv.o - -ifneq ($(CONFIG_CD_NO_IDESCSI)$(CONFIG_BLK_DEV_IDECD)$(CONFIG_BLK_DEV_SR)$(CONFIG_PARIDE_PCD),) -DRIVERS-y += drivers/cdrom/driver.o -endif - -DRIVERS-$(CONFIG_SOUND) += drivers/sound/sounddrivers.o -DRIVERS-$(CONFIG_PCI) += drivers/pci/driver.o -DRIVERS-$(CONFIG_MTD) += drivers/mtd/mtdlink.o -DRIVERS-$(CONFIG_PCMCIA) += drivers/pcmcia/pcmcia.o -DRIVERS-$(CONFIG_NET_PCMCIA) += drivers/net/pcmcia/pcmcia_net.o -DRIVERS-$(CONFIG_NET_WIRELESS) += drivers/net/wireless/wireless_net.o -DRIVERS-$(CONFIG_PCMCIA_CHRDEV) += drivers/char/pcmcia/pcmcia_char.o -DRIVERS-$(CONFIG_DIO) += drivers/dio/dio.a -DRIVERS-$(CONFIG_SBUS) += drivers/sbus/sbus_all.o -DRIVERS-$(CONFIG_ZORRO) += drivers/zorro/driver.o -DRIVERS-$(CONFIG_FC4) += drivers/fc4/fc4.a -DRIVERS-$(CONFIG_PPC32) += drivers/macintosh/macintosh.o -DRIVERS-$(CONFIG_MAC) += drivers/macintosh/macintosh.o -DRIVERS-$(CONFIG_ISAPNP) += drivers/pnp/pnp.o -DRIVERS-$(CONFIG_I2C) += drivers/i2c/i2c.o -DRIVERS-$(CONFIG_VT) += drivers/video/video.o -DRIVERS-$(CONFIG_PARIDE) += drivers/block/paride/paride.a -DRIVERS-$(CONFIG_HAMRADIO) += drivers/net/hamradio/hamradio.o -DRIVERS-$(CONFIG_TC) += drivers/tc/tc.a -DRIVERS-$(CONFIG_USB) += drivers/usb/usbdrv.o -DRIVERS-$(CONFIG_USB_GADGET) += drivers/usb/gadget/built-in.o -DRIVERS-y +=drivers/media/media.o -DRIVERS-$(CONFIG_INPUT) += drivers/input/inputdrv.o -DRIVERS-$(CONFIG_HIL) += drivers/hil/hil.o -DRIVERS-$(CONFIG_I2O) += drivers/message/i2o/i2o.o -DRIVERS-$(CONFIG_IRDA) += drivers/net/irda/irda.o -DRIVERS-$(CONFIG_PHONE) += drivers/telephony/telephony.o -DRIVERS-$(CONFIG_MD) += drivers/md/mddev.o -DRIVERS-$(CONFIG_GSC) += drivers/gsc/gscbus.o -DRIVERS-$(CONFIG_BLUEZ) += drivers/bluetooth/bluetooth.o -DRIVERS-$(CONFIG_HOTPLUG_PCI) += drivers/hotplug/vmlinux-obj.o -DRIVERS-$(CONFIG_ISDN_BOOL) += drivers/isdn/vmlinux-obj.o -DRIVERS-$(CONFIG_CRYPTO) += crypto/crypto.o - -DRIVERS := $(DRIVERS-y) - - -# files removed with 'make clean' -CLEAN_FILES = \ - kernel/ksyms.lst include/linux/compile.h \ - vmlinux System.map \ - .tmp* \ - drivers/char/consolemap_deftbl.c drivers/video/promcon_tbl.c \ - drivers/char/conmakehash \ - drivers/char/drm/*-mod.c \ - drivers/pci/devlist.h drivers/pci/classlist.h drivers/pci/gen-devlist \ - drivers/zorro/devlist.h drivers/zorro/gen-devlist \ - drivers/sound/bin2hex drivers/sound/hex2hex \ - drivers/atm/fore200e_mkfirm drivers/atm/{pca,sba}*{.bin,.bin1,.bin2} \ - drivers/scsi/aic7xxx/aicasm/aicasm \ - drivers/scsi/aic7xxx/aicasm/aicasm_gram.c \ - drivers/scsi/aic7xxx/aicasm/aicasm_gram.h \ - drivers/scsi/aic7xxx/aicasm/aicasm_macro_gram.c \ - drivers/scsi/aic7xxx/aicasm/aicasm_macro_gram.h \ - drivers/scsi/aic7xxx/aicasm/aicasm_macro_scan.c \ - drivers/scsi/aic7xxx/aicasm/aicasm_scan.c \ - drivers/scsi/aic7xxx/aicasm/aicdb.h \ - drivers/scsi/aic7xxx/aicasm/y.tab.h \ - drivers/scsi/53c700_d.h \ - drivers/tc/lk201-map.c \ - net/khttpd/make_times_h \ - net/khttpd/times.h \ - submenu* \ - drivers/ieee1394/oui.c -# directories removed with 'make clean' -CLEAN_DIRS = \ - modules - -# files removed with 'make mrproper' -MRPROPER_FILES = \ - include/linux/autoconf.h include/linux/version.h \ - lib/crc32table.h lib/gen_crc32table \ - drivers/net/hamradio/soundmodem/sm_tbl_{afsk1200,afsk2666,fsk9600}.h \ - drivers/net/hamradio/soundmodem/sm_tbl_{hapn4800,psk4800}.h \ - drivers/net/hamradio/soundmodem/sm_tbl_{afsk2400_7,afsk2400_8}.h \ - drivers/net/hamradio/soundmodem/gentbl \ - drivers/sound/*_boot.h drivers/sound/.*.boot \ - drivers/sound/msndinit.c \ - drivers/sound/msndperm.c \ - drivers/sound/pndsperm.c \ - drivers/sound/pndspini.c \ - drivers/atm/fore200e_*_fw.c drivers/atm/.fore200e_*.fw \ - .version .config* config.in config.old \ - scripts/tkparse scripts/kconfig.tk scripts/kconfig.tmp \ - scripts/lxdialog/*.o scripts/lxdialog/lxdialog \ - .menuconfig.log \ - include/asm \ - .hdepend scripts/mkdep scripts/split-include scripts/docproc \ - $(TOPDIR)/include/linux/modversions.h \ - kernel.spec - -# directories removed with 'make mrproper' -MRPROPER_DIRS = \ - include/config \ - $(TOPDIR)/include/linux/modules - - -include arch/$(ARCH)/Makefile - -# Extra cflags for kbuild 2.4. The default is to forbid includes by kernel code -# from user space headers. Some UML code requires user space headers, in the -# UML Makefiles add 'kbuild_2_4_nostdinc :=' before include Rules.make. No -# other kernel code should include user space headers, if you need -# 'kbuild_2_4_nostdinc :=' or -I/usr/include for kernel code and you are not UML -# then your code is broken! KAO. - -kbuild_2_4_nostdinc := -nostdinc -iwithprefix include -export kbuild_2_4_nostdinc - -export CPPFLAGS CFLAGS CFLAGS_KERNEL AFLAGS AFLAGS_KERNEL - -export NETWORKS DRIVERS LIBS HEAD LDFLAGS LINKFLAGS MAKEBOOT ASFLAGS - -.S.s: - $(CPP) $(AFLAGS) $(AFLAGS_KERNEL) -traditional -o $*.s $< -.S.o: - $(CC) $(AFLAGS) $(AFLAGS_KERNEL) -traditional -c -o $*.o $< - -Version: dummy - @rm -f include/linux/compile.h - -boot: vmlinux - @$(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C arch/$(ARCH)/boot - -vmlinux: include/linux/version.h $(CONFIGURATION) init/main.o init/version.o init/do_mounts.o linuxsubdirs - $(LD) $(LINKFLAGS) $(HEAD) init/main.o init/version.o init/do_mounts.o \ - --start-group \ - $(CORE_FILES) \ - $(DRIVERS) \ - $(NETWORKS) \ - $(LIBS) \ - --end-group \ - -o vmlinux - $(NM) vmlinux | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map - -symlinks: - rm -f include/asm - ( cd include ; ln -sf asm-$(ARCH) asm) - @if [ ! -d include/linux/modules ]; then \ - mkdir include/linux/modules; \ - fi - -oldconfig: symlinks - $(CONFIG_SHELL) scripts/Configure -d arch/$(ARCH)/config.in - -xconfig: symlinks - $(MAKE) -C scripts kconfig.tk - wish -f scripts/kconfig.tk - -menuconfig: include/linux/version.h symlinks - $(MAKE) -C scripts/lxdialog all - $(CONFIG_SHELL) scripts/Menuconfig arch/$(ARCH)/config.in - -config: symlinks - $(CONFIG_SHELL) scripts/Configure arch/$(ARCH)/config.in - -include/config/MARKER: scripts/split-include include/linux/autoconf.h - scripts/split-include include/linux/autoconf.h include/config - @ touch include/config/MARKER - -linuxsubdirs: $(patsubst %, _dir_%, $(SUBDIRS)) - -$(patsubst %, _dir_%, $(SUBDIRS)) : dummy include/linux/version.h include/config/MARKER - $(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" -C $(patsubst _dir_%, %, $@) - -$(TOPDIR)/include/linux/version.h: include/linux/version.h -$(TOPDIR)/include/linux/compile.h: include/linux/compile.h - -newversion: - . scripts/mkversion > .tmpversion - @mv -f .tmpversion .version - -uts_len := 64 -uts_truncate := sed -e 's/\(.\{1,$(uts_len)\}\).*/\1/' - -include/linux/compile.h: $(CONFIGURATION) include/linux/version.h newversion - @echo -n \#`cat .version` > .ver1 - @if [ -n "$(CONFIG_SMP)" ] ; then echo -n " SMP" >> .ver1; fi - @if [ -f .name ]; then echo -n \-`cat .name` >> .ver1; fi - @LANG=C echo ' '`date` >> .ver1 - @echo \#define UTS_VERSION \"`cat .ver1 | $(uts_truncate)`\" > .ver - @LANG=C echo \#define LINUX_COMPILE_TIME \"`date +%T`\" >> .ver - @echo \#define LINUX_COMPILE_BY \"`whoami`\" >> .ver - @echo \#define LINUX_COMPILE_HOST \"`hostname | $(uts_truncate)`\" >> .ver - @([ -x /bin/dnsdomainname ] && /bin/dnsdomainname > .ver1) || \ - ([ -x /bin/domainname ] && /bin/domainname > .ver1) || \ - echo > .ver1 - @echo \#define LINUX_COMPILE_DOMAIN \"`cat .ver1 | $(uts_truncate)`\" >> .ver - @echo \#define LINUX_COMPILER \"`$(CC) $(CFLAGS) -v 2>&1 | tail -n 1`\" >> .ver - @mv -f .ver $@ - @rm -f .ver1 - -include/linux/version.h: ./Makefile - @expr length "$(KERNELRELEASE)" \<= $(uts_len) > /dev/null || \ - (echo KERNELRELEASE \"$(KERNELRELEASE)\" exceeds $(uts_len) characters >&2; false) - @echo \#define UTS_RELEASE \"$(KERNELRELEASE)\" > .ver - @echo \#define LINUX_VERSION_CODE `expr $(VERSION) \\* 65536 + $(PATCHLEVEL) \\* 256 + $(SUBLEVEL)` >> .ver - @echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))' >>.ver - @mv -f .ver $@ - -comma := , - -init/version.o: init/version.c include/linux/compile.h include/config/MARKER - $(CC) $(CFLAGS) $(CFLAGS_KERNEL) -DUTS_MACHINE='"$(SUBARCH)"' -DKBUILD_BASENAME=$(subst $(comma),_,$(subst -,_,$(*F))) -c -o init/version.o init/version.c - -init/main.o: init/main.c include/config/MARKER - $(CC) $(CFLAGS) $(CFLAGS_KERNEL) $(PROFILING) -DKBUILD_BASENAME=$(subst $(comma),_,$(subst -,_,$(*F))) -c -o $@ $< - -init/do_mounts.o: init/do_mounts.c include/config/MARKER - $(CC) $(CFLAGS) $(CFLAGS_KERNEL) $(PROFILING) -DKBUILD_BASENAME=$(subst $(comma),_,$(subst -,_,$(*F))) -c -o $@ $< - -fs lib mm ipc kernel drivers net: dummy - $(MAKE) CFLAGS="$(CFLAGS) $(CFLAGS_KERNEL)" $(subst $@, _dir_$@, $@) - -TAGS: dummy - { find include/asm-${ARCH} -name '*.h' -print ; \ - find include -type d \( -name "asm-*" -o -name config \) -prune -o -name '*.h' -print ; \ - find $(SUBDIRS) init arch/${ARCH} -name '*.[chS]' ; } | grep -v SCCS | grep -v '\.svn' | etags - - -# Exuberant ctags works better with -I -tags: dummy - CTAGSF=`ctags --version | grep -i exuberant >/dev/null && echo "-I __initdata,__exitdata,EXPORT_SYMBOL,EXPORT_SYMBOL_NOVERS"`; \ - ctags $$CTAGSF `find include/asm-$(ARCH) -name '*.h'` && \ - find include -type d \( -name "asm-*" -o -name config \) -prune -o -name '*.h' -print | xargs ctags $$CTAGSF -a && \ - find $(SUBDIRS) init -name '*.[ch]' | xargs ctags $$CTAGSF -a - -ifdef CONFIG_MODULES -ifdef CONFIG_MODVERSIONS -MODFLAGS += -DMODVERSIONS -include $(HPATH)/linux/modversions.h -endif - -.PHONY: modules -modules: $(patsubst %, _mod_%, $(SUBDIRS)) - -.PHONY: $(patsubst %, _mod_%, $(SUBDIRS)) -$(patsubst %, _mod_%, $(SUBDIRS)) : include/linux/version.h include/config/MARKER - $(MAKE) -C $(patsubst _mod_%, %, $@) CFLAGS="$(CFLAGS) $(MODFLAGS)" MAKING_MODULES=1 modules - -.PHONY: modules_install -modules_install: _modinst_ $(patsubst %, _modinst_%, $(SUBDIRS)) _modinst_post - -.PHONY: _modinst_ -_modinst_: - @rm -rf $(MODLIB)/kernel - @rm -f $(MODLIB)/build - @mkdir -p $(MODLIB)/kernel - @ln -s $(TOPDIR) $(MODLIB)/build - -# If System.map exists, run depmod. This deliberately does not have a -# dependency on System.map since that would run the dependency tree on -# vmlinux. This depmod is only for convenience to give the initial -# boot a modules.dep even before / is mounted read-write. However the -# boot script depmod is the master version. -ifeq "$(strip $(INSTALL_MOD_PATH))" "" -depmod_opts := -else -depmod_opts := -b $(INSTALL_MOD_PATH) -r -endif -.PHONY: _modinst_post -_modinst_post: _modinst_post_pcmcia - if [ -r System.map ]; then $(DEPMOD) -ae -F System.map $(depmod_opts) $(KERNELRELEASE); fi - -# Backwards compatibilty symlinks for people still using old versions -# of pcmcia-cs with hard coded pathnames on insmod. Remove -# _modinst_post_pcmcia for kernel 2.4.1. -.PHONY: _modinst_post_pcmcia -_modinst_post_pcmcia: - cd $(MODLIB); \ - mkdir -p pcmcia; \ - find kernel -path '*/pcmcia/*' -name '*.o' | xargs -i -r ln -sf ../{} pcmcia - -.PHONY: $(patsubst %, _modinst_%, $(SUBDIRS)) -$(patsubst %, _modinst_%, $(SUBDIRS)) : - $(MAKE) -C $(patsubst _modinst_%, %, $@) modules_install - -# modules disabled.... - -else -modules modules_install: dummy - @echo - @echo "The present kernel configuration has modules disabled." - @echo "Type 'make config' and enable loadable module support." - @echo "Then build a kernel with module support enabled." - @echo - @exit 1 -endif - -clean: archclean - find . \( -name '*.[oas]' -o -name core -o -name '.*.flags' \) -type f -print \ - | grep -v lxdialog/ | xargs rm -f - rm -f $(CLEAN_FILES) - rm -rf $(CLEAN_DIRS) - $(MAKE) -C Documentation/DocBook clean - -mrproper: clean archmrproper - find . \( -size 0 -o -name .depend \) -type f -print | xargs rm -f - rm -f $(MRPROPER_FILES) - rm -rf $(MRPROPER_DIRS) - $(MAKE) -C Documentation/DocBook mrproper - -distclean: mrproper - rm -f core `find . \( -not -type d \) -and \ - \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ - -o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \ - -o -name '.*.rej' -o -name '.SUMS' -o -size 0 \) -type f -print` TAGS tags - -backup: mrproper - cd .. && tar cf - linux/ | gzip -9 > backup.gz - sync - -sgmldocs: - chmod 755 $(TOPDIR)/scripts/docgen - chmod 755 $(TOPDIR)/scripts/gen-all-syms - chmod 755 $(TOPDIR)/scripts/kernel-doc - $(MAKE) -C $(TOPDIR)/Documentation/DocBook books - -psdocs: sgmldocs - $(MAKE) -C Documentation/DocBook ps - -pdfdocs: sgmldocs - $(MAKE) -C Documentation/DocBook pdf - -htmldocs: sgmldocs - $(MAKE) -C Documentation/DocBook html - -mandocs: - chmod 755 $(TOPDIR)/scripts/kernel-doc - chmod 755 $(TOPDIR)/scripts/split-man - $(MAKE) -C Documentation/DocBook man - -sums: - find . -type f -print | sort | xargs sum > .SUMS - -dep-files: scripts/mkdep archdep include/linux/version.h - rm -f .depend .hdepend - $(MAKE) $(patsubst %,_sfdep_%,$(SUBDIRS)) _FASTDEP_ALL_SUB_DIRS="$(SUBDIRS)" -ifdef CONFIG_MODVERSIONS - $(MAKE) update-modverfile -endif - scripts/mkdep -- `find $(FINDHPATH) \( -name SCCS -o -name .svn \) -prune -o -follow -name \*.h ! -name modversions.h -print` > .hdepend - scripts/mkdep -- init/*.c > .depend - -ifdef CONFIG_MODVERSIONS -MODVERFILE := $(TOPDIR)/include/linux/modversions.h -else -MODVERFILE := -endif -export MODVERFILE - -depend dep: dep-files - -checkconfig: - find * -name '*.[hcS]' -type f -print | sort | xargs $(PERL) -w scripts/checkconfig.pl - -checkhelp: - find * -name [cC]onfig.in -print | sort | xargs $(PERL) -w scripts/checkhelp.pl - -checkincludes: - find * -name '*.[hcS]' -type f -print | sort | xargs $(PERL) -w scripts/checkincludes.pl - -ifdef CONFIGURATION -..$(CONFIGURATION): - @echo - @echo "You have a bad or nonexistent" .$(CONFIGURATION) ": running 'make" $(CONFIGURATION)"'" - @echo - $(MAKE) $(CONFIGURATION) - @echo - @echo "Successful. Try re-making (ignore the error that follows)" - @echo - exit 1 - -#dummy: ..$(CONFIGURATION) -dummy: - -else - -dummy: - -endif - -include Rules.make - -# -# This generates dependencies for the .h files. -# - -scripts/mkdep: scripts/mkdep.c - $(HOSTCC) $(HOSTCFLAGS) -o scripts/mkdep scripts/mkdep.c - -scripts/split-include: scripts/split-include.c - $(HOSTCC) $(HOSTCFLAGS) -o scripts/split-include scripts/split-include.c - -# -# RPM target -# -# If you do a make spec before packing the tarball you can rpm -ta it -# -spec: - . scripts/mkspec >kernel.spec - -# -# Build a tar ball, generate an rpm from it and pack the result -# There arw two bits of magic here -# 1) The use of /. to avoid tar packing just the symlink -# 2) Removing the .dep files as they have source paths in them that -# will become invalid -# -rpm: clean spec - find . \( -size 0 -o -name .depend -o -name .hdepend \) -type f -print | xargs rm -f - set -e; \ - cd $(TOPDIR)/.. ; \ - ln -sf $(TOPDIR) $(KERNELPATH) ; \ - tar -cvz --exclude CVS -f $(KERNELPATH).tar.gz $(KERNELPATH)/. ; \ - rm $(KERNELPATH) ; \ - cd $(TOPDIR) ; \ - . scripts/mkversion > .version ; \ - $(RPM) -ta $(TOPDIR)/../$(KERNELPATH).tar.gz ; \ - rm $(TOPDIR)/../$(KERNELPATH).tar.gz diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,139 +0,0 @@ -# -# xen/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" and "archdep" for cleaning up and making dependencies for -# this architecture -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@xxxxxxxxxxxxx> -# Added '-march' and '-mpreferred-stack-boundary' support -# - -# If no .config file exists then use the appropriate defconfig-* file -ifneq (.config,$(wildcard .config)) -DUMMYX:=$(shell cp $(TOPDIR)/arch/xen/defconfig$(EXTRAVERSION) $(TOPDIR)/.config) --include $(TOPDIR)/.config -endif - -LD=$(CROSS_COMPILE)ld -m elf_i386 -OBJCOPY=$(CROSS_COMPILE)objcopy -R .note -R .comment -S -LDFLAGS=-e stext -LINKFLAGS =-T $(TOPDIR)/arch/xen/vmlinux.lds $(LDFLAGS) - -CFLAGS += -pipe - -check_gcc = $(shell if $(CC) $(1) -S -o /dev/null -xc /dev/null > /dev/null 2>&1; then echo "$(1)"; else echo "$(2)"; fi) - -# prevent gcc from keeping the stack 16 byte aligned -CFLAGS += $(call check_gcc,-mpreferred-stack-boundary=2,) - -ifdef CONFIG_M686 -CFLAGS += -march=i686 -endif - -ifdef CONFIG_MPENTIUMIII -CFLAGS += -march=i686 -endif - -ifdef CONFIG_MPENTIUM4 -CFLAGS += -march=i686 -endif - -ifdef CONFIG_MK7 -CFLAGS += $(call check_gcc,-march=athlon,-march=i686 -malign-functions=4) -endif - -# Disable unit-at-a-time mode, it makes gcc use a lot more stack -# due to the lack of sharing of stacklots. -CFLAGS += $(call check_gcc,-fno-unit-at-a-time,) - -HEAD := arch/xen/kernel/head.o arch/xen/kernel/init_task.o - -SUBDIRS += arch/xen/kernel arch/xen/mm arch/xen/lib -SUBDIRS += arch/xen/drivers/console -SUBDIRS += arch/xen/drivers/evtchn -SUBDIRS += arch/xen/drivers/blkif -SUBDIRS += arch/xen/drivers/netif -SUBDIRS += arch/xen/drivers/usbif -SUBDIRS += arch/xen/drivers/balloon -ifdef CONFIG_XEN_PRIVILEGED_GUEST -SUBDIRS += arch/xen/drivers/dom0 -endif - -CORE_FILES += arch/xen/kernel/kernel.o arch/xen/mm/mm.o -CORE_FILES += arch/xen/drivers/evtchn/drv.o -CORE_FILES += arch/xen/drivers/console/drv.o -DRIVERS += arch/xen/drivers/blkif/drv.o -DRIVERS += arch/xen/drivers/netif/drv.o -DRIVERS += arch/xen/drivers/usbif/drv.o -ifdef CONFIG_XEN_PRIVILEGED_GUEST -CORE_FILES += arch/xen/drivers/dom0/drv.o -endif -CORE_FILES += arch/xen/drivers/balloon/drv.o -LIBS := $(TOPDIR)/arch/xen/lib/lib.a $(LIBS) $(TOPDIR)/arch/xen/lib/lib.a - -arch/xen/kernel: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/kernel - -arch/xen/mm: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/mm - -arch/xen/drivers/console: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/drivers/console - -arch/xen/drivers/network: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/drivers/network - -arch/xen/drivers/block: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/drivers/block - -arch/xen/drivers/dom0: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/drivers/dom0 - -arch/xen/drivers/balloon: dummy - $(MAKE) linuxsubdirs SUBDIRS=arch/xen/drivers/balloon - -MAKEBOOT = $(MAKE) -C arch/$(ARCH)/boot - -vmlinux: arch/xen/vmlinux.lds - -FORCE: ; - -.PHONY: bzImage compressed clean archclean archmrproper archdep - - -bzImage: vmlinux - @$(MAKEBOOT) bzImage - -INSTALL_NAME ?= $(KERNELRELEASE) -install: bzImage - mkdir -p $(INSTALL_PATH)/boot - ln -f -s vmlinuz-$(INSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_PATH)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) - rm -f $(INSTALL_PATH)/boot/vmlinuz-$(INSTALL_NAME)$(INSTALL_SUFFIX) - install -m0644 arch/$(ARCH)/boot/bzImage $(INSTALL_PATH)/boot/vmlinuz-$(INSTALL_NAME)$(INSTALL_SUFFIX) - install -m0644 vmlinux $(INSTALL_PATH)/boot/vmlinux-syms-$(INSTALL_NAME)$(INSTALL_SUFFIX) - install -m0664 .config $(INSTALL_PATH)/boot/config-$(INSTALL_NAME)$(INSTALL_SUFFIX) - install -m0664 System.map $(INSTALL_PATH)/boot/System.map-$(INSTALL_NAME)$(INSTALL_SUFFIX) - ln -f -s vmlinuz-$(INSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_PATH)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) - -%_config: arch/xen/defconfig-% - rm -f .config arch/xen/defconfig - cp -f arch/xen/defconfig-$(@:_config=) arch/xen/defconfig - cp -f arch/xen/defconfig-$(@:_config=) .config - - -archclean: - @$(MAKEBOOT) clean - -archmrproper: - rm -f include/asm-xen/xen-public/arch - -archdep: - @$(MAKEBOOT) dep diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/boot/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/boot/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,13 +0,0 @@ -# -# arch/xen/boot/Makefile -# - -bzImage: $(TOPDIR)/vmlinux - $(OBJCOPY) $< Image - gzip -f -9 < Image > $@ - rm -f Image - -dep: - -clean: - rm -f bzImage Image diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/config.in --- a/linux-2.4.30-xen-sparse/arch/xen/config.in Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,337 +0,0 @@ -# -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/config-language.txt. -# -mainmenu_name "Linux Kernel Configuration" - -define_bool CONFIG_XEN y - -define_bool CONFIG_X86 y -define_bool CONFIG_ISA y -define_bool CONFIG_SBUS n - -define_bool CONFIG_UID16 y - -mainmenu_option next_comment -comment 'Xen' -bool 'Support for privileged operations (domain 0)' CONFIG_XEN_PRIVILEGED_GUEST -bool 'Device-driver domain (physical device access)' CONFIG_XEN_PHYSDEV_ACCESS -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - bool 'USB-device backend driver' CONFIG_XEN_USB_BACKEND -fi -bool 'Scrub memory before freeing it to Xen' CONFIG_XEN_SCRUB_PAGES -bool 'Network-device frontend driver' CONFIG_XEN_NETDEV_FRONTEND -bool 'Block-device frontend driver' CONFIG_XEN_BLKDEV_FRONTEND -bool 'Block-device uses grant tables' CONFIG_XEN_BLKDEV_GRANT -bool 'USB-device frontend driver' CONFIG_XEN_USB_FRONTEND -endmenu -# The IBM S/390 patch needs this. -define_bool CONFIG_NO_IDLE_HZ y - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - define_bool CONFIG_FOREIGN_PAGES y -else - define_bool CONFIG_FOREIGN_PAGES n - define_bool CONFIG_NETDEVICES y - define_bool CONFIG_VT n -fi - -mainmenu_option next_comment -comment 'Code maturity level options' -bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL -endmenu - -mainmenu_option next_comment -comment 'Loadable module support' -bool 'Enable loadable module support' CONFIG_MODULES -if [ "$CONFIG_MODULES" = "y" ]; then - bool ' Set version information on all module symbols' CONFIG_MODVERSIONS - bool ' Kernel module loader' CONFIG_KMOD -fi -endmenu - -mainmenu_option next_comment -comment 'Processor type and features' -choice 'Processor family' \ - "Pentium-Pro/Celeron/Pentium-II CONFIG_M686 \ - Pentium-III/Celeron(Coppermine) CONFIG_MPENTIUMIII \ - Pentium-4 CONFIG_MPENTIUM4 \ - Athlon/Duron/K7 CONFIG_MK7 \ - Opteron/Athlon64/Hammer/K8 CONFIG_MK8 \ - VIA-C3-2 CONFIG_MVIAC3_2" Pentium-Pro - - define_bool CONFIG_X86_WP_WORKS_OK y - define_bool CONFIG_X86_INVLPG y - define_bool CONFIG_X86_CMPXCHG y - define_bool CONFIG_X86_XADD y - define_bool CONFIG_X86_BSWAP y - define_bool CONFIG_X86_POPAD_OK y - define_bool CONFIG_RWSEM_GENERIC_SPINLOCK n - define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM y - - define_bool CONFIG_X86_GOOD_APIC y - define_bool CONFIG_X86_PGE y - define_bool CONFIG_X86_USE_PPRO_CHECKSUM y - define_bool CONFIG_X86_TSC y - -if [ "$CONFIG_M686" = "y" ]; then - define_int CONFIG_X86_L1_CACHE_SHIFT 5 -fi -if [ "$CONFIG_MPENTIUMIII" = "y" ]; then - define_int CONFIG_X86_L1_CACHE_SHIFT 5 -fi -if [ "$CONFIG_MPENTIUM4" = "y" ]; then - define_int CONFIG_X86_L1_CACHE_SHIFT 7 -fi -if [ "$CONFIG_MK8" = "y" ]; then - define_bool CONFIG_MK7 y -fi -if [ "$CONFIG_MK7" = "y" ]; then - define_int CONFIG_X86_L1_CACHE_SHIFT 6 - define_bool CONFIG_X86_USE_3DNOW y -fi -if [ "$CONFIG_MVIAC3_2" = "y" ]; then - define_int CONFIG_X86_L1_CACHE_SHIFT 5 -fi - -#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then -# tristate 'BIOS Enhanced Disk Drive calls determine boot disk (EXPERIMENTAL)' CONFIG_EDD -#fi - -choice 'High Memory Support' \ - "off CONFIG_NOHIGHMEM \ - 4GB CONFIG_HIGHMEM4G" off -# 64GB CONFIG_HIGHMEM64G" off -if [ "$CONFIG_HIGHMEM4G" = "y" ]; then - define_bool CONFIG_HIGHMEM y -fi -if [ "$CONFIG_HIGHMEM64G" = "y" ]; then - define_bool CONFIG_HIGHMEM y - define_bool CONFIG_X86_PAE y -fi - -if [ "$CONFIG_HIGHMEM" = "y" ]; then - bool 'HIGHMEM I/O support' CONFIG_HIGHIO -fi - -define_int CONFIG_FORCE_MAX_ZONEORDER 11 - -#bool 'Symmetric multi-processing support' CONFIG_SMP -#if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then -# define_bool CONFIG_HAVE_DEC_LOCK y -#fi -endmenu - -mainmenu_option next_comment -comment 'General setup' - -bool 'Networking support' CONFIG_NET - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - bool 'PCI support' CONFIG_PCI - source drivers/pci/Config.in - - bool 'Support for hot-pluggable devices' CONFIG_HOTPLUG - - if [ "$CONFIG_HOTPLUG" = "y" ] ; then - source drivers/pcmcia/Config.in - source drivers/hotplug/Config.in - else - define_bool CONFIG_PCMCIA n - define_bool CONFIG_HOTPLUG_PCI n - fi -fi - -bool 'System V IPC' CONFIG_SYSVIPC -bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT -bool 'Sysctl support' CONFIG_SYSCTL -if [ "$CONFIG_PROC_FS" = "y" ]; then - choice 'Kernel core (/proc/kcore) format' \ - "ELF CONFIG_KCORE_ELF \ - A.OUT CONFIG_KCORE_AOUT" ELF -fi -tristate 'Kernel support for a.out binaries' CONFIG_BINFMT_AOUT -bool 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF -tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC -bool 'Select task to kill on out of memory condition' CONFIG_OOM_KILLER - -endmenu - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - source drivers/mtd/Config.in - - source drivers/parport/Config.in - - source drivers/pnp/Config.in - - source drivers/block/Config.in - - source drivers/md/Config.in -fi - -if [ "$CONFIG_NET" = "y" ]; then - source net/Config.in -fi - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - mainmenu_option next_comment - comment 'ATA/IDE/MFM/RLL support' - - tristate 'ATA/IDE/MFM/RLL support' CONFIG_IDE - - if [ "$CONFIG_IDE" != "n" ]; then - source drivers/ide/Config.in - else - define_bool CONFIG_BLK_DEV_HD n - fi - endmenu -fi - -mainmenu_option next_comment -comment 'SCSI support' - -tristate 'SCSI support' CONFIG_SCSI - -if [ "$CONFIG_SCSI" != "n" ]; then - source drivers/scsi/Config.in -fi -endmenu - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - source drivers/message/fusion/Config.in - - source drivers/ieee1394/Config.in - - source drivers/message/i2o/Config.in - - if [ "$CONFIG_NET" = "y" ]; then - mainmenu_option next_comment - comment 'Network device support' - - bool 'Network device support' CONFIG_NETDEVICES - if [ "$CONFIG_NETDEVICES" = "y" ]; then - source drivers/net/Config.in - if [ "$CONFIG_ATM" = "y" -o "$CONFIG_ATM" = "m" ]; then - source drivers/atm/Config.in - fi - fi - endmenu - fi - - source net/ax25/Config.in - - source net/irda/Config.in - - mainmenu_option next_comment - comment 'ISDN subsystem' - if [ "$CONFIG_NET" != "n" ]; then - tristate 'ISDN support' CONFIG_ISDN - if [ "$CONFIG_ISDN" != "n" ]; then - source drivers/isdn/Config.in - fi - fi - endmenu - - if [ "$CONFIG_ISA" = "y" ]; then - mainmenu_option next_comment - comment 'Old CD-ROM drivers (not SCSI, not IDE)' - - bool 'Support non-SCSI/IDE/ATAPI CDROM drives' CONFIG_CD_NO_IDESCSI - if [ "$CONFIG_CD_NO_IDESCSI" != "n" ]; then - source drivers/cdrom/Config.in - fi - endmenu - fi - - # - # input before char - char/joystick depends on it. As does USB. - # - source drivers/input/Config.in -else - # - # Block device driver configuration - # - mainmenu_option next_comment - comment 'Block devices' - tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP - dep_tristate 'Network block device support' CONFIG_BLK_DEV_NBD $CONFIG_NET - tristate 'RAM disk support' CONFIG_BLK_DEV_RAM - if [ "$CONFIG_BLK_DEV_RAM" = "y" -o "$CONFIG_BLK_DEV_RAM" = "m" ]; then - int ' Default RAM disk size' CONFIG_BLK_DEV_RAM_SIZE 4096 - fi - dep_bool ' Initial RAM disk (initrd) support' CONFIG_BLK_DEV_INITRD $CONFIG_BLK_DEV_RAM - bool 'Per partition statistics in /proc/partitions' CONFIG_BLK_STATS - define_bool CONFIG_BLK_DEV_HD n - endmenu -fi - -source drivers/char/Config.in - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" -o "$CONFIG_XEN_USB_FRONTEND" = "y" ]; then - source drivers/media/Config.in -fi - -source fs/Config.in - -mainmenu_option next_comment -comment 'Console drivers' - -define_bool CONFIG_XEN_CONSOLE y - -if [ "$CONFIG_VT" = "y" ]; then - bool 'VGA text console' CONFIG_VGA_CONSOLE - bool 'Dummy console' CONFIG_DUMMY_CONSOLE - if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - bool 'Video mode selection support' CONFIG_VIDEO_SELECT - if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then - tristate 'MDA text console (dual-headed) (EXPERIMENTAL)' CONFIG_MDA_CONSOLE - source drivers/video/Config.in - fi - fi -fi -endmenu - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - mainmenu_option next_comment - comment 'Sound' - - tristate 'Sound card support' CONFIG_SOUND - if [ "$CONFIG_SOUND" != "n" ]; then - source drivers/sound/Config.in - fi - endmenu -fi - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" -o "$CONFIG_XEN_USB_FRONTEND" = "y" ]; then - if [ "$CONFIG_XEN_USB_FRONTEND" = "y" -o "$CONFIG_XEN_USB_BACKEND" = "y" ]; then - define_bool CONFIG_USB y - fi - source drivers/usb/Config.in -fi - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - source net/bluetooth/Config.in -fi - -mainmenu_option next_comment -comment 'Kernel hacking' - -bool 'Kernel debugging' CONFIG_DEBUG_KERNEL -if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; then - bool ' Check for stack overflows' CONFIG_DEBUG_STACKOVERFLOW - bool ' Debug high memory support' CONFIG_DEBUG_HIGHMEM - bool ' Debug memory allocations' CONFIG_DEBUG_SLAB - bool ' Memory mapped I/O debugging' CONFIG_DEBUG_IOVIRT - bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ - bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK - bool ' Verbose BUG() reporting (adds 70K)' CONFIG_DEBUG_BUGVERBOSE - bool ' Load all symbols for debugging' CONFIG_KALLSYMS - bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER -fi - -int 'Kernel messages buffer length shift (0 = default)' CONFIG_LOG_BUF_SHIFT 0 - -endmenu - -source crypto/Config.in -source lib/Config.in diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/defconfig-xen0 --- a/linux-2.4.30-xen-sparse/arch/xen/defconfig-xen0 Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,927 +0,0 @@ -# -# Automatically generated by make menuconfig: don't edit -# -CONFIG_XEN=y -CONFIG_X86=y -CONFIG_ISA=y -# CONFIG_SBUS is not set -CONFIG_UID16=y - -# -# Xen -# -CONFIG_XEN_PRIVILEGED_GUEST=y -CONFIG_XEN_PHYSDEV_ACCESS=y -# CONFIG_XEN_USB_BACKEND is not set -CONFIG_XEN_SCRUB_PAGES=y -CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_BLKDEV_FRONTEND=y -CONFIG_XEN_BLKDEV_GRANT=y -# CONFIG_XEN_USB_FRONTEND is not set -CONFIG_NO_IDLE_HZ=y -CONFIG_FOREIGN_PAGES=y - -# -# Code maturity level options -# -CONFIG_EXPERIMENTAL=y - -# -# Loadable module support -# -CONFIG_MODULES=y -CONFIG_MODVERSIONS=y -CONFIG_KMOD=y - -# -# Processor type and features -# -CONFIG_M686=y -# CONFIG_MPENTIUMIII is not set -# CONFIG_MPENTIUM4 is not set -# CONFIG_MK7 is not set -# CONFIG_MK8 is not set -# CONFIG_MVIAC3_2 is not set -CONFIG_X86_WP_WORKS_OK=y -CONFIG_X86_INVLPG=y -CONFIG_X86_CMPXCHG=y -CONFIG_X86_XADD=y -CONFIG_X86_BSWAP=y -CONFIG_X86_POPAD_OK=y -# CONFIG_RWSEM_GENERIC_SPINLOCK is not set -CONFIG_RWSEM_XCHGADD_ALGORITHM=y -CONFIG_X86_GOOD_APIC=y -CONFIG_X86_PGE=y -CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_TSC=y -CONFIG_X86_L1_CACHE_SHIFT=5 -CONFIG_NOHIGHMEM=y -# CONFIG_HIGHMEM4G is not set -CONFIG_FORCE_MAX_ZONEORDER=11 - -# -# General setup -# -CONFIG_NET=y -CONFIG_PCI=y -CONFIG_PCI_NAMES=y -CONFIG_HOTPLUG=y - -# -# PCMCIA/CardBus support -# -# CONFIG_PCMCIA is not set - -# -# PCI Hotplug Support -# -# CONFIG_HOTPLUG_PCI is not set -# CONFIG_HOTPLUG_PCI_COMPAQ is not set -# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set -# CONFIG_HOTPLUG_PCI_SHPC is not set -# CONFIG_HOTPLUG_PCI_SHPC_POLL_EVENT_MODE is not set -# CONFIG_HOTPLUG_PCI_PCIE is not set -# CONFIG_HOTPLUG_PCI_PCIE_POLL_EVENT_MODE is not set -CONFIG_SYSVIPC=y -# CONFIG_BSD_PROCESS_ACCT is not set -CONFIG_SYSCTL=y -CONFIG_KCORE_ELF=y -# CONFIG_KCORE_AOUT is not set -CONFIG_BINFMT_AOUT=y -CONFIG_BINFMT_ELF=y -# CONFIG_BINFMT_MISC is not set -# CONFIG_OOM_KILLER is not set - -# -# Memory Technology Devices (MTD) -# -# CONFIG_MTD is not set - -# -# Parallel port support -# -# CONFIG_PARPORT is not set - -# -# Plug and Play configuration -# -CONFIG_PNP=y -# CONFIG_ISAPNP is not set - -# -# Block devices -# -# CONFIG_BLK_DEV_FD is not set -# CONFIG_BLK_DEV_XD is not set -# CONFIG_PARIDE is not set -# CONFIG_BLK_CPQ_DA is not set -# CONFIG_BLK_CPQ_CISS_DA is not set -# CONFIG_CISS_SCSI_TAPE is not set -# CONFIG_CISS_MONITOR_THREAD is not set -# CONFIG_BLK_DEV_DAC960 is not set -# CONFIG_BLK_DEV_UMEM is not set -# CONFIG_BLK_DEV_SX8 is not set -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_NBD=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=4096 -CONFIG_BLK_DEV_INITRD=y -# CONFIG_BLK_STATS is not set - -# -# Multi-device support (RAID and LVM) -# -CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_MD_LINEAR=y -# CONFIG_MD_RAID0 is not set -CONFIG_MD_RAID1=y -# CONFIG_MD_RAID5 is not set -# CONFIG_MD_MULTIPATH is not set -CONFIG_BLK_DEV_LVM=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_MMAP=y -# CONFIG_NETLINK_DEV is not set -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_DEBUG is not set -CONFIG_FILTER=y -CONFIG_UNIX=y -CONFIG_INET=y -# CONFIG_IP_MULTICAST is not set -# CONFIG_IP_ADVANCED_ROUTER is not set -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IP_PNP_BOOTP is not set -# CONFIG_IP_PNP_RARP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE is not set -# CONFIG_ARPD is not set -# CONFIG_INET_ECN is not set -# CONFIG_SYN_COOKIES is not set - -# -# IP: Netfilter Configuration -# -CONFIG_IP_NF_CONNTRACK=m -CONFIG_IP_NF_FTP=m -# CONFIG_IP_NF_AMANDA is not set -CONFIG_IP_NF_TFTP=m -CONFIG_IP_NF_IRC=m -# CONFIG_IP_NF_QUEUE is not set -CONFIG_IP_NF_IPTABLES=y -# CONFIG_IP_NF_MATCH_LIMIT is not set -# CONFIG_IP_NF_MATCH_MAC is not set -# CONFIG_IP_NF_MATCH_PKTTYPE is not set -# CONFIG_IP_NF_MATCH_MARK is not set -# CONFIG_IP_NF_MATCH_MULTIPORT is not set -# CONFIG_IP_NF_MATCH_TOS is not set -# CONFIG_IP_NF_MATCH_RECENT is not set -# CONFIG_IP_NF_MATCH_ECN is not set -# CONFIG_IP_NF_MATCH_DSCP is not set -# CONFIG_IP_NF_MATCH_AH_ESP is not set -# CONFIG_IP_NF_MATCH_LENGTH is not set -# CONFIG_IP_NF_MATCH_TTL is not set -# CONFIG_IP_NF_MATCH_TCPMSS is not set -# CONFIG_IP_NF_MATCH_HELPER is not set -CONFIG_IP_NF_MATCH_STATE=m -CONFIG_IP_NF_MATCH_CONNTRACK=m -# CONFIG_IP_NF_MATCH_UNCLEAN is not set -# CONFIG_IP_NF_MATCH_OWNER is not set -CONFIG_IP_NF_MATCH_PHYSDEV=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_TARGET_REJECT=y -# CONFIG_IP_NF_TARGET_MIRROR is not set -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_NAT_NEEDED=y -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_REDIRECT=m -# CONFIG_IP_NF_NAT_SNMP_BASIC is not set -CONFIG_IP_NF_NAT_IRC=m -CONFIG_IP_NF_NAT_FTP=m -CONFIG_IP_NF_NAT_TFTP=m -# CONFIG_IP_NF_MANGLE is not set -CONFIG_IP_NF_TARGET_LOG=y -CONFIG_IP_NF_TARGET_ULOG=y -# CONFIG_IP_NF_TARGET_TCPMSS is not set -# CONFIG_IP_NF_ARPTABLES is not set - -# -# IP: Virtual Server Configuration -# -# CONFIG_IP_VS is not set -# CONFIG_IPV6 is not set -# CONFIG_KHTTPD is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# -# CONFIG_IP_SCTP is not set -# CONFIG_ATM is not set -CONFIG_VLAN_8021Q=y -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_DECNET is not set -CONFIG_BRIDGE=y -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_IPF=m -CONFIG_BRIDGE_EBT_ARPF=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_VLANF=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_MARKF=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_MARK_T=m -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_LLC is not set -# CONFIG_NET_DIVERT is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set -# CONFIG_NET_FASTROUTE is not set -# CONFIG_NET_HW_FLOWCONTROL is not set - -# -# QoS and/or fair queueing -# -# CONFIG_NET_SCHED is not set - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set - -# -# ATA/IDE/MFM/RLL support -# -CONFIG_IDE=y - -# -# IDE, ATA and ATAPI Block devices -# -CONFIG_BLK_DEV_IDE=y -# CONFIG_BLK_DEV_HD_IDE is not set -# CONFIG_BLK_DEV_HD is not set -# CONFIG_BLK_DEV_IDE_SATA is not set -CONFIG_BLK_DEV_IDEDISK=y -CONFIG_IDEDISK_MULTI_MODE=y -CONFIG_IDEDISK_STROKE=y -# CONFIG_BLK_DEV_IDECS is not set -# CONFIG_BLK_DEV_DELKIN is not set -CONFIG_BLK_DEV_IDECD=y -CONFIG_BLK_DEV_IDETAPE=y -CONFIG_BLK_DEV_IDEFLOPPY=y -CONFIG_BLK_DEV_IDESCSI=y -CONFIG_IDE_TASK_IOCTL=y -CONFIG_BLK_DEV_CMD640=y -CONFIG_BLK_DEV_CMD640_ENHANCED=y -# CONFIG_BLK_DEV_ISAPNP is not set -CONFIG_BLK_DEV_IDEPCI=y -CONFIG_BLK_DEV_GENERIC=y -CONFIG_IDEPCI_SHARE_IRQ=y -CONFIG_BLK_DEV_IDEDMA_PCI=y -CONFIG_BLK_DEV_OFFBOARD=y -# CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y -# CONFIG_IDEDMA_ONLYDISK is not set -CONFIG_BLK_DEV_IDEDMA=y -# CONFIG_IDEDMA_PCI_WIP is not set -CONFIG_BLK_DEV_ADMA100=y -CONFIG_BLK_DEV_AEC62XX=y -CONFIG_BLK_DEV_ALI15X3=y -CONFIG_WDC_ALI15X3=y -CONFIG_BLK_DEV_AMD74XX=y -CONFIG_AMD74XX_OVERRIDE=y -# CONFIG_BLK_DEV_ATIIXP is not set -CONFIG_BLK_DEV_CMD64X=y -CONFIG_BLK_DEV_TRIFLEX=y -CONFIG_BLK_DEV_CY82C693=y -CONFIG_BLK_DEV_CS5530=y -CONFIG_BLK_DEV_HPT34X=y -# CONFIG_HPT34X_AUTODMA is not set -CONFIG_BLK_DEV_HPT366=y -CONFIG_BLK_DEV_PIIX=y -CONFIG_BLK_DEV_NS87415=y -# CONFIG_BLK_DEV_OPTI621 is not set -CONFIG_BLK_DEV_PDC202XX_OLD=y -CONFIG_PDC202XX_BURST=y -CONFIG_BLK_DEV_PDC202XX_NEW=y -CONFIG_PDC202XX_FORCE=y -CONFIG_BLK_DEV_RZ1000=y -CONFIG_BLK_DEV_SC1200=y -CONFIG_BLK_DEV_SVWKS=y -CONFIG_BLK_DEV_SIIMAGE=y -CONFIG_BLK_DEV_SIS5513=y -CONFIG_BLK_DEV_SLC90E66=y -CONFIG_BLK_DEV_TRM290=y -CONFIG_BLK_DEV_VIA82CXXX=y -CONFIG_IDE_CHIPSETS=y -# CONFIG_BLK_DEV_4DRIVES is not set -# CONFIG_BLK_DEV_ALI14XX is not set -# CONFIG_BLK_DEV_DTC2278 is not set -# CONFIG_BLK_DEV_HT6560B is not set -# CONFIG_BLK_DEV_PDC4030 is not set -# CONFIG_BLK_DEV_QD65XX is not set -# CONFIG_BLK_DEV_UMC8672 is not set -CONFIG_IDEDMA_AUTO=y -# CONFIG_IDEDMA_IVB is not set -# CONFIG_DMA_NONPCI is not set -CONFIG_BLK_DEV_PDC202XX=y -# CONFIG_BLK_DEV_ATARAID is not set -# CONFIG_BLK_DEV_ATARAID_PDC is not set -# CONFIG_BLK_DEV_ATARAID_HPT is not set -# CONFIG_BLK_DEV_ATARAID_MEDLEY is not set -# CONFIG_BLK_DEV_ATARAID_SII is not set - -# -# SCSI support -# -CONFIG_SCSI=y -CONFIG_BLK_DEV_SD=y -CONFIG_SD_EXTRA_DEVS=40 -# CONFIG_CHR_DEV_ST is not set -# CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -CONFIG_CHR_DEV_SG=y -# CONFIG_SCSI_DEBUG_QUEUES is not set -# CONFIG_SCSI_MULTI_LUN is not set -# CONFIG_SCSI_CONSTANTS is not set -# CONFIG_SCSI_LOGGING is not set - -# -# SCSI low-level drivers -# -# CONFIG_BLK_DEV_3W_XXXX_RAID is not set -# CONFIG_SCSI_7000FASST is not set -# CONFIG_SCSI_ACARD is not set -# CONFIG_SCSI_AHA152X is not set -# CONFIG_SCSI_AHA1542 is not set -# CONFIG_SCSI_AHA1740 is not set -CONFIG_SCSI_AACRAID=y -CONFIG_SCSI_AIC7XXX=y -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -# CONFIG_AIC7XXX_PROBE_EISA_VL is not set -# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set -# CONFIG_AIC7XXX_DEBUG_ENABLE is not set -CONFIG_AIC7XXX_DEBUG_MASK=0 -# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set -CONFIG_SCSI_AIC79XX=y -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -# CONFIG_AIC79XX_BUILD_FIRMWARE is not set -# CONFIG_AIC79XX_ENABLE_RD_STRM is not set -# CONFIG_AIC79XX_DEBUG_ENABLE is not set -CONFIG_AIC79XX_DEBUG_MASK=0 -# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set -# CONFIG_SCSI_DPT_I2O is not set -# CONFIG_SCSI_ADVANSYS is not set -# CONFIG_SCSI_IN2000 is not set -# CONFIG_SCSI_AM53C974 is not set -CONFIG_SCSI_MEGARAID=y -# CONFIG_SCSI_MEGARAID2 is not set -CONFIG_SCSI_SATA=y -# CONFIG_SCSI_SATA_AHCI is not set -# CONFIG_SCSI_SATA_SVW is not set -CONFIG_SCSI_ATA_PIIX=y -# CONFIG_SCSI_SATA_NV is not set -# CONFIG_SCSI_SATA_QSTOR is not set -CONFIG_SCSI_SATA_PROMISE=y -CONFIG_SCSI_SATA_SX4=y -CONFIG_SCSI_SATA_SIL=y -CONFIG_SCSI_SATA_SIS=y -# CONFIG_SCSI_SATA_ULI is not set -CONFIG_SCSI_SATA_VIA=y -CONFIG_SCSI_SATA_VITESSE=y -CONFIG_SCSI_BUSLOGIC=y -# CONFIG_SCSI_OMIT_FLASHPOINT is not set -# CONFIG_SCSI_CPQFCTS is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_DTC3280 is not set -# CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_EATA_DMA is not set -# CONFIG_SCSI_EATA_PIO is not set -# CONFIG_SCSI_FUTURE_DOMAIN is not set -# CONFIG_SCSI_GDTH is not set -# CONFIG_SCSI_GENERIC_NCR5380 is not set -# CONFIG_SCSI_IPS is not set -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_NCR53C406A is not set -# CONFIG_SCSI_NCR53C7xx is not set -CONFIG_SCSI_SYM53C8XX_2=y -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set -# CONFIG_SCSI_PAS16 is not set -# CONFIG_SCSI_PCI2000 is not set -# CONFIG_SCSI_PCI2220I is not set -# CONFIG_SCSI_PSI240I is not set -# CONFIG_SCSI_QLOGIC_FAS is not set -# CONFIG_SCSI_QLOGIC_ISP is not set -# CONFIG_SCSI_QLOGIC_FC is not set -# CONFIG_SCSI_QLOGIC_1280 is not set -# CONFIG_SCSI_SEAGATE is not set -# CONFIG_SCSI_SIM710 is not set -# CONFIG_SCSI_SYM53C416 is not set -# CONFIG_SCSI_DC390T is not set -# CONFIG_SCSI_T128 is not set -# CONFIG_SCSI_U14_34F is not set -# CONFIG_SCSI_ULTRASTOR is not set -# CONFIG_SCSI_NSP32 is not set -# CONFIG_SCSI_DEBUG is not set - -# -# Fusion MPT device support -# -# CONFIG_FUSION is not set -# CONFIG_FUSION_BOOT is not set -# CONFIG_FUSION_ISENSE is not set -# CONFIG_FUSION_CTL is not set -# CONFIG_FUSION_LAN is not set - -# -# IEEE 1394 (FireWire) support (EXPERIMENTAL) -# -# CONFIG_IEEE1394 is not set - -# -# I2O device support -# -# CONFIG_I2O is not set -# CONFIG_I2O_PCI is not set -# CONFIG_I2O_BLOCK is not set -# CONFIG_I2O_LAN is not set -# CONFIG_I2O_SCSI is not set -# CONFIG_I2O_PROC is not set - -# -# Network device support -# -CONFIG_NETDEVICES=y - -# -# ARCnet devices -# -# CONFIG_ARCNET is not set -# CONFIG_DUMMY is not set -# CONFIG_BONDING is not set -# CONFIG_EQUALIZER is not set -# CONFIG_TUN is not set -# CONFIG_ETHERTAP is not set - -# -# Ethernet (10 or 100Mbit) -# -CONFIG_NET_ETHERNET=y -# CONFIG_SUNLANCE is not set -# CONFIG_HAPPYMEAL is not set -# CONFIG_SUNBMAC is not set -# CONFIG_SUNQE is not set -# CONFIG_SUNGEM is not set -CONFIG_NET_VENDOR_3COM=y -# CONFIG_EL1 is not set -# CONFIG_EL2 is not set -# CONFIG_ELPLUS is not set -# CONFIG_EL16 is not set -# CONFIG_EL3 is not set -# CONFIG_3C515 is not set -# CONFIG_ELMC is not set -# CONFIG_ELMC_II is not set -CONFIG_VORTEX=y -# CONFIG_TYPHOON is not set -# CONFIG_LANCE is not set -# CONFIG_NET_VENDOR_SMC is not set -# CONFIG_NET_VENDOR_RACAL is not set -# CONFIG_AT1700 is not set -# CONFIG_DEPCA is not set -# CONFIG_HP100 is not set -# CONFIG_NET_ISA is not set -CONFIG_NET_PCI=y -CONFIG_PCNET32=y -# CONFIG_AMD8111_ETH is not set -# CONFIG_ADAPTEC_STARFIRE is not set -# CONFIG_AC3200 is not set -# CONFIG_APRICOT is not set -# CONFIG_B44 is not set -# CONFIG_CS89x0 is not set -# CONFIG_TULIP is not set -# CONFIG_DE4X5 is not set -# CONFIG_DGRS is not set -# CONFIG_DM9102 is not set -# CONFIG_EEPRO100 is not set -# CONFIG_EEPRO100_PIO is not set -CONFIG_E100=y -# CONFIG_LNE390 is not set -# CONFIG_FEALNX is not set -# CONFIG_NATSEMI is not set -CONFIG_NE2K_PCI=y -# CONFIG_FORCEDETH is not set -# CONFIG_NE3210 is not set -# CONFIG_ES3210 is not set -# CONFIG_8139CP is not set -# CONFIG_8139TOO is not set -# CONFIG_8139TOO_PIO is not set -# CONFIG_8139TOO_TUNE_TWISTER is not set -# CONFIG_8139TOO_8129 is not set -# CONFIG_8139_OLD_RX_RESET is not set -# CONFIG_SIS900 is not set -# CONFIG_EPIC100 is not set -# CONFIG_SUNDANCE is not set -# CONFIG_SUNDANCE_MMIO is not set -# CONFIG_TLAN is not set -# CONFIG_VIA_RHINE is not set -# CONFIG_VIA_RHINE_MMIO is not set -# CONFIG_WINBOND_840 is not set -# CONFIG_NET_POCKET is not set - -# -# Ethernet (1000 Mbit) -# -# CONFIG_ACENIC is not set -# CONFIG_DL2K is not set -CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set -# CONFIG_MYRI_SBUS is not set -# CONFIG_NS83820 is not set -# CONFIG_HAMACHI is not set -# CONFIG_YELLOWFIN is not set -# CONFIG_R8169 is not set -# CONFIG_SK98LIN is not set -CONFIG_TIGON3=y -# CONFIG_FDDI is not set -# CONFIG_HIPPI is not set -# CONFIG_PLIP is not set -# CONFIG_PPP is not set -# CONFIG_SLIP is not set - -# -# Wireless LAN (non-hamradio) -# -# CONFIG_NET_RADIO is not set - -# -# Token Ring devices -# -# CONFIG_TR is not set -# CONFIG_NET_FC is not set -# CONFIG_RCPCI is not set -# CONFIG_SHAPER is not set - -# -# Wan interfaces -# -# CONFIG_WAN is not set - -# -# Amateur Radio support -# -# CONFIG_HAMRADIO is not set - -# -# IrDA (infrared) support -# -# CONFIG_IRDA is not set - -# -# ISDN subsystem -# -# CONFIG_ISDN is not set - -# -# Old CD-ROM drivers (not SCSI, not IDE) -# -# CONFIG_CD_NO_IDESCSI is not set - -# -# Input core support -# -# CONFIG_INPUT is not set -# CONFIG_INPUT_KEYBDEV is not set -# CONFIG_INPUT_MOUSEDEV is not set -# CONFIG_INPUT_JOYDEV is not set -# CONFIG_INPUT_EVDEV is not set -# CONFIG_INPUT_UINPUT is not set - -# -# Character devices -# -CONFIG_VT=y -CONFIG_VT_CONSOLE=y -# CONFIG_SERIAL is not set -# CONFIG_SERIAL_EXTENDED is not set -# CONFIG_SERIAL_NONSTANDARD is not set -CONFIG_UNIX98_PTYS=y -CONFIG_UNIX98_PTY_COUNT=256 - -# -# I2C support -# -# CONFIG_I2C is not set - -# -# Mice -# -# CONFIG_BUSMOUSE is not set -CONFIG_MOUSE=y -CONFIG_PSMOUSE=y -# CONFIG_82C710_MOUSE is not set -# CONFIG_PC110_PAD is not set -# CONFIG_MK712_MOUSE is not set - -# -# Joysticks -# -# CONFIG_INPUT_GAMEPORT is not set -# CONFIG_QIC02_TAPE is not set -# CONFIG_IPMI_HANDLER is not set -# CONFIG_IPMI_PANIC_EVENT is not set -# CONFIG_IPMI_DEVICE_INTERFACE is not set -# CONFIG_IPMI_KCS is not set -# CONFIG_IPMI_WATCHDOG is not set - -# -# Watchdog Cards -# -# CONFIG_WATCHDOG is not set -# CONFIG_SCx200 is not set -# CONFIG_SCx200_GPIO is not set -# CONFIG_AMD_RNG is not set -# CONFIG_INTEL_RNG is not set -# CONFIG_HW_RANDOM is not set -# CONFIG_AMD_PM768 is not set -# CONFIG_NVRAM is not set -# CONFIG_RTC is not set -# CONFIG_DTLK is not set -# CONFIG_R3964 is not set -# CONFIG_APPLICOM is not set -# CONFIG_SONYPI is not set - -# -# Ftape, the floppy tape device driver -# -# CONFIG_FTAPE is not set -# CONFIG_AGP is not set - -# -# Direct Rendering Manager (XFree86 DRI support) -# -# CONFIG_DRM is not set -# CONFIG_MWAVE is not set -# CONFIG_OBMOUSE is not set - -# -# Multimedia devices -# -# CONFIG_VIDEO_DEV is not set - -# -# File systems -# -# CONFIG_QUOTA is not set -# CONFIG_QFMT_V2 is not set -CONFIG_AUTOFS_FS=y -CONFIG_AUTOFS4_FS=y -# CONFIG_REISERFS_FS is not set -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -# CONFIG_ADFS_FS is not set -# CONFIG_ADFS_FS_RW is not set -# CONFIG_AFFS_FS is not set -# CONFIG_HFS_FS is not set -# CONFIG_HFSPLUS_FS is not set -# CONFIG_BEFS_FS is not set -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -CONFIG_EXT3_FS=y -CONFIG_JBD=y -# CONFIG_JBD_DEBUG is not set -CONFIG_FAT_FS=y -CONFIG_MSDOS_FS=y -CONFIG_UMSDOS_FS=y -CONFIG_VFAT_FS=y -# CONFIG_EFS_FS is not set -# CONFIG_JFFS_FS is not set -# CONFIG_JFFS2_FS is not set -# CONFIG_CRAMFS is not set -CONFIG_TMPFS=y -CONFIG_RAMFS=y -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -# CONFIG_JFS_FS is not set -# CONFIG_JFS_DEBUG is not set -# CONFIG_JFS_STATISTICS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_VXFS_FS is not set -# CONFIG_NTFS_FS is not set -# CONFIG_NTFS_RW is not set -# CONFIG_HPFS_FS is not set -CONFIG_PROC_FS=y -# CONFIG_DEVFS_FS is not set -# CONFIG_DEVFS_MOUNT is not set -# CONFIG_DEVFS_DEBUG is not set -CONFIG_DEVPTS_FS=y -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX4FS_RW is not set -# CONFIG_ROMFS_FS is not set -CONFIG_EXT2_FS=y -# CONFIG_SYSV_FS is not set -# CONFIG_UDF_FS is not set -# CONFIG_UDF_RW is not set -# CONFIG_UFS_FS is not set -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_XFS_FS is not set -# CONFIG_XFS_QUOTA is not set -# CONFIG_XFS_RT is not set -# CONFIG_XFS_TRACE is not set -# CONFIG_XFS_DEBUG is not set - -# -# Network File Systems -# -# CONFIG_CODA_FS is not set -# CONFIG_INTERMEZZO_FS is not set -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -# CONFIG_NFS_DIRECTIO is not set -CONFIG_ROOT_NFS=y -CONFIG_NFSD=y -CONFIG_NFSD_V3=y -# CONFIG_NFSD_TCP is not set -CONFIG_SUNRPC=y -CONFIG_LOCKD=y -CONFIG_LOCKD_V4=y -# CONFIG_SMB_FS is not set -# CONFIG_NCP_FS is not set -# CONFIG_NCPFS_PACKET_SIGNING is not set -# CONFIG_NCPFS_IOCTL_LOCKING is not set -# CONFIG_NCPFS_STRONG is not set -# CONFIG_NCPFS_NFS_NS is not set -# CONFIG_NCPFS_OS2_NS is not set -# CONFIG_NCPFS_SMALLDOS is not set -# CONFIG_NCPFS_NLS is not set -# CONFIG_NCPFS_EXTRAS is not set -CONFIG_ZISOFS_FS=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -# CONFIG_MAC_PARTITION is not set -CONFIG_MSDOS_PARTITION=y -# CONFIG_BSD_DISKLABEL is not set -# CONFIG_MINIX_SUBPARTITION is not set -# CONFIG_SOLARIS_X86_PARTITION is not set -# CONFIG_UNIXWARE_DISKLABEL is not set -# CONFIG_LDM_PARTITION is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -# CONFIG_EFI_PARTITION is not set -# CONFIG_SMB_NLS is not set -CONFIG_NLS=y - -# -# Native Language Support -# -CONFIG_NLS_DEFAULT="iso8559-1" -# CONFIG_NLS_CODEPAGE_437 is not set -# CONFIG_NLS_CODEPAGE_737 is not set -# CONFIG_NLS_CODEPAGE_775 is not set -# CONFIG_NLS_CODEPAGE_850 is not set -# CONFIG_NLS_CODEPAGE_852 is not set -# CONFIG_NLS_CODEPAGE_855 is not set -# CONFIG_NLS_CODEPAGE_857 is not set -# CONFIG_NLS_CODEPAGE_860 is not set -# CONFIG_NLS_CODEPAGE_861 is not set -# CONFIG_NLS_CODEPAGE_862 is not set -# CONFIG_NLS_CODEPAGE_863 is not set -# CONFIG_NLS_CODEPAGE_864 is not set -# CONFIG_NLS_CODEPAGE_865 is not set -# CONFIG_NLS_CODEPAGE_866 is not set -# CONFIG_NLS_CODEPAGE_869 is not set -# CONFIG_NLS_CODEPAGE_936 is not set -# CONFIG_NLS_CODEPAGE_950 is not set -# CONFIG_NLS_CODEPAGE_932 is not set -# CONFIG_NLS_CODEPAGE_949 is not set -# CONFIG_NLS_CODEPAGE_874 is not set -# CONFIG_NLS_ISO8859_8 is not set -# CONFIG_NLS_CODEPAGE_1250 is not set -# CONFIG_NLS_CODEPAGE_1251 is not set -CONFIG_NLS_ISO8859_1=y -# CONFIG_NLS_ISO8859_2 is not set -# CONFIG_NLS_ISO8859_3 is not set -# CONFIG_NLS_ISO8859_4 is not set -# CONFIG_NLS_ISO8859_5 is not set -# CONFIG_NLS_ISO8859_6 is not set -# CONFIG_NLS_ISO8859_7 is not set -# CONFIG_NLS_ISO8859_9 is not set -# CONFIG_NLS_ISO8859_13 is not set -# CONFIG_NLS_ISO8859_14 is not set -# CONFIG_NLS_ISO8859_15 is not set -# CONFIG_NLS_KOI8_R is not set -# CONFIG_NLS_KOI8_U is not set -# CONFIG_NLS_UTF8 is not set - -# -# Console drivers -# -CONFIG_XEN_CONSOLE=y -CONFIG_VGA_CONSOLE=y -CONFIG_DUMMY_CONSOLE=y -# CONFIG_VIDEO_SELECT is not set -# CONFIG_MDA_CONSOLE is not set - -# -# Frame-buffer support -# -# CONFIG_FB is not set - -# -# Sound -# -# CONFIG_SOUND is not set - -# -# USB support -# -# CONFIG_USB is not set - -# -# Support for USB gadgets -# -# CONFIG_USB_GADGET is not set - -# -# Bluetooth support -# -# CONFIG_BLUEZ is not set - -# -# Kernel hacking -# -CONFIG_DEBUG_KERNEL=y -# CONFIG_DEBUG_STACKOVERFLOW is not set -# CONFIG_DEBUG_HIGHMEM is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_IOVIRT is not set -# CONFIG_MAGIC_SYSRQ is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_KALLSYMS=y -# CONFIG_FRAME_POINTER is not set -CONFIG_LOG_BUF_SHIFT=0 - -# -# Cryptographic options -# -CONFIG_CRYPTO=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_NULL=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=m -CONFIG_CRYPTO_SHA1=m -CONFIG_CRYPTO_SHA256=m -CONFIG_CRYPTO_SHA512=m -# CONFIG_CRYPTO_WP512 is not set -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_AES=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST6=m -# CONFIG_CRYPTO_TEA is not set -# CONFIG_CRYPTO_KHAZAD is not set -# CONFIG_CRYPTO_ANUBIS is not set -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_DEFLATE=m -# CONFIG_CRYPTO_MICHAEL_MIC is not set -# CONFIG_CRYPTO_TEST is not set - -# -# Library routines -# -# CONFIG_CRC32 is not set -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=m -# CONFIG_FW_LOADER is not set diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/defconfig-xenU --- a/linux-2.4.30-xen-sparse/arch/xen/defconfig-xenU Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,562 +0,0 @@ -# -# Automatically generated make config: don't edit -# -CONFIG_XEN=y -CONFIG_X86=y -CONFIG_ISA=y -# CONFIG_SBUS is not set -CONFIG_UID16=y - -# -# Xen -# -# CONFIG_XEN_PRIVILEGED_GUEST is not set -# CONFIG_XEN_PHYSDEV_ACCESS is not set -CONFIG_XEN_SCRUB_PAGES=y -CONFIG_XEN_NETDEV_FRONTEND=y -CONFIG_XEN_BLKDEV_FRONTEND=y -CONFIG_XEN_BLKDEV_GRANT=y -# CONFIG_XEN_USB_FRONTEND is not set -CONFIG_NO_IDLE_HZ=y -# CONFIG_FOREIGN_PAGES is not set -CONFIG_NETDEVICES=y -# CONFIG_VT is not set - -# -# Code maturity level options -# -CONFIG_EXPERIMENTAL=y - -# -# Loadable module support -# -CONFIG_MODULES=y -CONFIG_MODVERSIONS=y -CONFIG_KMOD=y - -# -# Processor type and features -# -CONFIG_M686=y -# CONFIG_MPENTIUMIII is not set -# CONFIG_MPENTIUM4 is not set -# CONFIG_MK7 is not set -# CONFIG_MK8 is not set -# CONFIG_MVIAC3_2 is not set -CONFIG_X86_WP_WORKS_OK=y -CONFIG_X86_INVLPG=y -CONFIG_X86_CMPXCHG=y -CONFIG_X86_XADD=y -CONFIG_X86_BSWAP=y -CONFIG_X86_POPAD_OK=y -# CONFIG_RWSEM_GENERIC_SPINLOCK is not set -CONFIG_RWSEM_XCHGADD_ALGORITHM=y -CONFIG_X86_GOOD_APIC=y -CONFIG_X86_PGE=y -CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_TSC=y -CONFIG_X86_L1_CACHE_SHIFT=5 -CONFIG_NOHIGHMEM=y -# CONFIG_HIGHMEM4G is not set -CONFIG_FORCE_MAX_ZONEORDER=11 - -# -# General setup -# -CONFIG_NET=y -CONFIG_SYSVIPC=y -# CONFIG_BSD_PROCESS_ACCT is not set -CONFIG_SYSCTL=y -CONFIG_KCORE_ELF=y -# CONFIG_KCORE_AOUT is not set -CONFIG_BINFMT_AOUT=y -CONFIG_BINFMT_ELF=y -# CONFIG_BINFMT_MISC is not set -# CONFIG_OOM_KILLER is not set - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_MMAP=y -# CONFIG_NETLINK_DEV is not set -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_DEBUG is not set -CONFIG_FILTER=y -CONFIG_UNIX=y -CONFIG_INET=y -# CONFIG_IP_MULTICAST is not set -# CONFIG_IP_ADVANCED_ROUTER is not set -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IP_PNP_BOOTP is not set -# CONFIG_IP_PNP_RARP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE is not set -# CONFIG_ARPD is not set -# CONFIG_INET_ECN is not set -# CONFIG_SYN_COOKIES is not set - -# -# IP: Netfilter Configuration -# -CONFIG_IP_NF_CONNTRACK=y -CONFIG_IP_NF_FTP=y -# CONFIG_IP_NF_AMANDA is not set -CONFIG_IP_NF_TFTP=y -CONFIG_IP_NF_IRC=y -# CONFIG_IP_NF_QUEUE is not set -CONFIG_IP_NF_IPTABLES=y -# CONFIG_IP_NF_MATCH_LIMIT is not set -# CONFIG_IP_NF_MATCH_MAC is not set -# CONFIG_IP_NF_MATCH_PKTTYPE is not set -# CONFIG_IP_NF_MATCH_MARK is not set -# CONFIG_IP_NF_MATCH_MULTIPORT is not set -# CONFIG_IP_NF_MATCH_TOS is not set -# CONFIG_IP_NF_MATCH_RECENT is not set -# CONFIG_IP_NF_MATCH_ECN is not set -# CONFIG_IP_NF_MATCH_DSCP is not set -# CONFIG_IP_NF_MATCH_AH_ESP is not set -# CONFIG_IP_NF_MATCH_LENGTH is not set -# CONFIG_IP_NF_MATCH_TTL is not set -# CONFIG_IP_NF_MATCH_TCPMSS is not set -# CONFIG_IP_NF_MATCH_HELPER is not set -CONFIG_IP_NF_MATCH_STATE=y -CONFIG_IP_NF_MATCH_CONNTRACK=y -# CONFIG_IP_NF_MATCH_UNCLEAN is not set -# CONFIG_IP_NF_MATCH_OWNER is not set -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_TARGET_REJECT=y -# CONFIG_IP_NF_TARGET_MIRROR is not set -CONFIG_IP_NF_NAT=y -CONFIG_IP_NF_NAT_NEEDED=y -CONFIG_IP_NF_TARGET_MASQUERADE=y -CONFIG_IP_NF_TARGET_REDIRECT=y -# CONFIG_IP_NF_NAT_SNMP_BASIC is not set -CONFIG_IP_NF_NAT_IRC=y -CONFIG_IP_NF_NAT_FTP=y -CONFIG_IP_NF_NAT_TFTP=y -# CONFIG_IP_NF_MANGLE is not set -CONFIG_IP_NF_TARGET_LOG=y -CONFIG_IP_NF_TARGET_ULOG=y -# CONFIG_IP_NF_TARGET_TCPMSS is not set -# CONFIG_IP_NF_ARPTABLES is not set - -# -# IP: Virtual Server Configuration -# -# CONFIG_IP_VS is not set -# CONFIG_IPV6 is not set -# CONFIG_KHTTPD is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# -# CONFIG_IP_SCTP is not set -# CONFIG_ATM is not set -CONFIG_VLAN_8021Q=y - -# -# -# -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_DECNET is not set -# CONFIG_BRIDGE is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_LLC is not set -# CONFIG_NET_DIVERT is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set -# CONFIG_NET_FASTROUTE is not set -# CONFIG_NET_HW_FLOWCONTROL is not set - -# -# QoS and/or fair queueing -# -# CONFIG_NET_SCHED is not set - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set - -# -# SCSI support -# -CONFIG_SCSI=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_SD_EXTRA_DEVS=40 -# CONFIG_CHR_DEV_ST is not set -# CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -CONFIG_CHR_DEV_SG=y - -# -# Some SCSI devices (e.g. CD jukebox) support multiple LUNs -# -# CONFIG_SCSI_DEBUG_QUEUES is not set -# CONFIG_SCSI_MULTI_LUN is not set -# CONFIG_SCSI_CONSTANTS is not set -# CONFIG_SCSI_LOGGING is not set - -# -# SCSI low-level drivers -# -# CONFIG_SCSI_7000FASST is not set -# CONFIG_SCSI_ACARD is not set -# CONFIG_SCSI_AHA152X is not set -# CONFIG_SCSI_AHA1542 is not set -# CONFIG_SCSI_AHA1740 is not set -# CONFIG_SCSI_AACRAID is not set -# CONFIG_SCSI_AIC7XXX is not set -# CONFIG_SCSI_AIC79XX is not set -# CONFIG_SCSI_AIC7XXX_OLD is not set -# CONFIG_SCSI_DPT_I2O is not set -# CONFIG_SCSI_ADVANSYS is not set -# CONFIG_SCSI_IN2000 is not set -# CONFIG_SCSI_AM53C974 is not set -# CONFIG_SCSI_MEGARAID is not set -# CONFIG_SCSI_MEGARAID2 is not set -# CONFIG_SCSI_SATA is not set -# CONFIG_SCSI_SATA_AHCI is not set -# CONFIG_SCSI_SATA_SVW is not set -# CONFIG_SCSI_ATA_PIIX is not set -# CONFIG_SCSI_SATA_NV is not set -# CONFIG_SCSI_SATA_QSTOR is not set -# CONFIG_SCSI_SATA_PROMISE is not set -# CONFIG_SCSI_SATA_SX4 is not set -# CONFIG_SCSI_SATA_SIL is not set -# CONFIG_SCSI_SATA_SIS is not set -# CONFIG_SCSI_SATA_ULI is not set -# CONFIG_SCSI_SATA_VIA is not set -# CONFIG_SCSI_SATA_VITESSE is not set -# CONFIG_SCSI_BUSLOGIC is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_DTC3280 is not set -# CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_EATA_DMA is not set -# CONFIG_SCSI_EATA_PIO is not set -# CONFIG_SCSI_FUTURE_DOMAIN is not set -# CONFIG_SCSI_GDTH is not set -# CONFIG_SCSI_GENERIC_NCR5380 is not set -# CONFIG_SCSI_IPS is not set -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_PPA is not set -# CONFIG_SCSI_IMM is not set -# CONFIG_SCSI_NCR53C406A is not set -# CONFIG_SCSI_NCR53C7xx is not set -# CONFIG_SCSI_PAS16 is not set -# CONFIG_SCSI_PCI2000 is not set -# CONFIG_SCSI_PCI2220I is not set -# CONFIG_SCSI_PSI240I is not set -# CONFIG_SCSI_QLOGIC_FAS is not set -# CONFIG_SCSI_SEAGATE is not set -# CONFIG_SCSI_SIM710 is not set -# CONFIG_SCSI_SYM53C416 is not set -# CONFIG_SCSI_T128 is not set -# CONFIG_SCSI_U14_34F is not set -# CONFIG_SCSI_ULTRASTOR is not set -# CONFIG_SCSI_NSP32 is not set -# CONFIG_SCSI_DEBUG is not set - -# -# Block devices -# -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_NBD=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=4096 -CONFIG_BLK_DEV_INITRD=y -# CONFIG_BLK_STATS is not set -# CONFIG_BLK_DEV_HD is not set - -# -# Character devices -# -# CONFIG_VT is not set -# CONFIG_SERIAL is not set -# CONFIG_SERIAL_EXTENDED is not set -# CONFIG_SERIAL_NONSTANDARD is not set -CONFIG_UNIX98_PTYS=y -CONFIG_UNIX98_PTY_COUNT=256 -# CONFIG_PRINTER is not set -# CONFIG_PPDEV is not set -# CONFIG_TIPAR is not set - -# -# I2C support -# -# CONFIG_I2C is not set - -# -# Mice -# -# CONFIG_BUSMOUSE is not set -CONFIG_MOUSE=y -CONFIG_PSMOUSE=y -# CONFIG_82C710_MOUSE is not set -# CONFIG_PC110_PAD is not set -# CONFIG_MK712_MOUSE is not set - -# -# Joysticks -# -# CONFIG_INPUT_GAMEPORT is not set -# CONFIG_INPUT_NS558 is not set -# CONFIG_INPUT_LIGHTNING is not set -# CONFIG_INPUT_PCIGAME is not set -# CONFIG_INPUT_CS461X is not set -# CONFIG_INPUT_EMU10K1 is not set -# CONFIG_INPUT_SERIO is not set -# CONFIG_INPUT_SERPORT is not set - -# -# Joysticks -# -# CONFIG_INPUT_ANALOG is not set -# CONFIG_INPUT_A3D is not set -# CONFIG_INPUT_ADI is not set -# CONFIG_INPUT_COBRA is not set -# CONFIG_INPUT_GF2K is not set -# CONFIG_INPUT_GRIP is not set -# CONFIG_INPUT_INTERACT is not set -# CONFIG_INPUT_TMDC is not set -# CONFIG_INPUT_SIDEWINDER is not set -# CONFIG_INPUT_IFORCE_USB is not set -# CONFIG_INPUT_IFORCE_232 is not set -# CONFIG_INPUT_WARRIOR is not set -# CONFIG_INPUT_MAGELLAN is not set -# CONFIG_INPUT_SPACEORB is not set -# CONFIG_INPUT_SPACEBALL is not set -# CONFIG_INPUT_STINGER is not set -# CONFIG_INPUT_DB9 is not set -# CONFIG_INPUT_GAMECON is not set -# CONFIG_INPUT_TURBOGRAFX is not set -# CONFIG_QIC02_TAPE is not set -# CONFIG_IPMI_HANDLER is not set -# CONFIG_IPMI_PANIC_EVENT is not set -# CONFIG_IPMI_DEVICE_INTERFACE is not set -# CONFIG_IPMI_KCS is not set -# CONFIG_IPMI_WATCHDOG is not set - -# -# Watchdog Cards -# -# CONFIG_WATCHDOG is not set -# CONFIG_SCx200 is not set -# CONFIG_SCx200_GPIO is not set -# CONFIG_AMD_RNG is not set -# CONFIG_INTEL_RNG is not set -# CONFIG_HW_RANDOM is not set -# CONFIG_AMD_PM768 is not set -# CONFIG_NVRAM is not set -# CONFIG_RTC is not set -# CONFIG_DTLK is not set -# CONFIG_R3964 is not set -# CONFIG_APPLICOM is not set -# CONFIG_SONYPI is not set - -# -# Ftape, the floppy tape device driver -# -# CONFIG_FTAPE is not set -# CONFIG_AGP is not set - -# -# Direct Rendering Manager (XFree86 DRI support) -# -# CONFIG_DRM is not set -# CONFIG_MWAVE is not set -# CONFIG_OBMOUSE is not set - -# -# File systems -# -# CONFIG_QUOTA is not set -# CONFIG_QFMT_V2 is not set -CONFIG_AUTOFS_FS=y -CONFIG_AUTOFS4_FS=y -# CONFIG_REISERFS_FS is not set -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -# CONFIG_ADFS_FS is not set -# CONFIG_ADFS_FS_RW is not set -# CONFIG_AFFS_FS is not set -# CONFIG_HFS_FS is not set -# CONFIG_HFSPLUS_FS is not set -# CONFIG_BEFS_FS is not set -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -CONFIG_EXT3_FS=y -CONFIG_JBD=y -# CONFIG_JBD_DEBUG is not set -CONFIG_FAT_FS=y -CONFIG_MSDOS_FS=y -CONFIG_UMSDOS_FS=y -CONFIG_VFAT_FS=y -# CONFIG_EFS_FS is not set -# CONFIG_JFFS_FS is not set -# CONFIG_JFFS2_FS is not set -# CONFIG_CRAMFS is not set -CONFIG_TMPFS=y -CONFIG_RAMFS=y -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -# CONFIG_JFS_FS is not set -# CONFIG_JFS_DEBUG is not set -# CONFIG_JFS_STATISTICS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_VXFS_FS is not set -# CONFIG_NTFS_FS is not set -# CONFIG_NTFS_RW is not set -# CONFIG_HPFS_FS is not set -CONFIG_PROC_FS=y -# CONFIG_DEVFS_FS is not set -# CONFIG_DEVFS_MOUNT is not set -# CONFIG_DEVFS_DEBUG is not set -CONFIG_DEVPTS_FS=y -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX4FS_RW is not set -# CONFIG_ROMFS_FS is not set -CONFIG_EXT2_FS=y -# CONFIG_SYSV_FS is not set -# CONFIG_UDF_FS is not set -# CONFIG_UDF_RW is not set -# CONFIG_UFS_FS is not set -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_XFS_FS is not set -# CONFIG_XFS_QUOTA is not set -# CONFIG_XFS_RT is not set -# CONFIG_XFS_TRACE is not set -# CONFIG_XFS_DEBUG is not set - -# -# Network File Systems -# -# CONFIG_CODA_FS is not set -# CONFIG_INTERMEZZO_FS is not set -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -# CONFIG_NFS_DIRECTIO is not set -CONFIG_ROOT_NFS=y -CONFIG_NFSD=y -CONFIG_NFSD_V3=y -# CONFIG_NFSD_TCP is not set -CONFIG_SUNRPC=y -CONFIG_LOCKD=y -CONFIG_LOCKD_V4=y -# CONFIG_SMB_FS is not set -# CONFIG_NCP_FS is not set -# CONFIG_NCPFS_PACKET_SIGNING is not set -# CONFIG_NCPFS_IOCTL_LOCKING is not set -# CONFIG_NCPFS_STRONG is not set -# CONFIG_NCPFS_NFS_NS is not set -# CONFIG_NCPFS_OS2_NS is not set -# CONFIG_NCPFS_SMALLDOS is not set -# CONFIG_NCPFS_NLS is not set -# CONFIG_NCPFS_EXTRAS is not set -CONFIG_ZISOFS_FS=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -# CONFIG_MAC_PARTITION is not set -CONFIG_MSDOS_PARTITION=y -# CONFIG_BSD_DISKLABEL is not set -# CONFIG_MINIX_SUBPARTITION is not set -# CONFIG_SOLARIS_X86_PARTITION is not set -# CONFIG_UNIXWARE_DISKLABEL is not set -# CONFIG_LDM_PARTITION is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -# CONFIG_EFI_PARTITION is not set -# CONFIG_SMB_NLS is not set -CONFIG_NLS=y - -# -# Native Language Support -# -CONFIG_NLS_DEFAULT="iso8559-1" -# CONFIG_NLS_CODEPAGE_437 is not set -# CONFIG_NLS_CODEPAGE_737 is not set -# CONFIG_NLS_CODEPAGE_775 is not set -# CONFIG_NLS_CODEPAGE_850 is not set -# CONFIG_NLS_CODEPAGE_852 is not set -# CONFIG_NLS_CODEPAGE_855 is not set -# CONFIG_NLS_CODEPAGE_857 is not set -# CONFIG_NLS_CODEPAGE_860 is not set -# CONFIG_NLS_CODEPAGE_861 is not set -# CONFIG_NLS_CODEPAGE_862 is not set -# CONFIG_NLS_CODEPAGE_863 is not set -# CONFIG_NLS_CODEPAGE_864 is not set -# CONFIG_NLS_CODEPAGE_865 is not set -# CONFIG_NLS_CODEPAGE_866 is not set -# CONFIG_NLS_CODEPAGE_869 is not set -# CONFIG_NLS_CODEPAGE_936 is not set -# CONFIG_NLS_CODEPAGE_950 is not set -# CONFIG_NLS_CODEPAGE_932 is not set -# CONFIG_NLS_CODEPAGE_949 is not set -# CONFIG_NLS_CODEPAGE_874 is not set -# CONFIG_NLS_ISO8859_8 is not set -# CONFIG_NLS_CODEPAGE_1250 is not set -# CONFIG_NLS_CODEPAGE_1251 is not set -CONFIG_NLS_ISO8859_1=y -# CONFIG_NLS_ISO8859_2 is not set -# CONFIG_NLS_ISO8859_3 is not set -# CONFIG_NLS_ISO8859_4 is not set -# CONFIG_NLS_ISO8859_5 is not set -# CONFIG_NLS_ISO8859_6 is not set -# CONFIG_NLS_ISO8859_7 is not set -# CONFIG_NLS_ISO8859_9 is not set -# CONFIG_NLS_ISO8859_13 is not set -# CONFIG_NLS_ISO8859_14 is not set -# CONFIG_NLS_ISO8859_15 is not set -# CONFIG_NLS_KOI8_R is not set -# CONFIG_NLS_KOI8_U is not set -# CONFIG_NLS_UTF8 is not set - -# -# Console drivers -# -CONFIG_XEN_CONSOLE=y - -# -# Kernel hacking -# -CONFIG_DEBUG_KERNEL=y -# CONFIG_DEBUG_STACKOVERFLOW is not set -# CONFIG_DEBUG_HIGHMEM is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_IOVIRT is not set -# CONFIG_MAGIC_SYSRQ is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_BUGVERBOSE is not set -CONFIG_KALLSYMS=y -# CONFIG_FRAME_POINTER is not set -CONFIG_LOG_BUF_SHIFT=0 - -# -# Cryptographic options -# -# CONFIG_CRYPTO is not set - -# -# Library routines -# -# CONFIG_CRC32 is not set -CONFIG_ZLIB_INFLATE=y -# CONFIG_ZLIB_DEFLATE is not set diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/balloon/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/balloon/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,4 +0,0 @@ -O_TARGET := drv.o -export-objs := balloon.o -obj-y := balloon.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,10 +0,0 @@ - -O_TARGET := drv.o - -subdir-$(CONFIG_XEN_BLKDEV_FRONTEND) += frontend -obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += frontend/drv.o - -subdir-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend -obj-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend/drv.o - -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/backend/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/backend/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ -O_TARGET := drv.o -obj-y := main.o control.o interface.o vbd.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/frontend/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ -O_TARGET := drv.o -obj-y := blkfront.o vbd.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/frontend/common.h --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/frontend/common.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,93 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/blkif/frontend/common.h - * - * Shared definitions between all levels of XenoLinux Virtual block devices. - */ - -#ifndef __XEN_DRIVERS_COMMON_H__ -#define __XEN_DRIVERS_COMMON_H__ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/hdreg.h> -#include <linux/blkdev.h> -#include <linux/major.h> -#include <asm-xen/xen-public/xen.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/uaccess.h> -#include <asm-xen/xen-public/io/blkif.h> - -#if 1 -#define IPRINTK(fmt, args...) \ - printk(KERN_INFO "xen_blk: " fmt, ##args) -#else -#define IPRINTK(fmt, args...) ((void)0) -#endif - -#if 1 -#define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_blk: " fmt, ##args) -#else -#define WPRINTK(fmt, args...) ((void)0) -#endif - -#if 0 -#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - -#if 0 -#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a ) -#else -#define DPRINTK_IOCTL(_f, _a...) ((void)0) -#endif - -/* Private gendisk->flags[] values. */ -#define GENHD_FL_XEN 2 /* Is unit a Xen block device? */ -#define GENHD_FL_VIRT_PARTNS 4 /* Are unit partitions virtual? */ - -/* - * We have one of these per vbd, whether ide, scsi or 'other'. - * They hang in an array off the gendisk structure. We may end up putting - * all kinds of interesting stuff here :-) - */ -typedef struct xl_disk { - int usage; -} xl_disk_t; - -extern int blkif_open(struct inode *inode, struct file *filep); -extern int blkif_release(struct inode *inode, struct file *filep); -extern int blkif_ioctl(struct inode *inode, struct file *filep, - unsigned command, unsigned long argument); -extern int blkif_check(kdev_t dev); -extern int blkif_revalidate(kdev_t dev); -extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); -extern void do_blkif_request (request_queue_t *rq); - -extern void xlvbd_update_vbds(void); - -static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev) -{ - struct gendisk *gd = get_gendisk(xldev); - - if ( gd == NULL ) - return NULL; - - return (xl_disk_t *)gd->real_devices + - (MINOR(xldev) >> gd->minor_shift); -} - - -/* Virtual block-device subsystem. */ -extern int xlvbd_init(void); -extern void xlvbd_cleanup(void); - -#endif /* __XEN_DRIVERS_COMMON_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/blkif/frontend/vbd.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,540 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/blkif/frontend/vbd.c - * - * Xenolinux virtual block-device driver. - * - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - */ - -#include "common.h" -#include <linux/blk.h> - -/* - * For convenience we distinguish between ide, scsi and 'other' (i.e. - * potentially combinations of the two) in the naming scheme and in a few - * other places (like default readahead, etc). - */ -#define XLIDE_MAJOR_NAME "hd" -#define XLSCSI_MAJOR_NAME "sd" -#define XLVBD_MAJOR_NAME "xvd" - -#define XLIDE_DEVS_PER_MAJOR 2 -#define XLSCSI_DEVS_PER_MAJOR 16 -#define XLVBD_DEVS_PER_MAJOR 16 - -#define XLIDE_PARTN_SHIFT 6 /* amount to shift minor to get 'real' minor */ -#define XLIDE_MAX_PART (1 << XLIDE_PARTN_SHIFT) /* minors per ide vbd */ - -#define XLSCSI_PARTN_SHIFT 4 /* amount to shift minor to get 'real' minor */ -#define XLSCSI_MAX_PART (1 << XLSCSI_PARTN_SHIFT) /* minors per scsi vbd */ - -#define XLVBD_PARTN_SHIFT 4 /* amount to shift minor to get 'real' minor */ -#define XLVBD_MAX_PART (1 << XLVBD_PARTN_SHIFT) /* minors per 'other' vbd */ - -/* The below are for the generic drivers/block/ll_rw_block.c code. */ -static int xlide_blksize_size[256]; -static int xlide_hardsect_size[256]; -static int xlide_max_sectors[256]; -static int xlscsi_blksize_size[256]; -static int xlscsi_hardsect_size[256]; -static int xlscsi_max_sectors[256]; -static int xlvbd_blksize_size[256]; -static int xlvbd_hardsect_size[256]; -static int xlvbd_max_sectors[256]; - -/* Information about our VBDs. */ -#define MAX_VBDS 64 -static int nr_vbds; -static vdisk_t *vbd_info; - -static struct block_device_operations xlvbd_block_fops = -{ - open: blkif_open, - release: blkif_release, - ioctl: blkif_ioctl, - check_media_change: blkif_check, - revalidate: blkif_revalidate, -}; - -static int xlvbd_get_vbd_info(vdisk_t *disk_info) -{ - vdisk_t *buf = (vdisk_t *)__get_free_page(GFP_KERNEL); - blkif_request_t req; - blkif_response_t rsp; - int nr; - - memset(&req, 0, sizeof(req)); - req.operation = BLKIF_OP_PROBE; - req.nr_segments = 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT - blkif_control_probe_send(&req, &rsp, - (unsigned long)(virt_to_machine(buf))); -#else - req.frame_and_sects[0] = virt_to_machine(buf) | 7; - - blkif_control_send(&req, &rsp); -#endif - - if ( rsp.status <= 0 ) - { - printk(KERN_ALERT "Could not probe disks (%d)\n", rsp.status); - return -1; - } - - if ( (nr = rsp.status) > MAX_VBDS ) - nr = MAX_VBDS; - memcpy(disk_info, buf, nr * sizeof(vdisk_t)); - - return nr; -} - -/* - * xlvbd_init_device - initialise a VBD device - * @disk: a vdisk_t describing the VBD - * - * Takes a vdisk_t * that describes a VBD the domain has access to. - * Performs appropriate initialisation and registration of the device. - * - * Care needs to be taken when making re-entrant calls to ensure that - * corruption does not occur. Also, devices that are in use should not have - * their details updated. This is the caller's responsibility. - */ -static int xlvbd_init_device(vdisk_t *xd) -{ - int device = xd->device; - int major = MAJOR(device); - int minor = MINOR(device); - int is_ide = IDE_DISK_MAJOR(major); /* is this an ide device? */ - int is_scsi= SCSI_BLK_MAJOR(major); /* is this a scsi device? */ - char *major_name; - struct gendisk *gd; - struct block_device *bd; - xl_disk_t *disk; - int i, rc = 0, max_part, partno; - unsigned long capacity; - - unsigned char buf[64]; - - if ( (bd = bdget(device)) == NULL ) - return -1; - - if ( ((disk = xldev_to_xldisk(device)) != NULL) && (disk->usage != 0) ) - { - printk(KERN_ALERT "VBD update failed - in use [dev=%x]\n", device); - rc = -1; - goto out; - } - - if ( is_ide ) { - - major_name = XLIDE_MAJOR_NAME; - max_part = XLIDE_MAX_PART; - - } else if ( is_scsi ) { - - major_name = XLSCSI_MAJOR_NAME; - max_part = XLSCSI_MAX_PART; - - } else { - - /* SMH: hmm - probably a CCISS driver or sim; assume CCISS for now */ - printk(KERN_ALERT "Assuming device %02x:%02x is CCISS/SCSI\n", - major, minor); - is_scsi = 1; - major_name = "cciss"; - max_part = XLSCSI_MAX_PART; - - } - - partno = minor & (max_part - 1); - - if ( (gd = get_gendisk(device)) == NULL ) - { - rc = register_blkdev(major, major_name, &xlvbd_block_fops); - if ( rc < 0 ) - { - printk(KERN_ALERT "XL VBD: can't get major %d\n", major); - goto out; - } - - if ( is_ide ) - { - blksize_size[major] = xlide_blksize_size; - hardsect_size[major] = xlide_hardsect_size; - max_sectors[major] = xlide_max_sectors; - read_ahead[major] = 8; - } - else if ( is_scsi ) - { - blksize_size[major] = xlscsi_blksize_size; - hardsect_size[major] = xlscsi_hardsect_size; - max_sectors[major] = xlscsi_max_sectors; - read_ahead[major] = 8; - } - else - { - blksize_size[major] = xlvbd_blksize_size; - hardsect_size[major] = xlvbd_hardsect_size; - max_sectors[major] = xlvbd_max_sectors; - read_ahead[major] = 8; - } - - blk_init_queue(BLK_DEFAULT_QUEUE(major), do_blkif_request); - - /* - * Turn off barking 'headactive' mode. We dequeue buffer heads as - * soon as we pass them to the back-end driver. - */ - blk_queue_headactive(BLK_DEFAULT_QUEUE(major), 0); - - /* Construct an appropriate gendisk structure. */ - gd = kmalloc(sizeof(struct gendisk), GFP_KERNEL); - gd->major = major; - gd->major_name = major_name; - - gd->max_p = max_part; - if ( is_ide ) - { - gd->minor_shift = XLIDE_PARTN_SHIFT; - gd->nr_real = XLIDE_DEVS_PER_MAJOR; - } - else if ( is_scsi ) - { - gd->minor_shift = XLSCSI_PARTN_SHIFT; - gd->nr_real = XLSCSI_DEVS_PER_MAJOR; - } - else - { - gd->minor_shift = XLVBD_PARTN_SHIFT; - gd->nr_real = XLVBD_DEVS_PER_MAJOR; - } - - /* - ** The sizes[] and part[] arrays hold the sizes and other - ** information about every partition with this 'major' (i.e. - ** every disk sharing the 8 bit prefix * max partns per disk) - */ - gd->sizes = kmalloc(max_part*gd->nr_real*sizeof(int), GFP_KERNEL); - gd->part = kmalloc(max_part*gd->nr_real*sizeof(struct hd_struct), - GFP_KERNEL); - memset(gd->sizes, 0, max_part * gd->nr_real * sizeof(int)); - memset(gd->part, 0, max_part * gd->nr_real - * sizeof(struct hd_struct)); - - - gd->real_devices = kmalloc(gd->nr_real * sizeof(xl_disk_t), - GFP_KERNEL); - memset(gd->real_devices, 0, gd->nr_real * sizeof(xl_disk_t)); - - gd->next = NULL; - gd->fops = &xlvbd_block_fops; - - gd->de_arr = kmalloc(gd->nr_real * sizeof(*gd->de_arr), - GFP_KERNEL); - gd->flags = kmalloc(gd->nr_real * sizeof(*gd->flags), GFP_KERNEL); - - memset(gd->de_arr, 0, gd->nr_real * sizeof(*gd->de_arr)); - memset(gd->flags, 0, gd->nr_real * sizeof(*gd->flags)); - - add_gendisk(gd); - - blk_size[major] = gd->sizes; - } - - if ( xd->info & VDISK_READONLY ) - set_device_ro(device, 1); - - gd->flags[minor >> gd->minor_shift] |= GENHD_FL_XEN; - - /* NB. Linux 2.4 only handles 32-bit sector offsets and capacities. */ - capacity = (unsigned long)xd->capacity; - - if ( partno != 0 ) - { - /* - * If this was previously set up as a real disc we will have set - * up partition-table information. Virtual partitions override - * 'real' partitions, and the two cannot coexist on a device. - */ - if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && - (gd->sizes[minor & ~(max_part-1)] != 0) ) - { - /* - * Any non-zero sub-partition entries must be cleaned out before - * installing 'virtual' partition entries. The two types cannot - * coexist, and virtual partitions are favoured. - */ - kdev_t dev = device & ~(max_part-1); - for ( i = max_part - 1; i > 0; i-- ) - { - invalidate_device(dev+i, 1); - gd->part[MINOR(dev+i)].start_sect = 0; - gd->part[MINOR(dev+i)].nr_sects = 0; - gd->sizes[MINOR(dev+i)] = 0; - } - printk(KERN_ALERT - "Virtual partitions found for /dev/%s - ignoring any " - "real partition information we may have found.\n", - disk_name(gd, MINOR(device), buf)); - } - - /* Need to skankily setup 'partition' information */ - gd->part[minor].start_sect = 0; - gd->part[minor].nr_sects = capacity; - gd->sizes[minor] = capacity >>(BLOCK_SIZE_BITS-9); - - gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; - } - else - { - gd->part[minor].nr_sects = capacity; - gd->sizes[minor] = capacity>>(BLOCK_SIZE_BITS-9); - - /* Some final fix-ups depending on the device type */ - if ( xd->info & VDISK_REMOVABLE ) - { - gd->flags[minor >> gd->minor_shift] |= GENHD_FL_REMOVABLE; - printk(KERN_ALERT - "Skipping partition check on %s /dev/%s\n", - (xd->info & VDISK_CDROM) ? "cdrom" : "removable", - disk_name(gd, MINOR(device), buf)); - } - else - { - /* Only check partitions on real discs (not virtual!). */ - if ( gd->flags[minor>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) - { - printk(KERN_ALERT - "Skipping partition check on virtual /dev/%s\n", - disk_name(gd, MINOR(device), buf)); - break; - } - register_disk(gd, device, gd->max_p, &xlvbd_block_fops, capacity); - } - } - - out: - bdput(bd); - return rc; -} - - -/* - * xlvbd_remove_device - remove a device node if possible - * @device: numeric device ID - * - * Updates the gendisk structure and invalidates devices. - * - * This is OK for now but in future, should perhaps consider where this should - * deallocate gendisks / unregister devices. - */ -static int xlvbd_remove_device(int device) -{ - int i, rc = 0, minor = MINOR(device); - struct gendisk *gd; - struct block_device *bd; - xl_disk_t *disk = NULL; - - if ( (bd = bdget(device)) == NULL ) - return -1; - - if ( ((gd = get_gendisk(device)) == NULL) || - ((disk = xldev_to_xldisk(device)) == NULL) ) - BUG(); - - if ( disk->usage != 0 ) - { - printk(KERN_ALERT "VBD removal failed - in use [dev=%x]\n", device); - rc = -1; - goto out; - } - - if ( (minor & (gd->max_p-1)) != 0 ) - { - /* 1: The VBD is mapped to a partition rather than a whole unit. */ - invalidate_device(device, 1); - gd->part[minor].start_sect = 0; - gd->part[minor].nr_sects = 0; - gd->sizes[minor] = 0; - - /* Clear the consists-of-virtual-partitions flag if possible. */ - gd->flags[minor >> gd->minor_shift] &= ~GENHD_FL_VIRT_PARTNS; - for ( i = 1; i < gd->max_p; i++ ) - if ( gd->sizes[(minor & ~(gd->max_p-1)) + i] != 0 ) - gd->flags[minor >> gd->minor_shift] |= GENHD_FL_VIRT_PARTNS; - - /* - * If all virtual partitions are now gone, and a 'whole unit' VBD is - * present, then we can try to grok the unit's real partition table. - */ - if ( !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) && - (gd->sizes[minor & ~(gd->max_p-1)] != 0) && - !(gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE) ) - { - register_disk(gd, - device&~(gd->max_p-1), - gd->max_p, - &xlvbd_block_fops, - gd->part[minor&~(gd->max_p-1)].nr_sects); - } - } - else - { - /* - * 2: The VBD is mapped to an entire 'unit'. Clear all partitions. - * NB. The partition entries are only cleared if there are no VBDs - * mapped to individual partitions on this unit. - */ - i = gd->max_p - 1; /* Default: clear subpartitions as well. */ - if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS ) - i = 0; /* 'Virtual' mode: only clear the 'whole unit' entry. */ - while ( i >= 0 ) - { - invalidate_device(device+i, 1); - gd->part[minor+i].start_sect = 0; - gd->part[minor+i].nr_sects = 0; - gd->sizes[minor+i] = 0; - i--; - } - } - - out: - bdput(bd); - return rc; -} - -/* - * xlvbd_update_vbds - reprobes the VBD status and performs updates driver - * state. The VBDs need to be updated in this way when the domain is - * initialised and also each time we receive an XLBLK_UPDATE event. - */ -void xlvbd_update_vbds(void) -{ - int i, j, k, old_nr, new_nr; - vdisk_t *old_info, *new_info, *merged_info; - - old_info = vbd_info; - old_nr = nr_vbds; - - new_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); - if (!new_info) - return; - - if ( unlikely(new_nr = xlvbd_get_vbd_info(new_info)) < 0 ) - goto out; - - /* - * Final list maximum size is old list + new list. This occurs only when - * old list and new list do not overlap at all, and we cannot yet destroy - * VBDs in the old list because the usage counts are busy. - */ - merged_info = kmalloc((old_nr + new_nr) * sizeof(vdisk_t), GFP_KERNEL); - if (!merged_info) - goto out; - - /* @i tracks old list; @j tracks new list; @k tracks merged list. */ - i = j = k = 0; - - while ( (i < old_nr) && (j < new_nr) ) - { - if ( old_info[i].device < new_info[j].device ) - { - if ( xlvbd_remove_device(old_info[i].device) != 0 ) - memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); - i++; - } - else if ( old_info[i].device > new_info[j].device ) - { - if ( xlvbd_init_device(&new_info[j]) == 0 ) - memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); - j++; - } - else - { - if ( ((old_info[i].capacity == new_info[j].capacity) && - (old_info[i].info == new_info[j].info)) || - (xlvbd_remove_device(old_info[i].device) != 0) ) - memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); - else if ( xlvbd_init_device(&new_info[j]) == 0 ) - memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); - i++; j++; - } - } - - for ( ; i < old_nr; i++ ) - { - if ( xlvbd_remove_device(old_info[i].device) != 0 ) - memcpy(&merged_info[k++], &old_info[i], sizeof(vdisk_t)); - } - - for ( ; j < new_nr; j++ ) - { - if ( xlvbd_init_device(&new_info[j]) == 0 ) - memcpy(&merged_info[k++], &new_info[j], sizeof(vdisk_t)); - } - - vbd_info = merged_info; - nr_vbds = k; - - kfree(old_info); -out: - kfree(new_info); -} - - -/* - * Set up all the linux device goop for the virtual block devices (vbd's) that - * we know about. Note that although from the backend driver's p.o.v. VBDs are - * addressed simply an opaque 16-bit device number, the domain creation tools - * conventionally allocate these numbers to correspond to those used by 'real' - * linux -- this is just for convenience as it means e.g. that the same - * /etc/fstab can be used when booting with or without Xen. - */ -int xlvbd_init(void) -{ - int i; - - /* - * If compiled as a module, we don't support unloading yet. We therefore - * permanently increment the reference count to disallow it. - */ - SET_MODULE_OWNER(&xlvbd_block_fops); - MOD_INC_USE_COUNT; - - /* Initialize the global arrays. */ - for ( i = 0; i < 256; i++ ) - { - xlide_blksize_size[i] = 1024; - xlide_hardsect_size[i] = 512; - xlide_max_sectors[i] = 512; - - xlscsi_blksize_size[i] = 1024; - xlscsi_hardsect_size[i] = 512; - xlscsi_max_sectors[i] = 512; - - xlvbd_blksize_size[i] = 512; - xlvbd_hardsect_size[i] = 512; - xlvbd_max_sectors[i] = 512; - } - - vbd_info = kmalloc(MAX_VBDS * sizeof(vdisk_t), GFP_KERNEL); - if (!vbd_info) - return -ENOMEM; - - nr_vbds = xlvbd_get_vbd_info(vbd_info); - - if ( nr_vbds < 0 ) - { - kfree(vbd_info); - vbd_info = NULL; - nr_vbds = 0; - } - else - { - for ( i = 0; i < nr_vbds; i++ ) - xlvbd_init_device(&vbd_info[i]); - } - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/console/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/console/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ -O_TARGET := drv.o -obj-$(CONFIG_XEN_CONSOLE) := console.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/dom0/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/dom0/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ -O_TARGET := drv.o -obj-y := core.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/evtchn/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/evtchn/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ -O_TARGET := drv.o -obj-y := evtchn.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/netif/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/netif/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,10 +0,0 @@ - -O_TARGET := drv.o - -subdir-$(CONFIG_XEN_NETDEV_FRONTEND) += frontend -obj-$(CONFIG_XEN_NETDEV_FRONTEND) += frontend/drv.o - -subdir-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend -obj-$(CONFIG_XEN_PHYSDEV_ACCESS) += backend/drv.o - -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/netif/backend/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/netif/backend/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,4 +0,0 @@ -O_TARGET := drv.o -export-objs := interface.o -obj-y := main.o control.o interface.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/drivers/netif/frontend/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/drivers/netif/frontend/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ -O_TARGET := drv.o -obj-y := main.o -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,20 +0,0 @@ - -.S.o: - $(CC) $(AFLAGS) -traditional -c $< -o $*.o - -all: kernel.o head.o init_task.o - -O_TARGET := kernel.o - -export-objs := i386_ksyms.o gnttab.o skbuff.o ctrl_if.o - -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ - ptrace.o ioport.o ldt.o setup.o time.o sys_i386.o \ - i386_ksyms.o i387.o evtchn.o ctrl_if.o pci-dma.o \ - reboot.o fixup.o gnttab.o skbuff.o - -ifdef CONFIG_PCI -obj-y += pci-i386.o pci-pc.o -endif - -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/entry.S --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/entry.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,779 +0,0 @@ -/* - * linux/arch/i386/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'ret_to_user': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include <linux/config.h> -#include <linux/sys.h> -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/smp.h> - -EBX = 0x00 -ECX = 0x04 -EDX = 0x08 -ESI = 0x0C -EDI = 0x10 -EBP = 0x14 -EAX = 0x18 -DS = 0x1C -ES = 0x20 -ORIG_EAX = 0x24 -EIP = 0x28 -CS = 0x2C -EFLAGS = 0x30 -OLDESP = 0x34 -OLDSS = 0x38 - -CF_MASK = 0x00000001 -TF_MASK = 0x00000100 -IF_MASK = 0x00000200 -DF_MASK = 0x00000400 -NT_MASK = 0x00004000 - -/* Offsets into task_struct. */ -state = 0 -flags = 4 -sigpending = 8 -addr_limit = 12 -exec_domain = 16 -need_resched = 20 -tsk_ptrace = 24 -processor = 52 - -/* Offsets into shared_info_t. */ -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -ENOSYS = 38 - - -#define SAVE_ALL \ - cld; \ - pushl %es; \ - pushl %ds; \ - pushl %eax; \ - pushl %ebp; \ - pushl %edi; \ - pushl %esi; \ - pushl %edx; \ - pushl %ecx; \ - pushl %ebx; \ - movl $(__KERNEL_DS),%edx; \ - movl %edx,%ds; \ - movl %edx,%es; - -#define RESTORE_ALL \ - popl %ebx; \ - popl %ecx; \ - popl %edx; \ - popl %esi; \ - popl %edi; \ - popl %ebp; \ - popl %eax; \ -1: popl %ds; \ -2: popl %es; \ - addl $4,%esp; \ -3: iret; \ -.section .fixup,"ax"; \ -4: movl $0,(%esp); \ - jmp 1b; \ -5: movl $0,(%esp); \ - jmp 2b; \ -6: pushl %ss; \ - popl %ds; \ - pushl %ss; \ - popl %es; \ - pushl $11; \ - call do_exit; \ -.previous; \ -.section __ex_table,"a";\ - .align 4; \ - .long 1b,4b; \ - .long 2b,5b; \ - .long 3b,6b; \ -.previous - -#define GET_CURRENT(reg) \ - movl $-8192, reg; \ - andl %esp, reg - -ENTRY(lcall7) - pushfl # We get a different stack layout with call - pushl %eax # gates, which has to be cleaned up later.. - SAVE_ALL - movl EIP(%esp),%eax # due to call gates, this is eflags, not eip.. - movl CS(%esp),%edx # this is eip.. - movl EFLAGS(%esp),%ecx # and this is cs.. - movl %eax,EFLAGS(%esp) # - andl $~(NT_MASK|TF_MASK|DF_MASK), %eax - pushl %eax - popfl - movl %edx,EIP(%esp) # Now we move them to their "normal" places - movl %ecx,CS(%esp) # - movl %esp,%ebx - pushl %ebx - andl $-8192,%ebx # GET_CURRENT - movl exec_domain(%ebx),%edx # Get the execution domain - movl 4(%edx),%edx # Get the lcall7 handler for the domain - pushl $0x7 - call *%edx - addl $4, %esp - popl %eax - jmp ret_to_user - -ENTRY(lcall27) - pushfl # We get a different stack layout with call - pushl %eax # gates, which has to be cleaned up later.. - SAVE_ALL - movl EIP(%esp),%eax # due to call gates, this is eflags, not eip.. - movl CS(%esp),%edx # this is eip.. - movl EFLAGS(%esp),%ecx # and this is cs.. - movl %eax,EFLAGS(%esp) # - andl $~(NT_MASK|TF_MASK|DF_MASK), %eax - pushl %eax - popfl - movl %edx,EIP(%esp) # Now we move them to their "normal" places - movl %ecx,CS(%esp) # - movl %esp,%ebx - pushl %ebx - andl $-8192,%ebx # GET_CURRENT - movl exec_domain(%ebx),%edx # Get the execution domain - movl 4(%edx),%edx # Get the lcall7 handler for the domain - pushl $0x27 - call *%edx - addl $4, %esp - popl %eax - jmp ret_to_user - -ENTRY(ret_from_fork) - pushl %ebx - call SYMBOL_NAME(schedule_tail) - addl $4, %esp - GET_CURRENT(%ebx) - testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS - jne tracesys_exit - jmp ret_to_user - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ -ENTRY(system_call) - pushl %eax # save orig_eax - SAVE_ALL - GET_CURRENT(%ebx) - testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS - jne tracesys - cmpl $(NR_syscalls),%eax - jae badsys - call *SYMBOL_NAME(sys_call_table)(,%eax,4) - movl %eax,EAX(%esp) # save the return value -ret_to_user: - movl SYMBOL_NAME(HYPERVISOR_shared_info),%esi - movb $1,evtchn_upcall_mask(%esi) # make tests atomic -ret_to_user_nocli: - cmpl $0,need_resched(%ebx) - jne reschedule - cmpl $0,sigpending(%ebx) - je safesti # ensure need_resched updates are seen -/*signal_return:*/ - movb $0,evtchn_upcall_mask(%esi) # reenable event callbacks - movl %esp,%eax - xorl %edx,%edx - call SYMBOL_NAME(do_signal) - jmp safesti - - ALIGN -restore_all: - RESTORE_ALL - - ALIGN -tracesys: - movl $-ENOSYS,EAX(%esp) - call SYMBOL_NAME(syscall_trace) - movl ORIG_EAX(%esp),%eax - cmpl $(NR_syscalls),%eax - jae tracesys_exit - call *SYMBOL_NAME(sys_call_table)(,%eax,4) - movl %eax,EAX(%esp) # save the return value -tracesys_exit: - call SYMBOL_NAME(syscall_trace) - jmp ret_to_user -badsys: - movl $-ENOSYS,EAX(%esp) - jmp ret_to_user - - ALIGN -ENTRY(ret_from_intr) - GET_CURRENT(%ebx) -ret_from_exception: - movb CS(%esp),%al - testl $2,%eax - jne ret_to_user - jmp restore_all - - ALIGN -reschedule: - movb $0,evtchn_upcall_mask(%esi) # reenable event callbacks - call SYMBOL_NAME(schedule) # test - jmp ret_to_user - -ENTRY(divide_error) - pushl $0 # no error code - pushl $ SYMBOL_NAME(do_divide_error) - ALIGN -error_code: - pushl %ds - pushl %eax - xorl %eax,%eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - decl %eax # eax = -1 - pushl %ecx - pushl %ebx - GET_CURRENT(%ebx) - cld - movl %es,%ecx - movl ORIG_EAX(%esp), %esi # get the error code - movl ES(%esp), %edi # get the function address - movl %eax, ORIG_EAX(%esp) - movl %ecx, ES(%esp) - movl %esp,%edx - pushl %esi # push the error code - pushl %edx # push the pt_regs pointer - movl $(__KERNEL_DS),%edx - movl %edx,%ds - movl %edx,%es - call *%edi - addl $8,%esp - jmp ret_from_exception - -# A note on the "critical region" in our callback handler. -# We want to avoid stacking callback handlers due to events occurring -# during handling of the last event. To do this, we keep events disabled -# until we've done all processing. HOWEVER, we must enable events before -# popping the stack frame (can't be done atomically) and so it would still -# be possible to get enough handler activations to overflow the stack. -# Although unlikely, bugs of that kind are hard to track down, so we'd -# like to avoid the possibility. -# So, on entry to the handler we detect whether we interrupted an -# existing activation in its critical region -- if so, we pop the current -# activation and restart the handler using the previous one. -ENTRY(hypervisor_callback) - pushl %eax - SAVE_ALL - GET_CURRENT(%ebx) - movl EIP(%esp),%eax - cmpl $scrit,%eax - jb 11f - cmpl $ecrit,%eax - jb critical_region_fixup -11: push %esp - call evtchn_do_upcall - add $4,%esp - movl SYMBOL_NAME(HYPERVISOR_shared_info),%esi - movb CS(%esp),%cl - test $2,%cl # slow return to ring 2 or 3 - jne ret_to_user_nocli -safesti:movb $0,evtchn_upcall_mask(%esi) # reenable event callbacks -scrit: /**** START OF CRITICAL REGION ****/ - testb $0xFF,evtchn_upcall_pending(%esi) - jnz 14f # process more events if necessary... - RESTORE_ALL -14: movb $1,evtchn_upcall_mask(%esi) - jmp 11b -ecrit: /**** END OF CRITICAL REGION ****/ -# [How we do the fixup]. We want to merge the current stack frame with the -# just-interrupted frame. How we do this depends on where in the critical -# region the interrupted handler was executing, and so how many saved -# registers are in each frame. We do this quickly using the lookup table -# 'critical_fixup_table'. For each byte offset in the critical region, it -# provides the number of bytes which have already been popped from the -# interrupted stack frame. -critical_region_fixup: - addl $critical_fixup_table-scrit,%eax - movzbl (%eax),%eax # %eax contains num bytes popped - mov %esp,%esi - add %eax,%esi # %esi points at end of src region - mov %esp,%edi - add $0x34,%edi # %edi points at end of dst region - mov %eax,%ecx - shr $2,%ecx # convert words to bytes - je 16f # skip loop if nothing to copy -15: subl $4,%esi # pre-decrementing copy loop - subl $4,%edi - movl (%esi),%eax - movl %eax,(%edi) - loop 15b -16: movl %edi,%esp # final %edi is top of merged stack - jmp 11b - -critical_fixup_table: - .byte 0x00,0x00,0x00 # testb $0xFF,(%esi) - .byte 0x00,0x00 # jnz 14f - .byte 0x00 # pop %ebx - .byte 0x04 # pop %ecx - .byte 0x08 # pop %edx - .byte 0x0c # pop %esi - .byte 0x10 # pop %edi - .byte 0x14 # pop %ebp - .byte 0x18 # pop %eax - .byte 0x1c # pop %ds - .byte 0x20 # pop %es - .byte 0x24,0x24,0x24 # add $4,%esp - .byte 0x28 # iret - .byte 0x00,0x00,0x00,0x00 # movb $1,4(%esi) - .byte 0x00,0x00 # jmp 11b - -# Hypervisor uses this for application faults while it executes. -ENTRY(failsafe_callback) -1: popl %ds -2: popl %es -3: popl %fs -4: popl %gs -5: iret -.section .fixup,"ax"; \ -6: movl $0,(%esp); \ - jmp 1b; \ -7: movl $0,(%esp); \ - jmp 2b; \ -8: movl $0,(%esp); \ - jmp 3b; \ -9: movl $0,(%esp); \ - jmp 4b; \ -10: pushl %ss; \ - popl %ds; \ - pushl %ss; \ - popl %es; \ - pushl $11; \ - call do_exit; \ -.previous; \ -.section __ex_table,"a";\ - .align 4; \ - .long 1b,6b; \ - .long 2b,7b; \ - .long 3b,8b; \ - .long 4b,9b; \ - .long 5b,10b; \ -.previous - -ENTRY(coprocessor_error) - pushl $0 - pushl $ SYMBOL_NAME(do_coprocessor_error) - jmp error_code - -ENTRY(simd_coprocessor_error) - pushl $0 - pushl $ SYMBOL_NAME(do_simd_coprocessor_error) - jmp error_code - -ENTRY(device_not_available) - pushl $-1 # mark this as an int - SAVE_ALL - GET_CURRENT(%ebx) - call SYMBOL_NAME(math_state_restore) - jmp ret_from_exception - -ENTRY(debug) - pushl $0 - pushl $ SYMBOL_NAME(do_debug) - jmp error_code - -ENTRY(int3) - pushl $0 - pushl $ SYMBOL_NAME(do_int3) - jmp error_code - -ENTRY(overflow) - pushl $0 - pushl $ SYMBOL_NAME(do_overflow) - jmp error_code - -ENTRY(bounds) - pushl $0 - pushl $ SYMBOL_NAME(do_bounds) - jmp error_code - -ENTRY(invalid_op) - pushl $0 - pushl $ SYMBOL_NAME(do_invalid_op) - jmp error_code - -ENTRY(coprocessor_segment_overrun) - pushl $0 - pushl $ SYMBOL_NAME(do_coprocessor_segment_overrun) - jmp error_code - -ENTRY(double_fault) - pushl $ SYMBOL_NAME(do_double_fault) - jmp error_code - -ENTRY(invalid_TSS) - pushl $ SYMBOL_NAME(do_invalid_TSS) - jmp error_code - -ENTRY(segment_not_present) - pushl $ SYMBOL_NAME(do_segment_not_present) - jmp error_code - -ENTRY(stack_segment) - pushl $ SYMBOL_NAME(do_stack_segment) - jmp error_code - -ENTRY(general_protection) - pushl $ SYMBOL_NAME(do_general_protection) - jmp error_code - -ENTRY(alignment_check) - pushl $ SYMBOL_NAME(do_alignment_check) - jmp error_code - -# This handler is special, because it gets an extra value on its stack, -# which is the linear faulting address. -#define PAGE_FAULT_STUB(_name1, _name2) \ -ENTRY(_name1) \ - pushl %ds ; \ - pushl %eax ; \ - xorl %eax,%eax ; \ - pushl %ebp ; \ - pushl %edi ; \ - pushl %esi ; \ - pushl %edx ; \ - decl %eax /* eax = -1 */ ; \ - pushl %ecx ; \ - pushl %ebx ; \ - GET_CURRENT(%ebx) ; \ - cld ; \ - movl %es,%ecx ; \ - movl ORIG_EAX(%esp), %esi /* get the error code */ ; \ - movl ES(%esp), %edi /* get the faulting address */ ; \ - movl %eax, ORIG_EAX(%esp) ; \ - movl %ecx, ES(%esp) ; \ - movl %esp,%edx ; \ - pushl %edi /* push the faulting address */ ; \ - pushl %esi /* push the error code */ ; \ - pushl %edx /* push the pt_regs pointer */ ; \ - movl $(__KERNEL_DS),%edx ; \ - movl %edx,%ds ; \ - movl %edx,%es ; \ - call SYMBOL_NAME(_name2) ; \ - addl $12,%esp ; \ - jmp ret_from_exception ; -PAGE_FAULT_STUB(page_fault, do_page_fault) - -ENTRY(machine_check) - pushl $0 - pushl $ SYMBOL_NAME(do_machine_check) - jmp error_code - -ENTRY(fixup_4gb_segment) - pushl $ SYMBOL_NAME(do_fixup_4gb_segment) - jmp error_code - -.data -ENTRY(sys_call_table) - .long SYMBOL_NAME(sys_ni_syscall) /* 0 - old "setup()" system call*/ - .long SYMBOL_NAME(sys_exit) - .long SYMBOL_NAME(sys_fork) - .long SYMBOL_NAME(sys_read) - .long SYMBOL_NAME(sys_write) - .long SYMBOL_NAME(sys_open) /* 5 */ - .long SYMBOL_NAME(sys_close) - .long SYMBOL_NAME(sys_waitpid) - .long SYMBOL_NAME(sys_creat) - .long SYMBOL_NAME(sys_link) - .long SYMBOL_NAME(sys_unlink) /* 10 */ - .long SYMBOL_NAME(sys_execve) - .long SYMBOL_NAME(sys_chdir) - .long SYMBOL_NAME(sys_time) - .long SYMBOL_NAME(sys_mknod) - .long SYMBOL_NAME(sys_chmod) /* 15 */ - .long SYMBOL_NAME(sys_lchown16) - .long SYMBOL_NAME(sys_ni_syscall) /* old break syscall holder */ - .long SYMBOL_NAME(sys_stat) - .long SYMBOL_NAME(sys_lseek) - .long SYMBOL_NAME(sys_getpid) /* 20 */ - .long SYMBOL_NAME(sys_mount) - .long SYMBOL_NAME(sys_oldumount) - .long SYMBOL_NAME(sys_setuid16) - .long SYMBOL_NAME(sys_getuid16) - .long SYMBOL_NAME(sys_stime) /* 25 */ - .long SYMBOL_NAME(sys_ptrace) - .long SYMBOL_NAME(sys_alarm) - .long SYMBOL_NAME(sys_fstat) - .long SYMBOL_NAME(sys_pause) - .long SYMBOL_NAME(sys_utime) /* 30 */ - .long SYMBOL_NAME(sys_ni_syscall) /* old stty syscall holder */ - .long SYMBOL_NAME(sys_ni_syscall) /* old gtty syscall holder */ - .long SYMBOL_NAME(sys_access) - .long SYMBOL_NAME(sys_nice) - .long SYMBOL_NAME(sys_ni_syscall) /* 35 */ /* old ftime syscall holder */ - .long SYMBOL_NAME(sys_sync) - .long SYMBOL_NAME(sys_kill) - .long SYMBOL_NAME(sys_rename) - .long SYMBOL_NAME(sys_mkdir) - .long SYMBOL_NAME(sys_rmdir) /* 40 */ - .long SYMBOL_NAME(sys_dup) - .long SYMBOL_NAME(sys_pipe) - .long SYMBOL_NAME(sys_times) - .long SYMBOL_NAME(sys_ni_syscall) /* old prof syscall holder */ - .long SYMBOL_NAME(sys_brk) /* 45 */ - .long SYMBOL_NAME(sys_setgid16) - .long SYMBOL_NAME(sys_getgid16) - .long SYMBOL_NAME(sys_signal) - .long SYMBOL_NAME(sys_geteuid16) - .long SYMBOL_NAME(sys_getegid16) /* 50 */ - .long SYMBOL_NAME(sys_acct) - .long SYMBOL_NAME(sys_umount) /* recycled never used phys() */ - .long SYMBOL_NAME(sys_ni_syscall) /* old lock syscall holder */ - .long SYMBOL_NAME(sys_ioctl) - .long SYMBOL_NAME(sys_fcntl) /* 55 */ - .long SYMBOL_NAME(sys_ni_syscall) /* old mpx syscall holder */ - .long SYMBOL_NAME(sys_setpgid) - .long SYMBOL_NAME(sys_ni_syscall) /* old ulimit syscall holder */ - .long SYMBOL_NAME(sys_olduname) - .long SYMBOL_NAME(sys_umask) /* 60 */ - .long SYMBOL_NAME(sys_chroot) - .long SYMBOL_NAME(sys_ustat) - .long SYMBOL_NAME(sys_dup2) - .long SYMBOL_NAME(sys_getppid) - .long SYMBOL_NAME(sys_getpgrp) /* 65 */ - .long SYMBOL_NAME(sys_setsid) - .long SYMBOL_NAME(sys_sigaction) - .long SYMBOL_NAME(sys_sgetmask) - .long SYMBOL_NAME(sys_ssetmask) - .long SYMBOL_NAME(sys_setreuid16) /* 70 */ - .long SYMBOL_NAME(sys_setregid16) - .long SYMBOL_NAME(sys_sigsuspend) - .long SYMBOL_NAME(sys_sigpending) - .long SYMBOL_NAME(sys_sethostname) - .long SYMBOL_NAME(sys_setrlimit) /* 75 */ - .long SYMBOL_NAME(sys_old_getrlimit) - .long SYMBOL_NAME(sys_getrusage) - .long SYMBOL_NAME(sys_gettimeofday) - .long SYMBOL_NAME(sys_settimeofday) - .long SYMBOL_NAME(sys_getgroups16) /* 80 */ - .long SYMBOL_NAME(sys_setgroups16) - .long SYMBOL_NAME(old_select) - .long SYMBOL_NAME(sys_symlink) - .long SYMBOL_NAME(sys_lstat) - .long SYMBOL_NAME(sys_readlink) /* 85 */ - .long SYMBOL_NAME(sys_uselib) - .long SYMBOL_NAME(sys_swapon) - .long SYMBOL_NAME(sys_reboot) - .long SYMBOL_NAME(old_readdir) - .long SYMBOL_NAME(old_mmap) /* 90 */ - .long SYMBOL_NAME(sys_munmap) - .long SYMBOL_NAME(sys_truncate) - .long SYMBOL_NAME(sys_ftruncate) - .long SYMBOL_NAME(sys_fchmod) - .long SYMBOL_NAME(sys_fchown16) /* 95 */ - .long SYMBOL_NAME(sys_getpriority) - .long SYMBOL_NAME(sys_setpriority) - .long SYMBOL_NAME(sys_ni_syscall) /* old profil syscall holder */ - .long SYMBOL_NAME(sys_statfs) - .long SYMBOL_NAME(sys_fstatfs) /* 100 */ - .long SYMBOL_NAME(sys_ioperm) - .long SYMBOL_NAME(sys_socketcall) - .long SYMBOL_NAME(sys_syslog) - .long SYMBOL_NAME(sys_setitimer) - .long SYMBOL_NAME(sys_getitimer) /* 105 */ - .long SYMBOL_NAME(sys_newstat) - .long SYMBOL_NAME(sys_newlstat) - .long SYMBOL_NAME(sys_newfstat) - .long SYMBOL_NAME(sys_uname) - .long SYMBOL_NAME(sys_iopl) /* 110 */ - .long SYMBOL_NAME(sys_vhangup) - .long SYMBOL_NAME(sys_ni_syscall) /* old "idle" system call */ - .long SYMBOL_NAME(sys_ni_syscall) /* was VM86 */ - .long SYMBOL_NAME(sys_wait4) - .long SYMBOL_NAME(sys_swapoff) /* 115 */ - .long SYMBOL_NAME(sys_sysinfo) - .long SYMBOL_NAME(sys_ipc) - .long SYMBOL_NAME(sys_fsync) - .long SYMBOL_NAME(sys_sigreturn) - .long SYMBOL_NAME(sys_clone) /* 120 */ - .long SYMBOL_NAME(sys_setdomainname) - .long SYMBOL_NAME(sys_newuname) - .long SYMBOL_NAME(sys_modify_ldt) - .long SYMBOL_NAME(sys_adjtimex) - .long SYMBOL_NAME(sys_mprotect) /* 125 */ - .long SYMBOL_NAME(sys_sigprocmask) - .long SYMBOL_NAME(sys_create_module) - .long SYMBOL_NAME(sys_init_module) - .long SYMBOL_NAME(sys_delete_module) - .long SYMBOL_NAME(sys_get_kernel_syms) /* 130 */ - .long SYMBOL_NAME(sys_quotactl) - .long SYMBOL_NAME(sys_getpgid) - .long SYMBOL_NAME(sys_fchdir) - .long SYMBOL_NAME(sys_bdflush) - .long SYMBOL_NAME(sys_sysfs) /* 135 */ - .long SYMBOL_NAME(sys_personality) - .long SYMBOL_NAME(sys_ni_syscall) /* for afs_syscall */ - .long SYMBOL_NAME(sys_setfsuid16) - .long SYMBOL_NAME(sys_setfsgid16) - .long SYMBOL_NAME(sys_llseek) /* 140 */ - .long SYMBOL_NAME(sys_getdents) - .long SYMBOL_NAME(sys_select) - .long SYMBOL_NAME(sys_flock) - .long SYMBOL_NAME(sys_msync) - .long SYMBOL_NAME(sys_readv) /* 145 */ - .long SYMBOL_NAME(sys_writev) - .long SYMBOL_NAME(sys_getsid) - .long SYMBOL_NAME(sys_fdatasync) - .long SYMBOL_NAME(sys_sysctl) - .long SYMBOL_NAME(sys_mlock) /* 150 */ - .long SYMBOL_NAME(sys_munlock) - .long SYMBOL_NAME(sys_mlockall) - .long SYMBOL_NAME(sys_munlockall) - .long SYMBOL_NAME(sys_sched_setparam) - .long SYMBOL_NAME(sys_sched_getparam) /* 155 */ - .long SYMBOL_NAME(sys_sched_setscheduler) - .long SYMBOL_NAME(sys_sched_getscheduler) - .long SYMBOL_NAME(sys_sched_yield) - .long SYMBOL_NAME(sys_sched_get_priority_max) - .long SYMBOL_NAME(sys_sched_get_priority_min) /* 160 */ - .long SYMBOL_NAME(sys_sched_rr_get_interval) - .long SYMBOL_NAME(sys_nanosleep) - .long SYMBOL_NAME(sys_mremap) - .long SYMBOL_NAME(sys_setresuid16) - .long SYMBOL_NAME(sys_getresuid16) /* 165 */ - .long SYMBOL_NAME(sys_ni_syscall) /* was VM86 */ - .long SYMBOL_NAME(sys_query_module) - .long SYMBOL_NAME(sys_poll) - .long SYMBOL_NAME(sys_nfsservctl) - .long SYMBOL_NAME(sys_setresgid16) /* 170 */ - .long SYMBOL_NAME(sys_getresgid16) - .long SYMBOL_NAME(sys_prctl) - .long SYMBOL_NAME(sys_rt_sigreturn) - .long SYMBOL_NAME(sys_rt_sigaction) - .long SYMBOL_NAME(sys_rt_sigprocmask) /* 175 */ - .long SYMBOL_NAME(sys_rt_sigpending) - .long SYMBOL_NAME(sys_rt_sigtimedwait) - .long SYMBOL_NAME(sys_rt_sigqueueinfo) - .long SYMBOL_NAME(sys_rt_sigsuspend) - .long SYMBOL_NAME(sys_pread) /* 180 */ - .long SYMBOL_NAME(sys_pwrite) - .long SYMBOL_NAME(sys_chown16) - .long SYMBOL_NAME(sys_getcwd) - .long SYMBOL_NAME(sys_capget) - .long SYMBOL_NAME(sys_capset) /* 185 */ - .long SYMBOL_NAME(sys_sigaltstack) - .long SYMBOL_NAME(sys_sendfile) - .long SYMBOL_NAME(sys_ni_syscall) /* streams1 */ - .long SYMBOL_NAME(sys_ni_syscall) /* streams2 */ - .long SYMBOL_NAME(sys_vfork) /* 190 */ - .long SYMBOL_NAME(sys_getrlimit) - .long SYMBOL_NAME(sys_mmap2) - .long SYMBOL_NAME(sys_truncate64) - .long SYMBOL_NAME(sys_ftruncate64) - .long SYMBOL_NAME(sys_stat64) /* 195 */ - .long SYMBOL_NAME(sys_lstat64) - .long SYMBOL_NAME(sys_fstat64) - .long SYMBOL_NAME(sys_lchown) - .long SYMBOL_NAME(sys_getuid) - .long SYMBOL_NAME(sys_getgid) /* 200 */ - .long SYMBOL_NAME(sys_geteuid) - .long SYMBOL_NAME(sys_getegid) - .long SYMBOL_NAME(sys_setreuid) - .long SYMBOL_NAME(sys_setregid) - .long SYMBOL_NAME(sys_getgroups) /* 205 */ - .long SYMBOL_NAME(sys_setgroups) - .long SYMBOL_NAME(sys_fchown) - .long SYMBOL_NAME(sys_setresuid) - .long SYMBOL_NAME(sys_getresuid) - .long SYMBOL_NAME(sys_setresgid) /* 210 */ - .long SYMBOL_NAME(sys_getresgid) - .long SYMBOL_NAME(sys_chown) - .long SYMBOL_NAME(sys_setuid) - .long SYMBOL_NAME(sys_setgid) - .long SYMBOL_NAME(sys_setfsuid) /* 215 */ - .long SYMBOL_NAME(sys_setfsgid) - .long SYMBOL_NAME(sys_pivot_root) - .long SYMBOL_NAME(sys_mincore) - .long SYMBOL_NAME(sys_madvise) - .long SYMBOL_NAME(sys_getdents64) /* 220 */ - .long SYMBOL_NAME(sys_fcntl64) - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for TUX */ - .long SYMBOL_NAME(sys_ni_syscall) /* Reserved for Security */ - .long SYMBOL_NAME(sys_gettid) - .long SYMBOL_NAME(sys_readahead) /* 225 */ - .long SYMBOL_NAME(sys_setxattr) - .long SYMBOL_NAME(sys_lsetxattr) - .long SYMBOL_NAME(sys_fsetxattr) - .long SYMBOL_NAME(sys_getxattr) - .long SYMBOL_NAME(sys_lgetxattr) /* 230 */ - .long SYMBOL_NAME(sys_fgetxattr) - .long SYMBOL_NAME(sys_listxattr) - .long SYMBOL_NAME(sys_llistxattr) - .long SYMBOL_NAME(sys_flistxattr) - .long SYMBOL_NAME(sys_removexattr) /* 235 */ - .long SYMBOL_NAME(sys_lremovexattr) - .long SYMBOL_NAME(sys_fremovexattr) - .long SYMBOL_NAME(sys_tkill) - .long SYMBOL_NAME(sys_sendfile64) - .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_setaffinity */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_getaffinity */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_thread_area */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_get_thread_area */ - .long SYMBOL_NAME(sys_ni_syscall) /* 245 sys_io_setup */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_destroy */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_getevents */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_submit */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_io_cancel */ - .long SYMBOL_NAME(sys_ni_syscall) /* 250 sys_alloc_hugepages */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_free_hugepages */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_exit_group */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_lookup_dcookie */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_create */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_ctl 255 */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_epoll_wait */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_remap_file_pages */ - .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_tid_address */ - - .rept NR_syscalls-(.-sys_call_table)/4 - .long SYMBOL_NAME(sys_ni_syscall) - .endr diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/head.S --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/head.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,41 +0,0 @@ - -.section __xen_guest - .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000" - .ascii ",LOADER=generic" - .byte 0 - -.text -#include <linux/config.h> -#include <linux/threads.h> -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -ENTRY(stext) -ENTRY(_stext) - cld - lss stack_start,%esp - /* Copy the necessary stuff from xen_start_info structure. */ - mov $SYMBOL_NAME(xen_start_info_union),%edi - mov $128,%ecx - rep movsl - jmp SYMBOL_NAME(start_kernel) - -ENTRY(stack_start) - .long SYMBOL_NAME(init_task_union)+8192, __KERNEL_DS - -.org 0x1000 -ENTRY(empty_zero_page) - -.org 0x2000 -ENTRY(default_ldt) - -.org 0x3000 -ENTRY(cpu0_pte_quicklist) - -.org 0x3400 -ENTRY(cpu0_pgd_quicklist) - -.org 0x3800 diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/i386_ksyms.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/i386_ksyms.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,180 +0,0 @@ -#include <linux/config.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/user.h> -#include <linux/elfcore.h> -#include <linux/mca.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/tty.h> - -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/hardirq.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> - -extern void dump_thread(struct pt_regs *, struct user *); -extern spinlock_t rtc_lock; - -#if defined(CONFIG_APMXXX) || defined(CONFIG_APM_MODULEXXX) -extern void machine_real_restart(unsigned char *, int); -EXPORT_SYMBOL(machine_real_restart); -extern void default_idle(void); -EXPORT_SYMBOL(default_idle); -#endif - -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -#endif - -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - -// XXX extern unsigned long get_cmos_time(void); - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -EXPORT_SYMBOL(dump_thread); -EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(dump_extended_fpu); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(enable_irq); -EXPORT_SYMBOL(disable_irq); -EXPORT_SYMBOL(disable_irq_nosync); -EXPORT_SYMBOL(probe_irq_mask); -EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off); -EXPORT_SYMBOL(apm_info); -//EXPORT_SYMBOL(gdt); -EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(phys_to_machine_mapping); - - -#ifdef CONFIG_DEBUG_IOVIRT -EXPORT_SYMBOL(__io_virt_debug); -#endif - -EXPORT_SYMBOL_NOVERS(__down_failed); -EXPORT_SYMBOL_NOVERS(__down_failed_interruptible); -EXPORT_SYMBOL_NOVERS(__down_failed_trylock); -EXPORT_SYMBOL_NOVERS(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); -/* Delay loops */ -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); - -EXPORT_SYMBOL_NOVERS(__get_user_1); -EXPORT_SYMBOL_NOVERS(__get_user_2); -EXPORT_SYMBOL_NOVERS(__get_user_4); - -EXPORT_SYMBOL(strtok); -EXPORT_SYMBOL(strpbrk); -EXPORT_SYMBOL(strstr); - -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__generic_copy_from_user); -EXPORT_SYMBOL(__generic_copy_to_user); -EXPORT_SYMBOL(strnlen_user); - - -EXPORT_SYMBOL(pci_alloc_consistent); -EXPORT_SYMBOL(pci_free_consistent); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pcibios_penalize_isa_irq); -EXPORT_SYMBOL(pci_mem_start); -#endif - - -#ifdef CONFIG_X86_USE_3DNOW -EXPORT_SYMBOL(_mmx_memcpy); -EXPORT_SYMBOL(mmx_clear_page); -EXPORT_SYMBOL(mmx_copy_page); -#endif - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); -EXPORT_SYMBOL(kernel_flag_cacheline); -EXPORT_SYMBOL(smp_num_cpus); -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL_NOVERS(__write_lock_failed); -EXPORT_SYMBOL_NOVERS(__read_lock_failed); - -/* Global SMP irq stuff */ -EXPORT_SYMBOL(synchronize_irq); -EXPORT_SYMBOL(global_irq_holder); -EXPORT_SYMBOL(__global_cli); -EXPORT_SYMBOL(__global_sti); -EXPORT_SYMBOL(__global_save_flags); -EXPORT_SYMBOL(__global_restore_flags); -EXPORT_SYMBOL(smp_call_function); - -/* TLB flushing */ -EXPORT_SYMBOL(flush_tlb_page); - -/* HT support */ -EXPORT_SYMBOL(smp_num_siblings); -EXPORT_SYMBOL(cpu_sibling_map); -#endif - -#ifdef CONFIG_X86_IO_APIC -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); -#endif - -EXPORT_SYMBOL(get_wchan); - -EXPORT_SYMBOL(rtc_lock); - -#undef memcpy -#undef memset -extern void * memset(void *,int,__kernel_size_t); -extern void * memcpy(void *,const void *,__kernel_size_t); -EXPORT_SYMBOL_NOVERS(memcpy); -EXPORT_SYMBOL_NOVERS(memset); - -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(atomic_dec_and_lock); -#endif - -#ifdef CONFIG_MULTIQUAD -EXPORT_SYMBOL(xquad_portio); -#endif - -#include <asm/xen_proc.h> -EXPORT_SYMBOL(create_xen_proc_entry); -EXPORT_SYMBOL(remove_xen_proc_entry); - -EXPORT_SYMBOL(evtchn_do_upcall); -EXPORT_SYMBOL(force_evtchn_callback); -EXPORT_SYMBOL(HYPERVISOR_shared_info); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/irq.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/irq.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1242 +0,0 @@ -/* - * linux/arch/i386/kernel/irq.c - * - * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar - * - * This file contains the code used by various IRQ handling routines: - * asking for different IRQ's should be done through these routines - * instead of just grabbing them. Thus setups with different IRQ numbers - * shouldn't result in any weird surprises, and installing new handlers - * should be easier. - */ - -/* - * (mostly architecture independent, will move to kernel/irq.c in 2.5.) - * - * IRQs are in fact implemented a bit like signal handlers for the kernel. - * Naturally it's not a 1:1 relation, but there are similarities. - */ - -#include <linux/config.h> -#include <linux/ptrace.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/ioport.h> -#include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/slab.h> -#include <linux/random.h> -#include <linux/smp_lock.h> -#include <linux/init.h> -#include <linux/kernel_stat.h> -#include <linux/irq.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> - -#include <asm/atomic.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/system.h> -#include <asm/bitops.h> -#include <asm/uaccess.h> -#include <asm/pgalloc.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/irq.h> - - - -/* - * Linux has a controller-independent x86 interrupt architecture. - * every controller has a 'controller-template', that is used - * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the apropriate - * controller. Thus drivers need not be aware of the - * interrupt-controller. - * - * Various interrupt controllers we handle: 8259 PIC, SMP IO-APIC, - * PIIX4's internal 8259 PIC and SGI's Visual Workstation Cobalt (IO-)APIC. - * (IO-APICs assumed to be messaging to Pentium local-APICs) - * - * the code is designed to be easily extended with new/different - * interrupt controllers, without having to do assembly magic. - */ - -/* - * Controller mappings for all interrupt sources: - */ -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = - { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}}; - -static void register_irq_proc (unsigned int irq); - -/* - * Special irq handlers. - */ - -void no_action(int cpl, void *dev_id, struct pt_regs *regs) { } - -/* - * Generic no controller code - */ - -static void enable_none(unsigned int irq) { } -static unsigned int startup_none(unsigned int irq) { return 0; } -static void disable_none(unsigned int irq) { } -static void ack_none(unsigned int irq) -{ -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves, it doesnt deserve - * a generic callback i think. - */ -#if CONFIG_X86 - printk("unexpected IRQ trap at vector %02x\n", irq); -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - */ - ack_APIC_irq(); -#endif -#endif -} - -/* startup is the same as "enable", shutdown is same as "disable" */ -#define shutdown_none disable_none -#define end_none enable_none - -struct hw_interrupt_type no_irq_type = { - "none", - startup_none, - shutdown_none, - enable_none, - disable_none, - ack_none, - end_none -}; - -atomic_t irq_err_count; -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG -atomic_t irq_mis_count; -#endif -#endif - -/* - * Generic, controller-independent functions: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i, j; - struct irqaction * action; - - seq_printf(p, " "); - for (j=0; j<smp_num_cpus; j++) - seq_printf(p, "CPU%d ",j); - seq_putc(p,'\n'); - - for (i = 0 ; i < NR_IRQS ; i++) { - action = irq_desc[i].action; - if (!action) - continue; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for (j = 0; j < smp_num_cpus; j++) - seq_printf(p, "%10u ", - kstat.irqs[cpu_logical_map(j)][i]); -#endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); - seq_printf(p, " %s", action->name); - - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); - seq_putc(p,'\n'); - } - seq_printf(p, "NMI: "); - for (j = 0; j < smp_num_cpus; j++) - seq_printf(p, "%10u ", - nmi_count(cpu_logical_map(j))); - seq_printf(p, "\n"); -#if CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for (j = 0; j < smp_num_cpus; j++) - seq_printf(p, "%10u ", - apic_timer_irqs[cpu_logical_map(j)]); - seq_printf(p, "\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif -#endif - - return 0; -} - - -/* - * Global interrupt locks for SMP. Allow interrupts to come in on any - * CPU, yet make cli/sti act globally to protect critical regions.. - */ - -#ifdef CONFIG_SMP -unsigned char global_irq_holder = NO_PROC_ID; -unsigned volatile long global_irq_lock; /* pendantic: long for set_bit --RR */ - -extern void show_stack(unsigned long* esp); - -static void show(char * str) -{ - int i; - int cpu = smp_processor_id(); - - printk("\n%s, CPU %d:\n", str, cpu); - printk("irq: %d [",irqs_running()); - for(i=0;i < smp_num_cpus;i++) - printk(" %d",local_irq_count(i)); - printk(" ]\nbh: %d [",spin_is_locked(&global_bh_lock) ? 1 : 0); - for(i=0;i < smp_num_cpus;i++) - printk(" %d",local_bh_count(i)); - - printk(" ]\nStack dumps:"); - for(i = 0; i < smp_num_cpus; i++) { - unsigned long esp; - if (i == cpu) - continue; - printk("\nCPU %d:",i); - esp = init_tss[i].esp0; - if (!esp) { - /* tss->esp0 is set to NULL in cpu_init(), - * it's initialized when the cpu returns to user - * space. -- manfreds - */ - printk(" <unknown> "); - continue; - } - esp &= ~(THREAD_SIZE-1); - esp += sizeof(struct task_struct); - show_stack((void*)esp); - } - printk("\nCPU %d:",cpu); - show_stack(NULL); - printk("\n"); -} - -#define MAXCOUNT 100000000 - -/* - * I had a lockup scenario where a tight loop doing - * spin_unlock()/spin_lock() on CPU#1 was racing with - * spin_lock() on CPU#0. CPU#0 should have noticed spin_unlock(), but - * apparently the spin_unlock() information did not make it - * through to CPU#0 ... nasty, is this by design, do we have to limit - * 'memory update oscillation frequency' artificially like here? - * - * Such 'high frequency update' races can be avoided by careful design, but - * some of our major constructs like spinlocks use similar techniques, - * it would be nice to clarify this issue. Set this define to 0 if you - * want to check whether your system freezes. I suspect the delay done - * by SYNC_OTHER_CORES() is in correlation with 'snooping latency', but - * i thought that such things are guaranteed by design, since we use - * the 'LOCK' prefix. - */ -#define SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND 0 - -#if SUSPECTED_CPU_OR_CHIPSET_BUG_WORKAROUND -# define SYNC_OTHER_CORES(x) udelay(x+1) -#else -/* - * We have to allow irqs to arrive between __sti and __cli - */ -# define SYNC_OTHER_CORES(x) __asm__ __volatile__ ("nop") -#endif - -static inline void wait_on_irq(int cpu) -{ - int count = MAXCOUNT; - - for (;;) { - - /* - * Wait until all interrupts are gone. Wait - * for bottom half handlers unless we're - * already executing in one.. - */ - if (!irqs_running()) - if (local_bh_count(cpu) || !spin_is_locked(&global_bh_lock)) - break; - - /* Duh, we have to loop. Release the lock to avoid deadlocks */ - clear_bit(0,&global_irq_lock); - - for (;;) { - if (!--count) { - show("wait_on_irq"); - count = ~0; - } - __sti(); - SYNC_OTHER_CORES(cpu); - __cli(); - if (irqs_running()) - continue; - if (global_irq_lock) - continue; - if (!local_bh_count(cpu) && spin_is_locked(&global_bh_lock)) - continue; - if (!test_and_set_bit(0,&global_irq_lock)) - break; - } - } -} - -/* - * This is called when we want to synchronize with - * interrupts. We may for example tell a device to - * stop sending interrupts: but to make sure there - * are no interrupts that are executing on another - * CPU we need to call this function. - */ -void synchronize_irq(void) -{ - if (irqs_running()) { - /* Stupid approach */ - cli(); - sti(); - } -} - -static inline void get_irqlock(int cpu) -{ - if (test_and_set_bit(0,&global_irq_lock)) { - /* do we already hold the lock? */ - if ((unsigned char) cpu == global_irq_holder) - return; - /* Uhhuh.. Somebody else got it. Wait.. */ - do { - do { - rep_nop(); - } while (test_bit(0,&global_irq_lock)); - } while (test_and_set_bit(0,&global_irq_lock)); - } - /* - * We also to make sure that nobody else is running - * in an interrupt context. - */ - wait_on_irq(cpu); - - /* - * Ok, finally.. - */ - global_irq_holder = cpu; -} - -/* - * A global "cli()" while in an interrupt context - * turns into just a local cli(). Interrupts - * should use spinlocks for the (very unlikely) - * case that they ever want to protect against - * each other. - * - * If we already have local interrupts disabled, - * this will not turn a local disable into a - * global one (problems with spinlocks: this makes - * save_flags+cli+sti usable inside a spinlock). - */ -void __global_cli(void) -{ - unsigned int flags; - - __save_flags(flags); - if (!flags) { - int cpu = smp_processor_id(); - __cli(); - if (!local_irq_count(cpu)) - get_irqlock(cpu); - } -} - -void __global_sti(void) -{ - int cpu = smp_processor_id(); - - if (!local_irq_count(cpu)) - release_irqlock(cpu); - __sti(); -} - -/* - * SMP flags value to restore to: - * 0 - global cli - * 1 - global sti - * 2 - local cli - * 3 - local sti - */ -unsigned long __global_save_flags(void) -{ - int retval; - int local_enabled; - unsigned long flags; - int cpu = smp_processor_id(); - - __save_flags(flags); - local_enabled = !flags; - /* default to local */ - retval = 2 + local_enabled; - - /* check for global flags if we're not in an interrupt */ - if (!local_irq_count(cpu)) { - if (local_enabled) - retval = 1; - if (global_irq_holder == cpu) - retval = 0; - } - return retval; -} - -void __global_restore_flags(unsigned long flags) -{ - switch (flags) { - case 0: - __global_cli(); - break; - case 1: - __global_sti(); - break; - case 2: - __cli(); - break; - case 3: - __sti(); - break; - default: - printk("global_restore_flags: %08lx (%08lx)\n", - flags, (&flags)[-1]); - } -} - -#endif - -/* - * This should really return information about whether - * we should do bottom half handling etc. Right now we - * end up _always_ checking the bottom half, which is a - * waste of time and is not what some drivers would - * prefer. - */ -int handle_IRQ_event(unsigned int irq, struct pt_regs * regs, struct irqaction * action) -{ - int status; - int cpu = smp_processor_id(); - - irq_enter(cpu, irq); - - status = 1; /* Force the "do bottom halves" bit */ - - if (!(action->flags & SA_INTERRUPT)) - __sti(); - - do { - status |= action->flags; - action->handler(irq, action->dev_id, regs); - action = action->next; - } while (action); - if (status & SA_SAMPLE_RANDOM) - add_interrupt_randomness(irq); - __cli(); - - irq_exit(cpu, irq); - - return status; -} - -/* - * Generic enable/disable code: this just calls - * down into the PIC-specific version for the actual - * hardware disable after having gotten the irq - * controller lock. - */ - -/** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. - * - * This function may be called from IRQ context. - */ - -inline void disable_irq_nosync(unsigned int irq) -{ - irq_desc_t *desc = irq_desc + irq; - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->handler->disable(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ - -void disable_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - - if (!local_irq_count(smp_processor_id())) { - do { - barrier(); - cpu_relax(); - } while (irq_desc[irq].status & IRQ_INPROGRESS); - } -} - -/** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable - * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. - * - * This function may be called from IRQ context. - */ - -void enable_irq(unsigned int irq) -{ - irq_desc_t *desc = irq_desc + irq; - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - switch (desc->depth) { - case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - desc->status = status; - if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = status | IRQ_REPLAY; - hw_resend_irq(desc->handler,irq); - } - desc->handler->enable(irq); - /* fall-through */ - } - default: - desc->depth--; - break; - case 0: - printk("enable_irq(%u) unbalanced from %p\n", irq, - __builtin_return_address(0)); - } - spin_unlock_irqrestore(&desc->lock, flags); -} - -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) -{ - /* - * We ack quickly, we don't want the irq controller - * thinking we're snobs just because some other CPU has - * disabled global interrupts (we have already done the - * INT_ACK cycles, it's too late to try to pretend to the - * controller that we aren't taking the interrupt). - * - * 0 return value means that this irq is already being - * handled by some other CPU. (or is disabled) - */ - int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ - int cpu = smp_processor_id(); - irq_desc_t *desc = irq_desc + irq; - struct irqaction * action; - unsigned int status; -#ifdef CONFIG_DEBUG_STACKOVERFLOW - long esp; - - /* Debugging check for stack overflow: is there less than 1KB free? */ - __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (8191)); - if (unlikely(esp < (sizeof(struct task_struct) + 1024))) { - extern void show_stack(unsigned long *); - - printk("do_IRQ: stack overflow: %ld\n", - esp - sizeof(struct task_struct)); - __asm__ __volatile__("movl %%esp,%0" : "=r" (esp)); - show_stack((void *)esp); - } -#endif - - kstat.irqs[cpu][irq]++; - spin_lock(&desc->lock); - desc->handler->ack(irq); - /* - REPLAY is when Linux resends an IRQ that was dropped earlier - WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - Since we set PENDING, if another processor is handling - a different instance of this same irq, the other processor - will take care of it. - */ - if (!action) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - spin_unlock(&desc->lock); - handle_IRQ_event(irq, regs, action); - spin_lock(&desc->lock); - - if (!(desc->status & IRQ_PENDING)) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->handler->end(irq); - spin_unlock(&desc->lock); - - if (softirq_pending(cpu)) - do_softirq(); - return 1; -} - -/** - * request_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: - * - * SA_SHIRQ Interrupt is shared - * - * SA_INTERRUPT Disable local interrupts while processing - * - * SA_SAMPLE_RANDOM The interrupt can be used for entropy - * - */ - -int request_irq(unsigned int irq, - void (*handler)(int, void *, struct pt_regs *), - unsigned long irqflags, - const char * devname, - void *dev_id) -{ - int retval; - struct irqaction * action; - -#if 1 - /* - * Sanity-check: shared interrupts should REALLY pass in - * a real dev-ID, otherwise we'll have trouble later trying - * to figure out which interrupt is which (messes up the - * interrupt freeing logic etc). - */ - if (irqflags & SA_SHIRQ) { - if (!dev_id) - printk("Bad boy: %s (at 0x%x) called us without a dev_id!\n", devname, (&irq)[-1]); - } -#endif - - if (irq >= NR_IRQS) - return -EINVAL; - if (!handler) - return -EINVAL; - - action = (struct irqaction *) - kmalloc(sizeof(struct irqaction), GFP_KERNEL); - if (!action) - return -ENOMEM; - - action->handler = handler; - action->flags = irqflags; - action->mask = 0; - action->name = devname; - action->next = NULL; - action->dev_id = dev_id; - - retval = setup_irq(irq, action); - if (retval) - kfree(action); - return retval; -} - -/* - * Internal function to unregister an irqaction - typically used to - * deallocate special interrupts that are part of the architecture. - */ -int teardown_irq(unsigned int irq, struct irqaction * old) -{ - irq_desc_t *desc; - struct irqaction **p; - unsigned long flags; - - if (irq >= NR_IRQS) - return -ENOENT; - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); - p = &desc->action; - for (;;) { - struct irqaction * action = *p; - if (action) { - struct irqaction **pp = p; - p = &action->next; - if (action != old) - continue; - - /* Found it - now remove it from the list of entries */ - *pp = action->next; - if (!desc->action) { - desc->status |= IRQ_DISABLED; - desc->handler->shutdown(irq); - } - spin_unlock_irqrestore(&desc->lock,flags); - -#ifdef CONFIG_SMP - /* Wait to make sure it's not being used on another CPU */ - while (desc->status & IRQ_INPROGRESS) { - barrier(); - cpu_relax(); - } -#endif - return 0; - } - printk("Trying to free free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); - return -ENOENT; - } -} - -/** - * free_irq - free an interrupt - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. - * - * This function may be called from interrupt context. - * - * Bugs: Attempting to free an irq in a handler for the same irq hangs - * the machine. - */ - -void free_irq(unsigned int irq, void *dev_id) -{ - irq_desc_t *desc; - struct irqaction *action; - unsigned long flags; - - if (irq >= NR_IRQS) - return; - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); - for (action = desc->action; action != NULL; action = action->next) { - if (action->dev_id != dev_id) - continue; - - spin_unlock_irqrestore(&desc->lock,flags); - - if (teardown_irq(irq, action) == 0) - kfree(action); - return; - } - printk("Trying to free free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); - return; -} - -/* - * IRQ autodetection code.. - * - * This depends on the fact that any interrupt that - * comes in on to an unassigned handler will get stuck - * with "IRQ_WAITING" cleared and the interrupt - * disabled. - */ - -static DECLARE_MUTEX(probe_sem); - -/** - * probe_irq_on - begin an interrupt autodetect - * - * Commence probing for an interrupt. The interrupts are scanned - * and a mask of potential interrupt lines is returned. - * - */ - -unsigned long probe_irq_on(void) -{ - unsigned int i; - irq_desc_t *desc; - unsigned long val; - unsigned long delay; - - down(&probe_sem); - /* - * something may have generated an irq long ago and we want to - * flush such a longstanding irq before considering it as spurious. - */ - for (i = NR_PIRQS-1; i > 0; i--) { - desc = irq_desc + i; - - spin_lock_irq(&desc->lock); - if (!irq_desc[i].action) - irq_desc[i].handler->startup(i); - spin_unlock_irq(&desc->lock); - } - - /* Wait for longstanding interrupts to trigger. */ - for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) - /* about 20ms delay */ synchronize_irq(); - - /* - * enable any unassigned irqs - * (we must startup again here because if a longstanding irq - * happened in the previous stage, it may have masked itself) - */ - for (i = NR_PIRQS-1; i > 0; i--) { - desc = irq_desc + i; - - spin_lock_irq(&desc->lock); - if (!desc->action) { - desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->handler->startup(i)) - desc->status |= IRQ_PENDING; - } - spin_unlock_irq(&desc->lock); - } - - /* - * Wait for spurious interrupts to trigger - */ - for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) - /* about 100ms delay */ synchronize_irq(); - - /* - * Now filter out any obviously spurious interrupts - */ - val = 0; - for (i = 0; i < NR_PIRQS; i++) { - irq_desc_t *desc = irq_desc + i; - unsigned int status; - - spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - /* It triggered already - consider it spurious. */ - if (!(status & IRQ_WAITING)) { - desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); - } else - if (i < 32) - val |= 1 << i; - } - spin_unlock_irq(&desc->lock); - } - - return val; -} - -/* - * Return a mask of triggered interrupts (this - * can handle only legacy ISA interrupts). - */ - -/** - * probe_irq_mask - scan a bitmap of interrupt lines - * @val: mask of interrupts to consider - * - * Scan the ISA bus interrupt lines and return a bitmap of - * active interrupts. The interrupt probe logic state is then - * returned to its previous value. - * - * Note: we need to scan all the irq's even though we will - * only return ISA irq numbers - just so that we reset them - * all to a known state. - */ -unsigned int probe_irq_mask(unsigned long val) -{ - int i; - unsigned int mask; - - mask = 0; - for (i = 0; i < NR_PIRQS; i++) { - irq_desc_t *desc = irq_desc + i; - unsigned int status; - - spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (i < 16 && !(status & IRQ_WAITING)) - mask |= 1 << i; - - desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); - } - spin_unlock_irq(&desc->lock); - } - up(&probe_sem); - - return mask & val; -} - -/* - * Return the one interrupt that triggered (this can - * handle any interrupt source). - */ - -/** - * probe_irq_off - end an interrupt autodetect - * @val: mask of potential interrupts (unused) - * - * Scans the unused interrupt lines and returns the line which - * appears to have triggered the interrupt. If no interrupt was - * found then zero is returned. If more than one interrupt is - * found then minus the first candidate is returned to indicate - * their is doubt. - * - * The interrupt probe logic state is returned to its previous - * value. - * - * BUGS: When used in a module (which arguably shouldnt happen) - * nothing prevents two IRQ probe callers from overlapping. The - * results of this are non-optimal. - */ - -int probe_irq_off(unsigned long val) -{ - int i, irq_found, nr_irqs; - - nr_irqs = 0; - irq_found = 0; - for (i = 0; i < NR_PIRQS; i++) { - irq_desc_t *desc = irq_desc + i; - unsigned int status; - - spin_lock_irq(&desc->lock); - status = desc->status; - - if (status & IRQ_AUTODETECT) { - if (!(status & IRQ_WAITING)) { - if (!nr_irqs) - irq_found = i; - nr_irqs++; - } - desc->status = status & ~IRQ_AUTODETECT; - desc->handler->shutdown(i); - } - spin_unlock_irq(&desc->lock); - } - up(&probe_sem); - - if (nr_irqs > 1) - irq_found = -irq_found; - return irq_found; -} - -/* this was setup_x86_irq but it seems pretty generic */ -int setup_irq(unsigned int irq, struct irqaction * new) -{ - int shared = 0; - unsigned long flags; - struct irqaction *old, **p; - irq_desc_t *desc = irq_desc + irq; - - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & SA_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } - - /* - * The following block of code has to be executed atomically - */ - spin_lock_irqsave(&desc->lock,flags); - p = &desc->action; - if ((old = *p) != NULL) { - /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) { - spin_unlock_irqrestore(&desc->lock,flags); - return -EBUSY; - } - - /* add new interrupt at end of irq queue */ - do { - p = &old->next; - old = *p; - } while (old); - shared = 1; - } - - *p = new; - - if (!shared) { - desc->depth = 0; - desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS); - desc->handler->startup(irq); - } - spin_unlock_irqrestore(&desc->lock,flags); - - register_irq_proc(irq); - return 0; -} - -static struct proc_dir_entry * root_irq_dir; -static struct proc_dir_entry * irq_dir [NR_IRQS]; - -#define HEX_DIGITS 8 - -static unsigned int parse_hex_value (const char *buffer, - unsigned long count, unsigned long *ret) -{ - unsigned char hexnum [HEX_DIGITS]; - unsigned long value; - int i; - - if (!count) - return -EINVAL; - if (count > HEX_DIGITS) - count = HEX_DIGITS; - if (copy_from_user(hexnum, buffer, count)) - return -EFAULT; - - /* - * Parse the first 8 characters as a hex string, any non-hex char - * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. - */ - value = 0; - - for (i = 0; i < count; i++) { - unsigned int c = hexnum[i]; - - switch (c) { - case '0' ... '9': c -= '0'; break; - case 'a' ... 'f': c -= 'a'-10; break; - case 'A' ... 'F': c -= 'A'-10; break; - default: - goto out; - } - value = (value << 4) | c; - } -out: - *ret = value; - return 0; -} - -#if CONFIG_SMP - -static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; - -static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; -static int irq_affinity_read_proc (char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - if (count < HEX_DIGITS+1) - return -EINVAL; - return sprintf (page, "%08lx\n", irq_affinity[(long)data]); -} - -static int irq_affinity_write_proc (struct file *file, const char *buffer, - unsigned long count, void *data) -{ - int irq = (long) data, full_count = count, err; - unsigned long new_value; - - if (!irq_desc[irq].handler->set_affinity) - return -EIO; - - err = parse_hex_value(buffer, count, &new_value); - - /* - * Do not allow disabling IRQs completely - it's a too easy - * way to make the system unusable accidentally :-) At least - * one online CPU still has to be targeted. - */ - if (!(new_value & cpu_online_map)) - return -EINVAL; - - irq_affinity[irq] = new_value; - irq_desc[irq].handler->set_affinity(irq, new_value); - - return full_count; -} - -#endif - -static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - unsigned long *mask = (unsigned long *) data; - if (count < HEX_DIGITS+1) - return -EINVAL; - return sprintf (page, "%08lx\n", *mask); -} - -static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, - unsigned long count, void *data) -{ - unsigned long *mask = (unsigned long *) data, full_count = count, err; - unsigned long new_value; - - err = parse_hex_value(buffer, count, &new_value); - if (err) - return err; - - *mask = new_value; - return full_count; -} - -#define MAX_NAMELEN 10 - -static void register_irq_proc (unsigned int irq) -{ - char name [MAX_NAMELEN]; - - if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) || - irq_dir[irq]) - return; - - memset(name, 0, MAX_NAMELEN); - sprintf(name, "%d", irq); - - /* create /proc/irq/1234 */ - irq_dir[irq] = proc_mkdir(name, root_irq_dir); - -#if CONFIG_SMP - { - struct proc_dir_entry *entry; - - /* create /proc/irq/1234/smp_affinity */ - entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); - - if (entry) { - entry->nlink = 1; - entry->data = (void *)(long)irq; - entry->read_proc = irq_affinity_read_proc; - entry->write_proc = irq_affinity_write_proc; - } - - smp_affinity_entry[irq] = entry; - } -#endif -} - -unsigned long prof_cpu_mask = -1; - -void init_irq_proc (void) -{ - struct proc_dir_entry *entry; - int i; - - /* create /proc/irq */ - root_irq_dir = proc_mkdir("irq", 0); - - /* create /proc/irq/prof_cpu_mask */ - entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); - - if (!entry) - return; - - entry->nlink = 1; - entry->data = (void *)&prof_cpu_mask; - entry->read_proc = prof_cpu_mask_read_proc; - entry->write_proc = prof_cpu_mask_write_proc; - - /* - * Create entries for all existing IRQs. - */ - for (i = 0; i < NR_IRQS; i++) - register_irq_proc(i); -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/ldt.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/ldt.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,272 +0,0 @@ -/* - * linux/kernel/ldt.c - * - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/mmu_context.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *mm) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) -{ - void *oldldt; - void *newldt; - int oldsize; - - if (mincount <= pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - pc->size = mincount; - if (reload) { - make_pages_readonly( - pc->ldt, - (pc->size*LDT_ENTRY_SIZE)/PAGE_SIZE); - load_LDT(pc); -#ifdef CONFIG_SMP - if (current->mm->cpu_vm_mask != (1<<smp_processor_id())) - smp_call_function(flush_ldt, 0, 1, 1); -#endif - } - wmb(); - if (oldsize) { - make_pages_writable( - oldldt, (oldsize*LDT_ENTRY_SIZE)/PAGE_SIZE); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) { - printk(KERN_WARNING "ldt allocation failed\n"); - new->size = 0; - return err; - } - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - init_MUTEX(&mm->context.sem); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); - retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); - } - return retval; -} - -/* - * No need to lock the MM as we are the last user - * Do not touch the ldt register, we are already - * in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - make_pages_writable( - mm->context.ldt, - (mm->context.size*LDT_ENTRY_SIZE)/PAGE_SIZE); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - down(&mm->context.sem); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - up(&mm->context.sem); - if (err < 0) - return err; - if (size != bytecount) { - /* zero-fill the rest */ - clear_user(ptr+size, bytecount-size); - } - return bytecount; -} - -static int read_default_ldt(void * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - void *address; - - err = 0; - address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (copy_to_user(ptr, address, size)) - err = -EFAULT; - - return err; -} - -static int write_ldt(void * ptr, unsigned long bytecount, int oldmode) -{ - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2, *lp; - unsigned long mach_lp; - int error; - struct modify_ldt_ldt_s ldt_info; - - error = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - down(&mm->context.sem); - if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - mach_lp = arbitrary_virt_to_machine(lp); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || - (ldt_info.contents == 0 && - ldt_info.read_exec_only == 1 && - ldt_info.seg_32bit == 0 && - ldt_info.limit_in_pages == 0 && - ldt_info.seg_not_present == 1 && - ldt_info.useable == 0 )) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = ((ldt_info.base_addr & 0x0000ffff) << 16) | - (ldt_info.limit & 0x0ffff); - entry_2 = (ldt_info.base_addr & 0xff000000) | - ((ldt_info.base_addr & 0x00ff0000) >> 16) | - (ldt_info.limit & 0xf0000) | - ((ldt_info.read_exec_only ^ 1) << 9) | - (ldt_info.contents << 10) | - ((ldt_info.seg_not_present ^ 1) << 15) | - (ldt_info.seg_32bit << 22) | - (ldt_info.limit_in_pages << 23) | - 0x7000; - if (!oldmode) - entry_2 |= (ldt_info.useable << 20); - - /* Install the new entry ... */ -install: - error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2); - -out_unlock: - up(&mm->context.sem); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/pci-pc.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/pci-pc.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,260 +0,0 @@ -/* - * Low-Level PCI Support for PC - * - * (c) 1999--2000 Martin Mares <mj@xxxxxx> - * - * Adjusted to use Xen's interface by Rolf Neugebauer, Intel Research Cambridge - * Further modifications by Keir Fraser, University of Cambridge - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/pci.h> -#include <linux/init.h> -#include <linux/ioport.h> - -#include <asm/segment.h> -#include <asm/io.h> - -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/xen-public/physdev.h> - -#include "pci-i386.h" - -/* - * NB. The following interface functions are not included here: - * 1. void eisa_set_level_irq(unsigned int irq) - * 2. irq_routing_table * __devinit pcibios_get_irq_routing_table(void) - * 3. int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq) - * All are used by the ACPI driver. This should be ported to Xen if it is - * ever required -- Xen is the ultimate source for IRQ-routing knowledge. - */ - -struct pci_ops *pci_root_ops = NULL; - -int (*pci_config_read)(int seg, int bus, int dev, int fn, - int reg, int len, u32 *value) = NULL; -int (*pci_config_write)(int seg, int bus, int dev, int fn, - int reg, int len, u32 value) = NULL; - -unsigned int pci_probe = PCI_PROBE_BIOS; - -struct pci_fixup pcibios_fixups[] = { { 0 } }; - -static int pci_confx_read(int seg, int bus, int dev, int fn, int reg, - int len, u32 *value) -{ - int ret; - physdev_op_t op; - - if (bus > 255 || dev > 31 || fn > 7 || reg > 255) - return -EINVAL; - - op.cmd = PHYSDEVOP_PCI_CFGREG_READ; - op.u.pci_cfgreg_read.bus = bus; - op.u.pci_cfgreg_read.dev = dev; - op.u.pci_cfgreg_read.func = fn; - op.u.pci_cfgreg_read.reg = reg; - op.u.pci_cfgreg_read.len = len; - - if ( (ret = HYPERVISOR_physdev_op(&op)) != 0 ) - return ret; - - *value = op.u.pci_cfgreg_read.value; - - return 0; -} - -static int pci_confx_write(int seg, int bus, int dev, int fn, int reg, - int len, u32 value) -{ - int ret; - physdev_op_t op; - - if ((bus > 255 || dev > 31 || fn > 7 || reg > 255)) - return -EINVAL; - - op.cmd = PHYSDEVOP_PCI_CFGREG_WRITE; - op.u.pci_cfgreg_write.bus = bus; - op.u.pci_cfgreg_write.dev = dev; - op.u.pci_cfgreg_write.func = fn; - op.u.pci_cfgreg_write.reg = reg; - op.u.pci_cfgreg_write.len = len; - op.u.pci_cfgreg_write.value = value; - - if ( (ret = HYPERVISOR_physdev_op(&op)) != 0 ) - return ret; - return 0; -} - - -static int pci_confx_read_config_byte(struct pci_dev *dev, - int where, u8 *value) -{ - int result; - u32 data; - - result = pci_confx_read(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 1, &data); - - *value = (u8)data; - - return result; -} - -static int pci_confx_read_config_word(struct pci_dev *dev, - int where, u16 *value) -{ - int result; - u32 data; - - result = pci_confx_read(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 2, &data); - - *value = (u16)data; - - return result; -} - -static int pci_confx_read_config_dword(struct pci_dev *dev, - int where, u32 *value) -{ - return pci_confx_read(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 4, value); -} - -static int pci_confx_write_config_byte(struct pci_dev *dev, - int where, u8 value) -{ - return pci_confx_write(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 1, value); -} - -static int pci_confx_write_config_word(struct pci_dev *dev, - int where, u16 value) -{ - return pci_confx_write(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 2, value); -} - -static int pci_confx_write_config_dword(struct pci_dev *dev, - int where, u32 value) -{ - return pci_confx_write(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 4, value); -} - -static struct pci_ops pci_conf_xen = { - pci_confx_read_config_byte, - pci_confx_read_config_word, - pci_confx_read_config_dword, - pci_confx_write_config_byte, - pci_confx_write_config_word, - pci_confx_write_config_dword -}; - -void pcibios_penalize_isa_irq(int irq) -{ - /* nothing */ -} - -void __devinit pcibios_fixup_bus(struct pci_bus *b) -{ - pci_read_bridge_bases(b); -} - -struct pci_bus * __devinit pcibios_scan_root(int busnum) -{ - struct list_head *list; - struct pci_bus *bus; - - list_for_each ( list, &pci_root_buses ) - { - bus = pci_bus_b(list); - if ( bus->number == busnum ) - return bus; - } - - printk("PCI: Probing PCI hardware (bus %02x)\n", busnum); - return pci_scan_bus(busnum, pci_root_ops, NULL); -} - -void __init pcibios_init(void) -{ - int bus; - physdev_op_t op; - - if ( !pci_probe ) - return; - - pci_root_ops = &pci_conf_xen; - pci_config_read = pci_confx_read; - pci_config_write = pci_confx_write; - - pcibios_set_cacheline_size(); - - op.cmd = PHYSDEVOP_PCI_PROBE_ROOT_BUSES; - if ( HYPERVISOR_physdev_op(&op) != 0 ) - { - printk(KERN_WARNING "PCI: System does not support PCI\n"); - return; - } - - printk(KERN_INFO "PCI: Probing PCI hardware\n"); - for ( bus = 0; bus < 256; bus++ ) - if ( test_bit(bus, &op.u.pci_probe_root_buses.busmask[0]) ) - (void)pcibios_scan_root(bus); - - pcibios_resource_survey(); -} - -char * __devinit pcibios_setup(char *str) -{ - if ( !strcmp(str, "off") ) - pci_probe = 0; - return NULL; -} - -unsigned int pcibios_assign_all_busses(void) -{ - return 0; -} - -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - int err; - u8 pin; - physdev_op_t op; - - /* Inform Xen that we are going to use this device. */ - op.cmd = PHYSDEVOP_PCI_INITIALISE_DEVICE; - op.u.pci_initialise_device.bus = dev->bus->number; - op.u.pci_initialise_device.dev = PCI_SLOT(dev->devfn); - op.u.pci_initialise_device.func = PCI_FUNC(dev->devfn); - if ( (err = HYPERVISOR_physdev_op(&op)) != 0 ) - return err; - - /* Now we can bind to the very final IRQ line. */ - pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &pin); - dev->irq = pin; - - /* Turn on device I/O and memory access as necessary. */ - if ( (err = pcibios_enable_resources(dev, mask)) < 0 ) - return err; - - /* Sanity-check that an interrupt-producing device is routed to an IRQ. */ - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if ( pin != 0 ) - { - if ( dev->irq != 0 ) - printk(KERN_INFO "PCI: Obtained IRQ %d for device %s\n", - dev->irq, dev->slot_name); - else - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of " - "device %s.\n", 'A' + pin - 1, dev->slot_name); - } - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/process.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/process.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,448 +0,0 @@ -/* - * linux/arch/i386/kernel/process.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - */ - -/* - * This file handles the architecture-dependent parts of process handling.. - */ - -#define __KERNEL_SYSCALLS__ -#include <stdarg.h> - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/interrupt.h> -#include <linux/config.h> -#include <linux/delay.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/mc146818rtc.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/ldt.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/desc.h> -#include <asm/mmu_context.h> -#include <asm-xen/xen-public/physdev.h> - -#include <linux/irq.h> - -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); - -int hlt_counter; - -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); - -void disable_hlt(void) -{ - hlt_counter++; -} - -void enable_hlt(void) -{ - hlt_counter--; -} - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle (void) -{ - extern int set_timeout_timer(void); - - /* Endless idle loop with no priority at all. */ - init_idle(); - current->nice = 20; - current->counter = -100; - - for ( ; ; ) - { - while ( !current->need_resched ) - { - __cli(); - if ( current->need_resched ) - { - /* The race-free check for events failed. */ - __sti(); - break; - } - else if ( set_timeout_timer() == 0 ) - { - /* NB. Blocking reenable events in a race-free manner. */ - HYPERVISOR_block(); - } - else - { - /* No race here: yielding will get us the CPU again anyway. */ - __sti(); - HYPERVISOR_yield(); - } - } - schedule(); - check_pgt_cache(); - } -} - -extern void show_trace(unsigned long* esp); - -void show_regs(struct pt_regs * regs) -{ - printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs,regs->eip, smp_processor_id()); - if (regs->xcs & 2) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); - printk(" EFLAGS: %08lx %s\n",regs->eflags, print_tainted()); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax,regs->ebx,regs->ecx,regs->edx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx", - regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); - - show_trace(®s->esp); -} - - -/* - * Create a kernel thread - */ -int arch_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) -{ - long retval, d0; - - __asm__ __volatile__( - "movl %%esp,%%esi\n\t" - "int $0x80\n\t" /* Linux/i386 system call */ - "cmpl %%esp,%%esi\n\t" /* child or parent? */ - "je 1f\n\t" /* parent - jump */ - /* Load the argument into eax, and push it. That way, it does - * not matter whether the called function is compiled with - * -mregparm or not. */ - "movl %4,%%eax\n\t" - "pushl %%eax\n\t" - "call *%5\n\t" /* call fn */ - "movl %3,%0\n\t" /* exit */ - "int $0x80\n" - "1:\t" - :"=&a" (retval), "=&S" (d0) - :"0" (__NR_clone), "i" (__NR_exit), - "r" (arg), "r" (fn), - "b" (flags | CLONE_VM) - : "memory"); - - return retval; -} - -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - /* nothing to do ... */ -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); - - /* - * Forget coprocessor state.. - */ - clear_fpu(tsk); - tsk->used_math = 0; -} - -void release_thread(struct task_struct *dead_task) -{ - if (dead_task->mm) { - // temporary debugging check - if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%08x>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); - BUG(); - } - } - //release_x86_irqs(dead_task); -} - - -/* - * Save a segment. - */ -#define savesegment(seg,value) \ - asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value))) - -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, - unsigned long unused, - struct task_struct * p, struct pt_regs * regs) -{ - struct pt_regs * childregs; - - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; - struct_cpy(childregs, regs); - childregs->eax = 0; - childregs->esp = esp; - - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); - - p->thread.eip = (unsigned long) ret_from_fork; - - savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); - - unlazy_fpu(current); - struct_cpy(&p->thread.i387, ¤t->thread.i387); - - p->thread.io_pl = current->thread.io_pl; - - return 0; -} - -/* - * fill in the user structure for a core dump.. - */ -void dump_thread(struct pt_regs * regs, struct user * dump) -{ - int i; - -/* changed the size calculations - should hopefully work better. lbt */ - dump->magic = CMAGIC; - dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; - dump->u_dsize -= dump->u_tsize; - dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; - - if (dump->start_stack < TASK_SIZE) - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - savesegment(fs,dump->regs.fs); - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; - - dump->u_fpvalid = dump_fpu (regs, &dump->i387); -} - -/* - * switch_to(x,yn) should switch tasks from x to y. - * - * We fsave/fwait so that an exception goes off at the right time - * (as a call from the fsave or fwait in effect) rather than to - * the wrong process. Lazy FP saving no longer makes any sense - * with modern CPU's, and this simplifies a lot of things (SMP - * and UP become the same). - * - * NOTE! We used to use the x86 hardware context switching. The - * reason for not using it any more becomes apparent when you - * try to recover gracefully from saved state that is no longer - * valid (stale segment register values in particular). With the - * hardware task-switch, there is no way to fix up bad state in - * a reasonable manner. - * - * The fact that Intel documents the hardware task-switching to - * be slow is a fairly red herring - this code is not noticeably - * faster. However, there _is_ some room for improvement here, - * so the performance issues may eventually be a valid point. - * More important, however, is the fact that this allows us much - * more flexibility. - */ -void fastcall __switch_to(struct task_struct *prev_p, struct task_struct *next_p) -{ - struct thread_struct *next = &next_p->thread; - physdev_op_t op; - multicall_entry_t _mcl[8], *mcl = _mcl; - - /* - * This is basically 'unlazy_fpu', except that we queue a multicall to - * indicate FPU task switch, rather than synchronously trapping to Xen. - */ - if ( prev_p->flags & PF_USEDFPU ) - { - if ( cpu_has_fxsr ) - asm volatile( "fxsave %0 ; fnclex" - : "=m" (prev_p->thread.i387.fxsave) ); - else - asm volatile( "fnsave %0 ; fwait" - : "=m" (prev_p->thread.i387.fsave) ); - prev_p->flags &= ~PF_USEDFPU; - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; - } - - mcl->op = __HYPERVISOR_stack_switch; - mcl->args[0] = __KERNEL_DS; - mcl->args[1] = next->esp0; - mcl++; - - if ( prev_p->thread.io_pl != next->io_pl ) - { - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = next->io_pl; - mcl->op = __HYPERVISOR_physdev_op; - mcl->args[0] = (unsigned long)&op; - mcl++; - } - - (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); - - /* - * Restore %fs and %gs. - */ - loadsegment(fs, next->fs); - loadsegment(gs, next->gs); - - /* - * Now maybe reload the debug registers - */ - if ( next->debugreg[7] != 0 ) - { - HYPERVISOR_set_debugreg(0, next->debugreg[0]); - HYPERVISOR_set_debugreg(1, next->debugreg[1]); - HYPERVISOR_set_debugreg(2, next->debugreg[2]); - HYPERVISOR_set_debugreg(3, next->debugreg[3]); - /* no 4 and 5 */ - HYPERVISOR_set_debugreg(6, next->debugreg[6]); - HYPERVISOR_set_debugreg(7, next->debugreg[7]); - } -} - -asmlinkage int sys_fork(struct pt_regs regs) -{ - return do_fork(SIGCHLD, regs.esp, ®s, 0); -} - -asmlinkage int sys_clone(struct pt_regs regs) -{ - unsigned long clone_flags; - unsigned long newsp; - - clone_flags = regs.ebx; - newsp = regs.ecx; - if (!newsp) - newsp = regs.esp; - return do_fork(clone_flags, newsp, ®s, 0); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage int sys_vfork(struct pt_regs regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0); -} - -/* - * sys_execve() executes a new program. - */ -asmlinkage int sys_execve(struct pt_regs regs) -{ - int error; - char * filename; - - filename = getname((char *) regs.ebx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, (char **) regs.ecx, (char **) regs.edx, ®s); - if (error == 0) - current->ptrace &= ~PT_DTRACE; - putname(filename); - out: - return error; -} - -/* - * These bracket the sleeping functions.. - */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); -#define first_sched ((unsigned long) scheduling_functions_start_here) -#define last_sched ((unsigned long) scheduling_functions_end_here) - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long ebp, esp, eip; - unsigned long stack_page; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - stack_page = (unsigned long)p; - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > 8188+stack_page) - return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; - do { - if (ebp < stack_page || ebp > 8184+stack_page) - return 0; - eip = *(unsigned long *) (ebp+4); - if (eip < first_sched || eip >= last_sched) - return eip; - ebp = *(unsigned long *) ebp; - } while (count++ < 16); - return 0; -} -#undef last_sched -#undef first_sched diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/setup.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/setup.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1213 +0,0 @@ -/* - * linux/arch/i386/kernel/setup.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#define __KERNEL_SYSCALLS__ -static int errno; -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/tty.h> -#include <linux/ioport.h> -#include <linux/delay.h> -#include <linux/config.h> -#include <linux/init.h> -#include <linux/apm_bios.h> -#ifdef CONFIG_BLK_DEV_RAM -#include <linux/blk.h> -#endif -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/seq_file.h> -#include <linux/reboot.h> -#include <asm/processor.h> -#include <linux/console.h> -#include <linux/module.h> -#include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <asm/dma.h> -#include <asm/mpspec.h> -#include <asm/mmu_context.h> -#include <asm/ctrl_if.h> -#include <asm/hypervisor.h> -#include <asm-xen/xen-public/physdev.h> -#include <linux/netdevice.h> -#include <linux/rtnetlink.h> -#include <linux/tqueue.h> -#include <net/pkt_sched.h> /* dev_(de)activate */ - -/* - * Point at the empty zero page to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; - -unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list; - -/* - * Machine setup.. - */ - -char ignore_irq13; /* set if exception 16 works */ -struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; - -unsigned long mmu_cr4_features; - -unsigned char * vgacon_mmap; - -/* - * Bus types .. - */ -#ifdef CONFIG_EISA -int EISA_bus; -#endif -int MCA_bus; - -/* for MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; -unsigned int mca_pentium_flag; - -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; - -/* - * Setup options - */ -struct drive_info_struct { char dummy[32]; } drive_info; -struct screen_info screen_info; -struct apm_info apm_info; -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; - -unsigned char aux_device_present; - -extern int root_mountflags; -extern char _text, _etext, _edata, _end; - -extern int blk_nohighio; - -int enable_acpi_smp_table; - -/* Raw start-of-day parameters from the hypervisor. */ -union xen_start_info_union xen_start_info_union; - -#define COMMAND_LINE_SIZE MAX_GUEST_CMDLINE -static char command_line[COMMAND_LINE_SIZE]; -char saved_command_line[COMMAND_LINE_SIZE]; - -/* parse_mem_cmdline() - * returns the value of the mem= boot param converted to pages or 0 - */ -static int __init parse_mem_cmdline (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0; - unsigned long long bytes; - int mem_param = 0; - - /* Save unparsed command line copy for /proc/cmdline */ - memcpy(saved_command_line, xen_start_info.cmd_line, COMMAND_LINE_SIZE); - saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; - - for (;;) { - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to <mem>, overriding the bios size. - * "mem=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * <start> to <start>+<mem>, overriding the bios size. - */ - if (c == ' ' && !memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - } else if (!memcmp(from+4, "exactmap", 8)) { - from += 8+4; - } else { - bytes = memparse(from+4, &from); - mem_param = bytes>>PAGE_SHIFT; - if (*from == '@') - (void)memparse(from+1, &from); - } - } - - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - - return mem_param; -} - -/* - * Every exception-fixup table is sorted (i.e., kernel main table, and every - * module table. Some elements may be out of order if they reference text.init, - * for example. - */ -static void sort_exception_table(struct exception_table_entry *start, - struct exception_table_entry *end) -{ - struct exception_table_entry *p, *q, tmp; - - for ( p = start; p < end; p++ ) - { - for ( q = p-1; q > start; q-- ) - if ( p->insn > q->insn ) - break; - if ( ++q != p ) - { - tmp = *p; - memmove(q+1, q, (p-q)*sizeof(*p)); - *q = tmp; - } - } -} - -int xen_module_init(struct module *mod) -{ - sort_exception_table(mod->ex_table_start, mod->ex_table_end); - return 0; -} - -void __init setup_arch(char **cmdline_p) -{ - int i,j; - unsigned long bootmap_size, start_pfn, lmax_low_pfn; - int mem_param; /* user specified memory size in pages */ - int boot_pfn; /* low pages available for bootmem */ - physdev_op_t op; - - extern void hypervisor_callback(void); - extern void failsafe_callback(void); - - extern unsigned long cpu0_pte_quicklist[]; - extern unsigned long cpu0_pgd_quicklist[]; - - extern const struct exception_table_entry __start___ex_table[]; - extern const struct exception_table_entry __stop___ex_table[]; - - extern char _stext; - - /* Force a quick death if the kernel panics. */ - extern int panic_timeout; - if ( panic_timeout == 0 ) - panic_timeout = 1; - - /* Ensure that the kernel exception-fixup table is sorted. */ - sort_exception_table(__start___ex_table, __stop___ex_table); - -#ifndef CONFIG_HIGHIO - blk_nohighio = 1; -#endif - - HYPERVISOR_vm_assist( - VMASST_CMD_enable, VMASST_TYPE_4gb_segments); - HYPERVISOR_vm_assist( - VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - - HYPERVISOR_set_callbacks( - __KERNEL_CS, (unsigned long)hypervisor_callback, - __KERNEL_CS, (unsigned long)failsafe_callback); - - boot_cpu_data.pgd_quick = cpu0_pgd_quicklist; - boot_cpu_data.pte_quick = cpu0_pte_quicklist; - - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. */ - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); - memset(&drive_info, 0, sizeof(drive_info)); - memset(&screen_info, 0, sizeof(screen_info)); - - /* This is drawn from a dump from vgacon:startup in standard Linux. */ - screen_info.orig_video_mode = 3; - screen_info.orig_video_isVGA = 1; - screen_info.orig_video_lines = 25; - screen_info.orig_video_cols = 80; - screen_info.orig_video_ega_bx = 3; - screen_info.orig_video_points = 16; - - memset(&apm_info.bios, 0, sizeof(apm_info.bios)); - aux_device_present = 0; -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = 0; - rd_prompt = 0; - rd_doload = 0; -#endif - - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) &_text; - init_mm.end_code = (unsigned long) &_etext; - init_mm.end_data = (unsigned long) &_edata; - init_mm.brk = (unsigned long) &_end; - - /* The mem= kernel command line param overrides the detected amount - * of memory. For xenolinux, if this override is larger than detected - * memory, then boot using only detected memory and make provisions to - * use all of the override value. The hypervisor can give this - * domain more memory later on and it will be added to the free - * lists at that time. See claim_new_pages() in - * arch/xen/drivers/balloon/balloon.c - */ - mem_param = parse_mem_cmdline(cmdline_p); - if (mem_param < xen_start_info.nr_pages) - mem_param = xen_start_info.nr_pages; - -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) - -/* - * 128MB for vmalloc(), iomap(), kmap(), and fixaddr mappings. - */ -#define VMALLOC_RESERVE (unsigned long)(128 << 20) -#define MAXMEM (unsigned long)(HYPERVISOR_VIRT_START-PAGE_OFFSET-VMALLOC_RESERVE) -#define MAXMEM_PFN PFN_DOWN(MAXMEM) -#define MAX_NONPAE_PFN (1 << 20) - - /* - * Determine low and high memory ranges: - */ - lmax_low_pfn = max_pfn = mem_param; - if (lmax_low_pfn > MAXMEM_PFN) { - lmax_low_pfn = MAXMEM_PFN; -#ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = lmax_low_pfn; -#else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_X86_PAE - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used.\n"); - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); - } -#endif /* !CONFIG_X86_PAE */ -#endif /* !CONFIG_HIGHMEM */ - } - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > MAXMEM_PFN) { - highstart_pfn = MAXMEM_PFN; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - } -#endif - - phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list; - cur_pgd = init_mm.pgd = (pgd_t *)xen_start_info.pt_base; - - start_pfn = (__pa(xen_start_info.pt_base) >> PAGE_SHIFT) + - xen_start_info.nr_pt_frames; - - /* - * Initialize the boot-time allocator, and free up all RAM. Then reserve - * space for OS image, initrd, phys->machine table, bootstrap page table, - * and the bootmem bitmap. - * NB. There is definitely enough room for the bootmem bitmap in the - * bootstrap page table. We are guaranteed to get >=512kB unused 'padding' - * for our own use after all bootstrap elements - * (see asm-xen/xen-public/xen.h). - */ - boot_pfn = min((int)xen_start_info.nr_pages,lmax_low_pfn); - bootmap_size = init_bootmem(start_pfn,boot_pfn); - free_bootmem(0, PFN_PHYS(boot_pfn)); - reserve_bootmem(__pa(&_stext), - PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1 - - __pa(&_stext)); - - /* init_bootmem() set the global max_low_pfn to boot_pfn. Now max_low_pfn - * can be set to the override value. - */ - max_low_pfn = lmax_low_pfn; - -#ifdef CONFIG_BLK_DEV_INITRD - if ( xen_start_info.mod_start != 0 ) - { - if ( (__pa(xen_start_info.mod_start) + xen_start_info.mod_len) <= - (max_low_pfn << PAGE_SHIFT) ) - { - initrd_start = xen_start_info.mod_start; - initrd_end = initrd_start + xen_start_info.mod_len; - initrd_below_start_ok = 1; - } - else - { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - __pa(xen_start_info.mod_start) + xen_start_info.mod_len, - max_low_pfn << PAGE_SHIFT); - initrd_start = 0; - } - } -#endif - - paging_init(); - - /* Make sure we have a correctly sized P->M table. */ - if ( max_pfn != xen_start_info.nr_pages ) - { - phys_to_machine_mapping = alloc_bootmem_low_pages( - max_pfn * sizeof(unsigned long)); - if ( max_pfn > xen_start_info.nr_pages ) - { - memset(phys_to_machine_mapping, ~0, - max_pfn * sizeof(unsigned long)); - memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - xen_start_info.nr_pages * sizeof(unsigned long)); - } - else - { - memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - max_pfn * sizeof(unsigned long)); - if (HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, - (unsigned long *)xen_start_info.mfn_list + max_pfn, - xen_start_info.nr_pages - max_pfn, 0) != - (xen_start_info.nr_pages - max_pfn)) - BUG(); - } - free_bootmem(__pa(xen_start_info.mfn_list), - PFN_PHYS(PFN_UP(xen_start_info.nr_pages * - sizeof(unsigned long)))); - } - - pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE); - for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) - { - pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; - } - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = - virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; - - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = current->thread.io_pl = 1; - HYPERVISOR_physdev_op(&op); - - if (xen_start_info.flags & SIF_INITDOMAIN ) - { - if( !(xen_start_info.flags & SIF_PRIVILEGED) ) - panic("Xen granted us console access but not privileged status"); - -#if defined(CONFIG_VT) -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif - } -} - -static int cachesize_override __initdata = -1; -static int __init cachesize_setup(char *str) -{ - get_option (&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - -static int __init highio_setup(char *str) -{ - printk("i386: disabling HIGHMEM block I/O\n"); - blk_nohighio = 1; - return 1; -} -__setup("nohighio", highio_setup); - -static int __init get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - char *p, *q; - - if (cpuid_eax(0x80000000) < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - - /* Intel chips right-justify this string for some dumb reason; - undo that brain damage */ - p = q = &c->x86_model_id[0]; - while ( *p == ' ' ) - p++; - if ( p != q ) { - while ( *p ) - *q++ = *p++; - while ( q <= &c->x86_model_id[48] ) - *q++ = '\0'; /* Zero-pad the rest */ - } - - return 1; -} - - -static void __init display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, ecx, edx, l2size; - - n = cpuid_eax(0x80000000); - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); - } - - if (n < 0x80000006) /* Some chips just has a large L1. */ - return; - - ecx = cpuid_ecx(0x80000006); - l2size = ecx >> 16; - - /* AMD errata T13 (order #21922) */ - if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ - l2size = 64; - if (c->x86_model == 4 && - (c->x86_mask==0 || c->x86_mask==1)) /* Tbird rev A1/A2 */ - l2size = 256; - } - - /* Intel PIII Tualatin. This comes in two flavours. - * One has 256kb of cache, the other 512. We have no way - * to determine which, so we use a boottime override - * for the 512kb model, and assume 256 otherwise. - */ - if ((c->x86_vendor == X86_VENDOR_INTEL) && (c->x86 == 6) && - (c->x86_model == 11) && (l2size == 0)) - l2size = 256; - - if (c->x86_vendor == X86_VENDOR_CENTAUR) { - /* VIA C3 CPUs (670-68F) need further shifting. */ - if ((c->x86 == 6) && - ((c->x86_model == 7) || (c->x86_model == 8))) { - l2size >>= 8; - } - - /* VIA also screwed up Nehemiah stepping 1, and made - it return '65KB' instead of '64KB' - - Note, it seems this may only be in engineering samples. */ - if ((c->x86==6) && (c->x86_model==9) && - (c->x86_mask==1) && (l2size==65)) - l2size -= 1; - } - - /* Allow user to override all this if necessary. */ - if (cachesize_override != -1) - l2size = cachesize_override; - - if ( l2size == 0 ) - return; /* Again, no L2 cache is possible */ - - c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); -} - -static void __init init_c3(struct cpuinfo_x86 *c) -{ - /* Test for Centaur Extended Feature Flags presence */ - if (cpuid_eax(0xC0000000) >= 0xC0000001) { - /* store Centaur Extended Feature Flags as - * word 5 of the CPU capability bit array - */ - c->x86_capability[5] = cpuid_edx(0xC0000001); - } - - switch (c->x86_model) { - case 9: /* Nehemiah */ - default: - get_model_name(c); - display_cacheinfo(c); - break; - } -} - -static void __init init_centaur(struct cpuinfo_x86 *c) -{ - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*3231) anyway */ - clear_bit(0*32+31, &c->x86_capability); - - switch (c->x86) { - case 6: - init_c3(c); - break; - default: - panic("Unsupported Centaur CPU (%i)\n", c->x86); - } -} - -static int __init init_amd(struct cpuinfo_x86 *c) -{ - int r; - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - - r = get_model_name(c); - - switch(c->x86) - { - case 5: /* We don't like AMD K6 */ - panic("Unsupported AMD processor\n"); - case 6: /* An Athlon/Duron. We can trust the BIOS probably */ - break; - } - - display_cacheinfo(c); - return r; -} - - -static void __init init_intel(struct cpuinfo_x86 *c) -{ - char *p = NULL; - unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ - - if (c->cpuid_level > 1) { - /* supports eax=2 call */ - int i, j, n; - int regs[4]; - unsigned char *dp = (unsigned char *)regs; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for ( i = 0 ; i < n ; i++ ) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for ( j = 0 ; j < 3 ; j++ ) { - if ( regs[j] < 0 ) regs[j] = 0; - } - - /* Byte 0 is level count, not a descriptor */ - for ( j = 1 ; j < 16 ; j++ ) { - unsigned char des = dp[j]; - unsigned char dl, dh; - unsigned int cs; - - dh = des >> 4; - dl = des & 0x0F; - - /* Black magic... */ - - switch ( dh ) - { - case 0: - switch ( dl ) { - case 6: - /* L1 I cache */ - l1i += 8; - break; - case 8: - /* L1 I cache */ - l1i += 16; - break; - case 10: - /* L1 D cache */ - l1d += 8; - break; - case 12: - /* L1 D cache */ - l1d += 16; - break; - default:; - /* TLB, or unknown */ - } - break; - case 2: - if ( dl ) { - /* L3 cache */ - cs = (dl-1) << 9; - l3 += cs; - } - break; - case 4: - if ( c->x86 > 6 && dl ) { - /* P4 family */ - /* L3 cache */ - cs = 128 << (dl-1); - l3 += cs; - break; - } - /* else same as 8 - fall through */ - case 8: - if ( dl ) { - /* L2 cache */ - cs = 128 << (dl-1); - l2 += cs; - } - break; - case 6: - if (dl > 5) { - /* L1 D cache */ - cs = 8<<(dl-6); - l1d += cs; - } - break; - case 7: - if ( dl >= 8 ) - { - /* L2 cache */ - cs = 64<<(dl-8); - l2 += cs; - } else { - /* L0 I cache, count as L1 */ - cs = dl ? (16 << (dl-1)) : 12; - l1i += cs; - } - break; - default: - /* TLB, or something else we don't know about */ - break; - } - } - } - if ( l1i || l1d ) - printk(KERN_INFO "CPU: L1 I cache: %dK, L1 D cache: %dK\n", - l1i, l1d); - if ( l2 ) - printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); - if ( l3 ) - printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); - - /* - * This assumes the L3 cache is shared; it typically lives in - * the northbridge. The L1 caches are included by the L2 - * cache, and so should not be included for the purpose of - * SMP switching weights. - */ - c->x86_cache_size = l2 ? l2 : (l1i+l1d); - } - - /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it */ - if ( c->x86 == 6 && c->x86_model < 3 && c->x86_mask < 3 ) - clear_bit(X86_FEATURE_SEP, &c->x86_capability); - - /* Names for the Pentium II/Celeron processors - detectable only by also checking the cache size. - Dixon is NOT a Celeron. */ - if (c->x86 == 6) { - switch (c->x86_model) { - case 5: - if (l2 == 0) - p = "Celeron (Covington)"; - if (l2 == 256) - p = "Mobile Pentium II (Dixon)"; - break; - - case 6: - if (l2 == 128) - p = "Celeron (Mendocino)"; - break; - - case 8: - if (l2 == 128) - p = "Celeron (Coppermine)"; - break; - } - } - - if ( p ) - strcpy(c->x86_model_id, p); -} - -void __init get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - - if (!strcmp(v, "GenuineIntel")) - c->x86_vendor = X86_VENDOR_INTEL; - else if (!strcmp(v, "AuthenticAMD")) - c->x86_vendor = X86_VENDOR_AMD; - else if (!strcmp(v, "CentaurHauls")) - c->x86_vendor = X86_VENDOR_CENTAUR; - else - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - -/* Naming convention should be: <Name> [(<Codename>)] */ -/* This table only is used unless init_<vendor>() below doesn't set it; */ -/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ -static struct cpu_model_info cpu_models[] __initdata = { - { X86_VENDOR_INTEL, 6, - { "Pentium Pro A-step", "Pentium Pro", NULL, "Pentium II (Klamath)", - NULL, "Pentium II (Deschutes)", "Mobile Pentium II", - "Pentium III (Katmai)", "Pentium III (Coppermine)", NULL, - "Pentium III (Cascades)", NULL, NULL, NULL, NULL }}, - { X86_VENDOR_AMD, 6, /* Is this this really necessary?? */ - { "Athlon", "Athlon", - "Athlon", NULL, "Athlon", NULL, - NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL }} -}; - -/* Look up CPU names by table lookup. */ -static char __init *table_lookup_model(struct cpuinfo_x86 *c) -{ - struct cpu_model_info *info = cpu_models; - int i; - - if ( c->x86_model >= 16 ) - return NULL; /* Range check */ - - for ( i = 0 ; i < sizeof(cpu_models)/sizeof(struct cpu_model_info) ; i++ ) { - if ( info->vendor == c->x86_vendor && - info->family == c->x86 ) { - return info->model_names[c->x86_model]; - } - info++; - } - return NULL; /* Not found */ -} - - - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - - -/* Probe for the CPUID instruction */ -static int __init have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - - - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -unsigned char eddnr; -struct edd_info edd[EDDMAXNR]; -unsigned int edd_disk80_sig; -/** - * copy_edd() - Copy the BIOS EDD information - * from empty_zero_page into a safe place. - * - */ -static inline void copy_edd(void) -{ - eddnr = EDD_NR; - memcpy(edd, EDD_BUF, sizeof(edd)); - edd_disk80_sig = DISK80_SIGNATURE_BUFFER; -} -#else -static inline void copy_edd(void) {} -#endif - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __init identify_cpu(struct cpuinfo_x86 *c) -{ - int junk, i; - u32 xlvl, tfms; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->cpuid_level = -1; /* CPUID not detected */ - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - memset(&c->x86_capability, 0, sizeof c->x86_capability); - c->hard_math = 1; - - if ( !have_cpuid_p() ) { - panic("Processor must support CPUID\n"); - } else { - /* CPU does have CPUID */ - - /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if ( c->cpuid_level >= 0x00000001 ) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &junk, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) { - c->x86 += (tfms >> 20) & 0xff; - c->x86_model += ((tfms >> 16) & 0xF) << 4; - } - c->x86_mask = tfms & 15; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ( (xlvl & 0xffff0000) == 0x80000000 ) { - if ( xlvl >= 0x80000001 ) - c->x86_capability[1] = cpuid_edx(0x80000001); - if ( xlvl >= 0x80000004 ) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ( (xlvl & 0xffff0000) == 0x80860000 ) { - if ( xlvl >= 0x80860001 ) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - } - - printk(KERN_DEBUG "CPU: Before vendor init, caps: %08x %08x %08x, vendor = %d\n", - c->x86_capability[0], - c->x86_capability[1], - c->x86_capability[2], - c->x86_vendor); - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - switch ( c->x86_vendor ) { - case X86_VENDOR_AMD: - init_amd(c); - break; - - case X86_VENDOR_INTEL: - init_intel(c); - break; - - case X86_VENDOR_CENTAUR: - init_centaur(c); - break; - - default: - printk("Unsupported CPU vendor (%d) -- please report!\n", - c->x86_vendor); - } - - printk(KERN_DEBUG "CPU: After vendor init, caps: %08x %08x %08x %08x\n", - c->x86_capability[0], - c->x86_capability[1], - c->x86_capability[2], - c->x86_capability[3]); - - - /* If the model name is still unset, do table lookup. */ - if ( !c->x86_model_id[0] ) { - char *p; - p = table_lookup_model(c); - if ( p ) - strcpy(c->x86_model_id, p); - else - /* Last resort... */ - sprintf(c->x86_model_id, "%02x/%02x", - c->x86_vendor, c->x86_model); - } - - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After generic, caps: %08x %08x %08x %08x\n", - c->x86_capability[0], - c->x86_capability[1], - c->x86_capability[2], - c->x86_capability[3]); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if ( c != &boot_cpu_data ) { - /* AND the already accumulated flags with these */ - for ( i = 0 ; i < NCAPINTS ; i++ ) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - printk(KERN_DEBUG "CPU: Common caps: %08x %08x %08x %08x\n", - boot_cpu_data.x86_capability[0], - boot_cpu_data.x86_capability[1], - boot_cpu_data.x86_capability[2], - boot_cpu_data.x86_capability[3]); -} - - -/* These need to match <asm/processor.h> */ -static char *cpu_vendor_names[] __initdata = { - "Intel", "Cyrix", "AMD", "UMC", "NexGen", "Centaur", "Rise", "Transmeta" }; - - -void __init print_cpu_info(struct cpuinfo_x86 *c) -{ - char *vendor = NULL; - - if (c->x86_vendor < sizeof(cpu_vendor_names)/sizeof(char *)) - vendor = cpu_vendor_names[c->x86_vendor]; - else if (c->cpuid_level >= 0) - vendor = c->x86_vendor_id; - - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk("%s ", vendor); - - if (!c->x86_model_id[0]) - printk("%d86", c->x86); - else - printk("%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); - else - printk("\n"); -} - -/* - * Get CPU information for use by the procfs. - */ -static int show_cpuinfo(struct seq_file *m, void *v) -{ - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static char *x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", NULL, NULL, "mmxext", NULL, - NULL, NULL, NULL, NULL, NULL, "lm", "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "tm2", - "est", NULL, "cid", NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "xstore", NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - }; - struct cpuinfo_x86 *c = v; - int i, n = c - cpu_data; - int fpu_exception; - -#ifdef CONFIG_SMP - if (!(cpu_online_map & (1<<n))) - return 0; -#endif - seq_printf(m, "processor\t: %d\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - n, - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); - else - seq_printf(m, "stepping\t: unknown\n"); - - if ( test_bit(X86_FEATURE_TSC, &c->x86_capability) ) { - seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n", - cpu_khz / 1000, (cpu_khz % 1000)); - } - - /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - - /* We use exception 16 if we have hardware math and we've either seen it or the CPU claims it is internal */ - fpu_exception = c->hard_math && (ignore_irq13 || cpu_has_fpu); - seq_printf(m, "fdiv_bug\t: %s\n" - "hlt_bug\t\t: %s\n" - "f00f_bug\t: %s\n" - "coma_bug\t: %s\n" - "fpu\t\t: %s\n" - "fpu_exception\t: %s\n" - "cpuid level\t: %d\n" - "wp\t\t: %s\n" - "flags\t\t:", - c->fdiv_bug ? "yes" : "no", - c->hlt_works_ok ? "no" : "yes", - c->f00f_bug ? "yes" : "no", - c->coma_bug ? "yes" : "no", - c->hard_math ? "yes" : "no", - fpu_exception ? "yes" : "no", - c->cpuid_level, - c->wp_works_ok ? "yes" : "no"); - - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if ( test_bit(i, &c->x86_capability) && - x86_cap_flags[i] != NULL ) - seq_printf(m, " %s", x86_cap_flags[i]); - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n\n", - c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - return *pos < NR_CPUS ? cpu_data + *pos : NULL; -} -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return c_start(m, pos); -} -static void c_stop(struct seq_file *m, void *v) -{ -} -struct seq_operations cpuinfo_op = { - start: c_start, - next: c_next, - stop: c_stop, - show: show_cpuinfo, -}; - -unsigned long cpu_initialized __initdata = 0; - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __init cpu_init (void) -{ - int nr = smp_processor_id(); - - if (test_and_set_bit(nr, &cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", nr); - for (;;) __sti(); - } - printk(KERN_INFO "Initializing CPU#%d\n", nr); - - /* - * set up and load the per-CPU TSS and LDT - */ - atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - if(current->mm) - BUG(); - enter_lazy_tlb(&init_mm, current, nr); - - HYPERVISOR_stack_switch(__KERNEL_DS, current->thread.esp0); - - load_LDT(&init_mm.context); - - /* Force FPU initialization. */ - current->flags &= ~PF_USEDFPU; - current->used_math = 0; - stts(); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/signal.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/signal.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,717 +0,0 @@ -/* - * linux/arch/i386/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/tty.h> -#include <linux/personality.h> -#include <asm/ucontext.h> -#include <asm/uaccess.h> -#include <asm/i387.h> - -#define DEBUG_SIG 0 - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -int FASTCALL(do_signal(struct pt_regs *regs, sigset_t *oldset)); - -int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from) -{ - if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) - return -EFAULT; - if (from->si_code < 0) - return __copy_to_user(to, from, sizeof(siginfo_t)); - else { - int err; - - /* If you change siginfo_t structure, please be sure - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - /* First 32bits of unions are always present. */ - err |= __put_user(from->si_pid, &to->si_pid); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: - break; - case __SI_CHLD >> 16: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - default: - err |= __put_user(from->si_uid, &to->si_uid); - break; - /* case __SI_RT: This is not generated by the kernel as of now. */ - } - return err; - } -} - -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - struct pt_regs * regs = (struct pt_regs *) &history0; - sigset_t saveset; - - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sigmask_lock); - saveset = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); - - regs->eax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage int -sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) -{ - struct pt_regs * regs = (struct pt_regs *) &unewset; - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sigmask_lock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); - - regs->eax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction *act, - struct old_sigaction *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - if (verify_area(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} - -asmlinkage int -sys_sigaltstack(const stack_t *uss, stack_t *uoss) -{ - struct pt_regs *regs = (struct pt_regs *) &uss; - return do_sigaltstack(uss, uoss, regs->esp); -} - - -/* - * Do a signal return; undo the signal stack. - */ - -struct sigframe -{ - char *pretcode; - int sig; - struct sigcontext sc; - struct _fpstate fpstate; - unsigned long extramask[_NSIG_WORDS-1]; - char retcode[8]; -}; - -struct rt_sigframe -{ - char *pretcode; - int sig; - struct siginfo *pinfo; - void *puc; - struct siginfo info; - struct ucontext uc; - struct _fpstate fpstate; - char retcode[8]; -}; - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext *sc, int *peax) -{ - unsigned int err = 0; - -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg,tmp); } - - GET_SEG(gs); - GET_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); - COPY(edi); - COPY(esi); - COPY(ebp); - COPY(esp); - COPY(ebx); - COPY(edx); - COPY(ecx); - COPY(eip); - COPY_SEG_STRICT(cs); - COPY_SEG_STRICT(ss); - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_eax = -1; /* disable syscall checks */ - } - - { - struct _fpstate * buf; - err |= __get_user(buf, &sc->fpstate); - if (buf) { - if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } - } - - err |= __get_user(*peax, &sc->eax); - return err; - -badframe: - return 1; -} - -asmlinkage int sys_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *) &__unused; - struct sigframe *frame = (struct sigframe *)(regs->esp - 8); - sigset_t set; - int eax; - - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) - || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); - current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); - - if (restore_sigcontext(regs, &frame->sc, &eax)) - goto badframe; - return eax; - -badframe: - force_sig(SIGSEGV, current); - return 0; -} - -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *) &__unused; - struct rt_sigframe *frame = (struct rt_sigframe *)(regs->esp - 4); - sigset_t set; - stack_t st; - int eax; - - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sigmask_lock); - current->blocked = set; - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) - goto badframe; - - if (__copy_from_user(&st, &frame->uc.uc_stack, sizeof(st))) - goto badframe; - /* It is more difficult to avoid calling this function than to - call it and ignore errors. */ - do_sigaltstack(&st, NULL, regs->esp); - - return eax; - -badframe: - force_sig(SIGSEGV, current); - return 0; -} - -/* - * Set up a signal frame. - */ - -static int -setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, - struct pt_regs *regs, unsigned long mask) -{ - int tmp, err = 0; - - tmp = 0; - __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->gs); - __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int *)&sc->fs); - - err |= __put_user(regs->xes, (unsigned int *)&sc->es); - err |= __put_user(regs->xds, (unsigned int *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int *)&sc->ss); - - tmp = save_i387(fpstate); - if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); - - /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ -static inline void * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) -{ - unsigned long esp; - - /* Default to using normal stack */ - esp = regs->esp; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(esp) == 0) - esp = current->sas_ss_sp + current->sas_ss_size; - } - - /* This is the legacy signal stack switching. */ - else if ((regs->xss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - esp = (unsigned long) ka->sa.sa_restorer; - } - - return (void *)((esp - frame_size) & -8ul); -} - -static void setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs) -{ - struct sigframe *frame; - int err = 0; - - frame = get_sigframe(ka, regs, sizeof(*frame)); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - err |= __put_user((current->exec_domain - && current->exec_domain->signal_invmap - && sig < 32 - ? current->exec_domain->signal_invmap[sig] - : sig), - &frame->sig); - if (err) - goto give_sigsegv; - - err |= setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); - if (err) - goto give_sigsegv; - - if (_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - } - if (err) - goto give_sigsegv; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - err |= __put_user(frame->retcode, &frame->pretcode); - /* This is popl %eax ; movl $,%eax ; int $0x80 */ - err |= __put_user(0xb858, (short *)(frame->retcode+0)); - err |= __put_user(__NR_sigreturn, (int *)(frame->retcode+2)); - err |= __put_user(0x80cd, (short *)(frame->retcode+6)); - } - - if (err) - goto give_sigsegv; - - /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - - set_fs(USER_DS); - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; - regs->eflags &= ~TF_MASK; - -#if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); -#endif - - return; - -give_sigsegv: - if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; - force_sig(SIGSEGV, current); -} - -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) -{ - struct rt_sigframe *frame; - int err = 0; - - frame = get_sigframe(ka, regs, sizeof(*frame)); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - err |= __put_user((current->exec_domain - && current->exec_domain->signal_invmap - && sig < 32 - ? current->exec_domain->signal_invmap[sig] - : sig), - &frame->sig); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->esp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) - goto give_sigsegv; - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - err |= __put_user(frame->retcode, &frame->pretcode); - /* This is movl $,%eax ; int $0x80 */ - err |= __put_user(0xb8, (char *)(frame->retcode+0)); - err |= __put_user(__NR_rt_sigreturn, (int *)(frame->retcode+1)); - err |= __put_user(0x80cd, (short *)(frame->retcode+5)); - } - - if (err) - goto give_sigsegv; - - /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - - set_fs(USER_DS); - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; - regs->eflags &= ~TF_MASK; - -#if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); -#endif - - return; - -give_sigsegv: - if (sig == SIGSEGV) - ka->sa.sa_handler = SIG_DFL; - force_sig(SIGSEGV, current); -} - -/* - * OK, we're invoking a handler - */ - -static void -handle_signal(unsigned long sig, struct k_sigaction *ka, - siginfo_t *info, sigset_t *oldset, struct pt_regs * regs) -{ - /* Are we from a system call? */ - if (regs->orig_eax >= 0) { - /* If so, check system call restarting.. */ - switch (regs->eax) { - case -ERESTARTNOHAND: - regs->eax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->eax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; - } - } - - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame(sig, ka, info, oldset, regs); - else - setup_frame(sig, ka, oldset, regs); - - if (ka->sa.sa_flags & SA_ONESHOT) - ka->sa.sa_handler = SIG_DFL; - - if (!(ka->sa.sa_flags & SA_NODEFER)) { - spin_lock_irq(¤t->sigmask_lock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - sigaddset(¤t->blocked,sig); - recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); - } -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) -{ - siginfo_t info; - struct k_sigaction *ka; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if ((regs->xcs & 2) != 2) - return 1; - - if (!oldset) - oldset = ¤t->blocked; - - for (;;) { - unsigned long signr; - - spin_lock_irq(¤t->sigmask_lock); - signr = dequeue_signal(¤t->blocked, &info); - spin_unlock_irq(¤t->sigmask_lock); - - if (!signr) - break; - - if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { - /* Let the debugger run. */ - current->exit_code = signr; - current->state = TASK_STOPPED; - notify_parent(current, SIGCHLD); - schedule(); - - /* We're back. Did the debugger cancel the sig? */ - if (!(signr = current->exit_code)) - continue; - current->exit_code = 0; - - /* The debugger continued. Ignore SIGSTOP. */ - if (signr == SIGSTOP) - continue; - - /* Update the siginfo structure. Is this good? */ - if (signr != info.si_signo) { - info.si_signo = signr; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = current->p_pptr->pid; - info.si_uid = current->p_pptr->uid; - } - - /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { - send_sig_info(signr, &info, current); - continue; - } - } - - ka = ¤t->sig->action[signr-1]; - if (ka->sa.sa_handler == SIG_IGN) { - if (signr != SIGCHLD) - continue; - /* Check for SIGCHLD: it's special. */ - while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) - /* nothing */; - continue; - } - - if (ka->sa.sa_handler == SIG_DFL) { - int exit_code = signr; - - /* Init gets no signals it doesn't want. */ - if (current->pid == 1) - continue; - - switch (signr) { - case SIGCONT: case SIGCHLD: case SIGWINCH: case SIGURG: - continue; - - case SIGTSTP: case SIGTTIN: case SIGTTOU: - if (is_orphaned_pgrp(current->pgrp)) - continue; - /* FALLTHRU */ - - case SIGSTOP: { - struct signal_struct *sig; - current->state = TASK_STOPPED; - current->exit_code = signr; - sig = current->p_pptr->sig; - if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) - notify_parent(current, SIGCHLD); - schedule(); - continue; - } - - case SIGQUIT: case SIGILL: case SIGTRAP: - case SIGABRT: case SIGFPE: case SIGSEGV: - case SIGBUS: case SIGSYS: case SIGXCPU: case SIGXFSZ: - if (do_coredump(signr, regs)) - exit_code |= 0x80; - /* FALLTHRU */ - - default: - sig_exit(signr, exit_code, &info); - /* NOTREACHED */ - } - } - - /* Reenable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if ( current->thread.debugreg[7] != 0 ) - HYPERVISOR_set_debugreg(7, current->thread.debugreg[7]); - - /* Whee! Actually deliver the signal. */ - handle_signal(signr, ka, &info, oldset, regs); - return 1; - } - - /* Did we come from a system call? */ - if (regs->orig_eax >= 0) { - /* Restart the system call - no handlers present */ - if (regs->eax == -ERESTARTNOHAND || - regs->eax == -ERESTARTSYS || - regs->eax == -ERESTARTNOINTR) { - regs->eax = regs->orig_eax; - regs->eip -= 2; - } - } - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/time.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/time.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,721 +0,0 @@ -/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- - **************************************************************************** - * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge - * (C) 2002-2003 - Keir Fraser - University of Cambridge - **************************************************************************** - * - * File: arch/xen/kernel/time.c - * Author: Rolf Neugebauer and Keir Fraser - * - * Description: Interface with Xen to get correct notion of time - */ - -/* - * linux/arch/i386/kernel/time.c - * - * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, Andrew D. - * Balsa <andrebalsa@xxxxxxxxxx>, Philip Gladstone <philip@xxxxxxxxxx>; - * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@xxxxxxxxxxxxx>). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - */ - -#include <asm/smp.h> -#include <asm/irq.h> -#include <asm/msr.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/uaccess.h> -#include <asm/processor.h> - -#include <asm/div64.h> -#include <asm/hypervisor.h> -#include <asm-xen/xen-public/dom0_ops.h> - -#include <linux/mc146818rtc.h> -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/irq.h> -#include <linux/sysctl.h> -#include <linux/sysrq.h> - -spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; -extern rwlock_t xtime_lock; -extern unsigned long wall_jiffies; - -unsigned long cpu_khz; /* get this from Xen, used elsewhere */ - -static unsigned int rdtsc_bitshift; -static u32 st_scale_f; /* convert ticks -> usecs */ -static u32 st_scale_i; /* convert ticks -> usecs */ - -/* These are peridically updated in shared_info, and then copied here. */ -static u32 shadow_tsc_stamp; -static u64 shadow_system_time; -static u32 shadow_time_version; -static struct timeval shadow_tv; - -/* - * We use this to ensure that gettimeofday() is monotonically increasing. We - * only break this guarantee if the wall clock jumps backwards "a long way". - */ -static struct timeval last_seen_tv = {0,0}; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -/* Periodically propagate synchronised time base to the RTC and to Xen. */ -static long last_update_to_rtc, last_update_to_xen; -#endif - -/* Periodically take synchronised time base from Xen, if we need it. */ -static long last_update_from_xen; /* UTC seconds when last read Xen clock. */ - -/* Keep track of last time we did processing/updating of jiffies and xtime. */ -static u64 processed_system_time; /* System time (ns) at last processing. */ - -#define NS_PER_TICK (1000000000ULL/HZ) - -#ifndef NSEC_PER_SEC -#define NSEC_PER_SEC (1000000000L) -#endif - -#define HANDLE_USEC_UNDERFLOW(_tv) \ - do { \ - while ( (_tv).tv_usec < 0 ) \ - { \ - (_tv).tv_usec += 1000000; \ - (_tv).tv_sec--; \ - } \ - } while ( 0 ) -#define HANDLE_USEC_OVERFLOW(_tv) \ - do { \ - while ( (_tv).tv_usec >= 1000000 ) \ - { \ - (_tv).tv_usec -= 1000000; \ - (_tv).tv_sec++; \ - } \ - } while ( 0 ) -static inline void __normalize_time(time_t *sec, s64 *nsec) -{ - while (*nsec >= NSEC_PER_SEC) { - (*nsec) -= NSEC_PER_SEC; - (*sec)++; - } - while (*nsec < 0) { - (*nsec) += NSEC_PER_SEC; - (*sec)--; - } -} - -/* Dynamically-mapped IRQs. */ -static int time_irq, debug_irq; - -/* Does this guest OS track Xen time, or set its wall clock independently? */ -static int independent_wallclock = 0; -static int __init __independent_wallclock(char *str) -{ - independent_wallclock = 1; - return 1; -} -__setup("independent_wallclock", __independent_wallclock); -#define INDEPENDENT_WALLCLOCK() \ - (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN)) - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be - * called 500 ms after the second nowtime has started, because when - * nowtime is written into the registers of the CMOS clock, it will - * jump to the next second precisely 500 ms later. Check the Motorola - * MC146818A or Dallas DS12887 data sheet for details. - * - * BUG: This routine does not handle hour overflow properly; it just - * sets the minutes. Usually you'll only notice that after reboot! - */ -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char save_control, save_freq_select; - - /* gets recalled with irq locally disabled */ - spin_lock(&rtc_lock); - save_control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); - - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - if ( !(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD ) - BCD_TO_BIN(cmos_minutes); - - /* - * since we're only adjusting minutes and seconds, don't interfere with - * hour overflow. This avoids messing with unknown time zones but requires - * your RTC not to be off by more than 15 minutes - */ - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if ( ((abs(real_minutes - cmos_minutes) + 15)/30) & 1 ) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if ( abs(real_minutes - cmos_minutes) < 30 ) - { - if ( !(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD ) - { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - } - CMOS_WRITE(real_seconds,RTC_SECONDS); - CMOS_WRITE(real_minutes,RTC_MINUTES); - } - else - { - printk(KERN_WARNING - "set_rtc_mmss: can't update from %d to %d\n", - cmos_minutes, real_minutes); - retval = -1; - } - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - spin_unlock(&rtc_lock); - - return retval; -} -#endif - - -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. Must be called with the xtime_lock held for writing. - */ -static void __get_time_values_from_xen(void) -{ - do { - shadow_time_version = HYPERVISOR_shared_info->time_version2; - rmb(); - shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec; - shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec; - shadow_tsc_stamp = - (u32)(HYPERVISOR_shared_info->tsc_timestamp >> rdtsc_bitshift); - shadow_system_time = HYPERVISOR_shared_info->system_time; - rmb(); - } - while ( shadow_time_version != HYPERVISOR_shared_info->time_version1 ); -} - -#define TIME_VALUES_UP_TO_DATE \ - ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) - - -/* - * Returns the system time elapsed, in ns, since the current shadow_timestamp - * was calculated. Must be called with the xtime_lock held for reading. - */ -static inline unsigned long __get_time_delta_usecs(void) -{ - s32 delta_tsc; - u32 low; - u64 delta, tsc; - - rdtscll(tsc); - low = (u32)(tsc >> rdtsc_bitshift); - delta_tsc = (s32)(low - shadow_tsc_stamp); - if ( unlikely(delta_tsc < 0) ) delta_tsc = 0; - delta = ((u64)delta_tsc * st_scale_f); - delta >>= 32; - delta += ((u64)delta_tsc * st_scale_i); - - return (unsigned long)delta; -} - - -/* - * Returns the current time-of-day in UTC timeval format. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags, lost; - struct timeval _tv; - s64 nsec; - - again: - read_lock_irqsave(&xtime_lock, flags); - - _tv.tv_usec = __get_time_delta_usecs(); - if ( (lost = (jiffies - wall_jiffies)) != 0 ) - _tv.tv_usec += lost * (1000000 / HZ); - _tv.tv_sec = xtime.tv_sec; - _tv.tv_usec += xtime.tv_usec; - - nsec = shadow_system_time - processed_system_time; - __normalize_time(&_tv.tv_sec, &nsec); - _tv.tv_usec += (long)nsec / 1000L; - - if ( unlikely(!TIME_VALUES_UP_TO_DATE) ) - { - /* - * We may have blocked for a long time, rendering our calculations - * invalid (e.g. the time delta may have overflowed). Detect that - * and recalculate with fresh values. - */ - read_unlock_irqrestore(&xtime_lock, flags); - write_lock_irqsave(&xtime_lock, flags); - __get_time_values_from_xen(); - write_unlock_irqrestore(&xtime_lock, flags); - goto again; - } - - HANDLE_USEC_OVERFLOW(_tv); - - /* Ensure that time-of-day is monotonically increasing. */ - if ( (_tv.tv_sec < last_seen_tv.tv_sec) || - ((_tv.tv_sec == last_seen_tv.tv_sec) && - (_tv.tv_usec < last_seen_tv.tv_usec)) ) - _tv = last_seen_tv; - last_seen_tv = _tv; - - read_unlock_irqrestore(&xtime_lock, flags); - - *tv = _tv; -} - - -/* - * Sets the current time-of-day based on passed-in UTC timeval parameter. - */ -void do_settimeofday(struct timeval *tv) -{ - struct timeval newtv; - s64 nsec; - suseconds_t usec; - - if ( !INDEPENDENT_WALLCLOCK() ) - return; - - write_lock_irq(&xtime_lock); - - /* - * Ensure we don't get blocked for a long time so that our time delta - * overflows. If that were to happen then our shadow time values would - * be stale, so we can retry with fresh ones. - */ - again: - usec = tv->tv_usec - __get_time_delta_usecs(); - - nsec = shadow_system_time - processed_system_time; - __normalize_time(&tv->tv_sec, &nsec); - usec -= (long)nsec / 1000L; - - if ( unlikely(!TIME_VALUES_UP_TO_DATE) ) - { - __get_time_values_from_xen(); - goto again; - } - tv->tv_usec = usec; - - HANDLE_USEC_UNDERFLOW(*tv); - - newtv = *tv; - - tv->tv_usec -= (jiffies - wall_jiffies) * (1000000 / HZ); - HANDLE_USEC_UNDERFLOW(*tv); - - xtime = *tv; - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - - /* Reset all our running time counts. They make no sense now. */ - last_seen_tv.tv_sec = 0; - last_update_from_xen = 0; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if ( xen_start_info.flags & SIF_INITDOMAIN ) - { - dom0_op_t op; - last_update_to_rtc = last_update_to_xen = 0; - op.cmd = DOM0_SETTIME; - op.u.settime.secs = newtv.tv_sec; - op.u.settime.usecs = newtv.tv_usec; - op.u.settime.system_time = shadow_system_time; - write_unlock_irq(&xtime_lock); - HYPERVISOR_dom0_op(&op); - } - else -#endif - { - write_unlock_irq(&xtime_lock); - } -} - - -asmlinkage long sys_stime(int *tptr) -{ - int value; - struct timeval tv; - - if ( !capable(CAP_SYS_TIME) ) - return -EPERM; - - if ( get_user(value, tptr) ) - return -EFAULT; - - tv.tv_sec = value; - tv.tv_usec = 0; - - do_settimeofday(&tv); - - return 0; -} - - -/* Convert jiffies to system time. Call with xtime_lock held for reading. */ -static inline u64 __jiffies_to_st(unsigned long j) -{ - return processed_system_time + ((j - jiffies) * NS_PER_TICK); -} - - -static inline void do_timer_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ - s64 delta; - unsigned long ticks = 0; - long sec_diff; - - do { - __get_time_values_from_xen(); - - delta = (s64)(shadow_system_time + - ((s64)__get_time_delta_usecs() * 1000LL) - - processed_system_time); - } - while ( !TIME_VALUES_UP_TO_DATE ); - - if ( unlikely(delta < 0) ) - { - printk("Timer ISR: Time went backwards: %lld\n", delta); - return; - } - - /* Process elapsed jiffies since last call. */ - while ( delta >= NS_PER_TICK ) - { - ticks++; - delta -= NS_PER_TICK; - processed_system_time += NS_PER_TICK; - } - - if ( ticks != 0 ) - { - do_timer_ticks(ticks); - - if ( user_mode(regs) ) - update_process_times_us(ticks, 0); - else - update_process_times_us(0, ticks); - } - - /* - * Take synchronised time from Xen once a minute if we're not - * synchronised ourselves, and we haven't chosen to keep an independent - * time base. - */ - if ( !INDEPENDENT_WALLCLOCK() && - ((time_status & STA_UNSYNC) != 0) && - (xtime.tv_sec > (last_update_from_xen + 60)) ) - { - /* Adjust shadow timeval for jiffies that haven't updated xtime yet. */ - shadow_tv.tv_usec -= (jiffies - wall_jiffies) * (1000000/HZ); - HANDLE_USEC_UNDERFLOW(shadow_tv); - - /* - * Reset our running time counts if they are invalidated by a warp - * backwards of more than 500ms. - */ - sec_diff = xtime.tv_sec - shadow_tv.tv_sec; - if ( unlikely(abs(sec_diff) > 1) || - unlikely(((sec_diff * 1000000) + - xtime.tv_usec - shadow_tv.tv_usec) > 500000) ) - { -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - last_update_to_rtc = last_update_to_xen = 0; -#endif - last_seen_tv.tv_sec = 0; - } - - /* Update our unsynchronised xtime appropriately. */ - xtime = shadow_tv; - - last_update_from_xen = xtime.tv_sec; - } - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if ( (xen_start_info.flags & SIF_INITDOMAIN) && - ((time_status & STA_UNSYNC) == 0) ) - { - /* Send synchronised time to Xen approximately every minute. */ - if ( xtime.tv_sec > (last_update_to_xen + 60) ) - { - dom0_op_t op; - struct timeval tv = xtime; - - tv.tv_usec += (jiffies - wall_jiffies) * (1000000/HZ); - HANDLE_USEC_OVERFLOW(tv); - - op.cmd = DOM0_SETTIME; - op.u.settime.secs = tv.tv_sec; - op.u.settime.usecs = tv.tv_usec; - op.u.settime.system_time = shadow_system_time; - HYPERVISOR_dom0_op(&op); - - last_update_to_xen = xtime.tv_sec; - } - - /* - * If we have an externally synchronized Linux clock, then update CMOS - * clock accordingly every ~11 minutes. Set_rtc_mmss() has to be called - * as close as possible to 500 ms before the new second starts. - */ - if ( (xtime.tv_sec > (last_update_to_rtc + 660)) && - (xtime.tv_usec >= (500000 - ((unsigned) tick) / 2)) && - (xtime.tv_usec <= (500000 + ((unsigned) tick) / 2)) ) - { - if ( set_rtc_mmss(xtime.tv_sec) == 0 ) - last_update_to_rtc = xtime.tv_sec; - else - last_update_to_rtc = xtime.tv_sec - 600; - } - } -#endif -} - - -static void timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - write_lock(&xtime_lock); - do_timer_interrupt(irq, NULL, regs); - write_unlock(&xtime_lock); -} - -static struct irqaction irq_timer = { - timer_interrupt, - SA_INTERRUPT, - 0, - "timer", - NULL, - NULL -}; - - -/* - * This function works out when the the next timer function has to be - * executed (by looking at the timer list) and sets the Xen one-shot - * domain timer to the appropriate value. This is typically called in - * cpu_idle() before the domain blocks. - * - * The function returns a non-0 value on error conditions. - * - * It must be called with interrupts disabled. - */ -extern spinlock_t timerlist_lock; -int set_timeout_timer(void) -{ - struct timer_list *timer; - u64 alarm = 0; - int ret = 0; - - spin_lock(&timerlist_lock); - - /* - * This is safe against long blocking (since calculations are not based on - * TSC deltas). It is also safe against warped system time since - * suspend-resume is cooperative and we would first get locked out. It is - * safe against normal updates of jiffies since interrupts are off. - */ - if ( (timer = next_timer_event()) != NULL ) - alarm = __jiffies_to_st(timer->expires); - - /* Tasks on the timer task queue expect to be executed on the next tick. */ - if ( TQ_ACTIVE(tq_timer) ) - alarm = __jiffies_to_st(jiffies + 1); - - /* Failure is pretty bad, but we'd best soldier on. */ - if ( HYPERVISOR_set_timer_op(alarm) != 0 ) - ret = -1; - - spin_unlock(&timerlist_lock); - - return ret; -} - - -/* Time debugging. */ -static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs) -{ - unsigned long flags, j; - u64 s_now, j_st; - struct timeval s_tv, tv; - - struct timer_list *timer; - u64 t_st; - - read_lock_irqsave(&xtime_lock, flags); - s_tv.tv_sec = shadow_tv.tv_sec; - s_tv.tv_usec = shadow_tv.tv_usec; - s_now = shadow_system_time; - read_unlock_irqrestore(&xtime_lock, flags); - - do_gettimeofday(&tv); - - j = jiffies; - j_st = __jiffies_to_st(j); - - timer = next_timer_event(); - t_st = __jiffies_to_st(timer->expires); - - printk(KERN_ALERT "time: shadow_st=0x%X:%08X\n", - (u32)(s_now>>32), (u32)s_now); - printk(KERN_ALERT "time: wct=%lds %ldus shadow_wct=%lds %ldus\n", - tv.tv_sec, tv.tv_usec, s_tv.tv_sec, s_tv.tv_usec); - printk(KERN_ALERT "time: jiffies=%lu(0x%X:%08X) timeout=%lu(0x%X:%08X)\n", - jiffies,(u32)(j_st>>32), (u32)j_st, - timer->expires,(u32)(t_st>>32), (u32)t_st); - printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n", - (u32)(processed_system_time>>32), (u32)processed_system_time); - -#ifdef CONFIG_MAGIC_SYSRQ - handle_sysrq('t',NULL,NULL,NULL); -#endif -} - -static struct irqaction dbg_time = { - dbg_time_int, - SA_SHIRQ, - 0, - "timer_dbg", - &dbg_time_int, - NULL -}; - -void __init time_init(void) -{ - unsigned long long alarm; - u64 __cpu_khz, __cpu_ghz, cpu_freq, scale, scale2; - unsigned int cpu_ghz; - - __cpu_khz = __cpu_ghz = cpu_freq = HYPERVISOR_shared_info->cpu_freq; - do_div(__cpu_khz, 1000UL); - cpu_khz = (u32)__cpu_khz; - do_div(__cpu_ghz, 1000000000UL); - cpu_ghz = (unsigned int)__cpu_ghz; - - printk("Xen reported: %lu.%03lu MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - - xtime.tv_sec = HYPERVISOR_shared_info->wc_sec; - xtime.tv_usec = HYPERVISOR_shared_info->wc_usec; - processed_system_time = shadow_system_time; - - for ( rdtsc_bitshift = 0; cpu_ghz != 0; rdtsc_bitshift++, cpu_ghz >>= 1 ) - continue; - - scale = 1000000LL << (32 + rdtsc_bitshift); - do_div(scale, (u32)cpu_freq); - - if ( (cpu_freq >> 32) != 0 ) - { - scale2 = 1000000LL << rdtsc_bitshift; - do_div(scale2, (u32)(cpu_freq>>32)); - scale += scale2; - } - - st_scale_f = scale & 0xffffffff; - st_scale_i = scale >> 32; - - __get_time_values_from_xen(); - processed_system_time = shadow_system_time; - - time_irq = bind_virq_to_irq(VIRQ_TIMER); - debug_irq = bind_virq_to_irq(VIRQ_DEBUG); - - (void)setup_irq(time_irq, &irq_timer); - (void)setup_irq(debug_irq, &dbg_time); - - rdtscll(alarm); -} - -void time_suspend(void) -{ -} - -void time_resume(void) -{ - unsigned long flags; - write_lock_irqsave(&xtime_lock, flags); - /* Get timebases for new environment. */ - __get_time_values_from_xen(); - /* Reset our own concept of passage of system time. */ - processed_system_time = shadow_system_time; - /* Accept a warp in UTC (wall-clock) time. */ - last_seen_tv.tv_sec = 0; - /* Make sure we resync UTC time with Xen on next timer interrupt. */ - last_update_from_xen = 0; - write_unlock_irqrestore(&xtime_lock, flags); -} - -/* - * /proc/sys/xen: This really belongs in another file. It can stay here for - * now however. - */ -static ctl_table xen_subtable[] = { - {1, "independent_wallclock", &independent_wallclock, - sizeof(independent_wallclock), 0644, NULL, proc_dointvec}, - {0} -}; -static ctl_table xen_table[] = { - {123, "xen", NULL, 0, 0555, xen_subtable}, - {0} -}; -static int __init xen_sysctl_init(void) -{ - (void)register_sysctl_table(xen_table, 0); - return 0; -} -__initcall(xen_sysctl_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/kernel/traps.c --- a/linux-2.4.30-xen-sparse/arch/xen/kernel/traps.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,619 +0,0 @@ -/* - * linux/arch/i386/traps.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. - */ -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/highmem.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/debugreg.h> -#include <asm/desc.h> -#include <asm/i387.h> - -#include <asm/smp.h> -#include <asm/pgalloc.h> - -#include <asm/hypervisor.h> - -#include <linux/irq.h> -#include <linux/module.h> - -asmlinkage int system_call(void); -asmlinkage void lcall7(void); -asmlinkage void lcall27(void); - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void fixup_4gb_segment(void); -asmlinkage void machine_check(void); - -int kstack_depth_to_print = 24; - - -/* - * If the address is either in the .text section of the - * kernel, or in the vmalloc'ed module regions, it *may* - * be the address of a calling routine - */ - -#ifdef CONFIG_MODULES - -extern struct module *module_list; -extern struct module kernel_module; - -static inline int kernel_text_address(unsigned long addr) -{ - int retval = 0; - struct module *mod; - - if (addr >= (unsigned long) &_stext && - addr <= (unsigned long) &_etext) - return 1; - - for (mod = module_list; mod != &kernel_module; mod = mod->next) { - /* mod_bound tests for addr being inside the vmalloc'ed - * module area. Of course it'd be better to test only - * for the .text subset... */ - if (mod_bound(addr, 0, mod)) { - retval = 1; - break; - } - } - - return retval; -} - -#else - -static inline int kernel_text_address(unsigned long addr) -{ - return (addr >= (unsigned long) &_stext && - addr <= (unsigned long) &_etext); -} - -#endif - -void show_trace(unsigned long * stack) -{ - int i; - unsigned long addr; - - if (!stack) - stack = (unsigned long*)&stack; - - printk("Call Trace: "); - i = 1; - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack++; - if (kernel_text_address(addr)) { - if (i && ((i % 6) == 0)) - printk("\n "); - printk("[<%08lx>] ", addr); - i++; - } - } - printk("\n"); -} - -void show_trace_task(struct task_struct *tsk) -{ - unsigned long esp = tsk->thread.esp; - - /* User space on another CPU? */ - if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1)) - return; - show_trace((unsigned long *)esp); -} - -void show_stack(unsigned long * esp) -{ - unsigned long *stack; - int i; - - // debugging aid: "show_stack(NULL);" prints the - // back trace for this cpu. - - if(esp==NULL) - esp=(unsigned long*)&esp; - - stack = esp; - for(i=0; i < kstack_depth_to_print; i++) { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - if (i && ((i % 8) == 0)) - printk("\n "); - printk("%08lx ", *stack++); - } - printk("\n"); - show_trace(esp); -} - -void show_registers(struct pt_regs *regs) -{ - int in_kernel = 1; - unsigned long esp; - unsigned short ss; - - esp = (unsigned long) (®s->esp); - ss = __KERNEL_DS; - if (regs->xcs & 2) { - in_kernel = 0; - esp = regs->esp; - ss = regs->xss & 0xffff; - } - printk(KERN_ALERT "CPU: %d\n", smp_processor_id() ); - printk(KERN_ALERT "EIP: %04x:[<%08lx>] %s\n", - 0xffff & regs->xcs, regs->eip, print_tainted()); - printk(KERN_ALERT "EFLAGS: %08lx\n",regs->eflags); - printk(KERN_ALERT "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk(KERN_ALERT "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_ALERT "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); - printk(KERN_ALERT "Process %s (pid: %d, stackpage=%08lx)", - current->comm, current->pid, 4096+(unsigned long)current); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (in_kernel) { - - printk(KERN_ALERT "\nStack: "); - show_stack((unsigned long*)esp); - -#if 0 - { - int i; - printk(KERN_ALERT "\nCode: "); - if(regs->eip < PAGE_OFFSET) - goto bad; - - for(i=0;i<20;i++) - { - unsigned char c; - if(__get_user(c, &((unsigned char*)regs->eip)[i])) { -bad: - printk(KERN_ALERT " Bad EIP value."); - break; - } - printk("%02x ", c); - } - } -#endif - } - printk(KERN_ALERT "\n"); -} - -spinlock_t die_lock = SPIN_LOCK_UNLOCKED; - -void die(const char * str, struct pt_regs * regs, long err) -{ - console_verbose(); - spin_lock_irq(&die_lock); - bust_spinlocks(1); - printk("%s: %04lx\n", str, err & 0xffff); - show_registers(regs); - bust_spinlocks(0); - spin_unlock_irq(&die_lock); - do_exit(SIGSEGV); -} - -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) -{ - if (!(2 & regs->xcs)) - die(str, regs, err); -} - - -static void inline do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, - siginfo_t *info) -{ - if (!(regs->xcs & 2)) - goto kernel_trap; - - /*trap_signal:*/ { - struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - } - - kernel_trap: { - unsigned long fixup = search_exception_table(regs->eip); - if (fixup) - regs->eip = fixup; - else - die(str, regs, error_code); - return; - } -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void *)siaddr; \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) -DO_ERROR( 3, SIGTRAP, "int3", int3) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) -DO_ERROR( 8, SIGSEGV, "double fault", double_fault) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR(18, SIGBUS, "machine check", machine_check) - -asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) -{ - /* - * If we trapped on an LDT access then ensure that the default_ldt is - * loaded, if nothing else. We load default_ldt lazily because LDT - * switching costs time and many applications don't need it. - */ - if ( unlikely((error_code & 6) == 4) ) - { - unsigned long ldt; - __asm__ __volatile__ ( "sldt %0" : "=r" (ldt) ); - if ( ldt == 0 ) - { - xen_set_ldt((unsigned long)&default_ldt[0], 5); - return; - } - } - - if (!(regs->xcs & 2)) - goto gp_in_kernel; - - current->thread.error_code = error_code; - current->thread.trap_no = 13; - force_sig(SIGSEGV, current); - return; - -gp_in_kernel: - { - unsigned long fixup; - fixup = search_exception_table(regs->eip); - if (fixup) { - regs->eip = fixup; - return; - } - die("general protection fault", regs, error_code); - } -} - - -asmlinkage void do_debug(struct pt_regs * regs, long error_code) -{ - unsigned int condition; - struct task_struct *tsk = current; - siginfo_t info; - - condition = HYPERVISOR_get_debugreg(6); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg[7]) - goto clear_dr7; - } - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg[6] = condition; - - /* Mask out spurious TF errors due to lazy TF clearing */ - if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ - if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) - goto clear_TF; - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - - /* If this is a kernel mode trap, save the user PC on entry to - * the kernel, that's what the debugger can make sense of. - */ - info.si_addr = ((regs->xcs & 2) == 0) ? (void *)tsk->thread.eip : - (void *)regs->eip; - force_sig_info(SIGTRAP, &info, tsk); - - /* Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ - clear_dr7: - HYPERVISOR_set_debugreg(7, 0); - return; - - clear_TF: - regs->eflags &= ~TF_MASK; - return; -} - - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void *eip) -{ - struct task_struct * task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = eip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't syncronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_coprocessor_error(struct pt_regs * regs, long error_code) -{ - ignore_irq13 = 1; - math_error((void *)regs->eip); -} - -void simd_math_error(void *eip) -{ - struct task_struct * task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = eip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs * regs, - long error_code) -{ - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_irq13 = 1; - simd_math_error((void *)regs->eip); - } else { - die_if_kernel("cache flush denied", regs, error_code); - current->thread.trap_no = 19; - current->thread.error_code = error_code; - force_sig(SIGSEGV, current); - } -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(struct pt_regs regs) -{ - /* - * A trap in kernel mode can be ignored. It'll be the fast XOR or - * copying libraries, which will correctly save/restore state and - * reset the TS bit in CR0. - */ - if ( (regs.xcs & 2) == 0 ) - return; - - if (current->used_math) { - restore_fpu(current); - } else { - init_fpu(); - } - current->flags |= PF_USEDFPU; /* So we fnsave on switch_to() */ -} - - -#define _set_gate(gate_addr,type,dpl,addr) \ -do { \ - int __d0, __d1; \ - __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \ - "movw %4,%%dx\n\t" \ - "movl %%eax,%0\n\t" \ - "movl %%edx,%1" \ - :"=m" (*((long *) (gate_addr))), \ - "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \ - :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \ - "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \ -} while (0) - -static void __init set_call_gate(void *a, void *addr) -{ - _set_gate(a,12,3,addr); -} - - -/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */ -static trap_info_t trap_table[] = { - { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, - { 1, 0, __KERNEL_CS, (unsigned long)debug }, - { 3, 3, __KERNEL_CS, (unsigned long)int3 }, - { 4, 3, __KERNEL_CS, (unsigned long)overflow }, - { 5, 3, __KERNEL_CS, (unsigned long)bounds }, - { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, - { 7, 0, __KERNEL_CS, (unsigned long)device_not_available }, - { 8, 0, __KERNEL_CS, (unsigned long)double_fault }, - { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, - { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, - { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, - { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, - { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, - { 14, 0, __KERNEL_CS, (unsigned long)page_fault }, - { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, - { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, - { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, - { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, - { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, - { SYSCALL_VECTOR, - 3, __KERNEL_CS, (unsigned long)system_call }, - { 0, 0, 0, 0 } -}; - - -void __init trap_init(void) -{ - HYPERVISOR_set_trap_table(trap_table); - - /* - * The default LDT is a single-entry callgate to lcall7 for iBCS and a - * callgate to lcall27 for Solaris/x86 binaries. - */ - clear_page(&default_ldt[0]); - set_call_gate(&default_ldt[0],lcall7); - set_call_gate(&default_ldt[4],lcall27); - __make_page_readonly(&default_ldt[0]); - - cpu_init(); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/lib/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/lib/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,15 +0,0 @@ - -.S.o: - $(CC) $(AFLAGS) -c $< -o $*.o - -L_TARGET = lib.a - -obj-y = checksum.o old-checksum.o delay.o \ - usercopy.o getuser.o \ - memcpy.o strstr.o xen_proc.o - -obj-$(CONFIG_X86_USE_3DNOW) += mmx.o -obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o -obj-$(CONFIG_DEBUG_IOVIRT) += iodebug.o - -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/lib/delay.c --- a/linux-2.4.30-xen-sparse/arch/xen/lib/delay.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,52 +0,0 @@ -/* - * Precise Delay Loops for i386 - * - * Copyright (C) 1993 Linus Torvalds - * Copyright (C) 1997 Martin Mares <mj@xxxxxxxxxxxxxxxxxxxxxxxx> - * - * The __delay function must _NOT_ be inlined as its execution time - * depends wildly on alignment on many x86 processors. The additional - * jump magic is needed to get the timing stable on all the CPU's - * we have to worry about. - */ - -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/delay.h> -#include <asm/processor.h> -#include <asm/delay.h> - -#ifdef CONFIG_SMP -#include <asm/smp.h> -#endif - -void __delay(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -inline void __const_udelay(unsigned long xloops) -{ - int d0; - __asm__("mull %0" - :"=d" (xloops), "=&a" (d0) - :"1" (xloops),"0" (current_cpu_data.loops_per_jiffy)); - __delay(xloops * HZ); -} - -void __udelay(unsigned long usecs) -{ - __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ -} - -void __ndelay(unsigned long nsecs) -{ - __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/mm/Makefile --- a/linux-2.4.30-xen-sparse/arch/xen/mm/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,16 +0,0 @@ -# -# Makefile for the linux i386-specific parts of the memory manager. -# -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# -# Note 2! The CFLAGS definition is now in the main makefile... - -O_TARGET := mm.o - -obj-y := init.o fault.o extable.o pageattr.o hypervisor.o ioremap.o - -export-objs := pageattr.o - -include $(TOPDIR)/Rules.make diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/mm/fault.c --- a/linux-2.4.30-xen-sparse/arch/xen/mm/fault.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,302 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgalloc.h> -#include <asm/hardirq.h> - -extern void die(const char *,struct pt_regs *,long); - -pgd_t *cur_pgd; - -extern spinlock_t timerlist_lock; - -/* - * Unlock any spinlocks which will prevent us from getting the - * message out (timerlist_lock is acquired through the - * console unblank code) - */ -void bust_spinlocks(int yes) -{ - spin_lock_init(&timerlist_lock); - if (yes) { - oops_in_progress = 1; - } else { - int loglevel_save = console_loglevel; -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; - } -} - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - */ -asmlinkage void do_page_fault(struct pt_regs *regs, - unsigned long error_code, - unsigned long address) -{ - struct task_struct *tsk = current; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long page; - unsigned long fixup; - int write; - siginfo_t info; - - /* Set the "privileged fault" bit to something sane. */ - error_code &= 3; - error_code |= (regs->xcs & 2) << 1; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 1) == 0. - */ - if (address >= TASK_SIZE && !(error_code & 5)) - goto vmalloc_fault; - - mm = tsk->mm; - info.si_code = SEGV_MAPERR; - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (in_interrupt() || !mm) - goto no_context; - - down_read(&mm->mmap_sem); - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * accessing the stack below %esp is always a bug. - * The "+ 32" is there due to some instructions (like - * pusha) doing post-decrement on the stack and that - * doesn't show up until later.. - */ - if (address + 32 < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - switch (handle_mm_fault(mm, vma, address, write)) { - case 1: - tsk->min_flt++; - break; - case 2: - tsk->maj_flt++; - break; - case 0: - goto do_sigbus; - default: - goto out_of_memory; - } - - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - /* Are we prepared to handle this kernel fault? */ - if ((fixup = search_exception_table(regs->eip)) != 0) { - regs->eip = fixup; - return; - } - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at virtual address %08lx\n",address); - printk(" printing eip:\n"); - printk("%08lx\n", regs->eip); - page = ((unsigned long *) cur_pgd)[address >> 22]; - printk(KERN_ALERT "*pde=%08lx(%08lx)\n", page, machine_to_phys(page)); - if (page & 1) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = machine_to_phys(page); - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte=%08lx(%08lx)\n", page, - machine_to_phys(page)); - } - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - if (tsk->pid == 1) { - yield(); - goto survive; - } - up_read(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* - * Send a sigbus, regardless of whether we were in kernel - * or user mode. - */ - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void *)address; - force_sig_info(SIGBUS, &info, tsk); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - return; - -vmalloc_fault: - { - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "tsk" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - int offset = __pgd_offset(address); - pgd_t *pgd, *pgd_k; - pmd_t *pmd, *pmd_k; - pte_t *pte_k; - - pgd = offset + cur_pgd; - pgd_k = init_mm.pgd + offset; - - if (!pgd_present(*pgd_k)) - goto no_context; - set_pgd(pgd, *pgd_k); - - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); - if (!pmd_present(*pmd_k)) - goto no_context; - set_pmd(pmd, *pmd_k); - - pte_k = pte_offset(pmd_k, address); - if (!pte_present(*pte_k)) - goto no_context; - return; - } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/mm/init.c --- a/linux-2.4.30-xen-sparse/arch/xen/mm/init.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,482 +0,0 @@ -/* - * linux/arch/i386/mm/init.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - */ - -#include <linux/config.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/init.h> -#ifdef CONFIG_BLK_DEV_INITRD -#include <linux/blk.h> -#endif -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/slab.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/dma.h> -#include <asm/apic.h> -#include <asm/tlb.h> - -/* XEN: We *cannot* use mmx_clear_page() this early. Force dumb memset(). */ -#undef clear_page -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) - -mmu_gather_t mmu_gathers[NR_CPUS]; -unsigned long highstart_pfn, highend_pfn; -static unsigned long totalram_pages; -static unsigned long totalhigh_pages; - -int do_check_pgt_cache(int low, int high) -{ - int freed = 0; - if(pgtable_cache_size > high) { - do { - if (!QUICKLIST_EMPTY(pgd_quicklist)) { - free_pgd_slow(get_pgd_fast()); - freed++; - } - if (!QUICKLIST_EMPTY(pte_quicklist)) { - pte_free_slow(pte_alloc_one_fast(NULL, 0)); - freed++; - } - } while(pgtable_cache_size > low); - } - return freed; -} - -/* - * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the - * physical space so we can cache the place of the first one and move - * around without checking the pgd every time. - */ - -#if CONFIG_HIGHMEM -pte_t *kmap_pte; -pgprot_t kmap_prot; - -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} -#endif /* CONFIG_HIGHMEM */ - -void show_mem(void) -{ - int i, total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - - printk("Mem-info:\n"); - show_free_areas(); - printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); - i = max_mapnr; - while (i-- > 0) { - total++; - if (PageHighMem(mem_map+i)) - highmem++; - if (PageReserved(mem_map+i)) - reserved++; - else if (PageSwapCache(mem_map+i)) - cached++; - else if (page_count(mem_map+i)) - shared += page_count(mem_map+i) - 1; - } - printk("%d pages of RAM\n", total); - printk("%d pages of HIGHMEM\n",highmem); - printk("%d reserved pages\n",reserved); - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); - printk("%ld pages in page table cache\n",pgtable_cache_size); - show_buffers(); -} - -/* References to section boundaries */ - -extern char _text, _etext, _edata, __bss_start, _end; -extern char __init_begin, __init_end; - -static inline void set_pte_phys (unsigned long vaddr, - unsigned long phys, pgprot_t prot) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - - pgd = init_mm.pgd + __pgd_offset(vaddr); - if (pgd_none(*pgd)) { - printk("PAE BUG #00!\n"); - return; - } - pmd = pmd_offset(pgd, vaddr); - if (pmd_none(*pmd)) { - printk("PAE BUG #01!\n"); - return; - } - pte = pte_offset(pmd, vaddr); - - set_pte(pte, (pte_t) { phys | pgprot_val(prot) }); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -void __set_fixmap(enum fixed_addresses idx, unsigned long phys, - pgprot_t flags) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); - return; - } - set_pte_phys(address, phys, flags); -} - -void clear_fixmap(enum fixed_addresses idx) -{ - set_pte_phys(__fix_to_virt(idx), 0, __pgprot(0)); -} - -static void __init fixrange_init (unsigned long start, - unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd, *kpgd; - pmd_t *pmd, *kpmd; - pte_t *pte, *kpte; - int i, j; - unsigned long vaddr; - - vaddr = start; - i = __pgd_offset(vaddr); - j = __pmd_offset(vaddr); - pgd = pgd_base + i; - - for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) { -#if CONFIG_X86_PAE - if (pgd_none(*pgd)) { - pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd) + 0x1)); - if (pmd != pmd_offset(pgd, 0)) - printk("PAE BUG #02!\n"); - } - pmd = pmd_offset(pgd, vaddr); -#else - pmd = (pmd_t *)pgd; -#endif - for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { - if (pmd_none(*pmd)) { - pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - clear_page(pte); - kpgd = pgd_offset_k((unsigned long)pte); - kpmd = pmd_offset(kpgd, (unsigned long)pte); - kpte = pte_offset(kpmd, (unsigned long)pte); - set_pte(kpte, pte_wrprotect(*kpte)); - set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte))); - } - vaddr += PMD_SIZE; - } - j = 0; - } -} - - -static void __init pagetable_init (void) -{ - unsigned long vaddr, end, ram_end; - pgd_t *kpgd, *pgd, *pgd_base; - int i, j, k; - pmd_t *kpmd, *pmd; - pte_t *kpte, *pte, *pte_base; - - ram_end = end = (unsigned long)__va(max_low_pfn * PAGE_SIZE); - if ( xen_start_info.nr_pages < max_low_pfn ) - ram_end = (unsigned long)__va(xen_start_info.nr_pages * PAGE_SIZE); - - pgd_base = init_mm.pgd; - i = __pgd_offset(PAGE_OFFSET); - pgd = pgd_base + i; - - for (; i < PTRS_PER_PGD; pgd++, i++) { - vaddr = i*PGDIR_SIZE; - if (vaddr >= end) - break; - pmd = (pmd_t *)pgd; - for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { - vaddr = i*PGDIR_SIZE + j*PMD_SIZE; - if (vaddr >= end) - break; - - /* Filled in for us already? */ - if ( pmd_val(*pmd) & _PAGE_PRESENT ) - continue; - - pte_base = pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - clear_page(pte_base); - - for (k = 0; k < PTRS_PER_PTE; pte++, k++) { - vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; - if (vaddr >= ram_end) - break; - *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL); - } - kpgd = pgd_offset_k((unsigned long)pte_base); - kpmd = pmd_offset(kpgd, (unsigned long)pte_base); - kpte = pte_offset(kpmd, (unsigned long)pte_base); - set_pte(kpte, pte_wrprotect(*kpte)); - set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); - } - } - - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - fixrange_init(vaddr, HYPERVISOR_VIRT_START, init_mm.pgd); - -#if CONFIG_HIGHMEM - /* - * Permanent kmaps: - */ - vaddr = PKMAP_BASE; - fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, init_mm.pgd); - - pgd = init_mm.pgd + __pgd_offset(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset(pmd, vaddr); - pkmap_page_table = pte; -#endif -} - -static void __init zone_sizes_init(void) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int max_dma, high, low; - - max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; - low = max_low_pfn; - high = highend_pfn; - - if (low < max_dma) - zones_size[ZONE_DMA] = low; - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = low - max_dma; -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - free_area_init(zones_size); -} - -void __init paging_init(void) -{ - pagetable_init(); - - zone_sizes_init(); - - /* Switch to the real shared_info page, and clear the dummy page. */ - set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - memset(empty_zero_page, 0, sizeof(empty_zero_page)); - -#ifdef CONFIG_HIGHMEM - kmap_init(); -#endif -} - -static inline int page_is_ram (unsigned long pagenr) -{ - return 1; -} - -#ifdef CONFIG_HIGHMEM -void __init one_highpage_init(struct page *page, int free_page) -{ - ClearPageReserved(page); - set_bit(PG_highmem, &page->flags); - atomic_set(&page->count, 1); - if ( free_page ) - __free_page(page); - totalhigh_pages++; -} -#endif /* CONFIG_HIGHMEM */ - -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - highmem_start_page = mem_map + highstart_pfn; - max_mapnr = num_physpages = highend_pfn; - num_mappedpages = max_low_pfn; -#else - max_mapnr = num_mappedpages = num_physpages = max_low_pfn; -#endif -} - -static int __init free_pages_init(void) -{ -#ifdef CONFIG_HIGHMEM - int bad_ppro = 0; -#endif - int reservedpages, pfn; - - /* add only boot_pfn pages of low memory to free list. - * max_low_pfn may be sized for - * pages yet to be allocated from the hypervisor, or it may be set - * to override the xen_start_info amount of memory - */ - int boot_pfn = min(xen_start_info.nr_pages,max_low_pfn); - - /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); - /* XEN: init and count low-mem pages outside initial allocation. */ - for (pfn = boot_pfn; pfn < max_low_pfn; pfn++) { - ClearPageReserved(&mem_map[pfn]); - atomic_set(&mem_map[pfn].count, 1); - totalram_pages++; - } - - reservedpages = 0; - for (pfn = 0; pfn < boot_pfn ; pfn++) { - /* - * Only count reserved RAM pages - */ - if (page_is_ram(pfn) && PageReserved(mem_map+pfn)) - reservedpages++; - } -#ifdef CONFIG_HIGHMEM - for (pfn = highend_pfn-1; pfn >= highstart_pfn; pfn--) - one_highpage_init((struct page *) (mem_map + pfn), - (pfn < xen_start_info.nr_pages)); - totalram_pages += totalhigh_pages; -#endif - return reservedpages; -} - -void __init mem_init(void) -{ - int codesize, reservedpages, datasize, initsize; - - if (!mem_map) - BUG(); - -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); - BUG(); - } -#endif - - set_max_mapnr_init(); - - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - - reservedpages = free_pages_init(); - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - max_mapnr << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); - - boot_cpu_data.wp_works_ok = 1; -} - -void free_initmem(void) -{ - unsigned long addr; - - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); - free_page(addr); - totalram_pages++; - } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (&__init_end - &__init_begin) >> 10); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); - free_page(start); - totalram_pages++; - } -} -#endif - -void si_meminfo(struct sysinfo *val) -{ - val->totalram = max_pfn; - val->sharedram = 0; - val->freeram = nr_free_pages(); - val->bufferram = atomic_read(&buffermem_pages); - val->totalhigh = max_pfn-max_low_pfn; - val->freehigh = nr_free_highpages(); - val->mem_unit = PAGE_SIZE; - return; -} - -#if defined(CONFIG_X86_PAE) -struct kmem_cache_s *pae_pgd_cachep; -void __init pgtable_cache_init(void) -{ - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); -} -#endif /* CONFIG_X86_PAE */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/mm/ioremap.c --- a/linux-2.4.30-xen-sparse/arch/xen/mm/ioremap.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,266 +0,0 @@ -/* - * arch/xen/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * - * (C) Copyright 1995 1996 Linus Torvalds - * - * Modifications for Xenolinux (c) 2003-2004 Keir Fraser - */ - -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/vmalloc.h> -#include <asm/io.h> -#include <asm/pgalloc.h> -#include <asm/uaccess.h> -#include <asm/tlb.h> -#include <asm/mmu.h> - -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - -/* These hacky macros avoid phys->machine translations. */ -#define __direct_pte(x) ((pte_t) { (x) } ) -#define __direct_mk_pte(page_nr,pgprot) \ - __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) -#define direct_mk_pte_phys(physpage, pgprot) \ - __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) - -static inline void direct_remap_area_pte(pte_t *pte, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - if (address >= end) - BUG(); - - do { - (*v)->ptr = virt_to_machine(pte); - (*v)++; - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int direct_remap_area_pmd(struct mm_struct *mm, - pmd_t *pmd, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - pte_t *pte = pte_alloc(mm, pmd, address); - if (!pte) - return -ENOMEM; - direct_remap_area_pte(pte, address, end - address, v); - - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v) -{ - pgd_t * dir; - unsigned long end = address + size; - - dir = pgd_offset(mm, address); - flush_cache_all(); - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, address); - if (!pmd) - return -ENOMEM; - direct_remap_area_pmd(mm, pmd, address, end - address, &v); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - - } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); - flush_tlb_all(); - return 0; -} - - -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - int i; - unsigned long start_address; -#define MAX_DIRECTMAP_MMU_QUEUE 130 - mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; - - start_address = address; - - for( i = 0; i < size; i += PAGE_SIZE ) - { - if ( (v - u) == MAX_DIRECTMAP_MMU_QUEUE ) - { - /* Fill in the PTE pointers. */ - __direct_remap_area_pages( mm, - start_address, - address-start_address, - u); - - if ( HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0 ) - return -EFAULT; - v = u; - start_address = address; - } - - /* - * Fill in the machine address: PTE ptr is done later by - * __direct_remap_area_pages(). - */ - v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); - - machine_addr += PAGE_SIZE; - address += PAGE_SIZE; - v++; - } - - if ( v != u ) - { - /* get the ptep's filled in */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - if ( unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) ) - return -EFAULT; - } - - return 0; -} - - -#endif /* CONFIG_XEN_PRIVILEGED_GUEST */ - - -/* - * Remap an arbitrary machine address space into the kernel virtual - * address space. Needed when a privileged instance of Xenolinux wants - * to access space outside its world directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void * __ioremap(unsigned long machine_addr, - unsigned long size, - unsigned long flags) -{ -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t prot; - - /* Don't allow wraparound or zero size */ - last_addr = machine_addr + size - 1; - if (!size || last_addr < machine_addr) - return NULL; - - /* Mappings have to be page-aligned */ - offset = machine_addr & ~PAGE_MASK; - machine_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - machine_addr; - - /* Ok, go for it */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - addr = area->addr; - prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | - _PAGE_ACCESSED | flags); - if (direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(addr), - machine_addr, size, prot, 0)) { - vfree(addr); - return NULL; - } - return (void *) (offset + (char *)addr); -#else - return NULL; -#endif -} - -void iounmap(void *addr) -{ - vfree((void *)((unsigned long)addr & PAGE_MASK)); -} - -/* implementation of boot time ioremap for purpose of provising access -to the vga console for privileged domains. Unlike boot time ioremap on -other architectures, ours is permanent and not reclaimed when then vmalloc -infrastructure is started */ - -void __init *bt_ioremap(unsigned long machine_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = machine_addr + size - 1; - if (!size || last_addr < machine_addr) - return NULL; - - /* - * Mappings have to be page-aligned - */ - offset = machine_addr & ~PAGE_MASK; - machine_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - machine_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - __set_fixmap(idx, machine_addr, PAGE_KERNEL); - machine_addr += PAGE_SIZE; - --idx; - --nrpages; - } - - flush_tlb_all(); - - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - - -#if 0 /* We don't support these functions. They shouldn't be required. */ -void __init bt_iounmap(void *addr, unsigned long size) {} -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/arch/xen/vmlinux.lds --- a/linux-2.4.30-xen-sparse/arch/xen/vmlinux.lds Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,75 +0,0 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares <mj@xxxxxxxxxxxxxxxxxxxxxxxx>; - */ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(_start) -SECTIONS -{ - . = 0xC0000000 + 0x100000; - _text = .; /* Text and read-only data */ - .text : { - *(.text) - *(.fixup) - *(.gnu.warning) - } = 0x9090 - - _etext = .; /* End of text section */ - - .rodata : { *(.rodata) *(.rodata.*) } - .kstrtab : { *(.kstrtab) } - - . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } - __stop___ex_table = .; - - __start___ksymtab = .; /* Kernel symbol table */ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; - - .data : { /* Data */ - *(.data) - CONSTRUCTORS - } - - _edata = .; /* End of data section */ - - . = ALIGN(8192); /* init_task */ - .data.init_task : { *(.data.init_task) } - - . = ALIGN(4096); /* Init code and data */ - __init_begin = .; - .text.init : { *(.text.init) } - .data.init : { *(.data.init) } - . = ALIGN(16); - __setup_start = .; - .setup.init : { *(.setup.init) } - __setup_end = .; - __initcall_start = .; - .initcall.init : { *(.initcall.init) } - __initcall_end = .; - . = ALIGN(4096); - __init_end = .; - - . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } - - . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } - - __bss_start = .; /* BSS */ - .bss : { - *(.bss) - } - _end = . ; - - /* Stabs debugging sections. */ - .stab 0 : { *(.stab) } - .stabstr 0 : { *(.stabstr) } - .stab.excl 0 : { *(.stab.excl) } - .stab.exclstr 0 : { *(.stab.exclstr) } - .stab.index 0 : { *(.stab.index) } - .stab.indexstr 0 : { *(.stab.indexstr) } - .comment 0 : { *(.comment) } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/drivers/block/ll_rw_blk.c --- a/linux-2.4.30-xen-sparse/drivers/block/ll_rw_blk.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1663 +0,0 @@ -/* - * linux/drivers/block/ll_rw_blk.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 1994, Karl Keyte: Added support for disk statistics - * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@xxxxxxx> SuSE - * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@xxxxxxx> - * kernel-doc documentation started by NeilBrown <neilb@xxxxxxxxxxxxxxx> - July2000 - */ - -/* - * This handles all read/write requests to block devices - */ -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/config.h> -#include <linux/locks.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/init.h> -#include <linux/smp_lock.h> -#include <linux/completion.h> -#include <linux/bootmem.h> - -#include <asm/system.h> -#include <asm/io.h> -#include <linux/blk.h> -#include <linux/highmem.h> -#include <linux/slab.h> -#include <linux/module.h> - -/* - * MAC Floppy IWM hooks - */ - -#ifdef CONFIG_MAC_FLOPPY_IWM -extern int mac_floppy_init(void); -#endif - -/* - * For the allocated request tables - */ -static kmem_cache_t *request_cachep; - -/* - * The "disk" task queue is used to start the actual requests - * after a plug - */ -DECLARE_TASK_QUEUE(tq_disk); - -/* - * Protect the request list against multiple users.. - * - * With this spinlock the Linux block IO subsystem is 100% SMP threaded - * from the IRQ event side, and almost 100% SMP threaded from the syscall - * side (we still have protect against block device array operations, and - * the do_request() side is casually still unsafe. The kernel lock protects - * this part currently.). - * - * there is a fair chance that things will work just OK if these functions - * are called with no global kernel lock held ... - */ -spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED; - -/* This specifies how many sectors to read ahead on the disk. */ - -int read_ahead[MAX_BLKDEV]; - -/* blk_dev_struct is: - * *request_fn - * *current_request - */ -struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */ - -/* - * blk_size contains the size of all block-devices in units of 1024 byte - * sectors: - * - * blk_size[MAJOR][MINOR] - * - * if (!blk_size[MAJOR]) then no minor size checking is done. - */ -int * blk_size[MAX_BLKDEV]; - -/* - * blksize_size contains the size of all block-devices: - * - * blksize_size[MAJOR][MINOR] - * - * if (!blksize_size[MAJOR]) then 1024 bytes is assumed. - */ -int * blksize_size[MAX_BLKDEV]; - -/* - * hardsect_size contains the size of the hardware sector of a device. - * - * hardsect_size[MAJOR][MINOR] - * - * if (!hardsect_size[MAJOR]) - * then 512 bytes is assumed. - * else - * sector_size is hardsect_size[MAJOR][MINOR] - * This is currently set by some scsi devices and read by the msdos fs driver. - * Other uses may appear later. - */ -int * hardsect_size[MAX_BLKDEV]; - -/* - * The following tunes the read-ahead algorithm in mm/filemap.c - */ -int * max_readahead[MAX_BLKDEV]; - -/* - * Max number of sectors per request - */ -int * max_sectors[MAX_BLKDEV]; - -unsigned long blk_max_low_pfn, blk_max_pfn; -int blk_nohighio = 0; - -int block_dump = 0; - -static struct timer_list writeback_timer; - -static inline int get_max_sectors(kdev_t dev) -{ - if (!max_sectors[MAJOR(dev)]) - return MAX_SECTORS; - return max_sectors[MAJOR(dev)][MINOR(dev)]; -} - -static inline request_queue_t *__blk_get_queue(kdev_t dev) -{ - struct blk_dev_struct *bdev = blk_dev + MAJOR(dev); - - if (bdev->queue) - return bdev->queue(dev); - else - return &blk_dev[MAJOR(dev)].request_queue; -} - -request_queue_t *blk_get_queue(kdev_t dev) -{ - return __blk_get_queue(dev); -} - -static int __blk_cleanup_queue(struct request_list *list) -{ - struct list_head *head = &list->free; - struct request *rq; - int i = 0; - - while (!list_empty(head)) { - rq = list_entry(head->next, struct request, queue); - list_del(&rq->queue); - kmem_cache_free(request_cachep, rq); - i++; - }; - - if (i != list->count) - printk("request list leak!\n"); - - list->count = 0; - return i; -} - -/** - * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed - * @q: the request queue to be released - * - * Description: - * blk_cleanup_queue is the pair to blk_init_queue(). It should - * be called when a request queue is being released; typically - * when a block device is being de-registered. Currently, its - * primary task it to free all the &struct request structures that - * were allocated to the queue. - * Caveat: - * Hopefully the low level driver will have finished any - * outstanding requests first... - **/ -void blk_cleanup_queue(request_queue_t * q) -{ - int count = q->nr_requests; - - count -= __blk_cleanup_queue(&q->rq); - - if (count) - printk("blk_cleanup_queue: leaked requests (%d)\n", count); - if (atomic_read(&q->nr_sectors)) - printk("blk_cleanup_queue: leaked sectors (%d)\n", atomic_read(&q->nr_sectors)); - - memset(q, 0, sizeof(*q)); -} - -/** - * blk_queue_headactive - indicate whether head of request queue may be active - * @q: The queue which this applies to. - * @active: A flag indication where the head of the queue is active. - * - * Description: - * The driver for a block device may choose to leave the currently active - * request on the request queue, removing it only when it has completed. - * The queue handling routines assume this by default for safety reasons - * and will not involve the head of the request queue in any merging or - * reordering of requests when the queue is unplugged (and thus may be - * working on this particular request). - * - * If a driver removes requests from the queue before processing them, then - * it may indicate that it does so, there by allowing the head of the queue - * to be involved in merging and reordering. This is done be calling - * blk_queue_headactive() with an @active flag of %0. - * - * If a driver processes several requests at once, it must remove them (or - * at least all but one of them) from the request queue. - * - * When a queue is plugged the head will be assumed to be inactive. - **/ - -void blk_queue_headactive(request_queue_t * q, int active) -{ - q->head_active = active; -} - -/** - * blk_queue_throttle_sectors - indicates you will call sector throttling funcs - * @q: The queue which this applies to. - * @active: A flag indication if you want sector throttling on - * - * Description: - * The sector throttling code allows us to put a limit on the number of - * sectors pending io to the disk at a given time, sending @active nonzero - * indicates you will call blk_started_sectors and blk_finished_sectors in - * addition to calling blk_started_io and blk_finished_io in order to - * keep track of the number of sectors in flight. - **/ - -void blk_queue_throttle_sectors(request_queue_t * q, int active) -{ - q->can_throttle = active; -} - -/** - * blk_queue_make_request - define an alternate make_request function for a device - * @q: the request queue for the device to be affected - * @mfn: the alternate make_request function - * - * Description: - * The normal way for &struct buffer_heads to be passed to a device - * driver is for them to be collected into requests on a request - * queue, and then to allow the device driver to select requests - * off that queue when it is ready. This works well for many block - * devices. However some block devices (typically virtual devices - * such as md or lvm) do not benefit from the processing on the - * request queue, and are served best by having the requests passed - * directly to them. This can be achieved by providing a function - * to blk_queue_make_request(). - * - * Caveat: - * The driver that does this *must* be able to deal appropriately - * with buffers in "highmemory", either by calling bh_kmap() to get - * a kernel mapping, to by calling create_bounce() to create a - * buffer in normal memory. - **/ - -void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn) -{ - q->make_request_fn = mfn; -} - -/** - * blk_queue_bounce_limit - set bounce buffer limit for queue - * @q: the request queue for the device - * @dma_addr: bus address limit - * - * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @page. By default - * the block layer sets this to the highest numbered "low" memory page. - **/ -void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr) -{ - unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; - unsigned long mb = dma_addr >> 20; - static request_queue_t *old_q; - - /* - * keep this for debugging for now... - */ - if (dma_addr != BLK_BOUNCE_HIGH && q != old_q) { - old_q = q; - printk("blk: queue %p, ", q); - if (dma_addr == BLK_BOUNCE_ANY) - printk("no I/O memory limit\n"); - else - printk("I/O limit %luMb (mask 0x%Lx)\n", mb, - (long long) dma_addr); - } - - q->bounce_pfn = bounce_pfn; -} - - -/* - * can we merge the two segments, or do we need to start a new one? - */ -static inline int __blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt) -{ - /* - * if bh and nxt are contigous and don't cross a 4g boundary, it's ok - */ - if (BH_CONTIG(bh, nxt) && BH_PHYS_4G(bh, nxt)) - return 1; - - return 0; -} - -int blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt) -{ - return __blk_seg_merge_ok(bh, nxt); -} - -static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments) -{ - if (req->nr_segments < max_segments) { - req->nr_segments++; - return 1; - } - return 0; -} - -static int ll_back_merge_fn(request_queue_t *q, struct request *req, - struct buffer_head *bh, int max_segments) -{ - if (__blk_seg_merge_ok(req->bhtail, bh)) - return 1; - - return ll_new_segment(q, req, max_segments); -} - -static int ll_front_merge_fn(request_queue_t *q, struct request *req, - struct buffer_head *bh, int max_segments) -{ - if (__blk_seg_merge_ok(bh, req->bh)) - return 1; - - return ll_new_segment(q, req, max_segments); -} - -static int ll_merge_requests_fn(request_queue_t *q, struct request *req, - struct request *next, int max_segments) -{ - int total_segments = req->nr_segments + next->nr_segments; - - if (__blk_seg_merge_ok(req->bhtail, next->bh)) - total_segments--; - - if (total_segments > max_segments) - return 0; - - req->nr_segments = total_segments; - return 1; -} - -/* - * "plug" the device if there are no outstanding requests: this will - * force the transfer to start only after we have put all the requests - * on the list. - * - * This is called with interrupts off and no requests on the queue. - * (and with the request spinlock acquired) - */ -static void generic_plug_device(request_queue_t *q, kdev_t dev) -{ - /* - * no need to replug device - */ - if (!list_empty(&q->queue_head) || q->plugged) - return; - - q->plugged = 1; - queue_task(&q->plug_tq, &tq_disk); -} - -/* - * remove the plug and let it rip.. - */ -static inline void __generic_unplug_device(request_queue_t *q) -{ - if (q->plugged) { - q->plugged = 0; - if (!list_empty(&q->queue_head)) - q->request_fn(q); - } -} - -void generic_unplug_device(void *data) -{ - request_queue_t *q = (request_queue_t *) data; - unsigned long flags; - - spin_lock_irqsave(&io_request_lock, flags); - __generic_unplug_device(q); - spin_unlock_irqrestore(&io_request_lock, flags); -} - -/** blk_grow_request_list - * @q: The &request_queue_t - * @nr_requests: how many requests are desired - * - * More free requests are added to the queue's free lists, bringing - * the total number of requests to @nr_requests. - * - * The requests are added equally to the request queue's read - * and write freelists. - * - * This function can sleep. - * - * Returns the (new) number of requests which the queue has available. - */ -int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors) -{ - unsigned long flags; - /* Several broken drivers assume that this function doesn't sleep, - * this causes system hangs during boot. - * As a temporary fix, make the function non-blocking. - */ - spin_lock_irqsave(&io_request_lock, flags); - while (q->nr_requests < nr_requests) { - struct request *rq; - - rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC); - if (rq == NULL) - break; - memset(rq, 0, sizeof(*rq)); - rq->rq_status = RQ_INACTIVE; - list_add(&rq->queue, &q->rq.free); - q->rq.count++; - - q->nr_requests++; - } - - /* - * Wakeup waiters after both one quarter of the - * max-in-fligh queue and one quarter of the requests - * are available again. - */ - - q->batch_requests = q->nr_requests / 4; - if (q->batch_requests > 32) - q->batch_requests = 32; - q->batch_sectors = max_queue_sectors / 4; - - q->max_queue_sectors = max_queue_sectors; - - BUG_ON(!q->batch_sectors); - atomic_set(&q->nr_sectors, 0); - - spin_unlock_irqrestore(&io_request_lock, flags); - return q->nr_requests; -} - -static void blk_init_free_list(request_queue_t *q) -{ - struct sysinfo si; - int megs; /* Total memory, in megabytes */ - int nr_requests, max_queue_sectors = MAX_QUEUE_SECTORS; - - INIT_LIST_HEAD(&q->rq.free); - q->rq.count = 0; - q->rq.pending[READ] = q->rq.pending[WRITE] = 0; - q->nr_requests = 0; - - si_meminfo(&si); - megs = si.totalram >> (20 - PAGE_SHIFT); - nr_requests = MAX_NR_REQUESTS; - if (megs < 30) { - nr_requests /= 2; - max_queue_sectors /= 2; - } - /* notice early if anybody screwed the defaults */ - BUG_ON(!nr_requests); - BUG_ON(!max_queue_sectors); - - blk_grow_request_list(q, nr_requests, max_queue_sectors); - - init_waitqueue_head(&q->wait_for_requests); - - spin_lock_init(&q->queue_lock); -} - -static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh); - -/** - * blk_init_queue - prepare a request queue for use with a block device - * @q: The &request_queue_t to be initialised - * @rfn: The function to be called to process requests that have been - * placed on the queue. - * - * Description: - * If a block device wishes to use the standard request handling procedures, - * which sorts requests and coalesces adjacent requests, then it must - * call blk_init_queue(). The function @rfn will be called when there - * are requests on the queue that need to be processed. If the device - * supports plugging, then @rfn may not be called immediately when requests - * are available on the queue, but may be called at some time later instead. - * Plugged queues are generally unplugged when a buffer belonging to one - * of the requests on the queue is needed, or due to memory pressure. - * - * @rfn is not required, or even expected, to remove all requests off the - * queue, but only as many as it can handle at a time. If it does leave - * requests on the queue, it is responsible for arranging that the requests - * get dealt with eventually. - * - * A global spin lock $io_request_lock must be held while manipulating the - * requests on the request queue. - * - * The request on the head of the queue is by default assumed to be - * potentially active, and it is not considered for re-ordering or merging - * whenever the given queue is unplugged. This behaviour can be changed with - * blk_queue_headactive(). - * - * Note: - * blk_init_queue() must be paired with a blk_cleanup_queue() call - * when the block device is deactivated (such as at module unload). - **/ -void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) -{ - INIT_LIST_HEAD(&q->queue_head); - elevator_init(&q->elevator, ELEVATOR_LINUS); - blk_init_free_list(q); - q->request_fn = rfn; - q->back_merge_fn = ll_back_merge_fn; - q->front_merge_fn = ll_front_merge_fn; - q->merge_requests_fn = ll_merge_requests_fn; - q->make_request_fn = __make_request; - q->plug_tq.sync = 0; - q->plug_tq.routine = &generic_unplug_device; - q->plug_tq.data = q; - q->plugged = 0; - q->can_throttle = 0; - - /* - * These booleans describe the queue properties. We set the - * default (and most common) values here. Other drivers can - * use the appropriate functions to alter the queue properties. - * as appropriate. - */ - q->plug_device_fn = generic_plug_device; - q->head_active = 1; - - blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); -} - -#define blkdev_free_rq(list) list_entry((list)->next, struct request, queue); -/* - * Get a free request. io_request_lock must be held and interrupts - * disabled on the way in. Returns NULL if there are no free requests. - */ -static struct request *get_request(request_queue_t *q, int rw) -{ - struct request *rq = NULL; - struct request_list *rl = &q->rq; - - if (blk_oversized_queue(q)) { - int rlim = q->nr_requests >> 5; - - if (rlim < 4) - rlim = 4; - - /* - * if its a write, or we have more than a handful of reads - * pending, bail out - */ - if ((rw == WRITE) || (rw == READ && rl->pending[READ] > rlim)) - return NULL; - if (blk_oversized_queue_reads(q)) - return NULL; - } - - if (!list_empty(&rl->free)) { - rq = blkdev_free_rq(&rl->free); - list_del(&rq->queue); - rl->count--; - rl->pending[rw]++; - rq->rq_status = RQ_ACTIVE; - rq->cmd = rw; - rq->special = NULL; - rq->q = q; - } - - return rq; -} - -/* - * Here's the request allocation design, low latency version: - * - * 1: Blocking on request exhaustion is a key part of I/O throttling. - * - * 2: We want to be `fair' to all requesters. We must avoid starvation, and - * attempt to ensure that all requesters sleep for a similar duration. Hence - * no stealing requests when there are other processes waiting. - * - * There used to be more here, attempting to allow a process to send in a - * number of requests once it has woken up. But, there's no way to - * tell if a process has just been woken up, or if it is a new process - * coming in to steal requests from the waiters. So, we give up and force - * everyone to wait fairly. - * - * So here's what we do: - * - * a) A READA requester fails if free_requests < batch_requests - * - * We don't want READA requests to prevent sleepers from ever - * waking. Note that READA is used extremely rarely - a few - * filesystems use it for directory readahead. - * - * When a process wants a new request: - * - * b) If free_requests == 0, the requester sleeps in FIFO manner, and - * the queue full condition is set. The full condition is not - * cleared until there are no longer any waiters. Once the full - * condition is set, all new io must wait, hopefully for a very - * short period of time. - * - * When a request is released: - * - * c) If free_requests < batch_requests, do nothing. - * - * d) If free_requests >= batch_requests, wake up a single waiter. - * - * As each waiter gets a request, he wakes another waiter. We do this - * to prevent a race where an unplug might get run before a request makes - * it's way onto the queue. The result is a cascade of wakeups, so delaying - * the initial wakeup until we've got batch_requests available helps avoid - * wakeups where there aren't any requests available yet. - */ - -static struct request *__get_request_wait(request_queue_t *q, int rw) -{ - register struct request *rq; - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&q->wait_for_requests, &wait); - - do { - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irq(&io_request_lock); - if (blk_oversized_queue(q) || q->rq.count == 0) { - __generic_unplug_device(q); - spin_unlock_irq(&io_request_lock); - schedule(); - spin_lock_irq(&io_request_lock); - } - rq = get_request(q, rw); - spin_unlock_irq(&io_request_lock); - } while (rq == NULL); - remove_wait_queue(&q->wait_for_requests, &wait); - current->state = TASK_RUNNING; - - return rq; -} - -static void get_request_wait_wakeup(request_queue_t *q, int rw) -{ - /* - * avoid losing an unplug if a second __get_request_wait did the - * generic_unplug_device while our __get_request_wait was running - * w/o the queue_lock held and w/ our request out of the queue. - */ - if (waitqueue_active(&q->wait_for_requests)) - wake_up(&q->wait_for_requests); -} - -/* RO fail safe mechanism */ - -static long ro_bits[MAX_BLKDEV][8]; - -int is_read_only(kdev_t dev) -{ - int minor,major; - - major = MAJOR(dev); - minor = MINOR(dev); - if (major < 0 || major >= MAX_BLKDEV) return 0; - return ro_bits[major][minor >> 5] & (1 << (minor & 31)); -} - -void set_device_ro(kdev_t dev,int flag) -{ - int minor,major; - - major = MAJOR(dev); - minor = MINOR(dev); - if (major < 0 || major >= MAX_BLKDEV) return; - if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31); - else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31)); -} - -inline void drive_stat_acct (kdev_t dev, int rw, - unsigned long nr_sectors, int new_io) -{ - unsigned int major = MAJOR(dev); - unsigned int index; - - index = disk_index(dev); - if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) - return; - - kstat.dk_drive[major][index] += new_io; - if (rw == READ) { - kstat.dk_drive_rio[major][index] += new_io; - kstat.dk_drive_rblk[major][index] += nr_sectors; - } else if (rw == WRITE) { - kstat.dk_drive_wio[major][index] += new_io; - kstat.dk_drive_wblk[major][index] += nr_sectors; - } else - printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n"); -} - -#ifdef CONFIG_BLK_STATS -/* - * Return up to two hd_structs on which to do IO accounting for a given - * request. - * - * On a partitioned device, we want to account both against the partition - * and against the whole disk. - */ -static void locate_hd_struct(struct request *req, - struct hd_struct **hd1, - struct hd_struct **hd2) -{ - struct gendisk *gd; - - *hd1 = NULL; - *hd2 = NULL; - - gd = get_gendisk(req->rq_dev); - if (gd && gd->part) { - /* Mask out the partition bits: account for the entire disk */ - int devnr = MINOR(req->rq_dev) >> gd->minor_shift; - int whole_minor = devnr << gd->minor_shift; - - *hd1 = &gd->part[whole_minor]; - if (whole_minor != MINOR(req->rq_dev)) - *hd2= &gd->part[MINOR(req->rq_dev)]; - } -} - -/* - * Round off the performance stats on an hd_struct. - * - * The average IO queue length and utilisation statistics are maintained - * by observing the current state of the queue length and the amount of - * time it has been in this state for. - * Normally, that accounting is done on IO completion, but that can result - * in more than a second's worth of IO being accounted for within any one - * second, leading to >100% utilisation. To deal with that, we do a - * round-off before returning the results when reading /proc/partitions, - * accounting immediately for all queue usage up to the current jiffies and - * restarting the counters again. - */ -void disk_round_stats(struct hd_struct *hd) -{ - unsigned long now = jiffies; - - hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change)); - hd->last_queue_change = now; - - if (hd->ios_in_flight) - hd->io_ticks += (now - hd->last_idle_time); - hd->last_idle_time = now; -} - -static inline void down_ios(struct hd_struct *hd) -{ - disk_round_stats(hd); - --hd->ios_in_flight; -} - -static inline void up_ios(struct hd_struct *hd) -{ - disk_round_stats(hd); - ++hd->ios_in_flight; -} - -static void account_io_start(struct hd_struct *hd, struct request *req, - int merge, int sectors) -{ - switch (req->cmd) { - case READ: - if (merge) - hd->rd_merges++; - hd->rd_sectors += sectors; - break; - case WRITE: - if (merge) - hd->wr_merges++; - hd->wr_sectors += sectors; - break; - } - if (!merge) - up_ios(hd); -} - -static void account_io_end(struct hd_struct *hd, struct request *req) -{ - unsigned long duration = jiffies - req->start_time; - switch (req->cmd) { - case READ: - hd->rd_ticks += duration; - hd->rd_ios++; - break; - case WRITE: - hd->wr_ticks += duration; - hd->wr_ios++; - break; - } - down_ios(hd); -} - -void req_new_io(struct request *req, int merge, int sectors) -{ - struct hd_struct *hd1, *hd2; - - locate_hd_struct(req, &hd1, &hd2); - if (hd1) - account_io_start(hd1, req, merge, sectors); - if (hd2) - account_io_start(hd2, req, merge, sectors); -} - -void req_merged_io(struct request *req) -{ - struct hd_struct *hd1, *hd2; - - locate_hd_struct(req, &hd1, &hd2); - if (hd1) - down_ios(hd1); - if (hd2) - down_ios(hd2); -} - -void req_finished_io(struct request *req) -{ - struct hd_struct *hd1, *hd2; - - locate_hd_struct(req, &hd1, &hd2); - if (hd1) - account_io_end(hd1, req); - if (hd2) - account_io_end(hd2, req); -} -EXPORT_SYMBOL(req_finished_io); -#endif /* CONFIG_BLK_STATS */ - -/* - * add-request adds a request to the linked list. - * io_request_lock is held and interrupts disabled, as we muck with the - * request queue list. - * - * By this point, req->cmd is always either READ/WRITE, never READA, - * which is important for drive_stat_acct() above. - */ -static inline void add_request(request_queue_t * q, struct request * req, - struct list_head *insert_here) -{ - drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1); - - if (!q->plugged && q->head_active && insert_here == &q->queue_head) { - spin_unlock_irq(&io_request_lock); - BUG(); - } - - /* - * elevator indicated where it wants this request to be - * inserted at elevator_merge time - */ - list_add(&req->queue, insert_here); -} - -/* - * Must be called with io_request_lock held and interrupts disabled - */ -void blkdev_release_request(struct request *req) -{ - request_queue_t *q = req->q; - - req->rq_status = RQ_INACTIVE; - req->q = NULL; - - /* - * Request may not have originated from ll_rw_blk. if not, - * assume it has free buffers and check waiters - */ - if (q) { - struct request_list *rl = &q->rq; - int oversized_batch = 0; - - if (q->can_throttle) - oversized_batch = blk_oversized_queue_batch(q); - rl->count++; - /* - * paranoia check - */ - if (req->cmd == READ || req->cmd == WRITE) - rl->pending[req->cmd]--; - if (rl->pending[READ] > q->nr_requests) - printk("blk: reads: %u\n", rl->pending[READ]); - if (rl->pending[WRITE] > q->nr_requests) - printk("blk: writes: %u\n", rl->pending[WRITE]); - if (rl->pending[READ] + rl->pending[WRITE] > q->nr_requests) - printk("blk: r/w: %u + %u > %u\n", rl->pending[READ], rl->pending[WRITE], q->nr_requests); - list_add(&req->queue, &rl->free); - if (rl->count >= q->batch_requests && !oversized_batch) { - smp_mb(); - if (waitqueue_active(&q->wait_for_requests)) - wake_up(&q->wait_for_requests); - } - } -} - -/* - * Has to be called with the request spinlock acquired - */ -static void attempt_merge(request_queue_t * q, - struct request *req, - int max_sectors, - int max_segments) -{ - struct request *next; - - next = blkdev_next_request(req); - if (req->sector + req->nr_sectors != next->sector) - return; - if (req->cmd != next->cmd - || req->rq_dev != next->rq_dev - || req->nr_sectors + next->nr_sectors > max_sectors - || next->waiting) - return; - /* - * If we are not allowed to merge these requests, then - * return. If we are allowed to merge, then the count - * will have been updated to the appropriate number, - * and we shouldn't do it here too. - */ - if (!q->merge_requests_fn(q, req, next, max_segments)) - return; - - q->elevator.elevator_merge_req_fn(req, next); - - /* At this point we have either done a back merge - * or front merge. We need the smaller start_time of - * the merged requests to be the current request - * for accounting purposes. - */ - if (time_after(req->start_time, next->start_time)) - req->start_time = next->start_time; - - req->bhtail->b_reqnext = next->bh; - req->bhtail = next->bhtail; - req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; - list_del(&next->queue); - - /* One last thing: we have removed a request, so we now have one - less expected IO to complete for accounting purposes. */ - req_merged_io(req); - - blkdev_release_request(next); -} - -static inline void attempt_back_merge(request_queue_t * q, - struct request *req, - int max_sectors, - int max_segments) -{ - if (&req->queue == q->queue_head.prev) - return; - attempt_merge(q, req, max_sectors, max_segments); -} - -static inline void attempt_front_merge(request_queue_t * q, - struct list_head * head, - struct request *req, - int max_sectors, - int max_segments) -{ - struct list_head * prev; - - prev = req->queue.prev; - if (head == prev) - return; - attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments); -} - -static int __make_request(request_queue_t * q, int rw, - struct buffer_head * bh) -{ - unsigned int sector, count, sync; - int max_segments = MAX_SEGMENTS; - struct request * req, *freereq = NULL; - int rw_ahead, max_sectors, el_ret; - struct list_head *head, *insert_here; - int latency; - elevator_t *elevator = &q->elevator; - int should_wake = 0; - - count = bh->b_size >> 9; - sector = bh->b_rsector; - sync = test_and_clear_bit(BH_Sync, &bh->b_state); - - rw_ahead = 0; /* normal case; gets changed below for READA */ - switch (rw) { - case READA: -#if 0 /* bread() misinterprets failed READA attempts as IO errors on SMP */ - rw_ahead = 1; -#endif - rw = READ; /* drop into READ */ - case READ: - case WRITE: - latency = elevator_request_latency(elevator, rw); - break; - default: - BUG(); - goto end_io; - } - - /* We'd better have a real physical mapping! - Check this bit only if the buffer was dirty and just locked - down by us so at this point flushpage will block and - won't clear the mapped bit under us. */ - if (!buffer_mapped(bh)) - BUG(); - - /* - * Temporary solution - in 2.5 this will be done by the lowlevel - * driver. Create a bounce buffer if the buffer data points into - * high memory - keep the original buffer otherwise. - */ - bh = blk_queue_bounce(q, rw, bh); - -/* look for a free request. */ - /* - * Try to coalesce the new request with old requests - */ - max_sectors = get_max_sectors(bh->b_rdev); - - req = NULL; - head = &q->queue_head; - /* - * Now we acquire the request spinlock, we have to be mega careful - * not to schedule or do something nonatomic - */ - spin_lock_irq(&io_request_lock); - -again: - insert_here = head->prev; - - if (list_empty(head)) { - q->plug_device_fn(q, bh->b_rdev); /* is atomic */ - goto get_rq; - } else if (q->head_active && !q->plugged) - head = head->next; - - el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors); - switch (el_ret) { - - case ELEVATOR_BACK_MERGE: - if (!q->back_merge_fn(q, req, bh, max_segments)) { - insert_here = &req->queue; - break; - } - req->bhtail->b_reqnext = bh; - req->bhtail = bh; - req->nr_sectors = req->hard_nr_sectors += count; - blk_started_io(count); - blk_started_sectors(req, count); - drive_stat_acct(req->rq_dev, req->cmd, count, 0); - req_new_io(req, 1, count); - attempt_back_merge(q, req, max_sectors, max_segments); - goto out; - - case ELEVATOR_FRONT_MERGE: - if (!q->front_merge_fn(q, req, bh, max_segments)) { - insert_here = req->queue.prev; - break; - } - bh->b_reqnext = req->bh; - req->bh = bh; - /* - * may not be valid, but queues not having bounce - * enabled for highmem pages must not look at - * ->buffer anyway - */ - req->buffer = bh->b_data; - req->current_nr_sectors = req->hard_cur_sectors = count; - req->sector = req->hard_sector = sector; - req->nr_sectors = req->hard_nr_sectors += count; - blk_started_io(count); - blk_started_sectors(req, count); - drive_stat_acct(req->rq_dev, req->cmd, count, 0); - req_new_io(req, 1, count); - attempt_front_merge(q, head, req, max_sectors, max_segments); - goto out; - - /* - * elevator says don't/can't merge. get new request - */ - case ELEVATOR_NO_MERGE: - /* - * use elevator hints as to where to insert the - * request. if no hints, just add it to the back - * of the queue - */ - if (req) - insert_here = &req->queue; - break; - - default: - printk("elevator returned crap (%d)\n", el_ret); - BUG(); - } - -get_rq: - if (freereq) { - req = freereq; - freereq = NULL; - } else { - /* - * See description above __get_request_wait() - */ - if (rw_ahead) { - if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) { - spin_unlock_irq(&io_request_lock); - goto end_io; - } - req = get_request(q, rw); - if (req == NULL) - BUG(); - } else { - req = get_request(q, rw); - if (req == NULL) { - spin_unlock_irq(&io_request_lock); - freereq = __get_request_wait(q, rw); - head = &q->queue_head; - spin_lock_irq(&io_request_lock); - should_wake = 1; - goto again; - } - } - } - -/* fill up the request-info, and add it to the queue */ - req->elevator_sequence = latency; - req->cmd = rw; - req->errors = 0; - req->hard_sector = req->sector = sector; - req->hard_nr_sectors = req->nr_sectors = count; - req->current_nr_sectors = req->hard_cur_sectors = count; - req->nr_segments = 1; /* Always 1 for a new request. */ - req->nr_hw_segments = 1; /* Always 1 for a new request. */ - req->buffer = bh->b_data; - req->waiting = NULL; - req->bh = bh; - req->bhtail = bh; - req->rq_dev = bh->b_rdev; - req->start_time = jiffies; - req_new_io(req, 0, count); - blk_started_io(count); - blk_started_sectors(req, count); - add_request(q, req, insert_here); -out: - if (freereq) - blkdev_release_request(freereq); - if (should_wake) - get_request_wait_wakeup(q, rw); - if (sync) - __generic_unplug_device(q); - spin_unlock_irq(&io_request_lock); - return 0; -end_io: - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - return 0; -} - -/** - * generic_make_request: hand a buffer head to it's device driver for I/O - * @rw: READ, WRITE, or READA - what sort of I/O is desired. - * @bh: The buffer head describing the location in memory and on the device. - * - * generic_make_request() is used to make I/O requests of block - * devices. It is passed a &struct buffer_head and a &rw value. The - * %READ and %WRITE options are (hopefully) obvious in meaning. The - * %READA value means that a read is required, but that the driver is - * free to fail the request if, for example, it cannot get needed - * resources immediately. - * - * generic_make_request() does not return any status. The - * success/failure status of the request, along with notification of - * completion, is delivered asynchronously through the bh->b_end_io - * function described (one day) else where. - * - * The caller of generic_make_request must make sure that b_page, - * b_addr, b_size are set to describe the memory buffer, that b_rdev - * and b_rsector are set to describe the device address, and the - * b_end_io and optionally b_private are set to describe how - * completion notification should be signaled. BH_Mapped should also - * be set (to confirm that b_dev and b_blocknr are valid). - * - * generic_make_request and the drivers it calls may use b_reqnext, - * and may change b_rdev and b_rsector. So the values of these fields - * should NOT be depended on after the call to generic_make_request. - * Because of this, the caller should record the device address - * information in b_dev and b_blocknr. - * - * Apart from those fields mentioned above, no other fields, and in - * particular, no other flags, are changed by generic_make_request or - * any lower level drivers. - * */ -void generic_make_request (int rw, struct buffer_head * bh) -{ - int major = MAJOR(bh->b_rdev); - int minorsize = 0; - request_queue_t *q; - - if (!bh->b_end_io) - BUG(); - - /* Test device size, when known. */ - if (blk_size[major]) - minorsize = blk_size[major][MINOR(bh->b_rdev)]; - if (minorsize) { - unsigned long maxsector = (minorsize << 1) + 1; - unsigned long sector = bh->b_rsector; - unsigned int count = bh->b_size >> 9; - - if (maxsector < count || maxsector - count < sector) { - /* Yecch */ - bh->b_state &= ~(1 << BH_Dirty); - - /* This may well happen - the kernel calls bread() - without checking the size of the device, e.g., - when mounting a device. */ - printk(KERN_INFO - "attempt to access beyond end of device\n"); - printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n", - kdevname(bh->b_rdev), rw, - (sector + count)>>1, minorsize); - - bh->b_end_io(bh, 0); - return; - } - } - - /* - * Resolve the mapping until finished. (drivers are - * still free to implement/resolve their own stacking - * by explicitly returning 0) - */ - /* NOTE: we don't repeat the blk_size check for each new device. - * Stacking drivers are expected to know what they are doing. - */ - do { - q = __blk_get_queue(bh->b_rdev); - if (!q) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%ld)\n", - kdevname(bh->b_rdev), bh->b_rsector); - buffer_IO_error(bh); - break; - } - } while (q->make_request_fn(q, rw, bh)); -} - - -/** - * submit_bh: submit a buffer_head to the block device later for I/O - * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) - * @bh: The &struct buffer_head which describes the I/O - * - * submit_bh() is very similar in purpose to generic_make_request(), and - * uses that function to do most of the work. - * - * The extra functionality provided by submit_bh is to determine - * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev. - * This is is appropriate for IO requests that come from the buffer - * cache and page cache which (currently) always use aligned blocks. - */ -void submit_bh(int rw, struct buffer_head * bh) -{ - int count = bh->b_size >> 9; - - if (!test_bit(BH_Lock, &bh->b_state)) - BUG(); - - set_bit(BH_Req, &bh->b_state); - set_bit(BH_Launder, &bh->b_state); - - /* - * First step, 'identity mapping' - RAID or LVM might - * further remap this. - */ - bh->b_rdev = bh->b_dev; - bh->b_rsector = bh->b_blocknr * count; - - get_bh(bh); - generic_make_request(rw, bh); - - /* fix race condition with wait_on_buffer() */ - smp_mb(); /* spin_unlock may have inclusive semantics */ - if (waitqueue_active(&bh->b_wait)) - wake_up(&bh->b_wait); - - if (block_dump) - printk(KERN_DEBUG "%s: %s block %lu/%u on %s\n", current->comm, rw == WRITE ? "WRITE" : "READ", bh->b_rsector, count, kdevname(bh->b_rdev)); - - put_bh(bh); - switch (rw) { - case WRITE: - kstat.pgpgout += count; - break; - default: - kstat.pgpgin += count; - break; - } -} - -/** - * ll_rw_block: low-level access to block devices - * @rw: whether to %READ or %WRITE or maybe %READA (readahead) - * @nr: number of &struct buffer_heads in the array - * @bhs: array of pointers to &struct buffer_head - * - * ll_rw_block() takes an array of pointers to &struct buffer_heads, - * and requests an I/O operation on them, either a %READ or a %WRITE. - * The third %READA option is described in the documentation for - * generic_make_request() which ll_rw_block() calls. - * - * This function provides extra functionality that is not in - * generic_make_request() that is relevant to buffers in the buffer - * cache or page cache. In particular it drops any buffer that it - * cannot get a lock on (with the BH_Lock state bit), any buffer that - * appears to be clean when doing a write request, and any buffer that - * appears to be up-to-date when doing read request. Further it marks - * as clean buffers that are processed for writing (the buffer cache - * wont assume that they are actually clean until the buffer gets - * unlocked). - * - * ll_rw_block sets b_end_io to simple completion handler that marks - * the buffer up-to-date (if approriate), unlocks the buffer and wakes - * any waiters. As client that needs a more interesting completion - * routine should call submit_bh() (or generic_make_request()) - * directly. - * - * Caveat: - * All of the buffers must be for the same device, and must also be - * of the current approved size for the device. */ - -void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]) -{ - unsigned int major; - int correct_size; - int i; - - if (!nr) - return; - - major = MAJOR(bhs[0]->b_dev); - - /* Determine correct block size for this device. */ - correct_size = get_hardsect_size(bhs[0]->b_dev); - - /* Verify requested block sizes. */ - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - if (bh->b_size % correct_size) { - printk(KERN_NOTICE "ll_rw_block: device %s: " - "only %d-char blocks implemented (%u)\n", - kdevname(bhs[0]->b_dev), - correct_size, bh->b_size); - goto sorry; - } - } - - if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) { - printk(KERN_NOTICE "Can't write to read-only device %s\n", - kdevname(bhs[0]->b_dev)); - goto sorry; - } - - for (i = 0; i < nr; i++) { - struct buffer_head *bh = bhs[i]; - - lock_buffer(bh); - - /* We have the buffer lock */ - atomic_inc(&bh->b_count); - bh->b_end_io = end_buffer_io_sync; - - switch(rw) { - case WRITE: - if (!atomic_set_buffer_clean(bh)) - /* Hmmph! Nothing to write */ - goto end_io; - __mark_buffer_clean(bh); - break; - - case READA: - case READ: - if (buffer_uptodate(bh)) - /* Hmmph! Already have it */ - goto end_io; - break; - default: - BUG(); - end_io: - bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); - continue; - } - - submit_bh(rw, bh); - } - return; - -sorry: - /* Make sure we don't get infinite dirty retries.. */ - for (i = 0; i < nr; i++) - mark_buffer_clean(bhs[i]); -} - -#ifdef CONFIG_STRAM_SWAP -extern int stram_device_init (void); -#endif - -static void blk_writeback_timer(unsigned long data) -{ - wakeup_bdflush(); - wakeup_kupdate(); -} - -/** - * end_that_request_first - end I/O on one buffer. - * @req: the request being processed - * @uptodate: 0 for I/O error - * @name: the name printed for an I/O error - * - * Description: - * Ends I/O on the first buffer attached to @req, and sets it up - * for the next buffer_head (if any) in the cluster. - * - * Return: - * 0 - we are done with this request, call end_that_request_last() - * 1 - still buffers pending for this request - * - * Caveat: - * Drivers implementing their own end_request handling must call - * blk_finished_io() appropriately. - **/ - -int end_that_request_first (struct request *req, int uptodate, char *name) -{ - struct buffer_head * bh; - int nsect; - - req->errors = 0; - if (!uptodate) - printk("end_request: I/O error, dev %s (%s), sector %lu\n", - kdevname(req->rq_dev), name, req->sector); - - if ((bh = req->bh) != NULL) { - nsect = bh->b_size >> 9; - blk_finished_io(nsect); - blk_finished_sectors(req, nsect); - req->bh = bh->b_reqnext; - bh->b_reqnext = NULL; - bh->b_end_io(bh, uptodate); - if ((bh = req->bh) != NULL) { - req->hard_sector += nsect; - req->hard_nr_sectors -= nsect; - req->sector = req->hard_sector; - req->nr_sectors = req->hard_nr_sectors; - - req->current_nr_sectors = bh->b_size >> 9; - req->hard_cur_sectors = req->current_nr_sectors; - if (req->nr_sectors < req->current_nr_sectors) { - req->nr_sectors = req->current_nr_sectors; - printk("end_request: buffer-list destroyed\n"); - } - req->buffer = bh->b_data; - return 1; - } - } - return 0; -} - -extern int laptop_mode; - -void end_that_request_last(struct request *req) -{ - struct completion *waiting = req->waiting; - - /* - * schedule the writeout of pending dirty data when the disk is idle - */ - if (laptop_mode && req->cmd == READ) - mod_timer(&writeback_timer, jiffies + 5 * HZ); - - req_finished_io(req); - blkdev_release_request(req); - if (waiting) - complete(waiting); -} - -int __init blk_dev_init(void) -{ - struct blk_dev_struct *dev; - - request_cachep = kmem_cache_create("blkdev_requests", - sizeof(struct request), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - - if (!request_cachep) - panic("Can't create request pool slab cache\n"); - - for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) - dev->queue = NULL; - - memset(ro_bits,0,sizeof(ro_bits)); - memset(max_readahead, 0, sizeof(max_readahead)); - memset(max_sectors, 0, sizeof(max_sectors)); - - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - - init_timer(&writeback_timer); - writeback_timer.function = blk_writeback_timer; - -#ifdef CONFIG_AMIGA_Z2RAM - z2_init(); -#endif -#ifdef CONFIG_STRAM_SWAP - stram_device_init(); -#endif -#ifdef CONFIG_ISP16_CDI - isp16_init(); -#endif -#ifdef CONFIG_BLK_DEV_PS2 - ps2esdi_init(); -#endif -#ifdef CONFIG_BLK_DEV_XD - xd_init(); -#endif -#ifdef CONFIG_BLK_DEV_MFM - mfm_init(); -#endif -#ifdef CONFIG_PARIDE - { extern void paride_init(void); paride_init(); }; -#endif -#ifdef CONFIG_MAC_FLOPPY - swim3_init(); -#endif -#ifdef CONFIG_BLK_DEV_SWIM_IOP - swimiop_init(); -#endif -#ifdef CONFIG_AMIGA_FLOPPY - amiga_floppy_init(); -#endif -#ifdef CONFIG_ATARI_FLOPPY - atari_floppy_init(); -#endif -#ifdef CONFIG_BLK_DEV_FD - floppy_init(); -#else -#if defined(__i386__) && !defined(CONFIG_XEN) /* Do we even need this? */ - outb_p(0xc, 0x3f2); -#endif -#endif -#ifdef CONFIG_CDU31A - cdu31a_init(); -#endif -#ifdef CONFIG_ATARI_ACSI - acsi_init(); -#endif -#ifdef CONFIG_MCD - mcd_init(); -#endif -#ifdef CONFIG_MCDX - mcdx_init(); -#endif -#ifdef CONFIG_SBPCD - sbpcd_init(); -#endif -#ifdef CONFIG_AZTCD - aztcd_init(); -#endif -#ifdef CONFIG_CDU535 - sony535_init(); -#endif -#ifdef CONFIG_GSCD - gscd_init(); -#endif -#ifdef CONFIG_CM206 - cm206_init(); -#endif -#ifdef CONFIG_OPTCD - optcd_init(); -#endif -#ifdef CONFIG_SJCD - sjcd_init(); -#endif -#ifdef CONFIG_APBLOCK - ap_init(); -#endif -#ifdef CONFIG_DDV - ddv_init(); -#endif -#ifdef CONFIG_MDISK - mdisk_init(); -#endif -#ifdef CONFIG_DASD - dasd_init(); -#endif -#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_BLOCK) - tapeblock_init(); -#endif -#ifdef CONFIG_BLK_DEV_XPRAM - xpram_init(); -#endif - -#ifdef CONFIG_SUN_JSFLASH - jsfd_init(); -#endif - -#if defined(CONFIG_XEN_BLKDEV_FRONTEND) - xlblk_init(); -#endif - - return 0; -}; - -EXPORT_SYMBOL(io_request_lock); -EXPORT_SYMBOL(end_that_request_first); -EXPORT_SYMBOL(end_that_request_last); -EXPORT_SYMBOL(blk_grow_request_list); -EXPORT_SYMBOL(blk_init_queue); -EXPORT_SYMBOL(blk_get_queue); -EXPORT_SYMBOL(blk_cleanup_queue); -EXPORT_SYMBOL(blk_queue_headactive); -EXPORT_SYMBOL(blk_queue_throttle_sectors); -EXPORT_SYMBOL(blk_queue_make_request); -EXPORT_SYMBOL(generic_make_request); -EXPORT_SYMBOL(blkdev_release_request); -EXPORT_SYMBOL(generic_unplug_device); -EXPORT_SYMBOL(blk_queue_bounce_limit); -EXPORT_SYMBOL(blk_max_low_pfn); -EXPORT_SYMBOL(blk_max_pfn); -EXPORT_SYMBOL(blk_seg_merge_ok); -EXPORT_SYMBOL(blk_nohighio); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/drivers/char/Makefile --- a/linux-2.4.30-xen-sparse/drivers/char/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,361 +0,0 @@ -# -# Makefile for the kernel character device drivers. -# -# Note! Dependencies are done automagically by 'make dep', which also -# removes any old dependencies. DON'T put your own dependencies here -# unless it's something special (ie not a .c file). -# -# Note 2! The CFLAGS definitions are now inherited from the -# parent makes.. -# - -# -# This file contains the font map for the default (hardware) font -# -FONTMAPFILE = cp437.uni - -O_TARGET := char.o - -obj-y += mem.o tty_io.o n_tty.o tty_ioctl.o raw.o pty.o misc.o random.o - -# All of the (potential) objects that export symbols. -# This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'. - -export-objs := busmouse.o console.o keyboard.o sysrq.o \ - misc.o pty.o random.o selection.o serial.o \ - sonypi.o tty_io.o tty_ioctl.o generic_serial.o \ - au1000_gpio.o vac-serial.o hp_psaux.o nvram.o \ - scx200.o fetchop.o - -mod-subdirs := joystick ftape drm drm-4.0 pcmcia - -list-multi := - -KEYMAP =defkeymap.o -KEYBD =pc_keyb.o -CONSOLE =console.o -SERIAL =serial.o - -ifeq ($(ARCH),xen) - ifneq ($(CONFIG_XEN_PHYSDEV_ACCESS),y) - KEYBD = - endif -endif - -ifeq ($(ARCH),s390) - KEYMAP = - KEYBD = - CONSOLE = - SERIAL = -endif - -ifeq ($(ARCH),mips) - ifneq ($(CONFIG_PC_KEYB),y) - KEYBD = - endif - ifeq ($(CONFIG_VR41XX_KIU),y) - KEYMAP = - KEYBD = vr41xx_keyb.o - endif -endif - -ifeq ($(ARCH),s390x) - KEYMAP = - KEYBD = - CONSOLE = - SERIAL = -endif - -ifeq ($(ARCH),m68k) - ifdef CONFIG_AMIGA - KEYBD = amikeyb.o - else - ifndef CONFIG_MAC - KEYBD = - endif - endif - SERIAL = -endif - -ifeq ($(ARCH),parisc) - ifdef CONFIG_GSC_PS2 - KEYBD = hp_psaux.o hp_keyb.o - else - KEYBD = - endif - ifdef CONFIG_SERIAL_MUX - CONSOLE += mux.o - endif - ifdef CONFIG_PDC_CONSOLE - CONSOLE += pdc_console.o - endif -endif - -ifdef CONFIG_Q40 - KEYBD += q40_keyb.o - SERIAL = serial.o -endif - -ifdef CONFIG_APOLLO - KEYBD += dn_keyb.o -endif - -ifeq ($(ARCH),parisc) - ifdef CONFIG_GSC_PS2 - KEYBD = hp_psaux.o hp_keyb.o - else - KEYBD = - endif - ifdef CONFIG_PDC_CONSOLE - CONSOLE += pdc_console.o - endif -endif - -ifeq ($(ARCH),arm) - ifneq ($(CONFIG_PC_KEYMAP),y) - KEYMAP = - endif - ifneq ($(CONFIG_PC_KEYB),y) - KEYBD = - endif -endif - -ifeq ($(ARCH),sh) - KEYMAP = - KEYBD = - CONSOLE = - ifeq ($(CONFIG_SH_HP600),y) - KEYMAP = defkeymap.o - KEYBD = scan_keyb.o hp600_keyb.o - CONSOLE = console.o - endif - ifeq ($(CONFIG_SH_DMIDA),y) - # DMIDA does not connect the HD64465 PS/2 keyboard port - # but we allow for USB keyboards to be plugged in. - KEYMAP = defkeymap.o - KEYBD = # hd64465_keyb.o pc_keyb.o - CONSOLE = console.o - endif - ifeq ($(CONFIG_SH_EC3104),y) - KEYMAP = defkeymap.o - KEYBD = ec3104_keyb.o - CONSOLE = console.o - endif - ifeq ($(CONFIG_SH_DREAMCAST),y) - KEYMAP = defkeymap.o - KEYBD = - CONSOLE = console.o - endif -endif - -ifeq ($(CONFIG_DECSTATION),y) - KEYMAP = - KEYBD = -endif - -ifeq ($(CONFIG_BAGET_MIPS),y) - KEYBD = - SERIAL = vac-serial.o -endif - -ifeq ($(CONFIG_NINO),y) - SERIAL = -endif - -ifneq ($(CONFIG_SUN_SERIAL),) - SERIAL = -endif - -ifeq ($(CONFIG_QTRONIX_KEYBOARD),y) - KEYBD = qtronix.o - KEYMAP = qtronixmap.o -endif - -ifeq ($(CONFIG_DUMMY_KEYB),y) - KEYBD = dummy_keyb.o -endif - -obj-$(CONFIG_VT) += vt.o vc_screen.o consolemap.o consolemap_deftbl.o $(CONSOLE) selection.o -obj-$(CONFIG_SERIAL) += $(SERIAL) -obj-$(CONFIG_PARPORT_SERIAL) += parport_serial.o -obj-$(CONFIG_SERIAL_HCDP) += hcdp_serial.o -obj-$(CONFIG_SERIAL_21285) += serial_21285.o -obj-$(CONFIG_SERIAL_SA1100) += serial_sa1100.o -obj-$(CONFIG_SERIAL_AMBA) += serial_amba.o -obj-$(CONFIG_TS_AU1X00_ADS7846) += au1000_ts.o -obj-$(CONFIG_SERIAL_DEC) += decserial.o - -ifndef CONFIG_SUN_KEYBOARD - obj-$(CONFIG_VT) += keyboard.o $(KEYMAP) $(KEYBD) -else - obj-$(CONFIG_PCI) += keyboard.o $(KEYMAP) -endif - -obj-$(CONFIG_HIL) += hp_keyb.o -obj-$(CONFIG_MAGIC_SYSRQ) += sysrq.o -obj-$(CONFIG_ATARI_DSP56K) += dsp56k.o -obj-$(CONFIG_ROCKETPORT) += rocket.o -obj-$(CONFIG_MOXA_SMARTIO) += mxser.o -obj-$(CONFIG_MOXA_INTELLIO) += moxa.o -obj-$(CONFIG_DIGI) += pcxx.o -obj-$(CONFIG_DIGIEPCA) += epca.o -obj-$(CONFIG_CYCLADES) += cyclades.o -obj-$(CONFIG_STALLION) += stallion.o -obj-$(CONFIG_ISTALLION) += istallion.o -obj-$(CONFIG_SIBYTE_SB1250_DUART) += sb1250_duart.o -obj-$(CONFIG_COMPUTONE) += ip2.o ip2main.o -obj-$(CONFIG_RISCOM8) += riscom8.o -obj-$(CONFIG_ISI) += isicom.o -obj-$(CONFIG_ESPSERIAL) += esp.o -obj-$(CONFIG_SYNCLINK) += synclink.o -obj-$(CONFIG_SYNCLINKMP) += synclinkmp.o -obj-$(CONFIG_N_HDLC) += n_hdlc.o -obj-$(CONFIG_SPECIALIX) += specialix.o -obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o -obj-$(CONFIG_A2232) += ser_a2232.o generic_serial.o -obj-$(CONFIG_SX) += sx.o generic_serial.o -obj-$(CONFIG_RIO) += rio/rio.o generic_serial.o -obj-$(CONFIG_SH_SCI) += sh-sci.o generic_serial.o -obj-$(CONFIG_SERIAL167) += serial167.o -obj-$(CONFIG_MVME147_SCC) += generic_serial.o vme_scc.o -obj-$(CONFIG_MVME162_SCC) += generic_serial.o vme_scc.o -obj-$(CONFIG_BVME6000_SCC) += generic_serial.o vme_scc.o -obj-$(CONFIG_HVC_CONSOLE) += hvc_console.o -obj-$(CONFIG_SERIAL_TX3912) += generic_serial.o serial_tx3912.o -obj-$(CONFIG_TXX927_SERIAL) += serial_txx927.o -obj-$(CONFIG_SERIAL_TXX9) += generic_serial.o serial_txx9.o -obj-$(CONFIG_IP22_SERIAL) += sgiserial.o -obj-$(CONFIG_AU1X00_UART) += au1x00-serial.o -obj-$(CONFIG_SGI_L1_SERIAL) += sn_serial.o - -subdir-$(CONFIG_RIO) += rio -subdir-$(CONFIG_INPUT) += joystick - -obj-$(CONFIG_ATIXL_BUSMOUSE) += atixlmouse.o -obj-$(CONFIG_LOGIBUSMOUSE) += logibusmouse.o -obj-$(CONFIG_PRINTER) += lp.o -obj-$(CONFIG_TIPAR) += tipar.o -obj-$(CONFIG_OBMOUSE) += obmouse.o - -ifeq ($(CONFIG_INPUT),y) -obj-y += joystick/js.o -endif - -obj-$(CONFIG_FETCHOP) += fetchop.o -obj-$(CONFIG_BUSMOUSE) += busmouse.o -obj-$(CONFIG_DTLK) += dtlk.o -obj-$(CONFIG_R3964) += n_r3964.o -obj-$(CONFIG_APPLICOM) += applicom.o -obj-$(CONFIG_SONYPI) += sonypi.o -obj-$(CONFIG_MS_BUSMOUSE) += msbusmouse.o -obj-$(CONFIG_82C710_MOUSE) += qpmouse.o -obj-$(CONFIG_AMIGAMOUSE) += amigamouse.o -obj-$(CONFIG_ATARIMOUSE) += atarimouse.o -obj-$(CONFIG_ADBMOUSE) += adbmouse.o -obj-$(CONFIG_PC110_PAD) += pc110pad.o -obj-$(CONFIG_MK712_MOUSE) += mk712.o -obj-$(CONFIG_RTC) += rtc.o -obj-$(CONFIG_GEN_RTC) += genrtc.o -obj-$(CONFIG_EFI_RTC) += efirtc.o -obj-$(CONFIG_SGI_DS1286) += ds1286.o -obj-$(CONFIG_MIPS_RTC) += mips_rtc.o -obj-$(CONFIG_SGI_IP27_RTC) += ip27-rtc.o -ifeq ($(CONFIG_PPC),) - obj-$(CONFIG_NVRAM) += nvram.o -endif -obj-$(CONFIG_TOSHIBA) += toshiba.o -obj-$(CONFIG_I8K) += i8k.o -obj-$(CONFIG_DS1620) += ds1620.o -obj-$(CONFIG_DS1742) += ds1742.o -obj-$(CONFIG_INTEL_RNG) += i810_rng.o -obj-$(CONFIG_AMD_RNG) += amd768_rng.o -obj-$(CONFIG_HW_RANDOM) += hw_random.o -obj-$(CONFIG_AMD_PM768) += amd76x_pm.o -obj-$(CONFIG_BRIQ_PANEL) += briq_panel.o - -obj-$(CONFIG_ITE_GPIO) += ite_gpio.o -obj-$(CONFIG_AU1X00_GPIO) += au1000_gpio.o -obj-$(CONFIG_AU1X00_USB_TTY) += au1000_usbtty.o -obj-$(CONFIG_AU1X00_USB_RAW) += au1000_usbraw.o -obj-$(CONFIG_COBALT_LCD) += lcd.o - -obj-$(CONFIG_QIC02_TAPE) += tpqic02.o - -subdir-$(CONFIG_FTAPE) += ftape -subdir-$(CONFIG_DRM_OLD) += drm-4.0 -subdir-$(CONFIG_DRM_NEW) += drm -subdir-$(CONFIG_PCMCIA) += pcmcia -subdir-$(CONFIG_AGP) += agp - -ifeq ($(CONFIG_FTAPE),y) -obj-y += ftape/ftape.o -endif - -obj-$(CONFIG_H8) += h8.o -obj-$(CONFIG_PPDEV) += ppdev.o -obj-$(CONFIG_DZ) += dz.o -obj-$(CONFIG_NWBUTTON) += nwbutton.o -obj-$(CONFIG_NWFLASH) += nwflash.o -obj-$(CONFIG_SCx200) += scx200.o -obj-$(CONFIG_SCx200_GPIO) += scx200_gpio.o - -# Only one watchdog can succeed. We probe the hardware watchdog -# drivers first, then the softdog driver. This means if your hardware -# watchdog dies or is 'borrowed' for some reason the software watchdog -# still gives you some cover. - -obj-$(CONFIG_PCWATCHDOG) += pcwd.o -obj-$(CONFIG_ACQUIRE_WDT) += acquirewdt.o -obj-$(CONFIG_ADVANTECH_WDT) += advantechwdt.o -obj-$(CONFIG_IB700_WDT) += ib700wdt.o -obj-$(CONFIG_MIXCOMWD) += mixcomwd.o -obj-$(CONFIG_60XX_WDT) += sbc60xxwdt.o -obj-$(CONFIG_W83877F_WDT) += w83877f_wdt.o -obj-$(CONFIG_SC520_WDT) += sc520_wdt.o -obj-$(CONFIG_WDT) += wdt.o -obj-$(CONFIG_WDTPCI) += wdt_pci.o -obj-$(CONFIG_21285_WATCHDOG) += wdt285.o -obj-$(CONFIG_977_WATCHDOG) += wdt977.o -obj-$(CONFIG_I810_TCO) += i810-tco.o -obj-$(CONFIG_MACHZ_WDT) += machzwd.o -obj-$(CONFIG_SH_WDT) += shwdt.o -obj-$(CONFIG_EUROTECH_WDT) += eurotechwdt.o -obj-$(CONFIG_ALIM7101_WDT) += alim7101_wdt.o -obj-$(CONFIG_ALIM1535_WDT) += alim1535d_wdt.o -obj-$(CONFIG_INDYDOG) += indydog.o -obj-$(CONFIG_SC1200_WDT) += sc1200wdt.o -obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o -obj-$(CONFIG_WAFER_WDT) += wafer5823wdt.o -obj-$(CONFIG_SOFT_WATCHDOG) += softdog.o -obj-$(CONFIG_INDYDOG) += indydog.o -obj-$(CONFIG_8xx_WDT) += mpc8xx_wdt.o - -subdir-$(CONFIG_MWAVE) += mwave -ifeq ($(CONFIG_MWAVE),y) - obj-y += mwave/mwave.o -endif - -subdir-$(CONFIG_IPMI_HANDLER) += ipmi -ifeq ($(CONFIG_IPMI_HANDLER),y) - obj-y += ipmi/ipmi.o -endif - -include $(TOPDIR)/Rules.make - -fastdep: - -conmakehash: conmakehash.c - $(HOSTCC) $(HOSTCFLAGS) -o conmakehash conmakehash.c - -consolemap_deftbl.c: $(FONTMAPFILE) conmakehash - ./conmakehash $(FONTMAPFILE) > consolemap_deftbl.c - -consolemap_deftbl.o: consolemap_deftbl.c $(TOPDIR)/include/linux/types.h - -.DELETE_ON_ERROR: - -defkeymap.c: defkeymap.map - set -e ; loadkeys --mktable $< | sed -e 's/^static *//' > $@ - -qtronixmap.c: qtronixmap.map - set -e ; loadkeys --mktable $< | sed -e 's/^static *//' > $@ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/drivers/char/mem.c --- a/linux-2.4.30-xen-sparse/drivers/char/mem.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,812 +0,0 @@ -/* - * linux/drivers/char/mem.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Added devfs support. - * Jan-11-1998, C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx> - * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@xxxxxxx> - * - * MODIFIED FOR XEN by Keir Fraser, 10th July 2003. - * Linux running on Xen has strange semantics for /dev/mem and /dev/kmem!! - * 1. mmap will not work on /dev/kmem - * 2. mmap on /dev/mem interprets the 'file offset' as a machine address - * rather than a physical address. - * I don't believe anyone sane mmaps /dev/kmem, but /dev/mem is mmapped - * to get at memory-mapped I/O spaces (eg. the VESA X server does this). - * For this to work at all we need to expect machine addresses. - * Reading/writing of /dev/kmem expects kernel virtual addresses, as usual. - * Reading/writing of /dev/mem expects 'physical addresses' as usual -- this - * is because /dev/mem can only read/write existing kernel mappings, which - * will be normal RAM, and we should present pseudo-physical layout for all - * except I/O (which is the sticky case that mmap is hacked to deal with). - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> -#include <linux/tpqic02.h> -#include <linux/ftape.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/mman.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/raw.h> -#include <linux/tty.h> -#include <linux/capability.h> -#include <linux/ptrace.h> - -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/pgalloc.h> - -#ifdef CONFIG_I2C -extern int i2c_init_all(void); -#endif -#ifdef CONFIG_FB -extern void fbmem_init(void); -#endif -#ifdef CONFIG_PROM_CONSOLE -extern void prom_con_init(void); -#endif -#ifdef CONFIG_MDA_CONSOLE -extern void mda_console_init(void); -#endif -#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) -extern void tapechar_init(void); -#endif - -static ssize_t do_write_mem(struct file * file, void *p, unsigned long realp, - const char * buf, size_t count, loff_t *ppos) -{ - ssize_t written; - - written = 0; -#if defined(__sparc__) || defined(__mc68000__) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; - if (sz > count) sz = count; - /* Hmm. Do something? */ - buf+=sz; - p+=sz; - count-=sz; - written+=sz; - } -#endif - if (copy_from_user(p, buf, count)) - return -EFAULT; - written += count; - *ppos = realp + written; - return written; -} - - -/* - * This funcion reads the *physical* memory. The f_pos points directly to the - * memory location. - */ -static ssize_t read_mem(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - unsigned long end_mem; - ssize_t read; - - end_mem = __pa(high_memory); - if (p >= end_mem) - return 0; - if (count > end_mem - p) - count = end_mem - p; - read = 0; -#if defined(__sparc__) || defined(__mc68000__) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; - if (sz > count) - sz = count; - if (sz > 0) { - if (clear_user(buf, sz)) - return -EFAULT; - buf += sz; - p += sz; - count -= sz; - read += sz; - } - } -#endif - if (copy_to_user(buf, __va(p), count)) - return -EFAULT; - read += count; - *ppos = p + read; - return read; -} - -static ssize_t write_mem(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - unsigned long end_mem; - - end_mem = __pa(high_memory); - if (p >= end_mem) - return 0; - if (count > end_mem - p) - count = end_mem - p; - return do_write_mem(file, __va(p), p, buf, count, ppos); -} - -#ifndef pgprot_noncached - -/* - * This should probably be per-architecture in <asm/pgtable.h> - */ -static inline pgprot_t pgprot_noncached(pgprot_t _prot) -{ - unsigned long prot = pgprot_val(_prot); - -#if defined(__i386__) || defined(__x86_64__) - /* On PPro and successors, PCD alone doesn't always mean - uncached because of interactions with the MTRRs. PCD | PWT - means definitely uncached. */ - if (boot_cpu_data.x86 > 3) - prot |= _PAGE_PCD | _PAGE_PWT; -#elif defined(__powerpc__) - prot |= _PAGE_NO_CACHE | _PAGE_GUARDED; -#elif defined(__mc68000__) -#ifdef SUN3_PAGE_NOCACHE - if (MMU_IS_SUN3) - prot |= SUN3_PAGE_NOCACHE; - else -#endif - if (MMU_IS_851 || MMU_IS_030) - prot |= _PAGE_NOCACHE030; - /* Use no-cache mode, serialized */ - else if (MMU_IS_040 || MMU_IS_060) - prot = (prot & _CACHEMASK040) | _PAGE_NOCACHE_S; -#endif - - return __pgprot(prot); -} - -#endif /* !pgprot_noncached */ - -/* - * Architectures vary in how they handle caching for addresses - * outside of main memory. - */ -static inline int noncached_address(unsigned long addr) -{ -#if defined(__i386__) - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting PCD or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - return !( test_bit(X86_FEATURE_MTRR, &boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_K6_MTRR, &boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CYRIX_ARR, &boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CENTAUR_MCR, &boot_cpu_data.x86_capability) ) - && addr >= __pa(high_memory); -#else - return addr >= __pa(high_memory); -#endif -} - -#if !defined(CONFIG_XEN) -static int mmap_mem(struct file * file, struct vm_area_struct * vma) -{ - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - - /* - * Accessing memory above the top the kernel knows about or - * through a file pointer that was marked O_SYNC will be - * done non-cached. - */ - if (noncached_address(offset) || (file->f_flags & O_SYNC)) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - /* Don't try to swap out physical pages.. */ - vma->vm_flags |= VM_RESERVED; - - /* - * Don't dump addresses that are not real memory to a core file. - */ - if (offset >= __pa(high_memory) || (file->f_flags & O_SYNC)) - vma->vm_flags |= VM_IO; - - if (remap_page_range(vma->vm_start, offset, vma->vm_end-vma->vm_start, - vma->vm_page_prot)) - return -EAGAIN; - return 0; -} -#elif !defined(CONFIG_XEN_PRIVILEGED_GUEST) -static int mmap_mem(struct file * file, struct vm_area_struct * vma) -{ - return -ENXIO; -} -#else -static int mmap_mem(struct file * file, struct vm_area_struct * vma) -{ - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - return -ENXIO; - - /* DONTCOPY is essential for Xen as copy_page_range is broken. */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (direct_remap_area_pages(vma->vm_mm, vma->vm_start, offset, - vma->vm_end-vma->vm_start, vma->vm_page_prot, - DOMID_IO)) - return -EAGAIN; - return 0; -} -#endif /* CONFIG_XEN */ - -/* - * This function reads the *virtual* memory as seen by the kernel. - */ -static ssize_t read_kmem(struct file *file, char *buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t read = 0; - ssize_t virtr = 0; - char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ - - if (p < (unsigned long) high_memory) { - read = count; - if (count > (unsigned long) high_memory - p) - read = (unsigned long) high_memory - p; - -#if defined(__sparc__) || defined(__mc68000__) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { - size_t tmp = PAGE_SIZE - p; - if (tmp > read) tmp = read; - if (clear_user(buf, tmp)) - return -EFAULT; - buf += tmp; - p += tmp; - read -= tmp; - count -= tmp; - } -#endif - if (copy_to_user(buf, (char *)p, read)) - return -EFAULT; - p += read; - buf += read; - count -= read; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - while (count > 0) { - int len = count; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - len = vread(kbuf, (char *)p, len); - if (!len) - break; - if (copy_to_user(buf, kbuf, len)) { - free_page((unsigned long)kbuf); - return -EFAULT; - } - count -= len; - buf += len; - virtr += len; - p += len; - } - free_page((unsigned long)kbuf); - } - *ppos = p; - return virtr + read; -} - -extern long vwrite(char *buf, char *addr, unsigned long count); - -/* - * This function writes to the *virtual* memory as seen by the kernel. - */ -static ssize_t write_kmem(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t wrote = 0; - ssize_t virtr = 0; - char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ - - if (p < (unsigned long) high_memory) { - wrote = count; - if (count > (unsigned long) high_memory - p) - wrote = (unsigned long) high_memory - p; - - wrote = do_write_mem(file, (void*)p, p, buf, wrote, ppos); - - p += wrote; - buf += wrote; - count -= wrote; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - while (count > 0) { - int len = count; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - if (len && copy_from_user(kbuf, buf, len)) { - free_page((unsigned long)kbuf); - return -EFAULT; - } - len = vwrite(kbuf, (char *)p, len); - count -= len; - buf += len; - virtr += len; - p += len; - } - free_page((unsigned long)kbuf); - } - - *ppos = p; - return virtr + wrote; -} - -#if defined(CONFIG_ISA) || !defined(__mc68000__) -static ssize_t read_port(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - unsigned long i = *ppos; - char *tmp = buf; - - if (verify_area(VERIFY_WRITE,buf,count)) - return -EFAULT; - while (count-- > 0 && i < 65536) { - if (__put_user(inb(i),tmp) < 0) - return -EFAULT; - i++; - tmp++; - } - *ppos = i; - return tmp-buf; -} - -static ssize_t write_port(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - unsigned long i = *ppos; - const char * tmp = buf; - - if (verify_area(VERIFY_READ,buf,count)) - return -EFAULT; - while (count-- > 0 && i < 65536) { - char c; - if (__get_user(c, tmp)) - return -EFAULT; - outb(c,i); - i++; - tmp++; - } - *ppos = i; - return tmp-buf; -} -#endif - -static ssize_t read_null(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - return 0; -} - -static ssize_t write_null(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - return count; -} - -/* - * For fun, we are using the MMU for this. - */ -static inline size_t read_zero_pagealigned(char * buf, size_t size) -{ - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long addr=(unsigned long)buf; - - mm = current->mm; - /* Oops, this was forgotten before. -ben */ - down_read(&mm->mmap_sem); - - /* For private mappings, just map in zero pages. */ - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { - unsigned long count; - - if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) - goto out_up; - if (vma->vm_flags & VM_SHARED) - break; - count = vma->vm_end - addr; - if (count > size) - count = size; - - zap_page_range(mm, addr, count); - zeromap_page_range(addr, count, PAGE_COPY); - - size -= count; - buf += count; - addr += count; - if (size == 0) - goto out_up; - } - - up_read(&mm->mmap_sem); - - /* The shared case is hard. Let's do the conventional zeroing. */ - do { - unsigned long unwritten = clear_user(buf, PAGE_SIZE); - if (unwritten) - return size + unwritten - PAGE_SIZE; - if (current->need_resched) - schedule(); - buf += PAGE_SIZE; - size -= PAGE_SIZE; - } while (size); - - return size; -out_up: - up_read(&mm->mmap_sem); - return size; -} - -static ssize_t read_zero(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - unsigned long left, unwritten, written = 0; - - if (!count) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - left = count; - - /* do we want to be clever? Arbitrary cut-off */ - if (count >= PAGE_SIZE*4) { - unsigned long partial; - - /* How much left of the page? */ - partial = (PAGE_SIZE-1) & -(unsigned long) buf; - unwritten = clear_user(buf, partial); - written = partial - unwritten; - if (unwritten) - goto out; - left -= partial; - buf += partial; - unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); - written += (left & PAGE_MASK) - unwritten; - if (unwritten) - goto out; - buf += left & PAGE_MASK; - left &= ~PAGE_MASK; - } - unwritten = clear_user(buf, left); - written += left - unwritten; -out: - return written ? written : -EFAULT; -} - -static int mmap_zero(struct file * file, struct vm_area_struct * vma) -{ - if (vma->vm_flags & VM_SHARED) - return shmem_zero_setup(vma); - if (zeromap_page_range(vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) - return -EAGAIN; - return 0; -} - -static ssize_t write_full(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - return -ENOSPC; -} - -/* - * Special lseek() function for /dev/null and /dev/zero. Most notably, you - * can fopen() both devices with "a" now. This was previously impossible. - * -- SRB. - */ - -static loff_t null_lseek(struct file * file, loff_t offset, int orig) -{ - return file->f_pos = 0; -} - -/* - * The memory devices use the full 32/64 bits of the offset, and so we cannot - * check against negative addresses: they are ok. The return value is weird, - * though, in that case (0). - * - * also note that seeking relative to the "end of file" isn't supported: - * it has no meaning, so it returns -EINVAL. - */ -static loff_t memory_lseek(struct file * file, loff_t offset, int orig) -{ - loff_t ret; - - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - force_successful_syscall_return(); - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - force_successful_syscall_return(); - break; - default: - ret = -EINVAL; - } - return ret; -} - -static int open_port(struct inode * inode, struct file * filp) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -struct page *kmem_vm_nopage(struct vm_area_struct *vma, unsigned long address, int write) -{ - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long kaddr; - pgd_t *pgd; - pmd_t *pmd; - pte_t *ptep, pte; - struct page *page = NULL; - - /* address is user VA; convert to kernel VA of desired page */ - kaddr = (address - vma->vm_start) + offset; - kaddr = VMALLOC_VMADDR(kaddr); - - spin_lock(&init_mm.page_table_lock); - - /* Lookup page structure for kernel VA */ - pgd = pgd_offset(&init_mm, kaddr); - if (pgd_none(*pgd) || pgd_bad(*pgd)) - goto out; - pmd = pmd_offset(pgd, kaddr); - if (pmd_none(*pmd) || pmd_bad(*pmd)) - goto out; - ptep = pte_offset(pmd, kaddr); - if (!ptep) - goto out; - pte = *ptep; - if (!pte_present(pte)) - goto out; - if (write && !pte_write(pte)) - goto out; - page = pte_page(pte); - if (!VALID_PAGE(page)) { - page = NULL; - goto out; - } - - /* Increment reference count on page */ - get_page(page); - -out: - spin_unlock(&init_mm.page_table_lock); - - return page; -} - -struct vm_operations_struct kmem_vm_ops = { - nopage: kmem_vm_nopage, -}; - -static int mmap_kmem(struct file * file, struct vm_area_struct * vma) -{ - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long size = vma->vm_end - vma->vm_start; - - /* - * If the user is not attempting to mmap a high memory address then - * the standard mmap_mem mechanism will work. High memory addresses - * need special handling, as remap_page_range expects a physically- - * contiguous range of kernel addresses (such as obtained in kmalloc). - */ - if ((offset + size) < (unsigned long) high_memory) - return mmap_mem(file, vma); - - /* - * Accessing memory above the top the kernel knows about or - * through a file pointer that was marked O_SYNC will be - * done non-cached. - */ - if (noncached_address(offset) || (file->f_flags & O_SYNC)) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - /* Don't do anything here; "nopage" will fill the holes */ - vma->vm_ops = &kmem_vm_ops; - - /* Don't try to swap out physical pages.. */ - vma->vm_flags |= VM_RESERVED; - - /* - * Don't dump addresses that are not real memory to a core file. - */ - vma->vm_flags |= VM_IO; - - return 0; -} - -#define zero_lseek null_lseek -#define full_lseek null_lseek -#define write_zero write_null -#define read_full read_zero -#define open_mem open_port -#define open_kmem open_mem - -static struct file_operations mem_fops = { - llseek: memory_lseek, - read: read_mem, - write: write_mem, - mmap: mmap_mem, - open: open_mem, -}; - -static struct file_operations kmem_fops = { - llseek: memory_lseek, - read: read_kmem, - write: write_kmem, -#if !defined(CONFIG_XEN) - mmap: mmap_kmem, -#endif - open: open_kmem, -}; - -static struct file_operations null_fops = { - llseek: null_lseek, - read: read_null, - write: write_null, -}; - -#if defined(CONFIG_ISA) || !defined(__mc68000__) -static struct file_operations port_fops = { - llseek: memory_lseek, - read: read_port, - write: write_port, - open: open_port, -}; -#endif - -static struct file_operations zero_fops = { - llseek: zero_lseek, - read: read_zero, - write: write_zero, - mmap: mmap_zero, -}; - -static struct file_operations full_fops = { - llseek: full_lseek, - read: read_full, - write: write_full, -}; - -static int memory_open(struct inode * inode, struct file * filp) -{ - switch (MINOR(inode->i_rdev)) { - case 1: - filp->f_op = &mem_fops; - break; - case 2: - filp->f_op = &kmem_fops; - break; - case 3: - filp->f_op = &null_fops; - break; -#if defined(CONFIG_ISA) || !defined(__mc68000__) - case 4: - filp->f_op = &port_fops; - break; -#endif - case 5: - filp->f_op = &zero_fops; - break; - case 7: - filp->f_op = &full_fops; - break; - case 8: - filp->f_op = &random_fops; - break; - case 9: - filp->f_op = &urandom_fops; - break; - default: - return -ENXIO; - } - if (filp->f_op && filp->f_op->open) - return filp->f_op->open(inode,filp); - return 0; -} - -void __init memory_devfs_register (void) -{ - /* These are never unregistered */ - static const struct { - unsigned short minor; - char *name; - umode_t mode; - struct file_operations *fops; - } list[] = { /* list of minor devices */ - {1, "mem", S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops}, - {2, "kmem", S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops}, - {3, "null", S_IRUGO | S_IWUGO, &null_fops}, -#if defined(CONFIG_ISA) || !defined(__mc68000__) - {4, "port", S_IRUSR | S_IWUSR | S_IRGRP, &port_fops}, -#endif - {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, - {7, "full", S_IRUGO | S_IWUGO, &full_fops}, - {8, "random", S_IRUGO | S_IWUSR, &random_fops}, - {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops} - }; - int i; - - for (i=0; i<(sizeof(list)/sizeof(*list)); i++) - devfs_register (NULL, list[i].name, DEVFS_FL_NONE, - MEM_MAJOR, list[i].minor, - list[i].mode | S_IFCHR, - list[i].fops, NULL); -} - -static struct file_operations memory_fops = { - open: memory_open, /* just a selector for the real open */ -}; - -int __init chr_dev_init(void) -{ - if (devfs_register_chrdev(MEM_MAJOR,"mem",&memory_fops)) - printk("unable to get major %d for memory devs\n", MEM_MAJOR); - memory_devfs_register(); - rand_initialize(); -#ifdef CONFIG_I2C - i2c_init_all(); -#endif -#if defined (CONFIG_FB) - fbmem_init(); -#endif -#if defined (CONFIG_PROM_CONSOLE) - prom_con_init(); -#endif -#if defined (CONFIG_MDA_CONSOLE) - mda_console_init(); -#endif - tty_init(); -#ifdef CONFIG_M68K_PRINTER - lp_m68k_init(); -#endif - misc_init(); -#if CONFIG_QIC02_TAPE - qic02_tape_init(); -#endif -#ifdef CONFIG_FTAPE - ftape_init(); -#endif -#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) - tapechar_init(); -#endif - return 0; -} - -__initcall(chr_dev_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/drivers/char/tty_io.c --- a/linux-2.4.30-xen-sparse/drivers/char/tty_io.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2891 +0,0 @@ -/* - * linux/drivers/char/tty_io.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles - * or rs-channels. It also implements echoing, cooked mode etc. - * - * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. - * - * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the - * tty_struct and tty_queue structures. Previously there was an array - * of 256 tty_struct's which was statically allocated, and the - * tty_queue structures were allocated at boot time. Both are now - * dynamically allocated only when the tty is open. - * - * Also restructured routines so that there is more of a separation - * between the high-level tty routines (tty_io.c and tty_ioctl.c) and - * the low-level tty routines (serial.c, pty.c, console.c). This - * makes for cleaner and more compact code. -TYT, 9/17/92 - * - * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines - * which can be dynamically activated and de-activated by the line - * discipline handling modules (like SLIP). - * - * NOTE: pay no attention to the line discipline code (yet); its - * interface is still subject to change in this version... - * -- TYT, 1/31/92 - * - * Added functionality to the OPOST tty handling. No delays, but all - * other bits should be there. - * -- Nick Holloway <alfie@xxxxxxxxxxxxxxxxx>, 27th May 1993. - * - * Rewrote canonical mode and added more termios flags. - * -- julian@xxxxxxxxxxxxxxxxxxxxxx (J. Cowley), 13Jan94 - * - * Reorganized FASYNC support so mouse code can share it. - * -- ctm@xxxxxxxx, 9Sep95 - * - * New TIOCLINUX variants added. - * -- mj@xxxxxxxxxxxxxxxxx, 19-Nov-95 - * - * Restrict vt switching via ioctl() - * -- grif@xxxxxxxxxx, 5-Dec-95 - * - * Move console and virtual terminal code to more appropriate files, - * implement CONFIG_VT and generalize console device interface. - * -- Marko Kohtala <Marko.Kohtala@xxxxxx>, March 97 - * - * Rewrote init_dev and release_dev to eliminate races. - * -- Bill Hawes <whawes@xxxxxxxx>, June 97 - * - * Added devfs support. - * -- C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, 13-Jan-1998 - * - * Added support for a Unix98-style ptmx device. - * -- C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, 14-Jan-1998 - * - * Reduced memory usage for older ARM systems - * -- Russell King <rmk@xxxxxxxxxxxxxxxx> - * - * Move do_SAK() into process context. Less stack use in devfs functions. - * alloc_tty_struct() always uses kmalloc() -- Andrew Morton <andrewm@xxxxxxxxxx> 17Mar01 - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/major.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/fcntl.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/tty.h> -#include <linux/tty_driver.h> -#include <linux/tty_flip.h> -#include <linux/devpts_fs.h> -#include <linux/file.h> -#include <linux/console.h> -#include <linux/timer.h> -#include <linux/ctype.h> -#include <linux/kd.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/poll.h> -#include <linux/proc_fs.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/smp_lock.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/bitops.h> - -#include <linux/kbd_kern.h> -#include <linux/vt_kern.h> -#include <linux/selection.h> -#include <linux/devfs_fs_kernel.h> - -#include <linux/kmod.h> - -#ifdef CONFIG_XEN_CONSOLE -extern void xen_console_init(void); -#endif - -#ifdef CONFIG_VT -extern void con_init_devfs (void); -#endif - -extern void disable_early_printk(void); - -#define CONSOLE_DEV MKDEV(TTY_MAJOR,0) -#define TTY_DEV MKDEV(TTYAUX_MAJOR,0) -#define SYSCONS_DEV MKDEV(TTYAUX_MAJOR,1) -#define PTMX_DEV MKDEV(TTYAUX_MAJOR,2) - -#undef TTY_DEBUG_HANGUP - -#define TTY_PARANOIA_CHECK 1 -#define CHECK_TTY_COUNT 1 - -struct termios tty_std_termios; /* for the benefit of tty drivers */ -struct tty_driver *tty_drivers; /* linked list of tty drivers */ - -#ifdef CONFIG_UNIX98_PTYS -extern struct tty_driver ptm_driver[]; /* Unix98 pty masters; for /dev/ptmx */ -extern struct tty_driver pts_driver[]; /* Unix98 pty slaves; for /dev/ptmx */ -#endif - -static void initialize_tty_struct(struct tty_struct *tty); - -static ssize_t tty_read(struct file *, char *, size_t, loff_t *); -static ssize_t tty_write(struct file *, const char *, size_t, loff_t *); -static unsigned int tty_poll(struct file *, poll_table *); -static int tty_open(struct inode *, struct file *); -static int tty_release(struct inode *, struct file *); -int tty_ioctl(struct inode * inode, struct file * file, - unsigned int cmd, unsigned long arg); -static int tty_fasync(int fd, struct file * filp, int on); -extern int vme_scc_init (void); -extern long vme_scc_console_init(void); -extern int serial167_init(void); -extern long serial167_console_init(void); -extern void console_8xx_init(void); -extern void au1x00_serial_console_init(void); -extern int rs_8xx_init(void); -extern void mac_scc_console_init(void); -extern void hwc_console_init(void); -extern void hwc_tty_init(void); -extern void con3215_init(void); -extern void tty3215_init(void); -extern void tub3270_con_init(void); -extern void tub3270_init(void); -extern void rs285_console_init(void); -extern void sa1100_rs_console_init(void); -extern void sgi_serial_console_init(void); -extern void sn_sal_serial_console_init(void); -extern void sci_console_init(void); -extern void dec_serial_console_init(void); -extern void tx3912_console_init(void); -extern void tx3912_rs_init(void); -extern void txx927_console_init(void); -extern void txx9_rs_init(void); -extern void txx9_serial_console_init(void); -extern void sb1250_serial_console_init(void); -extern void arc_console_init(void); -extern int hvc_console_init(void); - -#ifndef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#endif -#ifndef MAX -#define MAX(a,b) ((a) < (b) ? (b) : (a)) -#endif - -static struct tty_struct *alloc_tty_struct(void) -{ - struct tty_struct *tty; - - tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); - if (tty) - memset(tty, 0, sizeof(struct tty_struct)); - return tty; -} - -static inline void free_tty_struct(struct tty_struct *tty) -{ - kfree(tty); -} - -/* - * This routine returns the name of tty. - */ -static char * -_tty_make_name(struct tty_struct *tty, const char *name, char *buf) -{ - int idx = (tty)?MINOR(tty->device) - tty->driver.minor_start:0; - - if (!tty) /* Hmm. NULL pointer. That's fun. */ - strcpy(buf, "NULL tty"); - else - sprintf(buf, name, - idx + tty->driver.name_base); - - return buf; -} - -#define TTY_NUMBER(tty) (MINOR((tty)->device) - (tty)->driver.minor_start + \ - (tty)->driver.name_base) - -char *tty_name(struct tty_struct *tty, char *buf) -{ - return _tty_make_name(tty, (tty)?tty->driver.name:NULL, buf); -} - -inline int tty_paranoia_check(struct tty_struct *tty, kdev_t device, - const char *routine) -{ -#ifdef TTY_PARANOIA_CHECK - static const char badmagic[] = KERN_WARNING - "Warning: bad magic number for tty struct (%s) in %s\n"; - static const char badtty[] = KERN_WARNING - "Warning: null TTY for (%s) in %s\n"; - - if (!tty) { - printk(badtty, kdevname(device), routine); - return 1; - } - if (tty->magic != TTY_MAGIC) { - printk(badmagic, kdevname(device), routine); - return 1; - } -#endif - return 0; -} - -static int check_tty_count(struct tty_struct *tty, const char *routine) -{ -#ifdef CHECK_TTY_COUNT - struct list_head *p; - int count = 0; - - file_list_lock(); - for(p = tty->tty_files.next; p != &tty->tty_files; p = p->next) { - if(list_entry(p, struct file, f_list)->private_data == tty) - count++; - } - file_list_unlock(); - if (tty->driver.type == TTY_DRIVER_TYPE_PTY && - tty->driver.subtype == PTY_TYPE_SLAVE && - tty->link && tty->link->count) - count++; - if (tty->count != count) { - printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " - "!= #fd's(%d) in %s\n", - kdevname(tty->device), tty->count, count, routine); - return count; - } -#endif - return 0; -} - -/* - * This is probably overkill for real world processors but - * they are not on hot paths so a little discipline won't do - * any harm. - */ - -static void tty_set_termios_ldisc(struct tty_struct *tty, int num) -{ - down(&tty->termios_sem); - tty->termios->c_line = num; - up(&tty->termios_sem); -} - -/* - * This guards the refcounted line discipline lists. The lock - * must be taken with irqs off because there are hangup path - * callers who will do ldisc lookups and cannot sleep. - */ - -spinlock_t tty_ldisc_lock = SPIN_LOCK_UNLOCKED; -DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait); -struct tty_ldisc tty_ldiscs[NR_LDISCS]; /* line disc dispatch table */ - -int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) -{ - - unsigned long flags; - int ret = 0; - - if (disc < N_TTY || disc >= NR_LDISCS) - return -EINVAL; - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if (new_ldisc) { - tty_ldiscs[disc] = *new_ldisc; - tty_ldiscs[disc].num = disc; - tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED; - tty_ldiscs[disc].refcount = 0; - } else { - if(tty_ldiscs[disc].refcount) - ret = -EBUSY; - else - tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED; - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - - return ret; - -} - - -EXPORT_SYMBOL(tty_register_ldisc); - -struct tty_ldisc *tty_ldisc_get(int disc) -{ - unsigned long flags; - struct tty_ldisc *ld; - - if (disc < N_TTY || disc >= NR_LDISCS) - return NULL; - - spin_lock_irqsave(&tty_ldisc_lock, flags); - - ld = &tty_ldiscs[disc]; - /* Check the entry is defined */ - if(ld->flags & LDISC_FLAG_DEFINED) - ld->refcount++; - else - ld = NULL; - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return ld; -} - -EXPORT_SYMBOL_GPL(tty_ldisc_get); - -void tty_ldisc_put(int disc) -{ - struct tty_ldisc *ld; - unsigned long flags; - - if (disc < N_TTY || disc >= NR_LDISCS) - BUG(); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - ld = &tty_ldiscs[disc]; - if(ld->refcount <= 0) - BUG(); - ld->refcount--; - spin_unlock_irqrestore(&tty_ldisc_lock, flags); -} - -EXPORT_SYMBOL_GPL(tty_ldisc_put); - -void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld) -{ - tty->ldisc = *ld; - tty->ldisc.refcount = 0; -} - -/** - * tty_ldisc_try - internal helper - * @tty: the tty - * - * Make a single attempt to grab and bump the refcount on - * the tty ldisc. Return 0 on failure or 1 on success. This is - * used to implement both the waiting and non waiting versions - * of tty_ldisc_ref - */ - -static int tty_ldisc_try(struct tty_struct *tty) -{ - unsigned long flags; - struct tty_ldisc *ld; - int ret = 0; - - spin_lock_irqsave(&tty_ldisc_lock, flags); - ld = &tty->ldisc; - if(test_bit(TTY_LDISC, &tty->flags)) - { - ld->refcount++; - ret = 1; - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return ret; -} - -/** - * tty_ldisc_ref_wait - wait for the tty ldisc - * @tty: tty device - * - * Dereference the line discipline for the terminal and take a - * reference to it. If the line discipline is in flux then - * wait patiently until it changes. - * - * Note: Must not be called from an IRQ/timer context. The caller - * must also be careful not to hold other locks that will deadlock - * against a discipline change, such as an existing ldisc reference - * (which we check for) - */ - -struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) -{ - /* wait_event is a macro */ - wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); - return &tty->ldisc; -} - -EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); - -/** - * tty_ldisc_ref - get the tty ldisc - * @tty: tty device - * - * Dereference the line discipline for the terminal and take a - * reference to it. If the line discipline is in flux then - * return NULL. Can be called from IRQ and timer functions. - */ - -struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) -{ - if(tty_ldisc_try(tty)) - return &tty->ldisc; - return NULL; -} - -EXPORT_SYMBOL_GPL(tty_ldisc_ref); - - -void tty_ldisc_deref(struct tty_ldisc *ld) -{ - - unsigned long flags; - - if(ld == NULL) - BUG(); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if(ld->refcount == 0) - printk(KERN_EMERG "tty_ldisc_deref: no references.\n"); - else - ld->refcount--; - if(ld->refcount == 0) - wake_up(&tty_ldisc_wait); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); -} - -EXPORT_SYMBOL_GPL(tty_ldisc_deref); - -/** - * tty_ldisc_enable - allow ldisc use - * @tty: terminal to activate ldisc on - * - * Set the TTY_LDISC flag when the line discipline can be called - * again. Do neccessary wakeups for existing sleepers. - * - * Note: nobody should set this bit except via this function. Clearing - * directly is allowed. - */ - -static void tty_ldisc_enable(struct tty_struct *tty) -{ - set_bit(TTY_LDISC, &tty->flags); - wake_up(&tty_ldisc_wait); -} - -/** - * tty_set_ldisc - set line discipline - * @tty: the terminal to set - * @ldisc: the line discipline - * - * Set the discipline of a tty line. Must be called from a process - * context. - */ - -static int tty_set_ldisc(struct tty_struct *tty, int ldisc) -{ - int retval = 0; - struct tty_ldisc o_ldisc; - char buf[64]; - unsigned long flags; - struct tty_ldisc *ld; - - if ((ldisc < N_TTY) || (ldisc >= NR_LDISCS)) - return -EINVAL; - -restart: - - if (tty->ldisc.num == ldisc) - return 0; /* We are already in the desired discipline */ - - ld = tty_ldisc_get(ldisc); - /* Eduardo Blanco <ejbs@xxxxxxxxxxxx> */ - /* Cyrus Durgin <cider@xxxxxxxxxxxxx> */ - if (ld == NULL) - { - char modname [20]; - sprintf(modname, "tty-ldisc-%d", ldisc); - request_module (modname); - ld = tty_ldisc_get(ldisc); - } - - if (ld == NULL) - return -EINVAL; - - - o_ldisc = tty->ldisc; - tty_wait_until_sent(tty, 0); - - /* - * Make sure we don't change while someone holds a - * reference to the line discipline. The TTY_LDISC bit - * prevents anyone taking a reference once it is clear. - * We need the lock to avoid racing reference takers. - */ - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if(tty->ldisc.refcount) - { - /* Free the new ldisc we grabbed. Must drop the lock - first. */ - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - tty_ldisc_put(ldisc); - /* - * There are several reasons we may be busy, including - * random momentary I/O traffic. We must therefore - * retry. We could distinguish between blocking ops - * and retries if we made tty_ldisc_wait() smarter. That - * is up for discussion. - */ - if(wait_event_interruptible(tty_ldisc_wait, tty->ldisc.refcount == 0) < 0) - return -ERESTARTSYS; - goto restart; - } - clear_bit(TTY_LDISC, &tty->flags); - clear_bit(TTY_DONT_FLIP, &tty->flags); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - - /* - * From this point on we know nobody has an ldisc - * usage reference, nor can they obtain one until - * we say so later on. - */ - - /* - * Wait for ->hangup_work and ->flip.work handlers to terminate - */ - run_task_queue(&tq_timer); - flush_scheduled_tasks(); - - /* Shutdown the current discipline. */ - if (tty->ldisc.close) - (tty->ldisc.close)(tty); - - /* Now set up the new line discipline. */ - tty_ldisc_assign(tty, ld); - tty_set_termios_ldisc(tty, ldisc); - if (tty->ldisc.open) - retval = (tty->ldisc.open)(tty); - if (retval < 0) { - tty_ldisc_put(ldisc); - /* There is an outstanding reference here so this is safe */ - tty_ldisc_assign(tty, tty_ldisc_get(o_ldisc.num)); - tty_set_termios_ldisc(tty, tty->ldisc.num); - if (tty->ldisc.open && (tty->ldisc.open(tty) < 0)) { - tty_ldisc_put(o_ldisc.num); - /* This driver is always present */ - tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); - tty_set_termios_ldisc(tty, N_TTY); - if (tty->ldisc.open) { - int r = tty->ldisc.open(tty); - - if (r < 0) - panic("Couldn't open N_TTY ldisc for " - "%s --- error %d.", - tty_name(tty, buf), r); - } - } - } - /* At this point we hold a reference to the new ldisc and a - reference to the old ldisc. If we ended up flipping back - to the existing ldisc we have two references to it */ - - if (tty->ldisc.num != o_ldisc.num && tty->driver.set_ldisc) - tty->driver.set_ldisc(tty); - - tty_ldisc_put(o_ldisc.num); - - /* - * Allow ldisc referencing to occur as soon as the driver - * ldisc callback completes. - */ - tty_ldisc_enable(tty); - - return retval; -} - -/* - * This routine returns a tty driver structure, given a device number - */ -struct tty_driver *get_tty_driver(kdev_t device) -{ - int major, minor; - struct tty_driver *p; - - minor = MINOR(device); - major = MAJOR(device); - - for (p = tty_drivers; p; p = p->next) { - if (p->major != major) - continue; - if (minor < p->minor_start) - continue; - if (minor >= p->minor_start + p->num) - continue; - return p; - } - return NULL; -} - -/* - * If we try to write to, or set the state of, a terminal and we're - * not in the foreground, send a SIGTTOU. If the signal is blocked or - * ignored, go ahead and perform the operation. (POSIX 7.2) - */ -int tty_check_change(struct tty_struct * tty) -{ - if (current->tty != tty) - return 0; - if (tty->pgrp <= 0) { - printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n"); - return 0; - } - if (current->pgrp == tty->pgrp) - return 0; - if (is_ignored(SIGTTOU)) - return 0; - if (is_orphaned_pgrp(current->pgrp)) - return -EIO; - (void) kill_pg(current->pgrp,SIGTTOU,1); - return -ERESTARTSYS; -} - -static ssize_t hung_up_tty_read(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - /* Can't seek (pread) on ttys. */ - if (ppos != &file->f_pos) - return -ESPIPE; - return 0; -} - -static ssize_t hung_up_tty_write(struct file * file, const char * buf, - size_t count, loff_t *ppos) -{ - /* Can't seek (pwrite) on ttys. */ - if (ppos != &file->f_pos) - return -ESPIPE; - return -EIO; -} - -/* No kernel lock held - none needed ;) */ -static unsigned int hung_up_tty_poll(struct file * filp, poll_table * wait) -{ - return POLLIN | POLLOUT | POLLERR | POLLHUP | POLLRDNORM | POLLWRNORM; -} - -static int hung_up_tty_ioctl(struct inode * inode, struct file * file, - unsigned int cmd, unsigned long arg) -{ - return cmd == TIOCSPGRP ? -ENOTTY : -EIO; -} - -static struct file_operations tty_fops = { - llseek: no_llseek, - read: tty_read, - write: tty_write, - poll: tty_poll, - ioctl: tty_ioctl, - open: tty_open, - release: tty_release, - fasync: tty_fasync, -}; - -static struct file_operations hung_up_tty_fops = { - llseek: no_llseek, - read: hung_up_tty_read, - write: hung_up_tty_write, - poll: hung_up_tty_poll, - ioctl: hung_up_tty_ioctl, - release: tty_release, -}; - -static spinlock_t redirect_lock = SPIN_LOCK_UNLOCKED; -static struct file *redirect; - -/** - * tty_wakeup - request more data - * @tty: terminal - * - * Internal and external helper for wakeups of tty. This function - * informs the line discipline if present that the driver is ready\ - * to receive more output data. - */ - -void tty_wakeup(struct tty_struct *tty) -{ - struct tty_ldisc *ld; - - if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { - ld = tty_ldisc_ref(tty); - if(ld) { - if(ld->write_wakeup) - ld->write_wakeup(tty); - tty_ldisc_deref(ld); - } - } - wake_up_interruptible(&tty->write_wait); -} - -/* - * tty_wakeup/tty_ldisc_flush are actually _GPL exports but we can't do - * that in 2.4 for modutils compat reasons. - */ -EXPORT_SYMBOL(tty_wakeup); - - -void tty_ldisc_flush(struct tty_struct *tty) -{ - struct tty_ldisc *ld = tty_ldisc_ref(tty); - if(ld) { - if(ld->flush_buffer) - ld->flush_buffer(tty); - tty_ldisc_deref(ld); - } -} - - -/* - * tty_wakeup/tty_ldisc_flush are actually _GPL exports but we can't do - * that in 2.4 for modutils compat reasons. - */ -EXPORT_SYMBOL(tty_ldisc_flush); - -void do_tty_hangup(void *data) -{ - struct tty_struct *tty = (struct tty_struct *) data; - struct file * cons_filp = NULL; - struct file *f = NULL; - struct task_struct *p; - struct list_head *l; - struct tty_ldisc *ld; - int closecount = 0, n; - - if (!tty) - return; - - /* inuse_filps is protected by the single kernel lock */ - lock_kernel(); - - spin_lock(&redirect_lock); - if (redirect && redirect->private_data == tty) { - f = redirect; - redirect = NULL; - } - spin_unlock(&redirect_lock); - - check_tty_count(tty, "do_tty_hangup"); - file_list_lock(); - for (l = tty->tty_files.next; l != &tty->tty_files; l = l->next) { - struct file * filp = list_entry(l, struct file, f_list); - if (filp->f_dentry->d_inode->i_rdev == CONSOLE_DEV || - filp->f_dentry->d_inode->i_rdev == SYSCONS_DEV) { - cons_filp = filp; - continue; - } - if (filp->f_op != &tty_fops) - continue; - closecount++; - tty_fasync(-1, filp, 0); /* can't block */ - filp->f_op = &hung_up_tty_fops; - } - file_list_unlock(); - - /* FIXME! What are the locking issues here? This may me overdoing things.. */ - ld = tty_ldisc_ref(tty); - if(ld != NULL) - { - if (ld->flush_buffer) - ld->flush_buffer(tty); - if (tty->driver.flush_buffer) - tty->driver.flush_buffer(tty); - if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && ld->write_wakeup) - ld->write_wakeup(tty); - if (ld->hangup) - ld->hangup(tty); - } - - /* FIXME: Once we trust the LDISC code better we can wait here for - ldisc completion and fix the driver call race */ - - wake_up_interruptible(&tty->write_wait); - wake_up_interruptible(&tty->read_wait); - - /* - * Shutdown the current line discipline, and reset it to - * N_TTY. - */ - - if (tty->driver.flags & TTY_DRIVER_RESET_TERMIOS) - { - down(&tty->termios_sem); - *tty->termios = tty->driver.init_termios; - up(&tty->termios_sem); - } - - /* Defer ldisc switch */ - /* tty_deferred_ldisc_switch(N_TTY) - This should get done automatically when the port closes and - tty_release is called */ - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((tty->session > 0) && (p->session == tty->session) && - p->leader) { - send_sig(SIGHUP,p,1); - send_sig(SIGCONT,p,1); - if (tty->pgrp > 0) - p->tty_old_pgrp = tty->pgrp; - } - if (p->tty == tty) - p->tty = NULL; - } - read_unlock(&tasklist_lock); - - tty->flags = 0; - tty->session = 0; - tty->pgrp = -1; - tty->ctrl_status = 0; - /* - * If one of the devices matches a console pointer, we - * cannot just call hangup() because that will cause - * tty->count and state->count to go out of sync. - * So we just call close() the right number of times. - */ - if (cons_filp) { - if (tty->driver.close) - for (n = 0; n < closecount; n++) - tty->driver.close(tty, cons_filp); - } else if (tty->driver.hangup) - (tty->driver.hangup)(tty); - - /* We don't want to have driver/ldisc interactions beyond - the ones we did here. The driver layer expects no - calls after ->hangup() from the ldisc side. However we - can't yet guarantee all that */ - - set_bit(TTY_HUPPED, &tty->flags); - if(ld) { - tty_ldisc_enable(tty); - tty_ldisc_deref(ld); - } - unlock_kernel(); - if (f) - fput(f); -} - -void tty_hangup(struct tty_struct * tty) -{ -#ifdef TTY_DEBUG_HANGUP - char buf[64]; - - printk(KERN_DEBUG "%s hangup...\n", tty_name(tty, buf)); -#endif - schedule_task(&tty->tq_hangup); -} - -void tty_vhangup(struct tty_struct * tty) -{ -#ifdef TTY_DEBUG_HANGUP - char buf[64]; - - printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf)); -#endif - do_tty_hangup((void *) tty); -} - -int tty_hung_up_p(struct file * filp) -{ - return (filp->f_op == &hung_up_tty_fops); -} - -/* - * This function is typically called only by the session leader, when - * it wants to disassociate itself from its controlling tty. - * - * It performs the following functions: - * (1) Sends a SIGHUP and SIGCONT to the foreground process group - * (2) Clears the tty from being controlling the session - * (3) Clears the controlling tty for all processes in the - * session group. - * - * The argument on_exit is set to 1 if called when a process is - * exiting; it is 0 if called by the ioctl TIOCNOTTY. - */ -void disassociate_ctty(int on_exit) -{ - struct tty_struct *tty = current->tty; - struct task_struct *p; - int tty_pgrp = -1; - - if (tty) { - tty_pgrp = tty->pgrp; - if (on_exit && tty->driver.type != TTY_DRIVER_TYPE_PTY) - tty_vhangup(tty); - } else { - if (current->tty_old_pgrp) { - kill_pg(current->tty_old_pgrp, SIGHUP, on_exit); - kill_pg(current->tty_old_pgrp, SIGCONT, on_exit); - } - return; - } - if (tty_pgrp > 0) { - kill_pg(tty_pgrp, SIGHUP, on_exit); - if (!on_exit) - kill_pg(tty_pgrp, SIGCONT, on_exit); - } - - current->tty_old_pgrp = 0; - tty->session = 0; - tty->pgrp = -1; - - read_lock(&tasklist_lock); - for_each_task(p) - if (p->session == current->session) - p->tty = NULL; - read_unlock(&tasklist_lock); -} - -void stop_tty(struct tty_struct *tty) -{ - if (tty->stopped) - return; - tty->stopped = 1; - if (tty->link && tty->link->packet) { - tty->ctrl_status &= ~TIOCPKT_START; - tty->ctrl_status |= TIOCPKT_STOP; - wake_up_interruptible(&tty->link->read_wait); - } - if (tty->driver.stop) - (tty->driver.stop)(tty); -} - -void start_tty(struct tty_struct *tty) -{ - if (!tty->stopped || tty->flow_stopped) - return; - tty->stopped = 0; - if (tty->link && tty->link->packet) { - tty->ctrl_status &= ~TIOCPKT_STOP; - tty->ctrl_status |= TIOCPKT_START; - wake_up_interruptible(&tty->link->read_wait); - } - if (tty->driver.start) - (tty->driver.start)(tty); - /* If we have a running line discipline it may need kicking */ - tty_wakeup(tty); -} - -static ssize_t tty_read(struct file * file, char * buf, size_t count, - loff_t *ppos) -{ - int i; - struct tty_struct * tty; - struct inode *inode; - struct tty_ldisc *ld; - - /* Can't seek (pread) on ttys. */ - if (ppos != &file->f_pos) - return -ESPIPE; - - tty = (struct tty_struct *)file->private_data; - inode = file->f_dentry->d_inode; - if (tty_paranoia_check(tty, inode->i_rdev, "tty_read")) - return -EIO; - if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags))) - return -EIO; - - /* This check not only needs to be done before reading, but also - whenever read_chan() gets woken up after sleeping, so I've - moved it to there. This should only be done for the N_TTY - line discipline, anyway. Same goes for write_chan(). -- jlc. */ -#if 0 - if ((inode->i_rdev != CONSOLE_DEV) && /* don't stop on /dev/console */ - (tty->pgrp > 0) && - (current->tty == tty) && - (tty->pgrp != current->pgrp)) - if (is_ignored(SIGTTIN) || is_orphaned_pgrp(current->pgrp)) - return -EIO; - else { - (void) kill_pg(current->pgrp, SIGTTIN, 1); - return -ERESTARTSYS; - } -#endif - /* We want to wait for the line discipline to sort out in this - situation */ - ld = tty_ldisc_ref_wait(tty); - lock_kernel(); - if (ld->read) - i = (ld->read)(tty,file,buf,count); - else - i = -EIO; - tty_ldisc_deref(ld); - unlock_kernel(); - if (i > 0) - inode->i_atime = CURRENT_TIME; - return i; -} - -/* - * Split writes up in sane blocksizes to avoid - * denial-of-service type attacks - */ -static inline ssize_t do_tty_write( - ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t), - struct tty_struct *tty, - struct file *file, - const unsigned char *buf, - size_t count) -{ - ssize_t ret = 0, written = 0; - - if (file->f_flags & O_NONBLOCK) { - if (down_trylock(&tty->atomic_write)) - return -EAGAIN; - } - else { - if (down_interruptible(&tty->atomic_write)) - return -ERESTARTSYS; - } - if ( test_bit(TTY_NO_WRITE_SPLIT, &tty->flags) ) { - lock_kernel(); - written = write(tty, file, buf, count); - unlock_kernel(); - } else { - for (;;) { - unsigned long size = MAX(PAGE_SIZE*2,16384); - if (size > count) - size = count; - lock_kernel(); - ret = write(tty, file, buf, size); - unlock_kernel(); - if (ret <= 0) - break; - written += ret; - buf += ret; - count -= ret; - if (!count) - break; - ret = -ERESTARTSYS; - if (signal_pending(current)) - break; - if (current->need_resched) - schedule(); - } - } - if (written) { - file->f_dentry->d_inode->i_mtime = CURRENT_TIME; - ret = written; - } - up(&tty->atomic_write); - return ret; -} - - -static ssize_t tty_write(struct file * file, const char * buf, size_t count, - loff_t *ppos) -{ - int is_console; - struct tty_struct * tty; - struct inode *inode = file->f_dentry->d_inode; - ssize_t ret; - struct tty_ldisc *ld; - - /* Can't seek (pwrite) on ttys. */ - if (ppos != &file->f_pos) - return -ESPIPE; - - /* - * For now, we redirect writes from /dev/console as - * well as /dev/tty0. - */ - inode = file->f_dentry->d_inode; - is_console = (inode->i_rdev == SYSCONS_DEV || - inode->i_rdev == CONSOLE_DEV); - - if (is_console) { - struct file *p = NULL; - - spin_lock(&redirect_lock); - if (redirect) { - get_file(redirect); - p = redirect; - } - spin_unlock(&redirect_lock); - - if (p) { - ssize_t res = p->f_op->write(p, buf, count, &p->f_pos); - fput(p); - return res; - } - } - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode->i_rdev, "tty_write")) - return -EIO; - if (!tty || !tty->driver.write || (test_bit(TTY_IO_ERROR, &tty->flags))) - return -EIO; -#if 0 - if (!is_console && L_TOSTOP(tty) && (tty->pgrp > 0) && - (current->tty == tty) && (tty->pgrp != current->pgrp)) { - if (is_orphaned_pgrp(current->pgrp)) - return -EIO; - if (!is_ignored(SIGTTOU)) { - (void) kill_pg(current->pgrp, SIGTTOU, 1); - return -ERESTARTSYS; - } - } -#endif - - ld = tty_ldisc_ref_wait(tty); - if (!ld->write) - ret = -EIO; - else - ret = do_tty_write(ld->write, tty, file, - (const unsigned char __user *)buf, count); - tty_ldisc_deref(ld); - return ret; -} - -/* Semaphore to protect creating and releasing a tty. This is shared with - vt.c for deeply disgusting hack reasons */ -static DECLARE_MUTEX(tty_sem); - -static void down_tty_sem(int index) -{ - down(&tty_sem); -} - -static void up_tty_sem(int index) -{ - up(&tty_sem); -} - -static void release_mem(struct tty_struct *tty, int idx); - -/* - * WSH 06/09/97: Rewritten to remove races and properly clean up after a - * failed open. The new code protects the open with a semaphore, so it's - * really quite straightforward. The semaphore locking can probably be - * relaxed for the (most common) case of reopening a tty. - */ -static int init_dev(kdev_t device, struct tty_struct **ret_tty) -{ - struct tty_struct *tty, *o_tty; - struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; - struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; - struct tty_driver *driver; - int retval=0; - int idx; - - driver = get_tty_driver(device); - if (!driver) - return -ENODEV; - - idx = MINOR(device) - driver->minor_start; - - /* - * Check whether we need to acquire the tty semaphore to avoid - * race conditions. For now, play it safe. - */ - down_tty_sem(idx); - - /* check whether we're reopening an existing tty */ - tty = driver->table[idx]; - if (tty) goto fast_track; - - /* - * First time open is complex, especially for PTY devices. - * This code guarantees that either everything succeeds and the - * TTY is ready for operation, or else the table slots are vacated - * and the allocated memory released. (Except that the termios - * and locked termios may be retained.) - */ - - o_tty = NULL; - tp = o_tp = NULL; - ltp = o_ltp = NULL; - - tty = alloc_tty_struct(); - if(!tty) - goto fail_no_mem; - initialize_tty_struct(tty); - tty->device = device; - tty->driver = *driver; - - tp_loc = &driver->termios[idx]; - if (!*tp_loc) { - tp = (struct termios *) kmalloc(sizeof(struct termios), - GFP_KERNEL); - if (!tp) - goto free_mem_out; - *tp = driver->init_termios; - } - - ltp_loc = &driver->termios_locked[idx]; - if (!*ltp_loc) { - ltp = (struct termios *) kmalloc(sizeof(struct termios), - GFP_KERNEL); - if (!ltp) - goto free_mem_out; - memset(ltp, 0, sizeof(struct termios)); - } - - if (driver->type == TTY_DRIVER_TYPE_PTY) { - o_tty = alloc_tty_struct(); - if (!o_tty) - goto free_mem_out; - initialize_tty_struct(o_tty); - o_tty->device = (kdev_t) MKDEV(driver->other->major, - driver->other->minor_start + idx); - o_tty->driver = *driver->other; - - o_tp_loc = &driver->other->termios[idx]; - if (!*o_tp_loc) { - o_tp = (struct termios *) - kmalloc(sizeof(struct termios), GFP_KERNEL); - if (!o_tp) - goto free_mem_out; - *o_tp = driver->other->init_termios; - } - - o_ltp_loc = &driver->other->termios_locked[idx]; - if (!*o_ltp_loc) { - o_ltp = (struct termios *) - kmalloc(sizeof(struct termios), GFP_KERNEL); - if (!o_ltp) - goto free_mem_out; - memset(o_ltp, 0, sizeof(struct termios)); - } - - /* - * Everything allocated ... set up the o_tty structure. - */ - driver->other->table[idx] = o_tty; - if (!*o_tp_loc) - *o_tp_loc = o_tp; - if (!*o_ltp_loc) - *o_ltp_loc = o_ltp; - o_tty->termios = *o_tp_loc; - o_tty->termios_locked = *o_ltp_loc; - (*driver->other->refcount)++; - if (driver->subtype == PTY_TYPE_MASTER) - o_tty->count++; - - /* Establish the links in both directions */ - tty->link = o_tty; - o_tty->link = tty; - } - - /* - * All structures have been allocated, so now we install them. - * Failures after this point use release_mem to clean up, so - * there's no need to null out the local pointers. - */ - driver->table[idx] = tty; - - if (!*tp_loc) - *tp_loc = tp; - if (!*ltp_loc) - *ltp_loc = ltp; - tty->termios = *tp_loc; - tty->termios_locked = *ltp_loc; - (*driver->refcount)++; - tty->count++; - - /* - * Structures all installed ... call the ldisc open routines. - * If we fail here just call release_mem to clean up. No need - * to decrement the use counts, as release_mem doesn't care. - */ - if (tty->ldisc.open) { - retval = (tty->ldisc.open)(tty); - if (retval) - goto release_mem_out; - } - if (o_tty && o_tty->ldisc.open) { - retval = (o_tty->ldisc.open)(o_tty); - if (retval) { - if (tty->ldisc.close) - (tty->ldisc.close)(tty); - goto release_mem_out; - } - set_bit(TTY_LDISC, &o_tty->flags); - tty_ldisc_enable(o_tty); - } - tty_ldisc_enable(tty); - goto success; - - /* - * This fast open can be used if the tty is already open. - * No memory is allocated, and the only failures are from - * attempting to open a closing tty or attempting multiple - * opens on a pty master. - */ -fast_track: - if (test_bit(TTY_CLOSING, &tty->flags)) { - retval = -EIO; - goto end_init; - } - if (driver->type == TTY_DRIVER_TYPE_PTY && - driver->subtype == PTY_TYPE_MASTER) { - /* - * special case for PTY masters: only one open permitted, - * and the slave side open count is incremented as well. - */ - if (tty->count) { - retval = -EIO; - goto end_init; - } - tty->link->count++; - } - tty->count++; - tty->driver = *driver; /* N.B. why do this every time?? */ - /* FIXME */ - if(!test_bit(TTY_LDISC, &tty->flags)) - printk(KERN_ERR "init_dev but no ldisc\n"); -success: - *ret_tty = tty; - - /* All paths come through here to release the semaphore */ -end_init: - up_tty_sem(idx); - return retval; - - /* Release locally allocated memory ... nothing placed in slots */ -free_mem_out: - if (o_tp) - kfree(o_tp); - if (o_tty) - free_tty_struct(o_tty); - if (ltp) - kfree(ltp); - if (tp) - kfree(tp); - free_tty_struct(tty); - -fail_no_mem: - retval = -ENOMEM; - goto end_init; - - /* call the tty release_mem routine to clean out this slot */ -release_mem_out: - printk(KERN_INFO "init_dev: ldisc open failed, " - "clearing slot %d\n", idx); - release_mem(tty, idx); - goto end_init; -} - -/* - * Releases memory associated with a tty structure, and clears out the - * driver table slots. - */ -static void release_mem(struct tty_struct *tty, int idx) -{ - struct tty_struct *o_tty; - struct termios *tp; - - if ((o_tty = tty->link) != NULL) { - o_tty->driver.table[idx] = NULL; - if (o_tty->driver.flags & TTY_DRIVER_RESET_TERMIOS) { - tp = o_tty->driver.termios[idx]; - o_tty->driver.termios[idx] = NULL; - kfree(tp); - } - o_tty->magic = 0; - (*o_tty->driver.refcount)--; - list_del_init(&o_tty->tty_files); - free_tty_struct(o_tty); - } - - tty->driver.table[idx] = NULL; - if (tty->driver.flags & TTY_DRIVER_RESET_TERMIOS) { - tp = tty->driver.termios[idx]; - tty->driver.termios[idx] = NULL; - kfree(tp); - } - tty->magic = 0; - (*tty->driver.refcount)--; - list_del_init(&tty->tty_files); - free_tty_struct(tty); -} - -/* - * Even releasing the tty structures is a tricky business.. We have - * to be very careful that the structures are all released at the - * same time, as interrupts might otherwise get the wrong pointers. - * - * WSH 09/09/97: rewritten to avoid some nasty race conditions that could - * lead to double frees or releasing memory still in use. - */ -static void release_dev(struct file * filp) -{ - struct tty_struct *tty, *o_tty; - int pty_master, tty_closing, o_tty_closing, do_sleep; - int idx; - char buf[64]; - unsigned long flags; - - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "release_dev")) - return; - - check_tty_count(tty, "release_dev"); - - tty_fasync(-1, filp, 0); - - idx = MINOR(tty->device) - tty->driver.minor_start; - pty_master = (tty->driver.type == TTY_DRIVER_TYPE_PTY && - tty->driver.subtype == PTY_TYPE_MASTER); - o_tty = tty->link; - -#ifdef TTY_PARANOIA_CHECK - if (idx < 0 || idx >= tty->driver.num) { - printk(KERN_DEBUG "release_dev: bad idx when trying to " - "free (%s)\n", kdevname(tty->device)); - return; - } - if (tty != tty->driver.table[idx]) { - printk(KERN_DEBUG "release_dev: driver.table[%d] not tty " - "for (%s)\n", idx, kdevname(tty->device)); - return; - } - if (tty->termios != tty->driver.termios[idx]) { - printk(KERN_DEBUG "release_dev: driver.termios[%d] not termios " - "for (%s)\n", - idx, kdevname(tty->device)); - return; - } - if (tty->termios_locked != tty->driver.termios_locked[idx]) { - printk(KERN_DEBUG "release_dev: driver.termios_locked[%d] not " - "termios_locked for (%s)\n", - idx, kdevname(tty->device)); - return; - } -#endif - -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "release_dev of %s (tty count=%d)...", - tty_name(tty, buf), tty->count); -#endif - -#ifdef TTY_PARANOIA_CHECK - if (tty->driver.other) { - if (o_tty != tty->driver.other->table[idx]) { - printk(KERN_DEBUG "release_dev: other->table[%d] " - "not o_tty for (%s)\n", - idx, kdevname(tty->device)); - return; - } - if (o_tty->termios != tty->driver.other->termios[idx]) { - printk(KERN_DEBUG "release_dev: other->termios[%d] " - "not o_termios for (%s)\n", - idx, kdevname(tty->device)); - return; - } - if (o_tty->termios_locked != - tty->driver.other->termios_locked[idx]) { - printk(KERN_DEBUG "release_dev: other->termios_locked[" - "%d] not o_termios_locked for (%s)\n", - idx, kdevname(tty->device)); - return; - } - if (o_tty->link != tty) { - printk(KERN_DEBUG "release_dev: bad pty pointers\n"); - return; - } - } -#endif - - if (tty->driver.close) - tty->driver.close(tty, filp); - - /* - * Sanity check: if tty->count is going to zero, there shouldn't be - * any waiters on tty->read_wait or tty->write_wait. We test the - * wait queues and kick everyone out _before_ actually starting to - * close. This ensures that we won't block while releasing the tty - * structure. - * - * The test for the o_tty closing is necessary, since the master and - * slave sides may close in any order. If the slave side closes out - * first, its count will be one, since the master side holds an open. - * Thus this test wouldn't be triggered at the time the slave closes, - * so we do it now. - * - * Note that it's possible for the tty to be opened again while we're - * flushing out waiters. By recalculating the closing flags before - * each iteration we avoid any problems. - */ - while (1) { - tty_closing = tty->count <= 1; - o_tty_closing = o_tty && - (o_tty->count <= (pty_master ? 1 : 0)); - do_sleep = 0; - - if (tty_closing) { - if (waitqueue_active(&tty->read_wait)) { - wake_up(&tty->read_wait); - do_sleep++; - } - if (waitqueue_active(&tty->write_wait)) { - wake_up(&tty->write_wait); - do_sleep++; - } - } - if (o_tty_closing) { - if (waitqueue_active(&o_tty->read_wait)) { - wake_up(&o_tty->read_wait); - do_sleep++; - } - if (waitqueue_active(&o_tty->write_wait)) { - wake_up(&o_tty->write_wait); - do_sleep++; - } - } - if (!do_sleep) - break; - - printk(KERN_WARNING "release_dev: %s: read/write wait queue " - "active!\n", tty_name(tty, buf)); - schedule(); - } - - /* - * The closing flags are now consistent with the open counts on - * both sides, and we've completed the last operation that could - * block, so it's safe to proceed with closing. - */ - if (pty_master) { - if (--o_tty->count < 0) { - printk(KERN_WARNING "release_dev: bad pty slave count " - "(%d) for %s\n", - o_tty->count, tty_name(o_tty, buf)); - o_tty->count = 0; - } - } - if (--tty->count < 0) { - printk(KERN_WARNING "release_dev: bad tty->count (%d) for %s\n", - tty->count, tty_name(tty, buf)); - tty->count = 0; - } - - /* - * We've decremented tty->count, so we should zero out - * filp->private_data, to break the link between the tty and - * the file descriptor. Otherwise if filp_close() blocks before - * the file descriptor is removed from the inuse_filp - * list, check_tty_count() could observe a discrepancy and - * printk a warning message to the user. - */ - filp->private_data = 0; - - /* - * Perform some housekeeping before deciding whether to return. - * - * Set the TTY_CLOSING flag if this was the last open. In the - * case of a pty we may have to wait around for the other side - * to close, and TTY_CLOSING makes sure we can't be reopened. - */ - if(tty_closing) - set_bit(TTY_CLOSING, &tty->flags); - if(o_tty_closing) - set_bit(TTY_CLOSING, &o_tty->flags); - - /* - * If _either_ side is closing, make sure there aren't any - * processes that still think tty or o_tty is their controlling - * tty. - */ - if (tty_closing || o_tty_closing) { - struct task_struct *p; - - read_lock(&tasklist_lock); - for_each_task(p) { - if (p->tty == tty || (o_tty && p->tty == o_tty)) - p->tty = NULL; - } - read_unlock(&tasklist_lock); - } - - /* check whether both sides are closing ... */ - if (!tty_closing || (o_tty && !o_tty_closing)) - return; - -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "freeing tty structure..."); -#endif - - /* - * Prevent flush_to_ldisc() from rescheduling the work for later. Then - * kill any delayed work. As this is the final close it does not - * race with the set_ldisc code path. - */ - clear_bit(TTY_LDISC, &tty->flags); - clear_bit(TTY_DONT_FLIP, &tty->flags); - - /* - * Wait for ->hangup_work and ->flip.work handlers to terminate - */ - - run_task_queue(&tq_timer); - flush_scheduled_tasks(); - - /* - * Wait for any short term users (we know they are just driver - * side waiters as the file is closing so user count on the file - * side is zero. - */ - - spin_lock_irqsave(&tty_ldisc_lock, flags); - while(tty->ldisc.refcount) - { - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0); - spin_lock_irqsave(&tty_ldisc_lock, flags); - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - - /* - * Shutdown the current line discipline, and reset it to N_TTY. - * N.B. why reset ldisc when we're releasing the memory?? - * FIXME: this MUST get fixed for the new reflocking - */ - if (tty->ldisc.close) - (tty->ldisc.close)(tty); - tty_ldisc_put(tty->ldisc.num); - - /* - * Switch the line discipline back - */ - tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); - tty_set_termios_ldisc(tty,N_TTY); - - if (o_tty) { - /* FIXME: could o_tty be in setldisc here ? */ - clear_bit(TTY_LDISC, &o_tty->flags); - if (o_tty->ldisc.close) - (o_tty->ldisc.close)(o_tty); - tty_ldisc_put(o_tty->ldisc.num); - tty_ldisc_assign(o_tty, tty_ldisc_get(N_TTY)); - tty_set_termios_ldisc(o_tty,N_TTY); - } - - /* - * The release_mem function takes care of the details of clearing - * the slots and preserving the termios structure. - */ - release_mem(tty, idx); -} - -/* - * tty_open and tty_release keep up the tty count that contains the - * number of opens done on a tty. We cannot use the inode-count, as - * different inodes might point to the same tty. - * - * Open-counting is needed for pty masters, as well as for keeping - * track of serial lines: DTR is dropped when the last close happens. - * (This is not done solely through tty->count, now. - Ted 1/27/92) - * - * The termios state of a pty is reset on first open so that - * settings don't persist across reuse. - */ -static int tty_open(struct inode * inode, struct file * filp) -{ - struct tty_struct *tty; - int noctty, retval; - kdev_t device; - unsigned short saved_flags; - char buf[64]; - - saved_flags = filp->f_flags; -retry_open: - noctty = filp->f_flags & O_NOCTTY; - device = inode->i_rdev; - if (device == TTY_DEV) { - if (!current->tty) - return -ENXIO; - device = current->tty->device; - filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ - /* noctty = 1; */ - } -#ifdef CONFIG_VT - if (device == CONSOLE_DEV) { - extern int fg_console; - device = MKDEV(TTY_MAJOR, fg_console + 1); - noctty = 1; - } -#endif - if (device == SYSCONS_DEV) { - struct console *c = console_drivers; - while(c && !c->device) - c = c->next; - if (!c) - return -ENODEV; - device = c->device(c); - filp->f_flags |= O_NONBLOCK; /* Don't let /dev/console block */ - noctty = 1; - } - - if (device == PTMX_DEV) { -#ifdef CONFIG_UNIX98_PTYS - - /* find a free pty. */ - int major, minor; - struct tty_driver *driver; - - /* find a device that is not in use. */ - retval = -1; - for ( major = 0 ; major < UNIX98_NR_MAJORS ; major++ ) { - driver = &ptm_driver[major]; - for (minor = driver->minor_start ; - minor < driver->minor_start + driver->num ; - minor++) { - device = MKDEV(driver->major, minor); - if (!init_dev(device, &tty)) goto ptmx_found; /* ok! */ - } - } - return -EIO; /* no free ptys */ - ptmx_found: - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - minor -= driver->minor_start; - devpts_pty_new(driver->other->name_base + minor, MKDEV(driver->other->major, minor + driver->other->minor_start)); - tty_register_devfs(&pts_driver[major], DEVFS_FL_DEFAULT, - pts_driver[major].minor_start + minor); - noctty = 1; - goto init_dev_done; - -#else /* CONFIG_UNIX_98_PTYS */ - - return -ENODEV; - -#endif /* CONFIG_UNIX_98_PTYS */ - } - - retval = init_dev(device, &tty); - if (retval) - return retval; - -#ifdef CONFIG_UNIX98_PTYS -init_dev_done: -#endif - filp->private_data = tty; - file_move(filp, &tty->tty_files); - check_tty_count(tty, "tty_open"); - if (tty->driver.type == TTY_DRIVER_TYPE_PTY && - tty->driver.subtype == PTY_TYPE_MASTER) - noctty = 1; -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "opening %s...", tty_name(tty, buf)); -#endif - if (tty->driver.open) - retval = tty->driver.open(tty, filp); - else - retval = -ENODEV; - filp->f_flags = saved_flags; - - if (!retval && test_bit(TTY_EXCLUSIVE, &tty->flags) && !suser()) - retval = -EBUSY; - - if (retval) { -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "error %d in opening %s...", retval, - tty_name(tty, buf)); -#endif - - release_dev(filp); - if (retval != -ERESTARTSYS) - return retval; - if (signal_pending(current)) - return retval; - schedule(); - /* - * Need to reset f_op in case a hangup happened. - */ - filp->f_op = &tty_fops; - goto retry_open; - } - if (!noctty && - current->leader && - !current->tty && - tty->session == 0) { - task_lock(current); - current->tty = tty; - task_unlock(current); - current->tty_old_pgrp = 0; - tty->session = current->session; - tty->pgrp = current->pgrp; - } - if ((tty->driver.type == TTY_DRIVER_TYPE_SERIAL) && - (tty->driver.subtype == SERIAL_TYPE_CALLOUT) && - (tty->count == 1)) { - static int nr_warns; - if (nr_warns < 5) { - printk(KERN_WARNING "tty_io.c: " - "process %d (%s) used obsolete /dev/%s - " - "update software to use /dev/ttyS%d\n", - current->pid, current->comm, - tty_name(tty, buf), TTY_NUMBER(tty)); - nr_warns++; - } - } - return 0; -} - -static int tty_release(struct inode * inode, struct file * filp) -{ - lock_kernel(); - release_dev(filp); - unlock_kernel(); - return 0; -} - -/* No kernel lock held - fine */ -static unsigned int tty_poll(struct file * filp, poll_table * wait) -{ - struct tty_struct * tty; - struct tty_ldisc *ld; - int ret = 0; - - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "tty_poll")) - return 0; - - ld = tty_ldisc_ref_wait(tty); - if (ld->poll) - ret = (ld->poll)(tty, filp, wait); - tty_ldisc_deref(ld); - return ret; -} - -static int tty_fasync(int fd, struct file * filp, int on) -{ - struct tty_struct * tty; - int retval; - - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode->i_rdev, "tty_fasync")) - return 0; - - retval = fasync_helper(fd, filp, on, &tty->fasync); - if (retval <= 0) - return retval; - - if (on) { - if (!waitqueue_active(&tty->read_wait)) - tty->minimum_to_wake = 1; - if (filp->f_owner.pid == 0) { - filp->f_owner.pid = (-tty->pgrp) ? : current->pid; - filp->f_owner.uid = current->uid; - filp->f_owner.euid = current->euid; - } - } else { - if (!tty->fasync && !waitqueue_active(&tty->read_wait)) - tty->minimum_to_wake = N_TTY_BUF_SIZE; - } - return 0; -} - -static int tiocsti(struct tty_struct *tty, char * arg) -{ - char ch, mbz = 0; - struct tty_ldisc *ld; - - if ((current->tty != tty) && !suser()) - return -EPERM; - if (get_user(ch, arg)) - return -EFAULT; - ld = tty_ldisc_ref_wait(tty); - ld->receive_buf(tty, &ch, &mbz, 1); - tty_ldisc_deref(ld); - return 0; -} - -static int tiocgwinsz(struct tty_struct *tty, struct winsize * arg) -{ - if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) - return -EFAULT; - return 0; -} - -static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, - struct winsize * arg) -{ - struct winsize tmp_ws; - - if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) - return -EFAULT; - if (!memcmp(&tmp_ws, &tty->winsize, sizeof(*arg))) - return 0; - if (tty->pgrp > 0) - kill_pg(tty->pgrp, SIGWINCH, 1); - if ((real_tty->pgrp != tty->pgrp) && (real_tty->pgrp > 0)) - kill_pg(real_tty->pgrp, SIGWINCH, 1); - tty->winsize = tmp_ws; - real_tty->winsize = tmp_ws; - return 0; -} - -static int tioccons(struct inode *inode, struct file *file) -{ - if (inode->i_rdev == SYSCONS_DEV || - inode->i_rdev == CONSOLE_DEV) { - struct file *f; - if (!suser()) - return -EPERM; - spin_lock(&redirect_lock); - f = redirect; - redirect = NULL; - spin_unlock(&redirect_lock); - if (f) - fput(f); - return 0; - } - spin_lock(&redirect_lock); - if (redirect) { - spin_unlock(&redirect_lock); - return -EBUSY; - } - get_file(file); - redirect = file; - spin_unlock(&redirect_lock); - return 0; -} - - -static int fionbio(struct file *file, int *arg) -{ - int nonblock; - - if (get_user(nonblock, arg)) - return -EFAULT; - - if (nonblock) - file->f_flags |= O_NONBLOCK; - else - file->f_flags &= ~O_NONBLOCK; - return 0; -} - -static int tiocsctty(struct tty_struct *tty, int arg) -{ - if (current->leader && - (current->session == tty->session)) - return 0; - /* - * The process must be a session leader and - * not have a controlling tty already. - */ - if (!current->leader || current->tty) - return -EPERM; - if (tty->session > 0) { - /* - * This tty is already the controlling - * tty for another session group! - */ - if ((arg == 1) && suser()) { - /* - * Steal it away - */ - struct task_struct *p; - - read_lock(&tasklist_lock); - for_each_task(p) - if (p->tty == tty) - p->tty = NULL; - read_unlock(&tasklist_lock); - } else - return -EPERM; - } - task_lock(current); - current->tty = tty; - task_unlock(current); - current->tty_old_pgrp = 0; - tty->session = current->session; - tty->pgrp = current->pgrp; - return 0; -} - -static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) -{ - /* - * (tty == real_tty) is a cheap way of - * testing if the tty is NOT a master pty. - */ - if (tty == real_tty && current->tty != real_tty) - return -ENOTTY; - return put_user(real_tty->pgrp, arg); -} - -static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) -{ - pid_t pgrp; - int retval = tty_check_change(real_tty); - - if (retval == -EIO) - return -ENOTTY; - if (retval) - return retval; - if (!current->tty || - (current->tty != real_tty) || - (real_tty->session != current->session)) - return -ENOTTY; - if (get_user(pgrp, (pid_t *) arg)) - return -EFAULT; - if (pgrp < 0) - return -EINVAL; - if (session_of_pgrp(pgrp) != current->session) - return -EPERM; - real_tty->pgrp = pgrp; - return 0; -} - -static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) -{ - /* - * (tty == real_tty) is a cheap way of - * testing if the tty is NOT a master pty. - */ - if (tty == real_tty && current->tty != real_tty) - return -ENOTTY; - if (real_tty->session <= 0) - return -ENOTTY; - return put_user(real_tty->session, arg); -} - -static int tiocttygstruct(struct tty_struct *tty, struct tty_struct *arg) -{ - if (copy_to_user(arg, tty, sizeof(*arg))) - return -EFAULT; - return 0; -} - -static int tiocsetd(struct tty_struct *tty, int *arg) -{ - int ldisc; - - if (get_user(ldisc, arg)) - return -EFAULT; - return tty_set_ldisc(tty, ldisc); -} - -static int send_break(struct tty_struct *tty, int duration) -{ - tty->driver.break_ctl(tty, -1); - if (!signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(duration); - } - tty->driver.break_ctl(tty, 0); - if (signal_pending(current)) - return -EINTR; - return 0; -} - -static int tty_generic_brk(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg) -{ - if (cmd == TCSBRK && arg) - { - /* tcdrain case */ - int retval = tty_check_change(tty); - if (retval) - return retval; - tty_wait_until_sent(tty, 0); - if (signal_pending(current)) - return -EINTR; - } - return 0; -} - -/* - * Split this up, as gcc can choke on it otherwise.. - */ -int tty_ioctl(struct inode * inode, struct file * file, - unsigned int cmd, unsigned long arg) -{ - struct tty_struct *tty, *real_tty; - int retval; - struct tty_ldisc *ld; - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode->i_rdev, "tty_ioctl")) - return -EINVAL; - - real_tty = tty; - if (tty->driver.type == TTY_DRIVER_TYPE_PTY && - tty->driver.subtype == PTY_TYPE_MASTER) - real_tty = tty->link; - - /* - * Break handling by driver - */ - if (!tty->driver.break_ctl) { - switch(cmd) { - case TIOCSBRK: - case TIOCCBRK: - if (tty->driver.ioctl) - return tty->driver.ioctl(tty, file, cmd, arg); - return -EINVAL; - - /* These two ioctl's always return success; even if */ - /* the driver doesn't support them. */ - case TCSBRK: - case TCSBRKP: - retval = -ENOIOCTLCMD; - if (tty->driver.ioctl) - retval = tty->driver.ioctl(tty, file, cmd, arg); - /* Not driver handled */ - if (retval == -ENOIOCTLCMD) - retval = tty_generic_brk(tty, file, cmd, arg); - return retval; - } - } - - /* - * Factor out some common prep work - */ - switch (cmd) { - case TIOCSETD: - case TIOCSBRK: - case TIOCCBRK: - case TCSBRK: - case TCSBRKP: - retval = tty_check_change(tty); - if (retval) - return retval; - if (cmd != TIOCCBRK) { - tty_wait_until_sent(tty, 0); - if (signal_pending(current)) - return -EINTR; - } - break; - } - - switch (cmd) { - case TIOCSTI: - return tiocsti(tty, (char *)arg); - case TIOCGWINSZ: - return tiocgwinsz(tty, (struct winsize *) arg); - case TIOCSWINSZ: - return tiocswinsz(tty, real_tty, (struct winsize *) arg); - case TIOCCONS: - return real_tty!=tty ? -EINVAL : tioccons(inode, file); - case FIONBIO: - return fionbio(file, (int *) arg); - case TIOCEXCL: - set_bit(TTY_EXCLUSIVE, &tty->flags); - return 0; - case TIOCNXCL: - clear_bit(TTY_EXCLUSIVE, &tty->flags); - return 0; - case TIOCNOTTY: - if (current->tty != tty) - return -ENOTTY; - if (current->leader) - disassociate_ctty(0); - task_lock(current); - current->tty = NULL; - task_unlock(current); - return 0; - case TIOCSCTTY: - return tiocsctty(tty, arg); - case TIOCGPGRP: - return tiocgpgrp(tty, real_tty, (pid_t *) arg); - case TIOCSPGRP: - return tiocspgrp(tty, real_tty, (pid_t *) arg); - case TIOCGSID: - return tiocgsid(tty, real_tty, (pid_t *) arg); - case TIOCGETD: - /* FIXME: check this is ok */ - return put_user(tty->ldisc.num, (int *) arg); - case TIOCSETD: - return tiocsetd(tty, (int *) arg); -#ifdef CONFIG_VT - case TIOCLINUX: - return tioclinux(tty, arg); -#endif - case TIOCTTYGSTRUCT: - return tiocttygstruct(tty, (struct tty_struct *) arg); - - /* - * Break handling - */ - case TIOCSBRK: /* Turn break on, unconditionally */ - tty->driver.break_ctl(tty, -1); - return 0; - - case TIOCCBRK: /* Turn break off, unconditionally */ - tty->driver.break_ctl(tty, 0); - return 0; - case TCSBRK: /* SVID version: non-zero arg --> no break */ - /* - * XXX is the above comment correct, or the - * code below correct? Is this ioctl used at - * all by anyone? - */ - if (!arg) - return send_break(tty, HZ/4); - return 0; - case TCSBRKP: /* support for POSIX tcsendbreak() */ - return send_break(tty, arg ? arg*(HZ/10) : HZ/4); - } - if (tty->driver.ioctl) { - retval = (tty->driver.ioctl)(tty, file, cmd, arg); - if (retval != -ENOIOCTLCMD) - return retval; - } - ld = tty_ldisc_ref_wait(tty); - retval = -EINVAL; - if (ld->ioctl) { - retval = ld->ioctl(tty, file, cmd, arg); - if (retval == -ENOIOCTLCMD) - retval = -EINVAL; - } - tty_ldisc_deref(ld); - return retval; -} - - -/* - * This implements the "Secure Attention Key" --- the idea is to - * prevent trojan horses by killing all processes associated with this - * tty when the user hits the "Secure Attention Key". Required for - * super-paranoid applications --- see the Orange Book for more details. - * - * This code could be nicer; ideally it should send a HUP, wait a few - * seconds, then send a INT, and then a KILL signal. But you then - * have to coordinate with the init process, since all processes associated - * with the current tty must be dead before the new getty is allowed - * to spawn. - * - * Now, if it would be correct ;-/ The current code has a nasty hole - - * it doesn't catch files in flight. We may send the descriptor to ourselves - * via AF_UNIX socket, close it and later fetch from socket. FIXME. - * - * Nasty bug: do_SAK is being called in interrupt context. This can - * deadlock. We punt it up to process context. AKPM - 16Mar2001 - */ -static void __do_SAK(void *arg) -{ -#ifdef TTY_SOFT_SAK - tty_hangup(tty); -#else - struct tty_struct *tty = arg; - struct task_struct *p; - int session; - int i; - struct file *filp; - struct tty_ldisc *disc; - - if (!tty) - return; - session = tty->session; - /* We don't want an ldisc switch during this */ - disc = tty_ldisc_ref(tty); - if (disc && disc->flush_buffer) - disc->flush_buffer(tty); - tty_ldisc_deref(disc); - - if (tty->driver.flush_buffer) - tty->driver.flush_buffer(tty); - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->tty == tty) || - ((session > 0) && (p->session == session))) { - send_sig(SIGKILL, p, 1); - continue; - } - task_lock(p); - if (p->files) { - read_lock(&p->files->file_lock); - for (i=0; i < p->files->max_fds; i++) { - filp = fcheck_files(p->files, i); - if (filp && (filp->f_op == &tty_fops) && - (filp->private_data == tty)) { - send_sig(SIGKILL, p, 1); - break; - } - } - read_unlock(&p->files->file_lock); - } - task_unlock(p); - } - read_unlock(&tasklist_lock); -#endif -} - -/* - * The tq handling here is a little racy - tty->SAK_tq may already be queued. - * But there's no mechanism to fix that without futzing with tqueue_lock. - * Fortunately we don't need to worry, because if ->SAK_tq is already queued, - * the values which we write to it will be identical to the values which it - * already has. --akpm - */ -void do_SAK(struct tty_struct *tty) -{ - if (!tty) - return; - PREPARE_TQUEUE(&tty->SAK_tq, __do_SAK, tty); - schedule_task(&tty->SAK_tq); -} - -/* - * This routine is called out of the software interrupt to flush data - * from the flip buffer to the line discipline. - */ -static void flush_to_ldisc(void *private_) -{ - struct tty_struct *tty = (struct tty_struct *) private_; - unsigned char *cp; - char *fp; - int count; - unsigned long flags; - struct tty_ldisc *disc; - - disc = tty_ldisc_ref(tty); - if (disc == NULL) /* !TTY_LDISC */ - return; - - if (test_bit(TTY_DONT_FLIP, &tty->flags)) { - queue_task(&tty->flip.tqueue, &tq_timer); - goto out; - } - if (tty->flip.buf_num) { - cp = tty->flip.char_buf + TTY_FLIPBUF_SIZE; - fp = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; - tty->flip.buf_num = 0; - - save_flags(flags); cli(); - tty->flip.char_buf_ptr = tty->flip.char_buf; - tty->flip.flag_buf_ptr = tty->flip.flag_buf; - } else { - cp = tty->flip.char_buf; - fp = tty->flip.flag_buf; - tty->flip.buf_num = 1; - - save_flags(flags); cli(); - tty->flip.char_buf_ptr = tty->flip.char_buf + TTY_FLIPBUF_SIZE; - tty->flip.flag_buf_ptr = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; - } - count = tty->flip.count; - tty->flip.count = 0; - restore_flags(flags); - - disc->receive_buf(tty, cp, fp, count); -out: - tty_ldisc_deref(disc); -} - -/* - * Call the ldisc flush directly from a driver. This function may - * return an error and need retrying by the user. - */ - -int tty_push_data(struct tty_struct *tty, unsigned char *cp, unsigned char *fp, int count) -{ - int ret = 0; - struct tty_ldisc *disc; - - disc = tty_ldisc_ref(tty); - if(test_bit(TTY_DONT_FLIP, &tty->flags)) - ret = -EAGAIN; - else if(disc == NULL) - ret = -EIO; - else - disc->receive_buf(tty, cp, fp, count); - tty_ldisc_deref(disc); - return ret; - -} - -/* - * Routine which returns the baud rate of the tty - * - * Note that the baud_table needs to be kept in sync with the - * include/asm/termbits.h file. - */ -static int baud_table[] = { - 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, - 9600, 19200, 38400, 57600, 115200, 230400, 460800, -#ifdef __sparc__ - 76800, 153600, 307200, 614400, 921600 -#else - 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, - 2500000, 3000000, 3500000, 4000000 -#endif -}; - -static int n_baud_table = sizeof(baud_table)/sizeof(int); - -int tty_get_baud_rate(struct tty_struct *tty) -{ - unsigned int cflag, i; - - cflag = tty->termios->c_cflag; - - i = cflag & CBAUD; - if (i & CBAUDEX) { - i &= ~CBAUDEX; - if (i < 1 || i+15 >= n_baud_table) - tty->termios->c_cflag &= ~CBAUDEX; - else - i += 15; - } - if (i==15 && tty->alt_speed) { - if (!tty->warned) { - printk(KERN_WARNING "Use of setserial/setrocket to " - "set SPD_* flags is deprecated\n"); - tty->warned = 1; - } - return(tty->alt_speed); - } - - return baud_table[i]; -} - -void tty_flip_buffer_push(struct tty_struct *tty) -{ - if (tty->low_latency) - flush_to_ldisc((void *) tty); - else - queue_task(&tty->flip.tqueue, &tq_timer); -} - -/* - * This subroutine initializes a tty structure. - */ -static void initialize_tty_struct(struct tty_struct *tty) -{ - memset(tty, 0, sizeof(struct tty_struct)); - tty->magic = TTY_MAGIC; - tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); - tty->pgrp = -1; - tty->flip.char_buf_ptr = tty->flip.char_buf; - tty->flip.flag_buf_ptr = tty->flip.flag_buf; - tty->flip.tqueue.routine = flush_to_ldisc; - tty->flip.tqueue.data = tty; - init_MUTEX(&tty->flip.pty_sem); - init_MUTEX(&tty->termios_sem); - init_waitqueue_head(&tty->write_wait); - init_waitqueue_head(&tty->read_wait); - tty->tq_hangup.routine = do_tty_hangup; - tty->tq_hangup.data = tty; - sema_init(&tty->atomic_read, 1); - sema_init(&tty->atomic_write, 1); - spin_lock_init(&tty->read_lock); - INIT_LIST_HEAD(&tty->tty_files); - INIT_TQUEUE(&tty->SAK_tq, 0, 0); -} - -/* - * The default put_char routine if the driver did not define one. - */ -void tty_default_put_char(struct tty_struct *tty, unsigned char ch) -{ - tty->driver.write(tty, 0, &ch, 1); -} - -/* - * Register a tty device described by <driver>, with minor number <minor>. - */ -void tty_register_devfs (struct tty_driver *driver, unsigned int flags, unsigned minor) -{ -#ifdef CONFIG_DEVFS_FS - umode_t mode = S_IFCHR | S_IRUSR | S_IWUSR; - kdev_t device = MKDEV (driver->major, minor); - int idx = minor - driver->minor_start; - char buf[32]; - - switch (device) { - case TTY_DEV: - case PTMX_DEV: - mode |= S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; - break; - default: - if (driver->major == PTY_MASTER_MAJOR) - mode |= S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; - break; - } - if ( (minor < driver->minor_start) || - (minor >= driver->minor_start + driver->num) ) { - printk(KERN_ERR "Attempt to register invalid minor number " - "with devfs (%d:%d).\n", (int)driver->major,(int)minor); - return; - } -# ifdef CONFIG_UNIX98_PTYS - if ( (driver->major >= UNIX98_PTY_SLAVE_MAJOR) && - (driver->major < UNIX98_PTY_SLAVE_MAJOR + UNIX98_NR_MAJORS) ) - flags |= DEVFS_FL_CURRENT_OWNER; -# endif - sprintf(buf, driver->name, idx + driver->name_base); - devfs_register (NULL, buf, flags | DEVFS_FL_DEFAULT, - driver->major, minor, mode, &tty_fops, NULL); -#endif /* CONFIG_DEVFS_FS */ -} - -void tty_unregister_devfs (struct tty_driver *driver, unsigned minor) -{ -#ifdef CONFIG_DEVFS_FS - void * handle; - int idx = minor - driver->minor_start; - char buf[32]; - - sprintf(buf, driver->name, idx + driver->name_base); - handle = devfs_find_handle (NULL, buf, driver->major, minor, - DEVFS_SPECIAL_CHR, 0); - devfs_unregister (handle); -#endif /* CONFIG_DEVFS_FS */ -} - -EXPORT_SYMBOL(tty_register_devfs); -EXPORT_SYMBOL(tty_unregister_devfs); - -/* - * Called by a tty driver to register itself. - */ -int tty_register_driver(struct tty_driver *driver) -{ - int error; - int i; - - if (driver->flags & TTY_DRIVER_INSTALLED) - return 0; - - error = devfs_register_chrdev(driver->major, driver->name, &tty_fops); - if (error < 0) - return error; - else if(driver->major == 0) - driver->major = error; - - if (!driver->put_char) - driver->put_char = tty_default_put_char; - - driver->prev = 0; - driver->next = tty_drivers; - if (tty_drivers) tty_drivers->prev = driver; - tty_drivers = driver; - - if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { - for(i = 0; i < driver->num; i++) - tty_register_devfs(driver, 0, driver->minor_start + i); - } - proc_tty_register_driver(driver); - return error; -} - -/* - * Called by a tty driver to unregister itself. - */ -int tty_unregister_driver(struct tty_driver *driver) -{ - int retval; - struct tty_driver *p; - int i, found = 0; - struct termios *tp; - const char *othername = NULL; - - if (*driver->refcount) - return -EBUSY; - - for (p = tty_drivers; p; p = p->next) { - if (p == driver) - found++; - else if (p->major == driver->major) - othername = p->name; - } - - if (!found) - return -ENOENT; - - if (othername == NULL) { - retval = devfs_unregister_chrdev(driver->major, driver->name); - if (retval) - return retval; - } else - devfs_register_chrdev(driver->major, othername, &tty_fops); - - if (driver->prev) - driver->prev->next = driver->next; - else - tty_drivers = driver->next; - - if (driver->next) - driver->next->prev = driver->prev; - - /* - * Free the termios and termios_locked structures because - * we don't want to get memory leaks when modular tty - * drivers are removed from the kernel. - */ - for (i = 0; i < driver->num; i++) { - tp = driver->termios[i]; - if (tp) { - driver->termios[i] = NULL; - kfree(tp); - } - tp = driver->termios_locked[i]; - if (tp) { - driver->termios_locked[i] = NULL; - kfree(tp); - } - tty_unregister_devfs(driver, driver->minor_start + i); - } - proc_tty_unregister_driver(driver); - return 0; -} - - -/* - * Initialize the console device. This is called *early*, so - * we can't necessarily depend on lots of kernel help here. - * Just do some early initializations, and do the complex setup - * later. - */ -void __init console_init(void) -{ - /* Setup the default TTY line discipline. */ - memset(tty_ldiscs, 0, NR_LDISCS*sizeof(struct tty_ldisc)); - (void) tty_register_ldisc(N_TTY, &tty_ldisc_N_TTY); - - /* - * Set up the standard termios. Individual tty drivers may - * deviate from this; this is used as a template. - */ - memset(&tty_std_termios, 0, sizeof(struct termios)); - memcpy(tty_std_termios.c_cc, INIT_C_CC, NCCS); - tty_std_termios.c_iflag = ICRNL | IXON; - tty_std_termios.c_oflag = OPOST | ONLCR; - tty_std_termios.c_cflag = B38400 | CS8 | CREAD | HUPCL; - tty_std_termios.c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | - ECHOCTL | ECHOKE | IEXTEN; - - /* - * set up the console device so that later boot sequences can - * inform about problems etc.. - */ -#ifdef CONFIG_EARLY_PRINTK - disable_early_printk(); -#endif - -#ifdef CONFIG_XEN_CONSOLE - xen_console_init(); -#endif - -#ifdef CONFIG_VT - con_init(); -#endif -#ifdef CONFIG_AU1X00_SERIAL_CONSOLE - au1x00_serial_console_init(); -#endif -#ifdef CONFIG_SERIAL_CONSOLE -#if (defined(CONFIG_8xx) || defined(CONFIG_CPM2)) - console_8xx_init(); -#elif defined(CONFIG_MAC_SERIAL) && defined(CONFIG_SERIAL) - if (_machine == _MACH_Pmac) - mac_scc_console_init(); - else - serial_console_init(); -#elif defined(CONFIG_MAC_SERIAL) - mac_scc_console_init(); -#elif defined(CONFIG_PARISC) - pdc_console_init(); -#elif defined(CONFIG_SERIAL) - serial_console_init(); -#endif /* CONFIG_8xx */ -#if defined(CONFIG_MVME162_SCC) || defined(CONFIG_BVME6000_SCC) || defined(CONFIG_MVME147_SCC) - vme_scc_console_init(); -#endif -#if defined(CONFIG_SERIAL167) - serial167_console_init(); -#endif -#if defined(CONFIG_SH_SCI) - sci_console_init(); -#endif -#endif -#ifdef CONFIG_SERIAL_DEC_CONSOLE - dec_serial_console_init(); -#endif -#ifdef CONFIG_TN3270_CONSOLE - tub3270_con_init(); -#endif -#ifdef CONFIG_TN3215 - con3215_init(); -#endif -#ifdef CONFIG_HWC - hwc_console_init(); -#endif -#ifdef CONFIG_STDIO_CONSOLE - stdio_console_init(); -#endif -#ifdef CONFIG_SERIAL_21285_CONSOLE - rs285_console_init(); -#endif -#ifdef CONFIG_SERIAL_SA1100_CONSOLE - sa1100_rs_console_init(); -#endif -#ifdef CONFIG_ARC_CONSOLE - arc_console_init(); -#endif -#ifdef CONFIG_SERIAL_AMBA_CONSOLE - ambauart_console_init(); -#endif -#ifdef CONFIG_SERIAL_TX3912_CONSOLE - tx3912_console_init(); -#endif -#ifdef CONFIG_TXX927_SERIAL_CONSOLE - txx927_console_init(); -#endif -#ifdef CONFIG_SERIAL_TXX9_CONSOLE - txx9_serial_console_init(); -#endif -#ifdef CONFIG_SIBYTE_SB1250_DUART_CONSOLE - sb1250_serial_console_init(); -#endif -#ifdef CONFIG_IP22_SERIAL - sgi_serial_console_init(); -#endif -} - -static struct tty_driver dev_tty_driver, dev_syscons_driver; -#ifdef CONFIG_UNIX98_PTYS -static struct tty_driver dev_ptmx_driver; -#endif -#ifdef CONFIG_HVC_CONSOLE - hvc_console_init(); -#endif -#ifdef CONFIG_VT -static struct tty_driver dev_console_driver; -#endif - -/* - * Ok, now we can initialize the rest of the tty devices and can count - * on memory allocations, interrupts etc.. - */ -void __init tty_init(void) -{ - /* - * dev_tty_driver and dev_console_driver are actually magic - * devices which get redirected at open time. Nevertheless, - * we register them so that register_chrdev is called - * appropriately. - */ - memset(&dev_tty_driver, 0, sizeof(struct tty_driver)); - dev_tty_driver.magic = TTY_DRIVER_MAGIC; - dev_tty_driver.driver_name = "/dev/tty"; - dev_tty_driver.name = dev_tty_driver.driver_name + 5; - dev_tty_driver.name_base = 0; - dev_tty_driver.major = TTYAUX_MAJOR; - dev_tty_driver.minor_start = 0; - dev_tty_driver.num = 1; - dev_tty_driver.type = TTY_DRIVER_TYPE_SYSTEM; - dev_tty_driver.subtype = SYSTEM_TYPE_TTY; - - if (tty_register_driver(&dev_tty_driver)) - panic("Couldn't register /dev/tty driver\n"); - - dev_syscons_driver = dev_tty_driver; - dev_syscons_driver.driver_name = "/dev/console"; - dev_syscons_driver.name = dev_syscons_driver.driver_name + 5; - dev_syscons_driver.major = TTYAUX_MAJOR; - dev_syscons_driver.minor_start = 1; - dev_syscons_driver.type = TTY_DRIVER_TYPE_SYSTEM; - dev_syscons_driver.subtype = SYSTEM_TYPE_SYSCONS; - - if (tty_register_driver(&dev_syscons_driver)) - panic("Couldn't register /dev/console driver\n"); - - /* console calls tty_register_driver() before kmalloc() works. - * Thus, we can't devfs_register() then. Do so now, instead. - */ -#ifdef CONFIG_VT - con_init_devfs(); -#endif - -#ifdef CONFIG_UNIX98_PTYS - dev_ptmx_driver = dev_tty_driver; - dev_ptmx_driver.driver_name = "/dev/ptmx"; - dev_ptmx_driver.name = dev_ptmx_driver.driver_name + 5; - dev_ptmx_driver.major= MAJOR(PTMX_DEV); - dev_ptmx_driver.minor_start = MINOR(PTMX_DEV); - dev_ptmx_driver.type = TTY_DRIVER_TYPE_SYSTEM; - dev_ptmx_driver.subtype = SYSTEM_TYPE_SYSPTMX; - - if (tty_register_driver(&dev_ptmx_driver)) - panic("Couldn't register /dev/ptmx driver\n"); -#endif - -#ifdef CONFIG_VT - dev_console_driver = dev_tty_driver; - dev_console_driver.driver_name = "/dev/vc/0"; - dev_console_driver.name = dev_console_driver.driver_name + 5; - dev_console_driver.major = TTY_MAJOR; - dev_console_driver.type = TTY_DRIVER_TYPE_SYSTEM; - dev_console_driver.subtype = SYSTEM_TYPE_CONSOLE; - - if (tty_register_driver(&dev_console_driver)) - panic("Couldn't register /dev/tty0 driver\n"); - - kbd_init(); -#endif - -#ifdef CONFIG_SGI_L1_SERIAL_CONSOLE - if (ia64_platform_is("sn2")) { - sn_sal_serial_console_init(); - return; /* only one console right now for SN2 */ - } -#endif -#ifdef CONFIG_ESPSERIAL /* init ESP before rs, so rs doesn't see the port */ - espserial_init(); -#endif -#if defined(CONFIG_MVME162_SCC) || defined(CONFIG_BVME6000_SCC) || defined(CONFIG_MVME147_SCC) - vme_scc_init(); -#endif -#ifdef CONFIG_SERIAL_TX3912 - tx3912_rs_init(); -#endif -#ifdef CONFIG_ROCKETPORT - rp_init(); -#endif -#ifdef CONFIG_SERIAL167 - serial167_init(); -#endif -#ifdef CONFIG_CYCLADES - cy_init(); -#endif -#ifdef CONFIG_STALLION - stl_init(); -#endif -#ifdef CONFIG_ISTALLION - stli_init(); -#endif -#ifdef CONFIG_DIGI - pcxe_init(); -#endif -#ifdef CONFIG_DIGIEPCA - pc_init(); -#endif -#ifdef CONFIG_SPECIALIX - specialix_init(); -#endif -#if (defined(CONFIG_8xx) || defined(CONFIG_CPM2)) - rs_8xx_init(); -#endif /* CONFIG_8xx */ - pty_init(); -#ifdef CONFIG_MOXA_SMARTIO - mxser_init(); -#endif -#ifdef CONFIG_MOXA_INTELLIO - moxa_init(); -#endif -#ifdef CONFIG_VT - vcs_init(); -#endif -#ifdef CONFIG_TN3270 - tub3270_init(); -#endif -#ifdef CONFIG_TN3215 - tty3215_init(); -#endif -#ifdef CONFIG_HWC - hwc_tty_init(); -#endif -#ifdef CONFIG_A2232 - a2232board_init(); -#endif -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/drivers/scsi/aic7xxx/Makefile --- a/linux-2.4.30-xen-sparse/drivers/scsi/aic7xxx/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,97 +0,0 @@ -# -# drivers/scsi/aic7xxx/Makefile -# -# Makefile for the Linux aic7xxx SCSI driver. -# - -O_TARGET := aic7xxx_drv.o - -list-multi := aic7xxx.o aic79xx.o - -obj-$(CONFIG_SCSI_AIC7XXX) += aic7xxx.o -ifeq ($(CONFIG_PCI),y) -obj-$(CONFIG_SCSI_AIC79XX) += aic79xx.o -endif - -EXTRA_CFLAGS += -I$(TOPDIR)/drivers/scsi -Werror -#EXTRA_CFLAGS += -g - -# Platform Specific Files -obj-aic7xxx = aic7xxx_osm.o aic7xxx_proc.o - -# Core Files -obj-aic7xxx += aic7xxx_core.o aic7xxx_93cx6.o -ifeq ($(CONFIG_AIC7XXX_REG_PRETTY_PRINT),y) -obj-aic7xxx += aic7xxx_reg_print.o -endif - -#EISA Specific Files -AIC7XXX_EISA_ARCH = $(filter i386 alpha xen,$(ARCH)) -ifneq ($(AIC7XXX_EISA_ARCH),) -obj-aic7xxx += aic7770.o -# Platform Specific EISA Files -obj-aic7xxx += aic7770_osm.o -endif - -#PCI Specific Files -ifeq ($(CONFIG_PCI),y) -obj-aic7xxx += aic7xxx_pci.o -# Platform Specific PCI Files -obj-aic7xxx += aic7xxx_osm_pci.o -endif - -# Platform Specific U320 Files -obj-aic79xx = aic79xx_osm.o aic79xx_proc.o aic79xx_osm_pci.o -# Core Files -obj-aic79xx += aic79xx_core.o aic79xx_pci.o -ifeq ($(CONFIG_AIC79XX_REG_PRETTY_PRINT),y) -obj-aic79xx += aic79xx_reg_print.o -endif - -# Override our module desitnation -MOD_DESTDIR = $(shell cd .. && $(CONFIG_SHELL) $(TOPDIR)/scripts/pathdown.sh) - -include $(TOPDIR)/Rules.make - -aic7xxx_core.o: aic7xxx_seq.h -$(obj-aic7xxx): aic7xxx_reg.h -aic7xxx.o: aic7xxx_seq.h aic7xxx_reg.h $(obj-aic7xxx) - $(LD) $(LD_RFLAG) -r -o $@ $(obj-aic7xxx) - -aic79xx_core.o: aic79xx_seq.h -$(obj-aic79xx): aic79xx_reg.h -aic79xx.o: aic79xx_seq.h aic79xx_reg.h $(obj-aic79xx) - $(LD) $(LD_RFLAG) -r -o $@ $(obj-aic79xx) - -ifeq ($(CONFIG_AIC7XXX_BUILD_FIRMWARE),y) -aic7xxx_gen = aic7xxx_seq.h aic7xxx_reg.h -ifeq ($(CONFIG_AIC7XXX_REG_PRETTY_PRINT),y) -aic7xxx_gen += aic7xxx_reg_print.c -aic7xxx_asm_cmd = aicasm/aicasm -I. -r aic7xxx_reg.h \ - -p aic7xxx_reg_print.c -i aic7xxx_osm.h \ - -o aic7xxx_seq.h aic7xxx.seq -else -aic7xxx_asm_cmd = aicasm/aicasm -I. -r aic7xxx_reg.h \ - -o aic7xxx_seq.h aic7xxx.seq -endif -$(aic7xxx_gen): aic7xxx.seq aic7xxx.reg aicasm/aicasm - $(aic7xxx_asm_cmd) -endif - -ifeq ($(CONFIG_AIC79XX_BUILD_FIRMWARE),y) -aic79xx_gen = aic79xx_seq.h aic79xx_reg.h -ifeq ($(CONFIG_AIC79XX_REG_PRETTY_PRINT),y) -aic79xx_gen += aic79xx_reg_print.c -aic79xx_asm_cmd = aicasm/aicasm -I. -r aic79xx_reg.h \ - -p aic79xx_reg_print.c -i aic79xx_osm.h \ - -o aic79xx_seq.h aic79xx.seq -else -aic79xx_asm_cmd = aicasm/aicasm -I. -r aic79xx_reg.h \ - -o aic79xx_seq.h aic79xx.seq -endif -$(aic79xx_gen): aic79xx.seq aic79xx.reg aicasm/aicasm - $(aic79xx_asm_cmd) -endif - -aicasm/aicasm: aicasm/*.[chyl] - $(MAKE) -C aicasm diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/bugs.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/bugs.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,53 +0,0 @@ -/* - * include/asm-i386/bugs.h - * - * Copyright (C) 1994 Linus Torvalds - * - * Cyrix stuff, June 1998 by: - * - Rafael R. Reilova (moved everything from head.S), - * <rreilova@xxxxxxxxxxxx> - * - Channing Corn (tests & fixes), - * - Andrew D. Balsa (code cleanup). - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - */ - -/* - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ - -#include <linux/config.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/msr.h> - - -static void __init check_fpu(void) -{ - boot_cpu_data.fdiv_bug = 0; -} - -static void __init check_hlt(void) -{ - boot_cpu_data.hlt_works_ok = 1; -} - -static void __init check_bugs(void) -{ - extern void __init boot_init_fpu(void); - - identify_cpu(&boot_cpu_data); - boot_init_fpu(); -#ifndef CONFIG_SMP - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - check_fpu(); - check_hlt(); - system_utsname.machine[1] = '0' + - (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/desc.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/desc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,37 +0,0 @@ -#ifndef __ARCH_DESC_H -#define __ARCH_DESC_H - -#include <asm/ldt.h> - -#ifndef __ASSEMBLY__ - -struct desc_struct { - unsigned long a,b; -}; - -struct Xgt_desc_struct { - unsigned short size; - unsigned long address __attribute__((packed)); -}; - -extern struct desc_struct default_ldt[]; - -static inline void clear_LDT(void) -{ - xen_set_ldt(0, 0); -} - -static inline void load_LDT(mm_context_t *pc) -{ - void *segments = pc->ldt; - int count = pc->size; - - if ( count == 0 ) - segments = NULL; - - xen_set_ldt((unsigned long)segments, count); -} - -#endif /* __ASSEMBLY__ */ - -#endif /* __ARCH_DESC_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/fixmap.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/fixmap.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,107 +0,0 @@ -/* - * fixmap.h: compile-time virtual memory allocation - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1998 Ingo Molnar - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - */ - -#ifndef _ASM_FIXMAP_H -#define _ASM_FIXMAP_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm-xen/gnttab.h> -#ifdef CONFIG_HIGHMEM -#include <linux/threads.h> -#include <asm/kmap_types.h> -#endif - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. - * Also this lets us do fail-safe vmalloc(), we - * can guarantee that these special addresses and - * vmalloc()-ed addresses never overlap. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ - -enum fixed_addresses { -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif - FIX_BLKRING_BASE, - FIX_NETRING0_BASE, - FIX_NETRING1_BASE, - FIX_NETRING2_BASE, - FIX_NETRING3_BASE, - FIX_SHARED_INFO, - FIX_GNTTAB_BEGIN, - FIX_GNTTAB_END = FIX_GNTTAB_BEGIN + NR_GRANT_FRAMES - 1, -#ifdef CONFIG_VGA_CONSOLE -#define NR_FIX_BTMAPS 32 /* 128KB For the Dom0 VGA Console A0000-C0000 */ -#else -#define NR_FIX_BTMAPS 1 /* in case anyone wants it in future... */ -#endif - FIX_BTMAP_END, - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, - /* our bt_ioremap is permanent, unlike other architectures */ - - __end_of_permanent_fixed_addresses, - __end_of_fixed_addresses = __end_of_permanent_fixed_addresses -}; - -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); - -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - -extern void clear_fixmap(enum fixed_addresses idx); - -/* - * used by vmalloc.c. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap, and leave one page empty - * at the top of mem.. - */ -#define FIXADDR_TOP (HYPERVISOR_VIRT_START - 2*PAGE_SIZE) -#define __FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without tranlation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static inline unsigned long fix_to_virt(unsigned int idx) -{ - return __fix_to_virt(idx); -} - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/highmem.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/highmem.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,132 +0,0 @@ -/* - * highmem.h: virtual kernel memory mappings for high memory - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@xxxxxxxxxxxxxx - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - */ - -#ifndef _ASM_HIGHMEM_H -#define _ASM_HIGHMEM_H - -#ifdef __KERNEL__ - -#include <linux/config.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <asm/kmap_types.h> -#include <asm/pgtable.h> - -#ifdef CONFIG_DEBUG_HIGHMEM -#define HIGHMEM_DEBUG 1 -#else -#define HIGHMEM_DEBUG 0 -#endif - -/* declarations for highmem.c */ -extern unsigned long highstart_pfn, highend_pfn; - -extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; -extern pte_t *pkmap_page_table; - -extern void kmap_init(void) __init; - -/* - * Right now we initialize only a single pte table. It can be extended - * easily, subsequent pte tables have to be allocated in one physical - * chunk of RAM. - */ -#define PKMAP_BASE (HYPERVISOR_VIRT_START - (1<<23)) -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif -#define LAST_PKMAP_MASK (LAST_PKMAP-1) -#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) -#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) - -extern void * FASTCALL(kmap_high(struct page *page, int nonblocking)); -extern void FASTCALL(kunmap_high(struct page *page)); - -#define kmap(page) __kmap(page, 0) -#define kmap_nonblock(page) __kmap(page, 1) - -static inline void *__kmap(struct page *page, int nonblocking) -{ - if (in_interrupt()) - out_of_line_bug(); - if (page < highmem_start_page) - return page_address(page); - return kmap_high(page, nonblocking); -} - -static inline void kunmap(struct page *page) -{ - if (in_interrupt()) - out_of_line_bug(); - if (page < highmem_start_page) - return; - kunmap_high(page); -} - -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -static inline void *kmap_atomic(struct page *page, enum km_type type) -{ - enum fixed_addresses idx; - unsigned long vaddr; - - if (page < highmem_start_page) - return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#if HIGHMEM_DEBUG - if (!pte_none(*(kmap_pte-idx))) - out_of_line_bug(); -#endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; -} - -static inline void kunmap_atomic(void *kvaddr, enum km_type type) -{ -#if HIGHMEM_DEBUG - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - - if (vaddr < FIXADDR_START) // FIXME - return; - - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - out_of_line_bug(); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(kmap_pte-idx); - __flush_tlb_one(vaddr); -#endif -} - -#endif /* __KERNEL__ */ - -#endif /* _ASM_HIGHMEM_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/hw_irq.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/hw_irq.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,61 +0,0 @@ -#ifndef _ASM_HW_IRQ_H -#define _ASM_HW_IRQ_H - -/* - * linux/include/asm/hw_irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - */ - -#include <linux/config.h> -#include <linux/smp.h> -#include <asm/atomic.h> -#include <asm/irq.h> - -#define SYSCALL_VECTOR 0x80 - -extern int irq_vector[NR_IRQS]; - -extern atomic_t irq_err_count; -extern atomic_t irq_mis_count; - -extern char _stext, _etext; - -extern unsigned long prof_cpu_mask; -extern unsigned int * prof_buffer; -extern unsigned long prof_len; -extern unsigned long prof_shift; - -/* - * x86 profiling function, SMP safe. We might want to do this in - * assembly totally? - */ -static inline void x86_do_profile (unsigned long eip) -{ - if (!prof_buffer) - return; - - /* - * Only measure the CPUs specified by /proc/irq/prof_cpu_mask. - * (default is all CPUs.) - */ - if (!((1<<smp_processor_id()) & prof_cpu_mask)) - return; - - eip -= (unsigned long) &_stext; - eip >>= prof_shift; - /* - * Don't ignore out-of-bounds EIP values silently, - * put them into the last histogram slot, so if - * present, they will show up as a sharp peak. - */ - if (eip > prof_len-1) - eip = prof_len-1; - atomic_inc((atomic_t *)&prof_buffer[eip]); -} - -static inline void hw_resend_irq(struct hw_interrupt_type *h, - unsigned int i) -{} - -#endif /* _ASM_HW_IRQ_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/io.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/io.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,457 +0,0 @@ -#ifndef _ASM_IO_H -#define _ASM_IO_H - -#include <linux/config.h> - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@xxxxxxxxxxxxxxxx> - */ - -#define IO_SPACE_LIMIT 0xffff - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ -#define XQUAD_PORTIO_LEN 0x80000 /* Only remapping first 2 quads */ - -#ifdef __KERNEL__ - -#include <linux/vmalloc.h> - -/* - * Temporary debugging check to catch old code using - * unmapped ISA addresses. Will be removed in 2.4. - */ -#if CONFIG_DEBUG_IOVIRT - extern void *__io_virt_debug(unsigned long x, const char *file, int line); - extern unsigned long __io_phys_debug(unsigned long x, const char *file, int line); - #define __io_virt(x) __io_virt_debug((unsigned long)(x), __FILE__, __LINE__) -//#define __io_phys(x) __io_phys_debug((unsigned long)(x), __FILE__, __LINE__) -#else - #define __io_virt(x) ((void *)(x)) -//#define __io_phys(x) __pa(x) -#endif - -/** - * virt_to_phys - map virtual addresses to physical - * @address: address to remap - * - * The returned physical address is the physical (CPU) mapping for - * the memory address given. It is only valid to use this function on - * addresses directly mapped or allocated via kmalloc. - * - * This function does not give bus mappings for DMA transfers. In - * almost all conceivable cases a device driver should not be using - * this function - */ - -static inline unsigned long virt_to_phys(volatile void * address) -{ - return __pa(address); -} - -/** - * phys_to_virt - map physical address to virtual - * @address: address to remap - * - * The returned virtual address is a current CPU mapping for - * the memory address given. It is only valid to use this function on - * addresses that have a kernel mapping - * - * This function does not handle bus mappings for DMA transfers. In - * almost all conceivable cases a device driver should not be using - * this function - */ - -static inline void * phys_to_virt(unsigned long address) -{ - return __va(address); -} - -/* - * We define page_to_phys 'incorrectly' because it is used when merging blkdev - * requests, and the correct thing to do there is to use machine addresses. - */ -#define page_to_phys(_x) phys_to_machine(((_x) - mem_map) << PAGE_SHIFT) - -extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); - -/** - * ioremap - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - */ - -static inline void * ioremap (unsigned long offset, unsigned long size) -{ - return __ioremap(offset, size, 0); -} - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In paticular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - */ - -static inline void * ioremap_nocache (unsigned long offset, unsigned long size) -{ - return __ioremap(offset, size, _PAGE_PCD); -} - -extern void iounmap(void *addr); - -/* - * bt_ioremap() and bt_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void *bt_ioremap(unsigned long offset, unsigned long size); -extern void bt_iounmap(void *addr, unsigned long size); - -#define virt_to_bus(_x) phys_to_machine(virt_to_phys(_x)) -#define bus_to_virt(_x) phys_to_virt(machine_to_phys(_x)) -#define page_to_bus(_x) phys_to_machine(((_x) - mem_map) << PAGE_SHIFT) -#define bus_to_phys(_x) machine_to_phys(_x) -#define bus_to_page(_x) (mem_map + (bus_to_phys(_x) >> PAGE_SHIFT)) - -/* - * readX/writeX() are used to access memory mapped devices. On some - * architectures the memory mapped IO stuff needs to be accessed - * differently. On the x86 architecture, we just read/write the - * memory location directly. - */ - -#define readb(addr) (*(volatile unsigned char *) __io_virt(addr)) -#define readw(addr) (*(volatile unsigned short *) __io_virt(addr)) -#define readl(addr) (*(volatile unsigned int *) __io_virt(addr)) -#define __raw_readb readb -#define __raw_readw readw -#define __raw_readl readl - -#define writeb(b,addr) (*(volatile unsigned char *) __io_virt(addr) = (b)) -#define writew(b,addr) (*(volatile unsigned short *) __io_virt(addr) = (b)) -#define writel(b,addr) (*(volatile unsigned int *) __io_virt(addr) = (b)) -#define __raw_writeb writeb -#define __raw_writew writew -#define __raw_writel writel - -#define memset_io(a,b,c) __memset(__io_virt(a),(b),(c)) -#define memcpy_fromio(a,b,c) __memcpy((a),__io_virt(b),(c)) -#define memcpy_toio(a,b,c) __memcpy(__io_virt(a),(b),(c)) - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char *)(PAGE_OFFSET)) - -#define isa_readb(a) readb(__ISA_IO_base + (a)) -#define isa_readw(a) readw(__ISA_IO_base + (a)) -#define isa_readl(a) readl(__ISA_IO_base + (a)) -#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a)) -#define isa_writew(w,a) writew(w,__ISA_IO_base + (a)) -#define isa_writel(l,a) writel(l,__ISA_IO_base + (a)) -#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c)) -#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c)) -#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c)) - - -/* - * Again, i386 does not require mem IO specific function. - */ - -#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),__io_virt(b),(c),(d)) -#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),__io_virt(__ISA_IO_base + (b)),(c),(d)) - -/** - * check_signature - find BIOS signatures - * @io_addr: mmio address to check - * @signature: signature block - * @length: length of signature - * - * Perform a signature comparison with the mmio address io_addr. This - * address should have been obtained by ioremap. - * Returns 1 on a match. - */ - -static inline int check_signature(unsigned long io_addr, - const unsigned char *signature, int length) -{ - int retval = 0; - do { - if (readb(io_addr) != *signature) - goto out; - io_addr++; - signature++; - length--; - } while (length); - retval = 1; -out: - return retval; -} - -/** - * isa_check_signature - find BIOS signatures - * @io_addr: mmio address to check - * @signature: signature block - * @length: length of signature - * - * Perform a signature comparison with the ISA mmio address io_addr. - * Returns 1 on a match. - * - * This function is deprecated. New drivers should use ioremap and - * check_signature. - */ - - -static inline int isa_check_signature(unsigned long io_addr, - const unsigned char *signature, int length) -{ - int retval = 0; - do { - if (isa_readb(io_addr) != *signature) - goto out; - io_addr++; - signature++; - length--; - } while (length); - retval = 1; -out: - return retval; -} - -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - -static inline void flush_write_buffers(void) -{ - __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory"); -} - -#define dma_cache_inv(_start,_size) flush_write_buffers() -#define dma_cache_wback(_start,_size) flush_write_buffers() -#define dma_cache_wback_inv(_start,_size) flush_write_buffers() - -#else - -/* Nothing to do */ - -#define dma_cache_inv(_start,_size) do { } while (0) -#define dma_cache_wback(_start,_size) do { } while (0) -#define dma_cache_wback_inv(_start,_size) do { } while (0) -#define flush_write_buffers() - -#endif - -#endif /* __KERNEL__ */ - -#ifdef SLOW_IO_BY_JUMPING -#define __SLOW_DOWN_IO "\njmp 1f\n1:\tjmp 1f\n1:" -#elif defined(__UNSAFE_IO__) -#define __SLOW_DOWN_IO "\noutb %%al,$0x80" -#else -#define __SLOW_DOWN_IO "\n1: outb %%al,$0x80\n" \ - "2:\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,2b\n" \ - ".previous" -#endif - -#ifdef REALLY_SLOW_IO -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO -#else -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO -#endif - -#ifdef CONFIG_MULTIQUAD -extern void *xquad_portio; /* Where the IO area was mapped */ -#endif /* CONFIG_MULTIQUAD */ - -/* - * Talk about misusing macros.. - */ -#define __OUT1(s,x) \ -static inline void out##s(unsigned x value, unsigned short port) { - -#ifdef __UNSAFE_IO__ -#define __OUT2(s,s1,s2) \ -__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" -#else -#define __OUT2(s,s1,s2) \ -__asm__ __volatile__ ("1: out" #s " %" s1 "0,%" s2 "1\n" \ - "2:\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,2b\n" \ - ".previous" -#endif - -#if defined (CONFIG_MULTIQUAD) && !defined(STANDALONE) -#define __OUTQ(s,ss,x) /* Do the equivalent of the portio op on quads */ \ -static inline void out##ss(unsigned x value, unsigned short port) { \ - if (xquad_portio) \ - write##s(value, (unsigned long) xquad_portio + port); \ - else /* We're still in early boot, running on quad 0 */ \ - out##ss##_local(value, port); \ -} \ -static inline void out##ss##_quad(unsigned x value, unsigned short port, int quad) { \ - if (xquad_portio) \ - write##s(value, (unsigned long) xquad_portio + (XQUAD_PORTIO_QUAD*quad)\ - + port); \ -} - -#define __INQ(s,ss) /* Do the equivalent of the portio op on quads */ \ -static inline RETURN_TYPE in##ss(unsigned short port) { \ - if (xquad_portio) \ - return read##s((unsigned long) xquad_portio + port); \ - else /* We're still in early boot, running on quad 0 */ \ - return in##ss##_local(port); \ -} \ -static inline RETURN_TYPE in##ss##_quad(unsigned short port, int quad) { \ - if (xquad_portio) \ - return read##s((unsigned long) xquad_portio + (XQUAD_PORTIO_QUAD*quad)\ - + port); \ - else\ - return 0;\ -} -#endif /* CONFIG_MULTIQUAD && !STANDALONE */ - -#if !defined(CONFIG_MULTIQUAD) || defined(STANDALONE) -#define __OUT(s,s1,x) \ -__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ -__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} -#else -/* Make the default portio routines operate on quad 0 */ -#define __OUT(s,s1,x) \ -__OUT1(s##_local,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ -__OUT1(s##_p_local,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \ -__OUTQ(s,s,x) \ -__OUTQ(s,s##_p,x) -#endif /* !CONFIG_MULTIQUAD || STANDALONE */ - -#define __IN1(s) \ -static inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v; - -#ifdef __UNSAFE_IO__ -#define __IN2(s,s1,s2) \ -__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" -#else -#define __IN2(s,s1,s2) \ -__asm__ __volatile__ ("1: in" #s " %" s2 "1,%" s1 "0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: mov" #s " $~0,%" s1 "0\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,3b\n" \ - ".previous" -#endif - -#if !defined(CONFIG_MULTIQUAD) || defined(STANDALONE) -#define __IN(s,s1,i...) \ -__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ -__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } -#else -/* Make the default portio routines operate on quad 0 */ -#define __IN(s,s1,i...) \ -__IN1(s##_local) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ -__IN1(s##_p_local) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ -__INQ(s,s) \ -__INQ(s,s##_p) -#endif /* !CONFIG_MULTIQUAD || STANDALONE */ - -#define __INS(s) \ -static inline void ins##s(unsigned short port, void * addr, unsigned long count) \ -{ __asm__ __volatile__ ("rep ; ins" #s \ -: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); } - -#define __OUTS(s) \ -static inline void outs##s(unsigned short port, const void * addr, unsigned long count) \ -{ __asm__ __volatile__ ("rep ; outs" #s \ -: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); } - -#define RETURN_TYPE unsigned char -__IN(b,"") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned short -__IN(w,"") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned int -__IN(l,"") -#undef RETURN_TYPE - -__OUT(b,"b",char) -__OUT(w,"w",short) -__OUT(l,,int) - -__INS(b) -__INS(w) -__INS(l) - -__OUTS(b) -__OUTS(w) -__OUTS(l) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/irq.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/irq.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,65 +0,0 @@ -#ifndef _ASM_IRQ_H -#define _ASM_IRQ_H - -/* - * linux/include/asm/irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - * - * IRQ/IPI changes taken from work by Thomas Radke - * <tomsoft@xxxxxxxxxxxxxxxxxxxxxxxxx> - */ - -#include <linux/config.h> -#include <asm/hypervisor.h> -#include <asm/ptrace.h> - -/* - * The flat IRQ space is divided into two regions: - * 1. A one-to-one mapping of real physical IRQs. This space is only used - * if we have physical device-access privilege. This region is at the - * start of the IRQ space so that existing device drivers do not need - * to be modified to translate physical IRQ numbers into our IRQ space. - * 3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These - * are bound using the provided bind/unbind functions. - */ - -#define PIRQ_BASE 0 -#define NR_PIRQS 128 - -#define DYNIRQ_BASE (PIRQ_BASE + NR_PIRQS) -#define NR_DYNIRQS 128 - -#define NR_IRQS (NR_PIRQS + NR_DYNIRQS) - -#define pirq_to_irq(_x) ((_x) + PIRQ_BASE) -#define irq_to_pirq(_x) ((_x) - PIRQ_BASE) - -#define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) -#define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) - -/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ -extern int bind_virq_to_irq(int virq); -extern void unbind_virq_from_irq(int virq); -extern int bind_evtchn_to_irq(int evtchn); -extern void unbind_evtchn_from_irq(int evtchn); - -static __inline__ int irq_cannonicalize(int irq) -{ - return (irq == 2) ? 9 : irq; -} - -extern void disable_irq(unsigned int); -extern void disable_irq_nosync(unsigned int); -extern void enable_irq(unsigned int); - -extern void irq_suspend(void); -extern void irq_resume(void); - - -#define CPU_MASK_NONE 0 - -/* XXX SMH: no-op for compat w/ 2.6 shared files */ -#define irq_ctx_init(cpu) do { ; } while (0) - -#endif /* _ASM_IRQ_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/keyboard.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/keyboard.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,74 +0,0 @@ -/* - * linux/include/asm-i386/keyboard.h - * - * Created 3 Nov 1996 by Geert Uytterhoeven - */ - -/* - * This file contains the i386 architecture specific keyboard definitions - */ - -#ifndef _I386_KEYBOARD_H -#define _I386_KEYBOARD_H - -#ifdef __KERNEL__ - -#include <linux/kernel.h> -#include <linux/ioport.h> -#include <linux/kd.h> -#include <linux/pm.h> -#include <asm/io.h> - -#define KEYBOARD_IRQ 1 -#define DISABLE_KBD_DURING_INTERRUPTS 0 - -extern int pckbd_setkeycode(unsigned int scancode, unsigned int keycode); -extern int pckbd_getkeycode(unsigned int scancode); -extern int pckbd_translate(unsigned char scancode, unsigned char *keycode, - char raw_mode); -extern char pckbd_unexpected_up(unsigned char keycode); -extern void pckbd_leds(unsigned char leds); -extern void pckbd_init_hw(void); -extern int pckbd_pm_resume(struct pm_dev *, pm_request_t, void *); -extern pm_callback pm_kbd_request_override; -extern unsigned char pckbd_sysrq_xlate[128]; - -#define kbd_setkeycode pckbd_setkeycode -#define kbd_getkeycode pckbd_getkeycode -#define kbd_translate pckbd_translate -#define kbd_unexpected_up pckbd_unexpected_up -#define kbd_leds pckbd_leds -#define kbd_init_hw pckbd_init_hw -#define kbd_sysrq_xlate pckbd_sysrq_xlate - -#define SYSRQ_KEY 0x54 - -#define kbd_controller_present() (xen_start_info.flags & SIF_INITDOMAIN) - -/* resource allocation */ -#define kbd_request_region() -#define kbd_request_irq(handler) request_irq(KEYBOARD_IRQ, handler, 0, \ - "keyboard", NULL) - -/* How to access the keyboard macros on this platform. */ -#define kbd_read_input() inb(KBD_DATA_REG) -#define kbd_read_status() inb(KBD_STATUS_REG) -#define kbd_write_output(val) outb(val, KBD_DATA_REG) -#define kbd_write_command(val) outb(val, KBD_CNTL_REG) - -/* Some stoneage hardware needs delays after some operations. */ -#define kbd_pause() do { } while(0) - -/* - * Machine specific bits for the PS/2 driver - */ - -#define AUX_IRQ 12 - -#define aux_request_irq(hand, dev_id) \ - request_irq(AUX_IRQ, hand, SA_SHIRQ, "PS/2 Mouse", dev_id) - -#define aux_free_irq(dev_id) free_irq(AUX_IRQ, dev_id) - -#endif /* __KERNEL__ */ -#endif /* _I386_KEYBOARD_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/mmu_context.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/mmu_context.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,59 +0,0 @@ -#ifndef __I386_MMU_CONTEXT_H -#define __I386_MMU_CONTEXT_H - -#include <linux/config.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/pgalloc.h> - -/* - * hooks to add arch specific data into the mm struct. - * Note that destroy_context is called even if init_new_context - * fails. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - -#ifdef CONFIG_SMP - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu) -{ - if(cpu_tlbstate[cpu].state == TLBSTATE_OK) - cpu_tlbstate[cpu].state = TLBSTATE_LAZY; -} -#else -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu) -{ -} -#endif - -extern pgd_t *cur_pgd; - -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) -{ - struct mmuext_op _op[2], *op = _op; - if (prev != next) { - /* stop flush ipis for the previous mm */ - clear_bit(cpu, &prev->cpu_vm_mask); - /* Re-load page tables */ - cur_pgd = next->pgd; - op->cmd = MMUEXT_NEW_BASEPTR; - op->mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); - op++; - /* load_LDT, if either the previous or next thread - * has a non-default LDT. - */ - if (next->context.size+prev->context.size) { - op->cmd = MMUEXT_SET_LDT; - op->linear_addr = (unsigned long)next->context.ldt; - op->nr_ents = next->context.size; - op++; - } - BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); - } -} - -#define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL,smp_processor_id()) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/module.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/module.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,14 +0,0 @@ -#ifndef _ASM_I386_MODULE_H -#define _ASM_I386_MODULE_H -/* - * This file contains the i386 architecture specific module code. - */ - -extern int xen_module_init(struct module *mod); - -#define module_map(x) vmalloc(x) -#define module_unmap(x) vfree(x) -#define module_arch_init(x) xen_module_init(x) -#define arch_init_modules(x) do { } while (0) - -#endif /* _ASM_I386_MODULE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/page.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/page.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,178 +0,0 @@ -#ifndef _I386_PAGE_H -#define _I386_PAGE_H - -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -#include <linux/config.h> -#include <linux/string.h> -#include <asm/types.h> -#include <asm-xen/xen-public/xen.h> - -#ifdef CONFIG_XEN_SCRUB_PAGES -#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT) -#else -#define scrub_pages(_p,_n) ((void)0) -#endif - -#ifdef CONFIG_X86_USE_3DNOW - -#include <asm/mmx.h> - -#define clear_page(page) mmx_clear_page((void *)(page)) -#define copy_page(to,from) mmx_copy_page(to,from) - -#else - -/* - * On older X86 processors its not a win to use MMX here it seems. - * Maybe the K6-III ? - */ - -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) -#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) - -#endif - -#define clear_user_page(page, vaddr) clear_page(page) -#define copy_user_page(to, from, vaddr) copy_page(to, from) - -/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ -extern unsigned int *phys_to_machine_mapping; -#define pfn_to_mfn(_pfn) ((unsigned long)(phys_to_machine_mapping[(_pfn)])) -#define mfn_to_pfn(_mfn) ((unsigned long)(machine_to_phys_mapping[(_mfn)])) -static inline unsigned long phys_to_machine(unsigned long phys) -{ - unsigned long machine = pfn_to_mfn(phys >> PAGE_SHIFT); - machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); - return machine; -} -static inline unsigned long machine_to_phys(unsigned long machine) -{ - unsigned long phys = mfn_to_pfn(machine >> PAGE_SHIFT); - phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); - return phys; -} - -/* - * These are used to make use of C type-checking.. - */ -#if CONFIG_X86_PAE -typedef struct { unsigned long pte_low, pte_high; } pte_t; -typedef struct { unsigned long long pmd; } pmd_t; -typedef struct { unsigned long long pgd; } pgd_t; -#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) -#else -typedef struct { unsigned long pte_low; } pte_t; -typedef struct { unsigned long pmd; } pmd_t; -typedef struct { unsigned long pgd; } pgd_t; -static inline unsigned long pte_val(pte_t x) -{ - unsigned long ret = x.pte_low; - if ( (ret & 1) ) ret = machine_to_phys(ret); - return ret; -} -#define pte_val_ma(x) ((x).pte_low) -#endif -#define PTE_MASK PAGE_MASK - -typedef struct { unsigned long pgprot; } pgprot_t; - -static inline unsigned long pmd_val(pmd_t x) -{ - unsigned long ret = x.pmd; - if ( ret ) ret = machine_to_phys(ret) | 1; - return ret; -} -#define pmd_val_ma(x) ((x).pmd) -#define pgd_val(x) ({ BUG(); (unsigned long)0; }) -#define pgprot_val(x) ((x).pgprot) - -#define __pte(x) ({ unsigned long _x = (x); \ - (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); }) -#define __pte_ma(x) ((pte_t) { (x) } ) -#define __pmd(x) ({ unsigned long _x = (x); \ - (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); }) -#define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; }) -#define __pgprot(x) ((pgprot_t) { (x) } ) - -#endif /* !__ASSEMBLY__ */ - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - -/* - * This handles the memory map.. We could make this a config - * option, but too many people screw it up, and too few need - * it. - * - * A __PAGE_OFFSET of 0xC0000000 means that the kernel has - * a virtual address space of one gigabyte, which limits the - * amount of physical memory you can use to about 950MB. - * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. - */ - -#define __PAGE_OFFSET (0xC0000000) - -#ifndef __ASSEMBLY__ - -/* - * Tell the user there is some problem. Beep too, so we can - * see^H^H^Hhear bugs in early bootup as well! - * The offending file and line are encoded after the "officially - * undefined" opcode for parsing in the trap handler. - */ - -#if 1 /* Set to zero for a slightly smaller kernel */ -#define BUG() \ - __asm__ __volatile__( "ud2\n" \ - "\t.word %c0\n" \ - "\t.long %c1\n" \ - : : "i" (__LINE__), "i" (__FILE__)) -#else -#define BUG() __asm__ __volatile__("ud2\n") -#endif - -#define PAGE_BUG(page) do { \ - BUG(); \ -} while (0) - -/* Pure 2^n version of get_order */ -static __inline__ int get_order(unsigned long size) -{ - int order; - - size = (size-1) >> (PAGE_SHIFT-1); - order = -1; - do { - size >>= 1; - order++; - } while (size); - return order; -} - -#endif /* __ASSEMBLY__ */ - -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define virt_to_page(kaddr) (mem_map + (__pa(kaddr) >> PAGE_SHIFT)) -#define VALID_PAGE(page) ((page - mem_map) < max_mapnr) - -#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -/* VIRT <-> MACHINE conversion */ -#define virt_to_machine(_a) (phys_to_machine(__pa(_a))) -#define machine_to_virt(_m) (__va(machine_to_phys(_m))) - -#endif /* __KERNEL__ */ - -#endif /* _I386_PAGE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/pci.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/pci.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,283 +0,0 @@ -#ifndef __i386_PCI_H -#define __i386_PCI_H - -#include <linux/config.h> - -#ifdef __KERNEL__ - -/* Can be used to override the logic in pci_scan_bus for skipping - already-configured bus numbers - to be used for buggy BIOSes - or architectures with incomplete PCI setup by the loader */ - -#ifdef CONFIG_PCI -extern unsigned int pcibios_assign_all_busses(void); -#else -#define pcibios_assign_all_busses() 0 -#endif -#define pcibios_scan_all_fns() 0 - -extern unsigned long pci_mem_start; -#define PCIBIOS_MIN_IO 0x1000 -#define PCIBIOS_MIN_MEM (pci_mem_start) - -void pcibios_config_init(void); -struct pci_bus * pcibios_scan_root(int bus); -extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); -extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); - -void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq); -struct irq_routing_table *pcibios_get_irq_routing_table(void); -int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); - -/* Dynamic DMA mapping stuff. - * i386 has everything mapped statically. - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <asm/scatterlist.h> -#include <linux/string.h> -#include <asm/io.h> - -struct pci_dev; - -/* The networking and block device layers use this boolean for bounce - * buffer decisions. - */ -#define PCI_DMA_BUS_IS_PHYS (0) - -/* Allocate and map kernel buffer using consistent mode DMA for a device. - * hwdev should be valid struct pci_dev pointer for PCI devices, - * NULL for PCI-like buses (ISA, EISA). - * Returns non-NULL cpu-view pointer to the buffer if successful and - * sets *dma_addrp to the pci side dma address as well, else *dma_addrp - * is undefined. - */ -extern void *pci_alloc_consistent(struct pci_dev *hwdev, size_t size, - dma_addr_t *dma_handle); - -/* Free and unmap a consistent DMA buffer. - * cpu_addr is what was returned from pci_alloc_consistent, - * size must be the same as what as passed into pci_alloc_consistent, - * and likewise dma_addr must be the same as what *dma_addrp was set to. - * - * References to the memory and mappings associated with cpu_addr/dma_addr - * past this call are illegal. - */ -extern void pci_free_consistent(struct pci_dev *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle); - -/* Map a single buffer of the indicated size for DMA in streaming mode. - * The 32-bit bus address to use is returned. - * - * Once the device is given the dma address, the device owns this memory - * until either pci_unmap_single or pci_dma_sync_single is performed. - */ -static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, - size_t size, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - flush_write_buffers(); - return virt_to_bus(ptr); -} - -/* Unmap a single streaming mode DMA translation. The dma_addr and size - * must match what was provided for in a previous pci_map_single call. All - * other usages are undefined. - * - * After this call, reads by the cpu to the buffer are guarenteed to see - * whatever the device wrote there. - */ -static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, - size_t size, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - /* Nothing to do */ -} - -/* - * pci_{map,unmap}_single_page maps a kernel page to a dma_addr_t. identical - * to pci_map_single, but takes a struct page instead of a virtual address - */ -static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - - return page_to_bus(page) + offset; -} - -static inline void pci_unmap_page(struct pci_dev *hwdev, dma_addr_t dma_address, - size_t size, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - /* Nothing to do */ -} - -/* pci_unmap_{page,single} is a nop so... */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) (0) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define pci_unmap_len(PTR, LEN_NAME) (0) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scather-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction) -{ - int i; - - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - - /* - * temporary 2.4 hack - */ - for (i = 0; i < nents; i++ ) { - if (sg[i].address && sg[i].page) - out_of_line_bug(); - else if (!sg[i].address && !sg[i].page) - out_of_line_bug(); - - if (sg[i].address) - sg[i].dma_address = virt_to_bus(sg[i].address); - else - sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset; - } - - flush_write_buffers(); - return nents; -} - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - /* Nothing to do */ -} - -/* Make physical memory consistent for a single - * streaming mode DMA translation after a transfer. - * - * If you perform a pci_map_single() but wish to interrogate the - * buffer using the cpu, yet do not wish to teardown the PCI dma - * mapping, you must call this function before doing so. At the - * next point you give the PCI dma address back to the card, the - * device again owns the buffer. - */ -static inline void pci_dma_sync_single(struct pci_dev *hwdev, - dma_addr_t dma_handle, - size_t size, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - flush_write_buffers(); -} - -/* Make physical memory consistent for a set of streaming - * mode DMA translations after a transfer. - * - * The same as pci_dma_sync_single but for a scatter-gather list, - * same rules and usage. - */ -static inline void pci_dma_sync_sg(struct pci_dev *hwdev, - struct scatterlist *sg, - int nelems, int direction) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); - flush_write_buffers(); -} - -/* Return whether the given PCI device DMA address mask can - * be supported properly. For example, if your device can - * only drive the low 24-bits during PCI bus mastering, then - * you would pass 0x00ffffff as the mask to this function. - */ -static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - */ - if(mask < 0x00ffffff) - return 0; - - return 1; -} - -/* This is always fine. */ -#define pci_dac_dma_supported(pci_dev, mask) (1) - -static __inline__ dma64_addr_t -pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) -{ - return ((dma64_addr_t) page_to_bus(page) + - (dma64_addr_t) offset); -} - -static __inline__ struct page * -pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) -{ - return bus_to_page(dma_addr); -} - -static __inline__ unsigned long -pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) -{ - return (dma_addr & ~PAGE_MASK); -} - -static __inline__ void -pci_dac_dma_sync_single(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) -{ - flush_write_buffers(); -} - -/* These macros should be used after a pci_map_sg call has been done - * to get bus addresses of each of the SG entries and their lengths. - * You should only work with the number of sg entries pci_map_sg - * returns. - */ -#define sg_dma_address(sg) ((sg)->dma_address) -#define sg_dma_len(sg) ((sg)->length) - -/* Return the index of the PCI controller for device. */ -static inline int pci_controller_num(struct pci_dev *dev) -{ - return 0; -} - -#define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine); - -#endif /* __KERNEL__ */ - -#endif /* __i386_PCI_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/pgalloc.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/pgalloc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,280 +0,0 @@ -#ifndef _I386_PGALLOC_H -#define _I386_PGALLOC_H - -#include <linux/config.h> -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <asm/hypervisor.h> -#include <linux/threads.h> - -/* - * Quick lists are aligned so that least significant bits of array pointer - * are all zero when list is empty, and all one when list is full. - */ -#define QUICKLIST_ENTRIES 256 -#define QUICKLIST_EMPTY(_l) !((unsigned long)(_l) & ((QUICKLIST_ENTRIES*4)-1)) -#define QUICKLIST_FULL(_l) QUICKLIST_EMPTY((_l)+1) -#define pgd_quicklist (current_cpu_data.pgd_quick) -#define pmd_quicklist (current_cpu_data.pmd_quick) -#define pte_quicklist (current_cpu_data.pte_quick) -#define pgtable_cache_size (current_cpu_data.pgtable_cache_sz) - -#define pmd_populate(mm, pmd, pte) \ - do { \ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ - } while ( 0 ) - -/* - * Allocate and free page tables. - */ - -#if defined (CONFIG_X86_PAE) - -#error "no PAE support as yet" - -/* - * We can't include <linux/slab.h> here, thus these uglinesses. - */ -struct kmem_cache_s; - -extern struct kmem_cache_s *pae_pgd_cachep; -extern void *kmem_cache_alloc(struct kmem_cache_s *, int); -extern void kmem_cache_free(struct kmem_cache_s *, void *); - - -static inline pgd_t *get_pgd_slow(void) -{ - int i; - pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); - - if (pgd) { - for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); - if (!pmd) - goto out_oom; - clear_page(pmd); - set_pgd(pgd + i, __pgd(1 + __pa(pmd))); - } - memcpy(pgd + USER_PTRS_PER_PGD, - init_mm.pgd + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - } - return pgd; -out_oom: - for (i--; i >= 0; i--) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); - return NULL; -} - -#else - -static inline pgd_t *get_pgd_slow(void) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); - - if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - init_mm.pgd + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - __make_page_readonly(pgd); - xen_pgd_pin(__pa(pgd)); - } - return pgd; -} - -#endif /* CONFIG_X86_PAE */ - -static inline pgd_t *get_pgd_fast(void) -{ - unsigned long ret; - - if ( !QUICKLIST_EMPTY(pgd_quicklist) ) { - ret = *(--pgd_quicklist); - pgtable_cache_size--; - - } else - ret = (unsigned long)get_pgd_slow(); - return (pgd_t *)ret; -} - -static inline void free_pgd_slow(pgd_t *pgd) -{ -#if defined(CONFIG_X86_PAE) -#error - int i; - - for (i = 0; i < USER_PTRS_PER_PGD; i++) - free_page((unsigned long)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pae_pgd_cachep, pgd); -#else - xen_pgd_unpin(__pa(pgd)); - __make_page_writable(pgd); - free_page((unsigned long)pgd); -#endif -} - -static inline void free_pgd_fast(pgd_t *pgd) -{ - if ( !QUICKLIST_FULL(pgd_quicklist) ) { - *(pgd_quicklist++) = (unsigned long)pgd; - pgtable_cache_size++; - } else - free_pgd_slow(pgd); -} - -static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte; - - pte = (pte_t *) __get_free_page(GFP_KERNEL); - if (pte) - { - clear_page(pte); - __make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - } - return pte; - -} - -static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, - unsigned long address) -{ - unsigned long ret = 0; - if ( !QUICKLIST_EMPTY(pte_quicklist) ) { - ret = *(--pte_quicklist); - pgtable_cache_size--; - } - return (pte_t *)ret; -} - -static __inline__ void pte_free_slow(pte_t *pte) -{ - xen_pte_unpin(__pa(pte)); - __make_page_writable(pte); - free_page((unsigned long)pte); -} - -static inline void pte_free_fast(pte_t *pte) -{ - if ( !QUICKLIST_FULL(pte_quicklist) ) { - *(pte_quicklist++) = (unsigned long)pte; - pgtable_cache_size++; - } else - pte_free_slow(pte); -} - -#define pte_free(pte) pte_free_fast(pte) -#define pgd_free(pgd) free_pgd_fast(pgd) -#define pgd_alloc(mm) get_pgd_fast() - -/* - * allocating and freeing a pmd is trivial: the 1-entry pmd is - * inside the pgd, so has no extra memory associated with it. - * (In the PAE case we free the pmds as part of the pgd.) - */ - -#define pmd_alloc_one_fast(mm, addr) ({ BUG(); ((pmd_t *)1); }) -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free_slow(x) do { } while (0) -#define pmd_free_fast(x) do { } while (0) -#define pmd_free(x) do { } while (0) -#define pgd_populate(mm, pmd, pte) BUG() - -extern int do_check_pgt_cache(int, int); - -/* - * TLB flushing: - * - * - flush_tlb() flushes the current mm struct TLBs - * - flush_tlb_all() flushes all processes TLBs - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(mm, start, end) flushes a range of pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables - * - * ..but the i386 has somewhat limited tlb flushing capabilities, - * and page-granular flushes are available only on i486 and up. - */ - -#ifndef CONFIG_SMP - -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) xen_tlb_flush(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) xen_invlpg(addr); -} - -static inline void flush_tlb_range(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - if (mm == current->active_mm) xen_tlb_flush(); -} - -#else -#error no kernel SMP support yet... -#include <asm/smp.h> - -#define local_flush_tlb() \ - __flush_tlb() - -extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); - -#define flush_tlb() flush_tlb_current_task() - -static inline void flush_tlb_range(struct mm_struct * mm, unsigned long start, unsigned long end) -{ - flush_tlb_mm(mm); -} - -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - -struct tlb_state -{ - struct mm_struct *active_mm; - int state; -} ____cacheline_aligned; -extern struct tlb_state cpu_tlbstate[NR_CPUS]; - -#endif /* CONFIG_SMP */ - -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* i386 does not keep any page table caches in TLB */ -} - -/* - * NB. The 'domid' field should be zero if mapping I/O space (non RAM). - * Otherwise it identifies the owner of the memory that is being mapped. - */ -extern int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid); - -extern int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v); - - - -#endif /* _I386_PGALLOC_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/pgtable-2level.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/pgtable-2level.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,97 +0,0 @@ -#ifndef _I386_PGTABLE_2LEVEL_H -#define _I386_PGTABLE_2LEVEL_H - -/* - * traditional i386 two-level paging structure: - */ - -#define PGDIR_SHIFT 22 -#define PTRS_PER_PGD 1024 - -/* - * the i386 is two-level, so we don't really have any - * PMD directory physically. - */ -#define PMD_SHIFT 22 -#define PTRS_PER_PMD 1 - -#define PTRS_PER_PTE 1024 - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) - -/* - * The "pgd_xxx()" functions here are trivial for a folded two-level - * setup: the pgd is never bad, and a pmd always exists (as it's folded - * into the pgd entry) - */ -static inline int pgd_none(pgd_t pgd) { return 0; } -static inline int pgd_bad(pgd_t pgd) { return 0; } -static inline int pgd_present(pgd_t pgd) { return 1; } -#define pgd_clear(xp) do { } while (0) - -/* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. - */ -#define set_pte(pteptr, pteval) (*(pteptr) = pteval) -#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval) - -/* - * (pmds are folded into pgds so this doesnt get actually called, - * but the define is needed for a generic inline function.) - */ -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) -#define set_pgd(pgdptr, pgdval) ((void)0) - -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) - -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} - -#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0)) -#define pte_same(a, b) ((a).pte_low == (b).pte_low) - -/* - * We detect special mappings in one of two ways: - * 1. If the MFN is an I/O page then Xen will set the m2p entry - * to be outside our maximum possible pseudophys range. - * 2. If the MFN belongs to a different domain then we will certainly - * not have MFN in our p2m table. Conversely, if the page is ours, - * then we'll have p2m(m2p(MFN))==MFN. - * If we detect a special mapping then it doesn't have a 'struct page'. - * We force !VALID_PAGE() by returning an out-of-range pointer. - * - * NB. These checks require that, for any MFN that is not in our reservation, - * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if - * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. - * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. - * - * NB2. When deliberately mapping foreign pages into the p2m table, you *must* - * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we - * require. In all the cases we care about, the high bit gets shifted out - * (e.g., phys_to_machine()) so behaviour there is correct. - */ -#define INVALID_P2M_ENTRY (~0U) -#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) -#define pte_page(_pte) \ -({ \ - unsigned long mfn = (_pte).pte_low >> PAGE_SHIFT; \ - unsigned long pfn = mfn_to_pfn(mfn); \ - if ( (pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn) ) \ - pfn = max_mapnr; /* specia: force !VALID_PAGE() */ \ - &mem_map[pfn]; \ -}) - -#define pte_none(x) (!(x).pte_low) -#define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) - -#endif /* _I386_PGTABLE_2LEVEL_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/pgtable.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/pgtable.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,371 +0,0 @@ -#ifndef _I386_PGTABLE_H -#define _I386_PGTABLE_H - -#include <linux/config.h> - -/* - * The Linux memory management assumes a three-level page table setup. On - * the i386, we use that, but "fold" the mid level into the top-level page - * table, so that we physically have the same two-level page table as the - * i386 mmu expects. - * - * This file contains the functions and defines necessary to modify and use - * the i386 page table tree. - */ -#ifndef __ASSEMBLY__ -#include <asm/processor.h> -#include <asm/hypervisor.h> -#include <linux/threads.h> -#include <asm/fixmap.h> - -#ifndef _I386_BITOPS_H -#include <asm/bitops.h> -#endif - -#define swapper_pg_dir 0 -extern void paging_init(void); - -/* Caches aren't brain-dead on the intel. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_range(mm, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr) do { } while (0) -#define flush_page_to_ram(page) do { } while (0) -#define flush_dcache_page(page) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma,pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) - -extern unsigned long pgkern_mask; - -#define __flush_tlb() xen_tlb_flush() -#define __flush_tlb_global() __flush_tlb() -#define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) xen_invlpg(addr) -#define __flush_tlb_single(addr) xen_invlpg(addr) - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[1024]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - -#endif /* !__ASSEMBLY__ */ - -/* - * The Linux x86 paging architecture is 'compile-time dual-mode', it - * implements both the traditional 2-level x86 page tables and the - * newer 3-level PAE-mode page tables. - */ -#ifndef __ASSEMBLY__ -#if CONFIG_X86_PAE -# include <asm/pgtable-3level.h> - -/* - * Need to initialise the X86 PAE caches - */ -extern void pgtable_cache_init(void); - -#else -# include <asm/pgtable-2level.h> - -/* - * No page table caches to initialise - */ -#define pgtable_cache_init() do { } while (0) - -#endif -#endif - -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 - -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - -#define TWOLEVEL_PGDIR_SHIFT 22 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) - - -#ifndef __ASSEMBLY__ -/* 4MB is just a nice "safety zone". Also, we align to a fresh pde. */ -#define VMALLOC_OFFSET (4*1024*1024) -extern void * high_memory; -#define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ - ~(VMALLOC_OFFSET-1)) -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#if CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif - -#define _PAGE_BIT_PRESENT 0 -#define _PAGE_BIT_RW 1 -#define _PAGE_BIT_USER 2 -#define _PAGE_BIT_PWT 3 -#define _PAGE_BIT_PCD 4 -#define _PAGE_BIT_ACCESSED 5 -#define _PAGE_BIT_DIRTY 6 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ - -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ -#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ - -#define _PAGE_PROTNONE 0x080 /* If not present */ - -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) - -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) - -#define __PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) -#define __PAGE_KERNEL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED) -#define __PAGE_KERNEL_RO \ - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) - -#if 0 -#define MAKE_GLOBAL(x) __pgprot((x) | _PAGE_GLOBAL) -#else -#define MAKE_GLOBAL(x) __pgprot(x) -#endif - -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) - -/* - * The i386 can't do page protection for execute, and considers that - * the same are read. Also, write permissions imply read permissions. - * This is the closest we can get.. - */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED - -#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) - -#define pmd_none(x) (!pmd_val(x)) -/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. - can temporarily clear it. */ -#define pmd_present(x) (pmd_val(x)) -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) - - -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_exec(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } - -static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } -static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } -static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } -static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } - -static inline int ptep_test_and_clear_dirty(pte_t *ptep) -{ - if (!pte_dirty(*ptep)) - return 0; - return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); -} - -static inline int ptep_test_and_clear_young(pte_t *ptep) -{ - if (!pte_young(*ptep)) - return 0; - return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); -} - -static inline void ptep_set_wrprotect(pte_t *ptep) -{ - if (pte_write(*ptep)) - clear_bit(_PAGE_BIT_RW, &ptep->pte_low); -} - -static inline void ptep_mkdirty(pte_t *ptep) -{ - if (!pte_dirty(*ptep)) - set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); -} - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - -#define mk_pte(page, pgprot) __mk_pte((page) - mem_map, (pgprot)) - -/* This takes a physical page address that is used by the remapping functions */ -#define mk_pte_phys(physpage, pgprot) __mk_pte((physpage) >> PAGE_SHIFT, pgprot) - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - pte.pte_low &= _PAGE_CHG_MASK; - pte.pte_low |= pgprot_val(newprot); - return pte; -} - -#define page_pte(page) page_pte_prot(page, __pgprot(0)) - -#define pmd_page(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - -/* to find an entry in a page-table-directory. */ -#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) - -#define __pgd_offset(address) pgd_index(address) - -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - -/* to find an entry in a kernel page-table-directory */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) - -#define __pmd_offset(address) \ - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) - -/* Find an entry in the third-level page table.. */ -#define __pte_offset(address) \ - ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset(dir, address) ((pte_t *) pmd_page(*(dir)) + \ - __pte_offset(address)) - -/* - * The i386 doesn't have any external MMU info: the kernel page - * tables contain all the necessary information. - */ -#define update_mmu_cache(vma,address,pte) do { } while (0) - -/* Encode and de-code a swap entry */ -#define SWP_TYPE(x) (((x).val >> 1) & 0x3f) -#define SWP_OFFSET(x) ((x).val >> 8) -#define SWP_ENTRY(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) -#define pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) -#define swp_entry_to_pte(x) ((pte_t) { (x).val }) - -struct page; -int change_page_attr(struct page *, int, pgprot_t prot); - -static inline void __make_page_readonly(void *va) -{ - pgd_t *pgd = pgd_offset_k((unsigned long)va); - pmd_t *pmd = pmd_offset(pgd, (unsigned long)va); - pte_t *pte = pte_offset(pmd, (unsigned long)va); - set_pte(pte, pte_wrprotect(*pte)); -} - -static inline void __make_page_writable(void *va) -{ - pgd_t *pgd = pgd_offset_k((unsigned long)va); - pmd_t *pmd = pmd_offset(pgd, (unsigned long)va); - pte_t *pte = pte_offset(pmd, (unsigned long)va); - set_pte(pte, pte_mkwrite(*pte)); -} - -static inline void make_page_readonly(void *va) -{ - pgd_t *pgd = pgd_offset_k((unsigned long)va); - pmd_t *pmd = pmd_offset(pgd, (unsigned long)va); - pte_t *pte = pte_offset(pmd, (unsigned long)va); - set_pte(pte, pte_wrprotect(*pte)); - if ( (unsigned long)va >= VMALLOC_START ) - __make_page_readonly(machine_to_virt( - *(unsigned long *)pte&PAGE_MASK)); -} - -static inline void make_page_writable(void *va) -{ - pgd_t *pgd = pgd_offset_k((unsigned long)va); - pmd_t *pmd = pmd_offset(pgd, (unsigned long)va); - pte_t *pte = pte_offset(pmd, (unsigned long)va); - set_pte(pte, pte_mkwrite(*pte)); - if ( (unsigned long)va >= VMALLOC_START ) - __make_page_writable(machine_to_virt( - *(unsigned long *)pte&PAGE_MASK)); -} - -static inline void make_pages_readonly(void *va, unsigned int nr) -{ - while ( nr-- != 0 ) - { - make_page_readonly(va); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -static inline void make_pages_writable(void *va, unsigned int nr) -{ - while ( nr-- != 0 ) - { - make_page_writable(va); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -static inline unsigned long arbitrary_virt_to_machine(void *va) -{ - pgd_t *pgd = pgd_offset_k((unsigned long)va); - pmd_t *pmd = pmd_offset(pgd, (unsigned long)va); - pte_t *pte = pte_offset(pmd, (unsigned long)va); - unsigned long pa = (*(unsigned long *)pte) & PAGE_MASK; - return pa | ((unsigned long)va & (PAGE_SIZE-1)); -} - -#endif /* !__ASSEMBLY__ */ - -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define PageSkip(page) (0) -#define kern_addr_valid(addr) (1) - -#define io_remap_page_range remap_page_range - -#endif /* _I386_PGTABLE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/processor.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/processor.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,483 +0,0 @@ -/* - * include/asm-i386/processor.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -#ifndef __ASM_I386_PROCESSOR_H -#define __ASM_I386_PROCESSOR_H - -#include <asm/vm86.h> -#include <asm/math_emu.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/sigcontext.h> -#include <asm/cpufeature.h> -#include <linux/cache.h> -#include <linux/config.h> -#include <linux/threads.h> - -/* - * Default implementation of macro that returns current - * instruction pointer ("program counter"). - */ -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) - -/* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head.S, so think twice - * before touching them. [mj] - */ - -struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; - __u8 x86_mask; - char wp_works_ok; /* It doesn't on 386's */ - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ - char hard_math; - char rfu; - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ - __u32 x86_capability[NCAPINTS]; - char x86_vendor_id[16]; - char x86_model_id[64]; - int x86_cache_size; /* in KB - valid for CPUS which support this - call */ - int fdiv_bug; - int f00f_bug; - int coma_bug; - unsigned long loops_per_jiffy; - unsigned long *pgd_quick; - unsigned long *pmd_quick; - unsigned long *pte_quick; - unsigned long pgtable_cache_sz; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - -#define X86_VENDOR_INTEL 0 -#define X86_VENDOR_CYRIX 1 -#define X86_VENDOR_AMD 2 -#define X86_VENDOR_UMC 3 -#define X86_VENDOR_NEXGEN 4 -#define X86_VENDOR_CENTAUR 5 -#define X86_VENDOR_RISE 6 -#define X86_VENDOR_TRANSMETA 7 -#define X86_VENDOR_NSC 8 -#define X86_VENDOR_SIS 9 -#define X86_VENDOR_UNKNOWN 0xff - -/* - * capabilities of CPUs - */ - -extern struct cpuinfo_x86 boot_cpu_data; -extern struct tss_struct init_tss[NR_CPUS]; - -#ifdef CONFIG_SMP -extern struct cpuinfo_x86 cpu_data[]; -#define current_cpu_data cpu_data[smp_processor_id()] -#else -#define cpu_data (&boot_cpu_data) -#define current_cpu_data boot_cpu_data -#endif - -extern char ignore_irq13; - -extern void identify_cpu(struct cpuinfo_x86 *); -extern void print_cpu_info(struct cpuinfo_x86 *); - -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - -/* - * Generic CPUID function - */ -static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) -{ - __asm__("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op)); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax; - - __asm__("cpuid" - : "=a" (eax) - : "0" (op) - : "bx", "cx", "dx"); - return eax; -} -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx; - - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); - return ebx; -} -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ecx; - - __asm__("cpuid" - : "=a" (eax), "=c" (ecx) - : "0" (op) - : "bx", "dx" ); - return ecx; -} -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, edx; - - __asm__("cpuid" - : "=a" (eax), "=d" (edx) - : "0" (op) - : "bx", "cx"); - return edx; -} - -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - -#define load_cr3(pgdir) \ - asm volatile("movl %0,%%cr3": :"r" (__pa(pgdir))); - -extern unsigned long mmu_cr4_features; - -#include <asm/hypervisor.h> - -static inline void set_in_cr4 (unsigned long mask) -{ - BUG(); -} - -static inline void clear_in_cr4 (unsigned long mask) -{ - BUG(); -} - -/* - * Cyrix CPU configuration register indexes - */ -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - -/* - * Cyrix CPU indexed register access macros - */ - -#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) - -#define setCx86(reg, data) do { \ - outb((reg), 0x22); \ - outb((data), 0x23); \ -} while (0) - -/* - * Bus types (default is ISA, but people can check others with these..) - */ -#ifdef CONFIG_EISA -extern int EISA_bus; -#else -#define EISA_bus (0) -#endif -extern int MCA_bus; - -/* from system description table in BIOS. Mostly for MCA use, but -others may find it useful. */ -extern unsigned int machine_id; -extern unsigned int machine_submodel_id; -extern unsigned int BIOS_revision; -extern unsigned int mca_pentium_flag; - -/* - * User space process size: 3GB (default). - */ -#define TASK_SIZE (PAGE_OFFSET) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE (TASK_SIZE / 3) - -/* - * Size of io_bitmap in longwords: 32 is ports 0-0x3ff. - */ -#define IO_BITMAP_SIZE 32 -#define IO_BITMAP_BYTES (IO_BITMAP_SIZE * 4) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) -#define INVALID_IO_BITMAP_OFFSET 0x8000 - -struct i387_fsave_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - long status; /* software status information */ -}; - -struct i387_fxsave_struct { - unsigned short cwd; - unsigned short swd; - unsigned short twd; - unsigned short fop; - long fip; - long fcs; - long foo; - long fos; - long mxcsr; - long reserved; - long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ - long padding[56]; -} __attribute__ ((aligned (16))); - -struct i387_soft_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - unsigned char ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - unsigned long entry_eip; -}; - -union i387_union { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; -}; - -typedef struct { - unsigned long seg; -} mm_segment_t; - -struct tss_struct { - unsigned short back_link,__blh; - unsigned long esp0; - unsigned short ss0,__ss0h; - unsigned long esp1; - unsigned short ss1,__ss1h; - unsigned long esp2; - unsigned short ss2,__ss2h; - unsigned long __cr3; - unsigned long eip; - unsigned long eflags; - unsigned long eax,ecx,edx,ebx; - unsigned long esp; - unsigned long ebp; - unsigned long esi; - unsigned long edi; - unsigned short es, __esh; - unsigned short cs, __csh; - unsigned short ss, __ssh; - unsigned short ds, __dsh; - unsigned short fs, __fsh; - unsigned short gs, __gsh; - unsigned short ldt, __ldth; - unsigned short trace, bitmap; - unsigned long io_bitmap[IO_BITMAP_SIZE+1]; - /* - * pads the TSS to be cacheline-aligned (size is 0x100) - */ - unsigned long __cacheline_filler[5]; -}; - -struct thread_struct { - unsigned long esp0; - unsigned long eip; - unsigned long esp; - unsigned long fs; - unsigned long gs; - unsigned int io_pl; -/* Hardware debugging registers */ - unsigned long debugreg[8]; /* %%db0-7 debug registers */ -/* fault info */ - unsigned long cr2, trap_no, error_code; -/* floating point info */ - union i387_union i387; -/* virtual 86 mode info */ - struct vm86_struct * vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags, v86mask, saved_esp0; -}; - -#define INIT_THREAD { sizeof(init_stack) + (long) &init_stack, \ - 0, 0, 0, 0, 0, 0, {0}, 0, 0, 0, {{0}}, 0, 0, 0, 0, 0 } - -#define INIT_TSS { \ - 0,0, /* back_link, __blh */ \ - sizeof(init_stack) + (long) &init_stack, /* esp0 */ \ - __KERNEL_DS, 0, /* ss0 */ \ - 0,0,0,0,0,0, /* stack1, stack2 */ \ - 0, /* cr3 */ \ - 0,0, /* eip,eflags */ \ - 0,0,0,0, /* eax,ecx,edx,ebx */ \ - 0,0,0,0, /* esp,ebp,esi,edi */ \ - 0,0,0,0,0,0, /* es,cs,ss */ \ - 0,0,0,0,0,0, /* ds,fs,gs */ \ - 0,0, /* ldt */ \ - 0, INVALID_IO_BITMAP_OFFSET, /* tace, bitmap */ \ - {~0, } /* ioperm */ \ -} - -#define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ - set_fs(USER_DS); \ - regs->xds = __USER_DS; \ - regs->xes = __USER_DS; \ - regs->xss = __USER_DS; \ - regs->xcs = __USER_CS; \ - regs->eip = new_eip; \ - regs->esp = new_esp; \ -} while (0) - -/* Forward declaration, a strange C thing */ -struct task_struct; -struct mm_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); -/* - * create a kernel thread without removing it from tasklists - */ -extern int arch_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); - -/* Copy and release all segment info associated with a VM - * Unusable due to lack of error handling, use {init_new,destroy}_context - * instead. - */ -static inline void copy_segments(struct task_struct *p, struct mm_struct * mm) { } -static inline void release_segments(struct mm_struct * mm) { } - -/* - * Return saved PC of a blocked thread. - */ -static inline unsigned long thread_saved_pc(struct thread_struct *t) -{ - return ((unsigned long *)t->esp)[3]; -} - -unsigned long get_wchan(struct task_struct *p); -#define KSTK_EIP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1019]) -#define KSTK_ESP(tsk) (((unsigned long *)(4096+(unsigned long)(tsk)))[1022]) - -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_task_struct() ((struct task_struct *) __get_free_pages(GFP_KERNEL,1)) -#define free_task_struct(p) free_pages((unsigned long) (p), 1) -#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count) - -#define init_task (init_task_union.task) -#define init_stack (init_task_union.stack) - -struct microcode { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int reserved[5]; - unsigned int bits[500]; -}; - -/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */ -#define MICROCODE_IOCFREE _IO('6',0) - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop" ::: "memory"); -} - -#define cpu_relax() rep_nop() - -/* Prefetch instructions for Pentium III and AMD Athlon */ -#if defined(CONFIG_MPENTIUMIII) || defined (CONFIG_MPENTIUM4) - -#define ARCH_HAS_PREFETCH -extern inline void prefetch(const void *x) -{ - __asm__ __volatile__ ("prefetchnta (%0)" : : "r"(x)); -} - -#elif CONFIG_X86_USE_3DNOW - -#define ARCH_HAS_PREFETCH -#define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH - -extern inline void prefetch(const void *x) -{ - __asm__ __volatile__ ("prefetch (%0)" : : "r"(x)); -} - -extern inline void prefetchw(const void *x) -{ - __asm__ __volatile__ ("prefetchw (%0)" : : "r"(x)); -} -#define spin_lock_prefetch(x) prefetchw(x) - -#endif - -#endif /* __ASM_I386_PROCESSOR_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/queues.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/queues.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,20 +0,0 @@ - -/* Work-queue emulation over task queues. Pretty simple. */ - -#ifndef __QUEUES_H__ -#define __QUEUES_H__ - -#include <linux/version.h> -#include <linux/list.h> -#include <linux/tqueue.h> - -#define DECLARE_TQUEUE(_name, _fn, _arg) \ - struct tq_struct _name = { LIST_HEAD_INIT((_name).list), 0, _fn, _arg } -#define DECLARE_WORK(_name, _fn, _arg) DECLARE_TQUEUE(_name, _fn, _arg) - -#define work_struct tq_struct -#define INIT_WORK(_work, _fn, _arg) INIT_TQUEUE(_work, _fn, _arg) - -#define schedule_work(_w) schedule_task(_w) - -#endif /* __QUEUES_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/segment.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/segment.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,15 +0,0 @@ -#ifndef _ASM_SEGMENT_H -#define _ASM_SEGMENT_H - -#ifndef __ASSEMBLY__ -#include <linux/types.h> -#endif -#include <asm-xen/xen-public/xen.h> - -#define __KERNEL_CS FLAT_RING1_CS -#define __KERNEL_DS FLAT_RING1_DS - -#define __USER_CS FLAT_RING3_CS -#define __USER_DS FLAT_RING3_DS - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/smp.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/smp.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,102 +0,0 @@ -#ifndef __ASM_SMP_H -#define __ASM_SMP_H - -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ -#ifndef __ASSEMBLY__ -#include <linux/config.h> -#include <linux/threads.h> -#include <linux/ptrace.h> -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ -#include <asm/bitops.h> -#include <asm/mpspec.h> -#ifdef CONFIG_X86_IO_APIC -#include <asm/io_apic.h> -#endif -#include <asm/apic.h> -#endif -#endif - -#ifdef CONFIG_SMP -#ifndef __ASSEMBLY__ - -/* - * Private routines/data - */ - -extern void smp_alloc_memory(void); -extern unsigned long phys_cpu_present_map; -extern unsigned long cpu_online_map; -extern volatile unsigned long smp_invalidate_needed; -extern int pic_mode; -extern int smp_num_siblings; -extern int cpu_sibling_map[]; - -extern void smp_flush_tlb(void); -extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); -extern void fastcall smp_send_reschedule(int cpu); -extern void smp_invalidate_rcv(void); /* Process an NMI */ -extern void (*mtrr_hook) (void); -extern void zap_low_mappings (void); - -/* - * On x86 all CPUs are mapped 1:1 to the APIC space. - * This simplifies scheduling and IPI sending and - * compresses data structures. - */ -static inline int cpu_logical_map(int cpu) -{ - return cpu; -} -static inline int cpu_number_map(int cpu) -{ - return cpu; -} - -/* - * Some lowlevel functions might want to know about - * the real APIC ID <-> CPU # mapping. - */ -#define MAX_APICID 256 -extern volatile int cpu_to_physical_apicid[NR_CPUS]; -extern volatile int physical_apicid_to_cpu[MAX_APICID]; -extern volatile int cpu_to_logical_apicid[NR_CPUS]; -extern volatile int logical_apicid_to_cpu[MAX_APICID]; - -/* - * General functions that each host system must provide. - */ - -extern void smp_boot_cpus(void); -extern void smp_store_cpu_info(int id); /* Store per CPU info (like the initial udelay numbers */ - -/* - * This function is needed by all SMP systems. It must _always_ be valid - * from the initial startup. We map APIC_BASE very early in page_setup(), - * so this is correct in the x86 case. - */ - -#define smp_processor_id() (current->processor) - -#endif /* !__ASSEMBLY__ */ - -#define NO_PROC_ID 0xFF /* No processor magic marker */ - -/* - * This magic constant controls our willingness to transfer - * a process across CPUs. Such a transfer incurs misses on the L1 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My - * gut feeling is this will vary by board in value. For a board - * with separate L2 cache it probably depends also on the RSS, and - * for a board with shared L2 cache it ought to decay fast as other - * processes are run. - */ - -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ - -#endif -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/system.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/system.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,424 +0,0 @@ -#ifndef __ASM_SYSTEM_H -#define __ASM_SYSTEM_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <asm/synch_bitops.h> -#include <asm/segment.h> -#include <asm/hypervisor.h> -#include <asm/evtchn.h> - -#ifdef __KERNEL__ - -struct task_struct; -extern void FASTCALL(__switch_to(struct task_struct *prev, - struct task_struct *next)); - -#define prepare_to_switch() \ -do { \ - struct thread_struct *__t = ¤t->thread; \ - __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (*(int *)&__t->fs) ); \ - __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (*(int *)&__t->gs) ); \ -} while (0) -#define switch_to(prev,next,last) do { \ - asm volatile("pushl %%esi\n\t" \ - "pushl %%edi\n\t" \ - "pushl %%ebp\n\t" \ - "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %3,%%esp\n\t" /* restore ESP */ \ - "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %4\n\t" /* restore EIP */ \ - "jmp __switch_to\n" \ - "1:\t" \ - "popl %%ebp\n\t" \ - "popl %%edi\n\t" \ - "popl %%esi\n\t" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=b" (last) \ - :"m" (next->thread.esp),"m" (next->thread.eip), \ - "a" (prev), "d" (next), \ - "b" (prev)); \ -} while (0) - -#define _set_base(addr,base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while(0) - -#define _set_limit(addr,limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while(0) - -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1)>>12 ) - -static inline unsigned long _get_base(char * addr) -{ - unsigned long __base; - __asm__("movb %3,%%dh\n\t" - "movb %2,%%dl\n\t" - "shll $16,%%edx\n\t" - "movw %1,%%dx" - :"=&d" (__base) - :"m" (*((addr)+2)), - "m" (*((addr)+4)), - "m" (*((addr)+7))); - return __base; -} - -#define get_base(ldt) _get_base( ((char *)&(ldt)) ) - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg,value) \ - asm volatile("\n" \ - "1:\t" \ - "movl %0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "pushl $0\n\t" \ - "popl %%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,3b\n" \ - ".previous" \ - : :"m" (*(unsigned int *)&(value))) - -/* NB. 'clts' is done for us by Xen during virtual trap. */ -#define clts() ((void)0) -#define stts() (HYPERVISOR_fpu_taskswitch(1)) - -#endif /* __KERNEL__ */ - -/** - * __ffs - find first bit in word. - * @word: The word to search - * - * Undefined if no bit exists, so code should check against 0 first. - * - * Taken from 2.6 for Xen. - */ -static inline unsigned long __ffs(unsigned long word) -{ - __asm__("bsfl %1,%0" - :"=r" (word) - :"rm" (word)); - return word; -} - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - __asm__("lsll %1,%0" - :"=r" (__limit):"r" (segment)); - return __limit+1; -} - -#define nop() __asm__ __volatile__ ("nop") - -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) - -#define tas(ptr) (xchg((ptr),1)) - -struct __xchg_dummy { unsigned long a[100]; }; -#define __xg(x) ((struct __xchg_dummy *)(x)) - - -/* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) - * - * chmxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. - */ -static inline void __set_64bit (unsigned long long * ptr, - unsigned int low, unsigned int high) -{ - __asm__ __volatile__ ( - "\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - "lock cmpxchg8b (%0)\n\t" - "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) - : "ax","dx","memory"); -} - -static inline void __set_64bit_constant (unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL)); -} -#define ll_low(x) *(((unsigned int*)&(x))+0) -#define ll_high(x) *(((unsigned int*)&(x))+1) - -static inline void __set_64bit_var (unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr,ll_low(value), ll_high(value)); -} - -#define set_64bit(ptr,value) \ -(__builtin_constant_p(value) ? \ - __set_64bit_constant(ptr, value) : \ - __set_64bit_var(ptr, value) ) - -#define _set_64bit(ptr,value) \ -(__builtin_constant_p(value) ? \ - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ - __set_64bit(ptr, ll_low(value), ll_high(value)) ) - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) -{ - switch (size) { - case 1: - __asm__ __volatile__("xchgb %b0,%1" - :"=q" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 2: - __asm__ __volatile__("xchgw %w0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 4: - __asm__ __volatile__("xchgl %0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - } - return x; -} - -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ - -#ifdef CONFIG_X86_CMPXCHG -#define __HAVE_ARCH_CMPXCHG 1 - -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__("lock cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__("lock cmpxchgw %w1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - __asm__ __volatile__("lock cmpxchgl %1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) - -#else -/* Compiling for a 386 proper. Is it worth implementing via cli/sti? */ -#endif - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - * - * For now, "wmb()" doesn't actually do anything, as all - * Intel CPU's follow what Intel calls a *Processor Order*, - * in which all writes are seen in the program order even - * outside the CPU. - * - * I expect future Intel CPU's to have a weaker ordering, - * but I'd also expect them to finally get their act together - * and add some real memory barriers if so. - * - * Some non intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ - -#define mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory") -#define rmb() mb() - -#ifdef CONFIG_X86_OOSTORE -#define wmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory") -#else -#define wmb() __asm__ __volatile__ ("": : :"memory") -#endif - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define set_mb(var, value) do { xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -#define set_wmb(var, value) do { var = value; wmb(); } while (0) - -#define safe_halt() ((void)0) - -/* - * The use of 'barrier' in the following reflects their use as local-lock - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following - * critical operations are executed. All critical operatiosn must complete - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also - * includes these barriers, for example. - */ - -#define __cli() \ -do { \ - HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) - -#define __sti() \ -do { \ - shared_info_t *_shared = HYPERVISOR_shared_info; \ - barrier(); \ - _shared->vcpu_data[0].evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ -} while (0) - -#define __save_flags(x) \ -do { \ - (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask; \ -} while (0) - -#define __restore_flags(x) \ -do { \ - shared_info_t *_shared = HYPERVISOR_shared_info; \ - barrier(); \ - if ( (_shared->vcpu_data[0].evtchn_upcall_mask = x) == 0 ) { \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ - } \ -} while (0) - -#define __save_and_cli(x) \ -do { \ - (x) = HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask; \ - HYPERVISOR_shared_info->vcpu_data[0].evtchn_upcall_mask = 1; \ - barrier(); \ -} while (0) - -#define __save_and_sti(x) \ -do { \ - shared_info_t *_shared = HYPERVISOR_shared_info; \ - barrier(); \ - (x) = _shared->vcpu_data[0].evtchn_upcall_mask; \ - _shared->vcpu_data[0].evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_shared->vcpu_data[0].evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ -} while (0) - -#define local_irq_save(x) __save_and_cli(x) -#define local_irq_set(x) __save_and_sti(x) -#define local_irq_restore(x) __restore_flags(x) -#define local_irq_disable() __cli() -#define local_irq_enable() __sti() - - -#ifdef CONFIG_SMP -#error no SMP -extern void __global_cli(void); -extern void __global_sti(void); -extern unsigned long __global_save_flags(void); -extern void __global_restore_flags(unsigned long); -#define cli() __global_cli() -#define sti() __global_sti() -#define save_flags(x) ((x)=__global_save_flags()) -#define restore_flags(x) __global_restore_flags(x) -#define save_and_cli(x) do { save_flags(x); cli(); } while(0); -#define save_and_sti(x) do { save_flags(x); sti(); } while(0); - -#else - -#define cli() __cli() -#define sti() __sti() -#define save_flags(x) __save_flags(x) -#define restore_flags(x) __restore_flags(x) -#define save_and_cli(x) __save_and_cli(x) -#define save_and_sti(x) __save_and_sti(x) - -#endif - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -void disable_hlt(void); -void enable_hlt(void); - -extern unsigned long dmi_broken; -extern int is_sony_vaio_laptop; - -#define BROKEN_ACPI_Sx 0x0001 -#define BROKEN_INIT_AFTER_S1 0x0002 -#define BROKEN_PNP_BIOS 0x0004 - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/vga.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/vga.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,42 +0,0 @@ -/* - * Access to VGA videoram - * - * (c) 1998 Martin Mares <mj@xxxxxx> - */ - -#ifndef _LINUX_ASM_VGA_H_ -#define _LINUX_ASM_VGA_H_ - -#include <asm/io.h> - -extern unsigned char *vgacon_mmap; - -static unsigned long VGA_MAP_MEM(unsigned long x) -{ - if( vgacon_mmap == NULL ) - { - /* This is our first time in this function. This whole thing - is a rather grim hack. We know we're going to get asked - to map a 32KB region between 0xb0000 and 0xb8000 because - that's what VGAs are. We used the boot time permanent - fixed map region, and map it to machine pages. - */ - if( x != 0xb8000 ) - panic("Argghh! VGA Console is weird. 1:%08lx\n",x); - - vgacon_mmap = (unsigned char*) bt_ioremap( 0xa0000, 128*1024 ); - return (unsigned long) (vgacon_mmap+x-0xa0000); - } - else - { - if( x != 0xc0000 && x != 0xa0000 ) /* vidmem_end or charmap fonts */ - panic("Argghh! VGA Console is weird. 2:%08lx\n",x); - return (unsigned long) (vgacon_mmap+x-0xa0000); - } - return 0; -} - -static inline unsigned char vga_readb(unsigned char * x) { return (*(x)); } -static inline void vga_writeb(unsigned char x, unsigned char *y) { *(y) = (x); } - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/asm-xen/xor.h --- a/linux-2.4.30-xen-sparse/include/asm-xen/xor.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,879 +0,0 @@ -/* - * include/asm-i386/xor.h - * - * Optimized RAID-5 checksumming functions for MMX and SSE. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* - * High-speed RAID5 checksumming functions utilizing MMX instructions. - * Copyright (C) 1998 Ingo Molnar. - */ - -#define FPU_SAVE \ - do { \ - if (!(current->flags & PF_USEDFPU)) \ - clts(); \ - __asm__ __volatile__ ("fsave %0; fwait": "=m"(fpu_save[0])); \ - } while (0) - -#define FPU_RESTORE \ - do { \ - __asm__ __volatile__ ("frstor %0": : "m"(fpu_save[0])); \ - if (!(current->flags & PF_USEDFPU)) \ - stts(); \ - } while (0) - -#define LD(x,y) " movq 8*("#x")(%1), %%mm"#y" ;\n" -#define ST(x,y) " movq %%mm"#y", 8*("#x")(%1) ;\n" -#define XO1(x,y) " pxor 8*("#x")(%2), %%mm"#y" ;\n" -#define XO2(x,y) " pxor 8*("#x")(%3), %%mm"#y" ;\n" -#define XO3(x,y) " pxor 8*("#x")(%4), %%mm"#y" ;\n" -#define XO4(x,y) " pxor 8*("#x")(%5), %%mm"#y" ;\n" - - -static void -xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) -{ - unsigned long lines = bytes >> 7; - char fpu_save[108]; - - FPU_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - ST(i,0) \ - XO1(i+1,1) \ - ST(i+1,1) \ - XO1(i+2,2) \ - ST(i+2,2) \ - XO1(i+3,3) \ - ST(i+3,3) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2) - : - : "memory"); - - FPU_RESTORE; -} - -static void -xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) -{ - unsigned long lines = bytes >> 7; - char fpu_save[108]; - - FPU_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - ST(i,0) \ - XO2(i+1,1) \ - ST(i+1,1) \ - XO2(i+2,2) \ - ST(i+2,2) \ - XO2(i+3,3) \ - ST(i+3,3) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3) - : - : "memory"); - - FPU_RESTORE; -} - -static void -xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) -{ - unsigned long lines = bytes >> 7; - char fpu_save[108]; - - FPU_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - ST(i,0) \ - XO3(i+1,1) \ - ST(i+1,1) \ - XO3(i+2,2) \ - ST(i+2,2) \ - XO3(i+3,3) \ - ST(i+3,3) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " addl $128, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) - : - : "memory"); - - FPU_RESTORE; -} - - -static void -xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) -{ - unsigned long lines = bytes >> 7; - char fpu_save[108]; - - FPU_SAVE; - - /* need to save/restore p4/p5 manually otherwise gcc's 10 argument - limit gets exceeded (+ counts as two arguments) */ - __asm__ __volatile__ ( - " pushl %4\n" - " pushl %5\n" -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - ST(i,0) \ - XO4(i+1,1) \ - ST(i+1,1) \ - XO4(i+2,2) \ - ST(i+2,2) \ - XO4(i+3,3) \ - ST(i+3,3) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $128, %1 ;\n" - " addl $128, %2 ;\n" - " addl $128, %3 ;\n" - " addl $128, %4 ;\n" - " addl $128, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - " popl %5\n" - " popl %4\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3) - : "r" (p4), "r" (p5) - : "memory"); - - FPU_RESTORE; -} - -#undef LD -#undef XO1 -#undef XO2 -#undef XO3 -#undef XO4 -#undef ST -#undef BLOCK - -static void -xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) -{ - unsigned long lines = bytes >> 6; - char fpu_save[108]; - - FPU_SAVE; - - __asm__ __volatile__ ( - " .align 32 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 8(%2), %%mm1 ;\n" - " movq 24(%1), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " pxor 16(%2), %%mm2 ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq 40(%1), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%2), %%mm4 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq 56(%1), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%2), %%mm6 ;\n" - " pxor 56(%2), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2) - : - : "memory"); - - FPU_RESTORE; -} - -static void -xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) -{ - unsigned long lines = bytes >> 6; - char fpu_save[108]; - - FPU_SAVE; - - __asm__ __volatile__ ( - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " movq 24(%1), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 32(%2), %%mm4 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " movq 56(%1), %%mm7 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 56(%2), %%mm7 ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3) - : - : "memory" ); - - FPU_RESTORE; -} - -static void -xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) -{ - unsigned long lines = bytes >> 6; - char fpu_save[108]; - - FPU_SAVE; - - __asm__ __volatile__ ( - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor (%4), %%mm0 ;\n" - " movq 24(%1), %%mm3 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " pxor 8(%4), %%mm1 ;\n" - " movq %%mm0, (%1) ;\n" - " movq 32(%1), %%mm4 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " pxor 16(%4), %%mm2 ;\n" - " movq %%mm1, 8(%1) ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 32(%2), %%mm4 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 40(%2), %%mm5 ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 24(%4), %%mm3 ;\n" - " movq %%mm3, 24(%1) ;\n" - " movq 56(%1), %%mm7 ;\n" - " movq 48(%1), %%mm6 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " pxor 32(%4), %%mm4 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 56(%2), %%mm7 ;\n" - " pxor 40(%4), %%mm5 ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%4), %%mm6 ;\n" - " pxor 56(%4), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " addl $64, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) - : - : "memory"); - - FPU_RESTORE; -} - -static void -xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) -{ - unsigned long lines = bytes >> 6; - char fpu_save[108]; - - FPU_SAVE; - - /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */ - __asm__ __volatile__ ( - " pushl %4\n" - " pushl %5\n" - " .align 32,0x90 ;\n" - " 1: ;\n" - " movq (%1), %%mm0 ;\n" - " movq 8(%1), %%mm1 ;\n" - " pxor (%2), %%mm0 ;\n" - " pxor 8(%2), %%mm1 ;\n" - " movq 16(%1), %%mm2 ;\n" - " pxor (%3), %%mm0 ;\n" - " pxor 8(%3), %%mm1 ;\n" - " pxor 16(%2), %%mm2 ;\n" - " pxor (%4), %%mm0 ;\n" - " pxor 8(%4), %%mm1 ;\n" - " pxor 16(%3), %%mm2 ;\n" - " movq 24(%1), %%mm3 ;\n" - " pxor (%5), %%mm0 ;\n" - " pxor 8(%5), %%mm1 ;\n" - " movq %%mm0, (%1) ;\n" - " pxor 16(%4), %%mm2 ;\n" - " pxor 24(%2), %%mm3 ;\n" - " movq %%mm1, 8(%1) ;\n" - " pxor 16(%5), %%mm2 ;\n" - " pxor 24(%3), %%mm3 ;\n" - " movq 32(%1), %%mm4 ;\n" - " movq %%mm2, 16(%1) ;\n" - " pxor 24(%4), %%mm3 ;\n" - " pxor 32(%2), %%mm4 ;\n" - " movq 40(%1), %%mm5 ;\n" - " pxor 24(%5), %%mm3 ;\n" - " pxor 32(%3), %%mm4 ;\n" - " pxor 40(%2), %%mm5 ;\n" - " movq %%mm3, 24(%1) ;\n" - " pxor 32(%4), %%mm4 ;\n" - " pxor 40(%3), %%mm5 ;\n" - " movq 48(%1), %%mm6 ;\n" - " movq 56(%1), %%mm7 ;\n" - " pxor 32(%5), %%mm4 ;\n" - " pxor 40(%4), %%mm5 ;\n" - " pxor 48(%2), %%mm6 ;\n" - " pxor 56(%2), %%mm7 ;\n" - " movq %%mm4, 32(%1) ;\n" - " pxor 48(%3), %%mm6 ;\n" - " pxor 56(%3), %%mm7 ;\n" - " pxor 40(%5), %%mm5 ;\n" - " pxor 48(%4), %%mm6 ;\n" - " pxor 56(%4), %%mm7 ;\n" - " movq %%mm5, 40(%1) ;\n" - " pxor 48(%5), %%mm6 ;\n" - " pxor 56(%5), %%mm7 ;\n" - " movq %%mm6, 48(%1) ;\n" - " movq %%mm7, 56(%1) ;\n" - - " addl $64, %1 ;\n" - " addl $64, %2 ;\n" - " addl $64, %3 ;\n" - " addl $64, %4 ;\n" - " addl $64, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - " popl %5\n" - " popl %4\n" - : "+g" (lines), - "+r" (p1), "+r" (p2), "+r" (p3) - : "r" (p4), "r" (p5) - : "memory"); - - FPU_RESTORE; -} - -static struct xor_block_template xor_block_pII_mmx = { - name: "pII_mmx", - do_2: xor_pII_mmx_2, - do_3: xor_pII_mmx_3, - do_4: xor_pII_mmx_4, - do_5: xor_pII_mmx_5, -}; - -static struct xor_block_template xor_block_p5_mmx = { - name: "p5_mmx", - do_2: xor_p5_mmx_2, - do_3: xor_p5_mmx_3, - do_4: xor_p5_mmx_4, - do_5: xor_p5_mmx_5, -}; - -#undef FPU_SAVE -#undef FPU_RESTORE - -/* - * Cache avoiding checksumming functions utilizing KNI instructions - * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo) - */ - -#define XMMS_SAVE \ - if (!(current->flags & PF_USEDFPU)) \ - clts(); \ - __asm__ __volatile__ ( \ - "movups %%xmm0,(%1) ;\n\t" \ - "movups %%xmm1,0x10(%1) ;\n\t" \ - "movups %%xmm2,0x20(%1) ;\n\t" \ - "movups %%xmm3,0x30(%1) ;\n\t" \ - : "=&r" (cr0) \ - : "r" (xmm_save) \ - : "memory") - -#define XMMS_RESTORE \ - __asm__ __volatile__ ( \ - "sfence ;\n\t" \ - "movups (%1),%%xmm0 ;\n\t" \ - "movups 0x10(%1),%%xmm1 ;\n\t" \ - "movups 0x20(%1),%%xmm2 ;\n\t" \ - "movups 0x30(%1),%%xmm3 ;\n\t" \ - : \ - : "r" (cr0), "r" (xmm_save) \ - : "memory"); \ - if (!(current->flags & PF_USEDFPU)) \ - stts() - -#define ALIGN16 __attribute__((aligned(16))) - -#define OFFS(x) "16*("#x")" -#define PF_OFFS(x) "256+16*("#x")" -#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n" -#define LD(x,y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n" -#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n" -#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n" -#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n" -#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n" -#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n" -#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n" -#define XO1(x,y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n" -#define XO2(x,y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n" -#define XO3(x,y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n" -#define XO4(x,y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n" -#define XO5(x,y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n" - - -static void -xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) -{ - unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - - XMMS_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - PF1(i) \ - PF1(i+2) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2) - : - : "memory"); - - XMMS_RESTORE; -} - -static void -xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) -{ - unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - - XMMS_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r"(p2), "+r"(p3) - : - : "memory" ); - - XMMS_RESTORE; -} - -static void -xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) -{ - unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - - XMMS_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4) - : - : "memory" ); - - XMMS_RESTORE; -} - -static void -xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) -{ - unsigned long lines = bytes >> 8; - char xmm_save[16*4] ALIGN16; - int cr0; - - XMMS_SAVE; - - /* need to save p4/p5 manually to not exceed gcc's 10 argument limit */ - __asm__ __volatile__ ( - " pushl %4\n" - " pushl %5\n" -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - PF4(i) \ - PF4(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - XO4(i+1,1) \ - XO4(i+2,2) \ - XO4(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addl $256, %1 ;\n" - " addl $256, %2 ;\n" - " addl $256, %3 ;\n" - " addl $256, %4 ;\n" - " addl $256, %5 ;\n" - " decl %0 ;\n" - " jnz 1b ;\n" - " popl %5\n" - " popl %4\n" - : "+r" (lines), - "+r" (p1), "+r" (p2), "+r" (p3) - : "r" (p4), "r" (p5) - : "memory"); - - XMMS_RESTORE; -} - -static struct xor_block_template xor_block_pIII_sse = { - name: "pIII_sse", - do_2: xor_sse_2, - do_3: xor_sse_3, - do_4: xor_sse_4, - do_5: xor_sse_5, -}; - -/* Also try the generic routines. */ -#include <asm-generic/xor.h> - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_8regs); \ - xor_speed(&xor_block_32regs); \ - if (cpu_has_xmm) \ - xor_speed(&xor_block_pIII_sse); \ - if (md_cpu_has_mmx()) { \ - xor_speed(&xor_block_pII_mmx); \ - xor_speed(&xor_block_p5_mmx); \ - } \ - } while (0) - -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/blk.h --- a/linux-2.4.30-xen-sparse/include/linux/blk.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,409 +0,0 @@ -#ifndef _BLK_H -#define _BLK_H - -#include <linux/blkdev.h> -#include <linux/locks.h> -#include <linux/config.h> -#include <linux/spinlock.h> - -/* - * Spinlock for protecting the request queue which - * is mucked around with in interrupts on potentially - * multiple CPU's.. - */ -extern spinlock_t io_request_lock; - -/* - * Initialization functions. - */ -extern int isp16_init(void); -extern int cdu31a_init(void); -extern int acsi_init(void); -extern int mcd_init(void); -extern int mcdx_init(void); -extern int sbpcd_init(void); -extern int aztcd_init(void); -extern int sony535_init(void); -extern int gscd_init(void); -extern int cm206_init(void); -extern int optcd_init(void); -extern int sjcd_init(void); -extern int cdi_init(void); -extern int hd_init(void); -extern int ide_init(void); -extern int xd_init(void); -extern int mfm_init(void); -extern int loop_init(void); -extern int md_init(void); -extern int ap_init(void); -extern int ddv_init(void); -extern int z2_init(void); -extern int swim3_init(void); -extern int swimiop_init(void); -extern int amiga_floppy_init(void); -extern int atari_floppy_init(void); -extern int ez_init(void); -extern int bpcd_init(void); -extern int ps2esdi_init(void); -extern int jsfd_init(void); -extern int viodasd_init(void); -extern int viocd_init(void); - -#if defined(CONFIG_ARCH_S390) -extern int dasd_init(void); -extern int xpram_init(void); -extern int tapeblock_init(void); -#endif /* CONFIG_ARCH_S390 */ - -#if defined(CONFIG_XEN) -extern int xlblk_init(void); -#endif /* CONFIG_XEN */ - -extern void set_device_ro(kdev_t dev,int flag); -void add_blkdev_randomness(int major); - -extern int floppy_init(void); -extern int rd_doload; /* 1 = load ramdisk, 0 = don't load */ -extern int rd_prompt; /* 1 = prompt for ramdisk, 0 = don't prompt */ -extern int rd_image_start; /* starting block # of image */ - -#ifdef CONFIG_BLK_DEV_INITRD - -#define INITRD_MINOR 250 /* shouldn't collide with /dev/ram* too soon ... */ - -extern unsigned long initrd_start,initrd_end; -extern int initrd_below_start_ok; /* 1 if it is not an error if initrd_start < memory_start */ -void initrd_init(void); - -#endif - - -/* - * end_request() and friends. Must be called with the request queue spinlock - * acquired. All functions called within end_request() _must_be_ atomic. - * - * Several drivers define their own end_request and call - * end_that_request_first() and end_that_request_last() - * for parts of the original function. This prevents - * code duplication in drivers. - */ - -static inline void blkdev_dequeue_request(struct request * req) -{ - list_del(&req->queue); -} - -int end_that_request_first(struct request *req, int uptodate, char *name); -void end_that_request_last(struct request *req); - -#if defined(MAJOR_NR) || defined(IDE_DRIVER) - -#undef DEVICE_ON -#undef DEVICE_OFF - -/* - * Add entries as needed. - */ - -#ifdef IDE_DRIVER - -#define DEVICE_NR(device) (MINOR(device) >> PARTN_BITS) -#define DEVICE_NAME "ide" - -#elif (MAJOR_NR == RAMDISK_MAJOR) - -/* ram disk */ -#define DEVICE_NAME "ramdisk" -#define DEVICE_NR(device) (MINOR(device)) -#define DEVICE_NO_RANDOM - -#elif (MAJOR_NR == Z2RAM_MAJOR) - -/* Zorro II Ram */ -#define DEVICE_NAME "Z2RAM" -#define DEVICE_REQUEST do_z2_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == FLOPPY_MAJOR) - -static void floppy_off(unsigned int nr); - -#define DEVICE_NAME "floppy" -#define DEVICE_INTR do_floppy -#define DEVICE_REQUEST do_fd_request -#define DEVICE_NR(device) ( (MINOR(device) & 3) | ((MINOR(device) & 0x80 ) >> 5 )) -#define DEVICE_OFF(device) floppy_off(DEVICE_NR(device)) - -#elif (MAJOR_NR == HD_MAJOR) - -/* Hard disk: timeout is 6 seconds. */ -#define DEVICE_NAME "hard disk" -#define DEVICE_INTR do_hd -#define TIMEOUT_VALUE (6*HZ) -#define DEVICE_REQUEST do_hd_request -#define DEVICE_NR(device) (MINOR(device)>>6) - -#elif (SCSI_DISK_MAJOR(MAJOR_NR)) - -#define DEVICE_NAME "scsidisk" -#define TIMEOUT_VALUE (2*HZ) -#define DEVICE_NR(device) (((MAJOR(device) & SD_MAJOR_MASK) << (8 - 4)) + (MINOR(device) >> 4)) - -/* Kludge to use the same number for both char and block major numbers */ -#elif (MAJOR_NR == MD_MAJOR) && defined(MD_DRIVER) - -#define DEVICE_NAME "Multiple devices driver" -#define DEVICE_REQUEST do_md_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == SCSI_TAPE_MAJOR) - -#define DEVICE_NAME "scsitape" -#define DEVICE_INTR do_st -#define DEVICE_NR(device) (MINOR(device) & 0x7f) - -#elif (MAJOR_NR == OSST_MAJOR) - -#define DEVICE_NAME "onstream" -#define DEVICE_INTR do_osst -#define DEVICE_NR(device) (MINOR(device) & 0x7f) -#define DEVICE_ON(device) -#define DEVICE_OFF(device) - -#elif (MAJOR_NR == SCSI_CDROM_MAJOR) - -#define DEVICE_NAME "CD-ROM" -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == XT_DISK_MAJOR) - -#define DEVICE_NAME "xt disk" -#define DEVICE_REQUEST do_xd_request -#define DEVICE_NR(device) (MINOR(device) >> 6) - -#elif (MAJOR_NR == PS2ESDI_MAJOR) - -#define DEVICE_NAME "PS/2 ESDI" -#define DEVICE_REQUEST do_ps2esdi_request -#define DEVICE_NR(device) (MINOR(device) >> 6) - -#elif (MAJOR_NR == CDU31A_CDROM_MAJOR) - -#define DEVICE_NAME "CDU31A" -#define DEVICE_REQUEST do_cdu31a_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == ACSI_MAJOR) && (defined(CONFIG_ATARI_ACSI) || defined(CONFIG_ATARI_ACSI_MODULE)) - -#define DEVICE_NAME "ACSI" -#define DEVICE_INTR do_acsi -#define DEVICE_REQUEST do_acsi_request -#define DEVICE_NR(device) (MINOR(device) >> 4) - -#elif (MAJOR_NR == MITSUMI_CDROM_MAJOR) - -#define DEVICE_NAME "Mitsumi CD-ROM" -/* #define DEVICE_INTR do_mcd */ -#define DEVICE_REQUEST do_mcd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == MITSUMI_X_CDROM_MAJOR) - -#define DEVICE_NAME "Mitsumi CD-ROM" -/* #define DEVICE_INTR do_mcdx */ -#define DEVICE_REQUEST do_mcdx_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == MATSUSHITA_CDROM_MAJOR) - -#define DEVICE_NAME "Matsushita CD-ROM controller #1" -#define DEVICE_REQUEST do_sbpcd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == MATSUSHITA_CDROM2_MAJOR) - -#define DEVICE_NAME "Matsushita CD-ROM controller #2" -#define DEVICE_REQUEST do_sbpcd2_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == MATSUSHITA_CDROM3_MAJOR) - -#define DEVICE_NAME "Matsushita CD-ROM controller #3" -#define DEVICE_REQUEST do_sbpcd3_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == MATSUSHITA_CDROM4_MAJOR) - -#define DEVICE_NAME "Matsushita CD-ROM controller #4" -#define DEVICE_REQUEST do_sbpcd4_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == AZTECH_CDROM_MAJOR) - -#define DEVICE_NAME "Aztech CD-ROM" -#define DEVICE_REQUEST do_aztcd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == CDU535_CDROM_MAJOR) - -#define DEVICE_NAME "SONY-CDU535" -#define DEVICE_INTR do_cdu535 -#define DEVICE_REQUEST do_cdu535_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == GOLDSTAR_CDROM_MAJOR) - -#define DEVICE_NAME "Goldstar R420" -#define DEVICE_REQUEST do_gscd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == CM206_CDROM_MAJOR) -#define DEVICE_NAME "Philips/LMS CD-ROM cm206" -#define DEVICE_REQUEST do_cm206_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == OPTICS_CDROM_MAJOR) - -#define DEVICE_NAME "DOLPHIN 8000AT CD-ROM" -#define DEVICE_REQUEST do_optcd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == SANYO_CDROM_MAJOR) - -#define DEVICE_NAME "Sanyo H94A CD-ROM" -#define DEVICE_REQUEST do_sjcd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == APBLOCK_MAJOR) - -#define DEVICE_NAME "apblock" -#define DEVICE_REQUEST ap_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == DDV_MAJOR) - -#define DEVICE_NAME "ddv" -#define DEVICE_REQUEST ddv_request -#define DEVICE_NR(device) (MINOR(device)>>PARTN_BITS) - -#elif (MAJOR_NR == MFM_ACORN_MAJOR) - -#define DEVICE_NAME "mfm disk" -#define DEVICE_INTR do_mfm -#define DEVICE_REQUEST do_mfm_request -#define DEVICE_NR(device) (MINOR(device) >> 6) - -#elif (MAJOR_NR == NBD_MAJOR) - -#define DEVICE_NAME "nbd" -#define DEVICE_REQUEST do_nbd_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == MDISK_MAJOR) - -#define DEVICE_NAME "mdisk" -#define DEVICE_REQUEST mdisk_request -#define DEVICE_NR(device) (MINOR(device)) - -#elif (MAJOR_NR == DASD_MAJOR) - -#define DEVICE_NAME "dasd" -#define DEVICE_REQUEST do_dasd_request -#define DEVICE_NR(device) (MINOR(device) >> PARTN_BITS) - -#elif (MAJOR_NR == I2O_MAJOR) - -#define DEVICE_NAME "I2O block" -#define DEVICE_REQUEST i2ob_request -#define DEVICE_NR(device) (MINOR(device)>>4) - -#elif (MAJOR_NR == COMPAQ_SMART2_MAJOR) - -#define DEVICE_NAME "ida" -#define TIMEOUT_VALUE (25*HZ) -#define DEVICE_REQUEST do_ida_request -#define DEVICE_NR(device) (MINOR(device) >> 4) - -#endif /* MAJOR_NR == whatever */ - -/* provide DEVICE_xxx defaults, if not explicitly defined - * above in the MAJOR_NR==xxx if-elif tree */ -#ifndef DEVICE_ON -#define DEVICE_ON(device) do {} while (0) -#endif -#ifndef DEVICE_OFF -#define DEVICE_OFF(device) do {} while (0) -#endif - -#if (MAJOR_NR != SCSI_TAPE_MAJOR) && (MAJOR_NR != OSST_MAJOR) -#if !defined(IDE_DRIVER) - -#ifndef CURRENT -#define CURRENT blkdev_entry_next_request(&blk_dev[MAJOR_NR].request_queue.queue_head) -#endif -#ifndef QUEUE_EMPTY -#define QUEUE_EMPTY list_empty(&blk_dev[MAJOR_NR].request_queue.queue_head) -#endif - -#ifndef DEVICE_NAME -#define DEVICE_NAME "unknown" -#endif - -#define CURRENT_DEV DEVICE_NR(CURRENT->rq_dev) - -#ifdef DEVICE_INTR -static void (*DEVICE_INTR)(void) = NULL; -#endif - -#define SET_INTR(x) (DEVICE_INTR = (x)) - -#ifdef DEVICE_REQUEST -static void (DEVICE_REQUEST)(request_queue_t *); -#endif - -#ifdef DEVICE_INTR -#define CLEAR_INTR SET_INTR(NULL) -#else -#define CLEAR_INTR -#endif - -#define INIT_REQUEST \ - if (QUEUE_EMPTY) {\ - CLEAR_INTR; \ - return; \ - } \ - if (MAJOR(CURRENT->rq_dev) != MAJOR_NR) \ - panic(DEVICE_NAME ": request list destroyed"); \ - if (CURRENT->bh) { \ - if (!buffer_locked(CURRENT->bh)) \ - panic(DEVICE_NAME ": block not locked"); \ - } - -#endif /* !defined(IDE_DRIVER) */ - - -#ifndef LOCAL_END_REQUEST /* If we have our own end_request, we do not want to include this mess */ - -#if ! SCSI_BLK_MAJOR(MAJOR_NR) && (MAJOR_NR != COMPAQ_SMART2_MAJOR) - -static inline void end_request(int uptodate) { - struct request *req = CURRENT; - - if (end_that_request_first(req, uptodate, DEVICE_NAME)) - return; - -#ifndef DEVICE_NO_RANDOM - add_blkdev_randomness(MAJOR(req->rq_dev)); -#endif - DEVICE_OFF(req->rq_dev); - blkdev_dequeue_request(req); - end_that_request_last(req); -} - -#endif /* ! SCSI_BLK_MAJOR(MAJOR_NR) */ -#endif /* LOCAL_END_REQUEST */ - -#endif /* (MAJOR_NR != SCSI_TAPE_MAJOR) */ -#endif /* defined(MAJOR_NR) || defined(IDE_DRIVER) */ - -#endif /* _BLK_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/highmem.h --- a/linux-2.4.30-xen-sparse/include/linux/highmem.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,137 +0,0 @@ -#ifndef _LINUX_HIGHMEM_H -#define _LINUX_HIGHMEM_H - -#include <linux/config.h> -#include <asm/pgalloc.h> - -#ifdef CONFIG_HIGHMEM - -extern struct page *highmem_start_page; - -#include <asm/highmem.h> - -/* declarations for linux/mm/highmem.c */ -unsigned int nr_free_highpages(void); -void kmap_flush_unused(void); - -extern struct buffer_head *create_bounce(int rw, struct buffer_head * bh_orig); - -static inline char *bh_kmap(struct buffer_head *bh) -{ - return kmap(bh->b_page) + bh_offset(bh); -} - -static inline void bh_kunmap(struct buffer_head *bh) -{ - kunmap(bh->b_page); -} - -/* - * remember to add offset! and never ever reenable interrupts between a - * bh_kmap_irq and bh_kunmap_irq!! - */ -static inline char *bh_kmap_irq(struct buffer_head *bh, unsigned long *flags) -{ - unsigned long addr; - - __save_flags(*flags); - - /* - * could be low - */ - if (!PageHighMem(bh->b_page)) - return bh->b_data; - - /* - * it's a highmem page - */ - __cli(); - addr = (unsigned long) kmap_atomic(bh->b_page, KM_BH_IRQ); - - if (addr & ~PAGE_MASK) - BUG(); - - return (char *) addr + bh_offset(bh); -} - -static inline void bh_kunmap_irq(char *buffer, unsigned long *flags) -{ - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr, KM_BH_IRQ); - __restore_flags(*flags); -} - -#else /* CONFIG_HIGHMEM */ - -static inline unsigned int nr_free_highpages(void) { return 0; } -static inline void kmap_flush_unused(void) { } - -static inline void *kmap(struct page *page) { return page_address(page); } - -#define kunmap(page) do { } while (0) - -#define kmap_atomic(page,idx) kmap(page) -#define kunmap_atomic(page,idx) kunmap(page) - -#define bh_kmap(bh) ((bh)->b_data) -#define bh_kunmap(bh) do { } while (0) -#define kmap_nonblock(page) kmap(page) -#define bh_kmap_irq(bh, flags) ((bh)->b_data) -#define bh_kunmap_irq(bh, flags) do { *(flags) = 0; } while (0) - -#endif /* CONFIG_HIGHMEM */ - -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ -static inline void clear_user_highpage(struct page *page, unsigned long vaddr) -{ - void *addr = kmap_atomic(page, KM_USER0); - clear_user_page(addr, vaddr); - kunmap_atomic(addr, KM_USER0); -} - -static inline void clear_highpage(struct page *page) -{ - clear_page(kmap(page)); - kunmap(page); -} - -/* - * Same but also flushes aliased cache contents to RAM. - */ -static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size) -{ - char *kaddr; - - if (offset + size > PAGE_SIZE) - out_of_line_bug(); - kaddr = kmap(page); - memset(kaddr + offset, 0, size); - flush_dcache_page(page); - flush_page_to_ram(page); - kunmap(page); -} - -static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr) -{ - char *vfrom, *vto; - - vfrom = kmap_atomic(from, KM_USER0); - vto = kmap_atomic(to, KM_USER1); - copy_user_page(vto, vfrom, vaddr); - kunmap_atomic(vfrom, KM_USER0); - kunmap_atomic(vto, KM_USER1); -} - -static inline void copy_highpage(struct page *to, struct page *from) -{ - char *vfrom, *vto; - - vfrom = kmap_atomic(from, KM_USER0); - vto = kmap_atomic(to, KM_USER1); - copy_page(vto, vfrom); - kunmap_atomic(vfrom, KM_USER0); - kunmap_atomic(vto, KM_USER1); -} - -#endif /* _LINUX_HIGHMEM_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/irq.h --- a/linux-2.4.30-xen-sparse/include/linux/irq.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,80 +0,0 @@ -#ifndef __irq_h -#define __irq_h - -/* - * Please do not include this file in generic code. There is currently - * no requirement for any architecture to implement anything held - * within this file. - * - * Thanks. --rmk - */ - -#include <linux/config.h> - -#if !defined(CONFIG_ARCH_S390) - -#include <linux/cache.h> -#include <linux/spinlock.h> - -#include <asm/irq.h> -#include <asm/ptrace.h> - -/* - * IRQ line status. - */ -#define IRQ_INPROGRESS 1 /* IRQ handler active - do not enter! */ -#define IRQ_DISABLED 2 /* IRQ disabled - do not enter! */ -#define IRQ_PENDING 4 /* IRQ pending - replay on enable */ -#define IRQ_REPLAY 8 /* IRQ has been replayed but not acked yet */ -#define IRQ_AUTODETECT 16 /* IRQ is being autodetected */ -#define IRQ_WAITING 32 /* IRQ not yet seen - for autodetection */ -#define IRQ_LEVEL 64 /* IRQ level triggered */ -#define IRQ_MASKED 128 /* IRQ masked - shouldn't be seen again */ -#define IRQ_PER_CPU 256 /* IRQ is per CPU */ - -/* - * Interrupt controller descriptor. This is all we need - * to describe about the low-level hardware. - */ -struct hw_interrupt_type { - const char * typename; - unsigned int (*startup)(unsigned int irq); - void (*shutdown)(unsigned int irq); - void (*enable)(unsigned int irq); - void (*disable)(unsigned int irq); - void (*ack)(unsigned int irq); - void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, unsigned long mask); -}; - -typedef struct hw_interrupt_type hw_irq_controller; - -/* - * This is the "IRQ descriptor", which contains various information - * about the irq, including what kind of hardware handling it has, - * whether it is disabled etc etc. - * - * Pad this out to 32 bytes for cache and indexing reasons. - */ -typedef struct { - unsigned int status; /* IRQ status */ - hw_irq_controller *handler; - struct irqaction *action; /* IRQ action list */ - unsigned int depth; /* nested irq disables */ - spinlock_t lock; -} ____cacheline_aligned irq_desc_t; - -extern irq_desc_t irq_desc [NR_IRQS]; - -#include <asm/hw_irq.h> /* the arch dependent stuff */ - -extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); -extern int setup_irq(unsigned int , struct irqaction * ); -extern int teardown_irq(unsigned int , struct irqaction * ); - -extern hw_irq_controller no_irq_type; /* needed in every arch ? */ -extern void no_action(int cpl, void *dev_id, struct pt_regs *regs); - -#endif - -#endif /* __irq_h */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/mm.h --- a/linux-2.4.30-xen-sparse/include/linux/mm.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,727 +0,0 @@ -#ifndef _LINUX_MM_H -#define _LINUX_MM_H - -#include <linux/sched.h> -#include <linux/errno.h> - -#ifdef __KERNEL__ - -#include <linux/config.h> -#include <linux/string.h> -#include <linux/list.h> -#include <linux/mmzone.h> -#include <linux/swap.h> -#include <linux/rbtree.h> - -extern unsigned long max_mapnr; -extern unsigned long num_physpages; -extern unsigned long num_mappedpages; -extern void * high_memory; -extern int page_cluster; -/* The inactive_clean lists are per zone. */ -extern struct list_head active_list; -extern struct list_head inactive_list; - -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/atomic.h> - -/* - * Linux kernel virtual memory manager primitives. - * The idea being to have a "virtual" mm in the same way - * we have a virtual fs - giving a cleaner interface to the - * mm details, and allowing different kinds of memory mappings - * (from shared memory to executable loading to arbitrary - * mmap() functions). - */ - -/* - * This struct defines a memory VMM memory area. There is one of these - * per VM-area/task. A VM area is any part of the process virtual memory - * space that has a special rule for the page-fault handlers (ie a shared - * library, the executable area etc). - */ -struct vm_area_struct { - struct mm_struct * vm_mm; /* The address space we belong to. */ - unsigned long vm_start; /* Our start address within vm_mm. */ - unsigned long vm_end; /* The first byte after our end address - within vm_mm. */ - - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next; - - pgprot_t vm_page_prot; /* Access permissions of this VMA. */ - unsigned long vm_flags; /* Flags, listed below. */ - - rb_node_t vm_rb; - - /* - * For areas with an address space and backing store, - * one of the address_space->i_mmap{,shared} lists, - * for shm areas, the list of attaches, otherwise unused. - */ - struct vm_area_struct *vm_next_share; - struct vm_area_struct **vm_pprev_share; - - /* Function pointers to deal with this struct. */ - struct vm_operations_struct * vm_ops; - - /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE - units, *not* PAGE_CACHE_SIZE */ - struct file * vm_file; /* File we map to (can be NULL). */ - unsigned long vm_raend; /* XXX: put full readahead info here. */ - void * vm_private_data; /* was vm_pte (shared mem) */ -}; - -/* - * vm_flags.. - */ -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 - -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 - -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ -#define VM_GROWSUP 0x00000200 -#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ - -#define VM_EXECUTABLE 0x00001000 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ - -#ifndef VM_STACK_FLAGS -#define VM_STACK_FLAGS 0x00000177 -#endif - -#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) -#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK -#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) -#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) -#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) - -/* read ahead limits */ -extern int vm_min_readahead; -extern int vm_max_readahead; - -/* - * mapping from the currently active vm_flags protection bits (the - * low four bits) to a page protection mask.. - */ -extern pgprot_t protection_map[16]; - - -/* - * These are the virtual MM functions - opening of an area, closing and - * unmapping it (needed to keep files on disk up-to-date etc), pointer - * to the functions called when a no-page or a wp-page exception occurs. - */ -struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); - void (*close)(struct vm_area_struct * area); - struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); -}; - -/* - * Each physical page in the system has a struct page associated with - * it to keep track of whatever it is we are using the page for at the - * moment. Note that we have no way to track which tasks are using - * a page. - * - * Try to keep the most commonly accessed fields in single cache lines - * here (16 bytes or greater). This ordering should be particularly - * beneficial on 32-bit processors. - * - * The first line is data used in page cache lookup, the second line - * is used for linear searches (eg. clock algorithm scans). - * - * TODO: make this structure smaller, it could be as small as 32 bytes. - */ -typedef struct page { - struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ - unsigned long index; /* Our offset within mapping. */ - struct page *next_hash; /* Next page sharing our hash bucket in - the pagecache hash table. */ - atomic_t count; /* Usage count, see below. */ - unsigned long flags; /* atomic flags, some possibly - updated asynchronously */ - struct list_head lru; /* Pageout list, eg. active_list; - protected by pagemap_lru_lock !! */ - struct page **pprev_hash; /* Complement to *next_hash. */ - struct buffer_head * buffers; /* Buffer maps us to a disk block. */ - - /* - * On machines where all RAM is mapped into kernel address space, - * we can simply calculate the virtual address. On machines with - * highmem some memory is mapped into kernel virtual memory - * dynamically, so we need a place to store that address. - * Note that this field could be 16 bits on x86 ... ;) - * - * Architectures with slow multiplication can define - * WANT_PAGE_VIRTUAL in asm/page.h - */ -#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL) - void *virtual; /* Kernel virtual address (NULL if - not kmapped, ie. highmem) */ -#endif /* CONFIG_HIGMEM || WANT_PAGE_VIRTUAL */ -} mem_map_t; - -/* - * Methods to modify the page usage count. - * - * What counts for a page usage: - * - cache mapping (page->mapping) - * - disk mapping (page->buffers) - * - page mapped in a task's page tables, each mapping - * is counted separately - * - * Also, many kernel routines increase the page count before a critical - * routine so they can be sure the page doesn't go away from under them. - */ -#define get_page(p) atomic_inc(&(p)->count) -#define put_page(p) __free_page(p) -#define put_page_testzero(p) atomic_dec_and_test(&(p)->count) -#define page_count(p) atomic_read(&(p)->count) -#define set_page_count(p,v) atomic_set(&(p)->count, v) - -static inline struct page *nth_page(struct page *page, int n) -{ - return page + n; -} - -/* - * Various page->flags bits: - * - * PG_reserved is set for special pages, which can never be swapped - * out. Some of them might not even exist (eg empty_bad_page)... - * - * Multiple processes may "see" the same page. E.g. for untouched - * mappings of /dev/null, all processes see the same page full of - * zeroes, and text pages of executables and shared libraries have - * only one copy in memory, at most, normally. - * - * For the non-reserved pages, page->count denotes a reference count. - * page->count == 0 means the page is free. - * page->count == 1 means the page is used for exactly one purpose - * (e.g. a private data page of one process). - * - * A page may be used for kmalloc() or anyone else who does a - * __get_free_page(). In this case the page->count is at least 1, and - * all other fields are unused but should be 0 or NULL. The - * management of this page is the responsibility of the one who uses - * it. - * - * The other pages (we may call them "process pages") are completely - * managed by the Linux memory manager: I/O, buffers, swapping etc. - * The following discussion applies only to them. - * - * A page may belong to an inode's memory mapping. In this case, - * page->mapping is the pointer to the inode, and page->index is the - * file offset of the page, in units of PAGE_CACHE_SIZE. - * - * A page may have buffers allocated to it. In this case, - * page->buffers is a circular list of these buffer heads. Else, - * page->buffers == NULL. - * - * For pages belonging to inodes, the page->count is the number of - * attaches, plus 1 if buffers are allocated to the page, plus one - * for the page cache itself. - * - * All pages belonging to an inode are in these doubly linked lists: - * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; - * using the page->list list_head. These fields are also used for - * freelist managemet (when page->count==0). - * - * There is also a hash table mapping (mapping,index) to the page - * in memory if present. The lists for this hash table use the fields - * page->next_hash and page->pprev_hash. - * - * All process pages can do I/O: - * - inode pages may need to be read from disk, - * - inode pages which have been modified and are MAP_SHARED may need - * to be written to disk, - * - private pages which have been modified may need to be swapped out - * to swap space and (later) to be read back into memory. - * During disk I/O, PG_locked is used. This bit is set before I/O - * and reset when I/O completes. page_waitqueue(page) is a wait queue of all - * tasks waiting for the I/O on this page to complete. - * PG_uptodate tells whether the page's contents is valid. - * When a read completes, the page becomes uptodate, unless a disk I/O - * error happened. - * - * For choosing which pages to swap out, inode pages carry a - * PG_referenced bit, which is set any time the system accesses - * that page through the (mapping,index) hash table. This referenced - * bit, together with the referenced bit in the page tables, is used - * to manipulate page->age and move the page across the active, - * inactive_dirty and inactive_clean lists. - * - * Note that the referenced bit, the page->lru list_head and the - * active, inactive_dirty and inactive_clean lists are protected by - * the pagemap_lru_lock, and *NOT* by the usual PG_locked bit! - * - * PG_skip is used on sparc/sparc64 architectures to "skip" certain - * parts of the address space. - * - * PG_error is set to indicate that an I/O error occurred on this page. - * - * PG_arch_1 is an architecture specific page state bit. The generic - * code guarantees that this bit is cleared for a page when it first - * is entered into the page cache. - * - * PG_highmem pages are not permanently mapped into the kernel virtual - * address space, they need to be kmapped separately for doing IO on - * the pages. The struct page (these bits with information) are always - * mapped into kernel address space... - */ -#define PG_locked 0 /* Page is locked. Don't touch. */ -#define PG_error 1 -#define PG_referenced 2 -#define PG_uptodate 3 -#define PG_dirty 4 -#define PG_unused 5 -#define PG_lru 6 -#define PG_active 7 -#define PG_slab 8 -#define PG_skip 10 -#define PG_highmem 11 -#define PG_checked 12 /* kill me in 2.5.<early>. */ -#define PG_arch_1 13 -#define PG_reserved 14 -#define PG_launder 15 /* written out by VM pressure.. */ -#define PG_fs_1 16 /* Filesystem specific */ -#define PG_foreign 21 /* Page belongs to foreign allocator */ - -#ifndef arch_set_page_uptodate -#define arch_set_page_uptodate(page) -#endif - -/* Make it prettier to test the above... */ -#define UnlockPage(page) unlock_page(page) -#define Page_Uptodate(page) test_bit(PG_uptodate, &(page)->flags) -#ifndef SetPageUptodate -#define SetPageUptodate(page) set_bit(PG_uptodate, &(page)->flags) -#endif -#define ClearPageUptodate(page) clear_bit(PG_uptodate, &(page)->flags) -#define PageDirty(page) test_bit(PG_dirty, &(page)->flags) -#define SetPageDirty(page) set_bit(PG_dirty, &(page)->flags) -#define ClearPageDirty(page) clear_bit(PG_dirty, &(page)->flags) -#define PageLocked(page) test_bit(PG_locked, &(page)->flags) -#define LockPage(page) set_bit(PG_locked, &(page)->flags) -#define TryLockPage(page) test_and_set_bit(PG_locked, &(page)->flags) -#define PageChecked(page) test_bit(PG_checked, &(page)->flags) -#define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) -#define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags) -#define PageLaunder(page) test_bit(PG_launder, &(page)->flags) -#define SetPageLaunder(page) set_bit(PG_launder, &(page)->flags) -#define ClearPageLaunder(page) clear_bit(PG_launder, &(page)->flags) -#define ClearPageArch1(page) clear_bit(PG_arch_1, &(page)->flags) - -/* A foreign page uses a custom destructor rather than the buddy allocator. */ -#ifdef CONFIG_FOREIGN_PAGES -#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) -#define SetPageForeign(page, dtor) do { \ - set_bit(PG_foreign, &(page)->flags); \ - (page)->mapping = (void *)dtor; \ -} while (0) -#define ClearPageForeign(page) do { \ - clear_bit(PG_foreign, &(page)->flags); \ - (page)->mapping = NULL; \ -} while (0) -#define PageForeignDestructor(page) \ - ( (void (*) (struct page *)) (page)->mapping ) -#else -#define PageForeign(page) 0 -#define PageForeignDestructor(page) void -#endif - -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ -#define NODE_SHIFT 4 -#define ZONE_SHIFT (BITS_PER_LONG - 8) - -struct zone_struct; -extern struct zone_struct *zone_table[]; - -static inline zone_t *page_zone(struct page *page) -{ - return zone_table[page->flags >> ZONE_SHIFT]; -} - -static inline void set_page_zone(struct page *page, unsigned long zone_num) -{ - page->flags &= ~(~0UL << ZONE_SHIFT); - page->flags |= zone_num << ZONE_SHIFT; -} - -/* - * In order to avoid #ifdefs within C code itself, we define - * set_page_address to a noop for non-highmem machines, where - * the field isn't useful. - * The same is true for page_address() in arch-dependent code. - */ -#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL) - -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) - -#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ -#define set_page_address(page, address) do { } while(0) -#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ - -/* - * Permanent address of a page. Obviously must never be - * called on a highmem page. - */ -#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL) - -#define page_address(page) ((page)->virtual) - -#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ - -#define page_address(page) \ - __va( (((page) - page_zone(page)->zone_mem_map) << PAGE_SHIFT) \ - + page_zone(page)->zone_start_paddr) - -#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */ - -extern void FASTCALL(set_page_dirty(struct page *)); - -/* - * The first mb is necessary to safely close the critical section opened by the - * TryLockPage(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page). - */ -#define PageError(page) test_bit(PG_error, &(page)->flags) -#define SetPageError(page) set_bit(PG_error, &(page)->flags) -#define ClearPageError(page) clear_bit(PG_error, &(page)->flags) -#define PageReferenced(page) test_bit(PG_referenced, &(page)->flags) -#define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags) -#define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags) -#define PageTestandClearReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags) -#define PageSlab(page) test_bit(PG_slab, &(page)->flags) -#define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) -#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) -#define PageReserved(page) test_bit(PG_reserved, &(page)->flags) - -#define PageActive(page) test_bit(PG_active, &(page)->flags) -#define SetPageActive(page) set_bit(PG_active, &(page)->flags) -#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) - -#define PageLRU(page) test_bit(PG_lru, &(page)->flags) -#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) -#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) - -#ifdef CONFIG_HIGHMEM -#define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) -#else -#define PageHighMem(page) 0 /* needed to optimize away at compile time */ -#endif - -#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) -#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) - -/* - * Error return values for the *_nopage functions - */ -#define NOPAGE_SIGBUS (NULL) -#define NOPAGE_OOM ((struct page *) (-1)) - -/* The array of struct pages */ -extern mem_map_t * mem_map; - -/* - * There is only one page-allocator function, and two main namespaces to - * it. The alloc_page*() variants return 'struct page *' and as such - * can allocate highmem pages, the *get*page*() variants return - * virtual kernel addresses to the allocated page(s). - */ -extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order)); -extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)); -extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order); - -static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - /* - * Gets optimized away by the compiler. - */ - if (order >= MAX_ORDER) - return NULL; - return _alloc_pages(gfp_mask, order); -} - -#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) - -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); - -#define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask),0) - -#define __get_dma_pages(gfp_mask, order) \ - __get_free_pages((gfp_mask) | GFP_DMA,(order)) - -/* - * The old interface name will be removed in 2.5: - */ -#define get_free_page get_zeroed_page - -/* - * There is only one 'core' page-freeing function. - */ -extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); -extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); - -#define __free_page(page) __free_pages((page), 0) -#define free_page(addr) free_pages((addr),0) - -extern void show_free_areas(void); -extern void show_free_areas_node(pg_data_t *pgdat); - -extern void clear_page_tables(struct mm_struct *, unsigned long, int); - -extern int fail_writepage(struct page *); -struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused); -struct file *shmem_file_setup(char * name, loff_t size); -extern void shmem_lock(struct file * file, int lock); -extern int shmem_zero_setup(struct vm_area_struct *); - -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); -extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); -extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); -extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); - -extern int vmtruncate(struct inode * inode, loff_t offset); -extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); -extern int make_pages_present(unsigned long addr, unsigned long end); -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char *dst, int len); -extern int ptrace_writedata(struct task_struct *tsk, char * src, unsigned long dst, int len); -extern int ptrace_attach(struct task_struct *tsk); -extern int ptrace_detach(struct task_struct *, unsigned int); -extern void ptrace_disable(struct task_struct *); -extern int ptrace_check_attach(struct task_struct *task, int kill); - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); - -/* - * On a two-level page table, this ends up being trivial. Thus the - * inlining and the symmetry break with pte_alloc() that does all - * of this out-of-line. - */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} - -extern int pgt_cache_water[2]; -extern int check_pgt_cache(void); - -extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, - unsigned long * zones_size, unsigned long zone_start_paddr, - unsigned long *zholes_size); -extern void mem_init(void); -extern void show_mem(void); -extern void si_meminfo(struct sysinfo * val); -extern void swapin_readahead(swp_entry_t); - -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) - -static inline int is_page_cache_freeable(struct page * page) -{ - return page_count(page) - !!page->buffers == 1; -} - -extern int FASTCALL(can_share_swap_page(struct page *)); -extern int FASTCALL(remove_exclusive_swap_page(struct page *)); - -extern void __free_pte(pte_t); - -/* mmap.c */ -extern void lock_vma_mappings(struct vm_area_struct *); -extern void unlock_vma_mappings(struct vm_area_struct *); -extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void build_mmap_rb(struct mm_struct *); -extern void exit_mmap(struct mm_struct *); - -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff); - -static inline unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret = -EINVAL; - if ((offset + PAGE_ALIGN(len)) < offset) - goto out; - if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -out: - return ret; -} - -extern int do_munmap(struct mm_struct *, unsigned long, size_t); - -extern unsigned long do_brk(unsigned long, unsigned long); - -static inline void __vma_unlink(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev) -{ - prev->vm_next = vma->vm_next; - rb_erase(&vma->vm_rb, &mm->mm_rb); - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; -} - -static inline int can_vma_merge(struct vm_area_struct * vma, unsigned long vm_flags) -{ - if (!vma->vm_file && vma->vm_flags == vm_flags) - return 1; - else - return 0; -} - -struct zone_t; -/* filemap.c */ -extern void remove_inode_page(struct page *); -extern unsigned long page_unuse(struct page *); -extern void truncate_inode_pages(struct address_space *, loff_t); - -/* generic vm_area_ops exported for stackable file systems */ -extern int filemap_sync(struct vm_area_struct *, unsigned long, size_t, unsigned int); -extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); - -/* - * GFP bitmasks.. - */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low four bits) */ -#define __GFP_DMA 0x01 -#define __GFP_HIGHMEM 0x02 - -/* Action modifiers - doesn't change the zoning */ -#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20 /* Should access emergency pools? */ -#define __GFP_IO 0x40 /* Can start low memory physical IO? */ -#define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ -#define __GFP_FS 0x100 /* Can call down to low-level FS? */ - -#define GFP_NOHIGHIO (__GFP_HIGH | __GFP_WAIT | __GFP_IO) -#define GFP_NOIO (__GFP_HIGH | __GFP_WAIT) -#define GFP_NOFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO) -#define GFP_ATOMIC (__GFP_HIGH) -#define GFP_USER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM) -#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_KSWAPD ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) - -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some - platforms, used as appropriate on others */ - -#define GFP_DMA __GFP_DMA - -static inline unsigned int pf_gfp_mask(unsigned int gfp_mask) -{ - /* avoid all memory balancing I/O methods if this task cannot block on I/O */ - if (current->flags & PF_NOIO) - gfp_mask &= ~(__GFP_IO | __GFP_HIGHIO | __GFP_FS); - - return gfp_mask; -} - -/* vma is the first one with address < vma->vm_end, - * and even address < vma->vm_start. Have to extend vma. */ -static inline int expand_stack(struct vm_area_struct * vma, unsigned long address) -{ - unsigned long grow; - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the - * page_table_lock lock to serialize against concurrent expand_stacks. - */ - address &= PAGE_MASK; - spin_lock(&vma->vm_mm->page_table_lock); - - /* already expanded while we were spinning? */ - if (vma->vm_start <= address) { - spin_unlock(&vma->vm_mm->page_table_lock); - return 0; - } - - grow = (vma->vm_start - address) >> PAGE_SHIFT; - if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); - return -ENOMEM; - } - - if ((vma->vm_flags & VM_LOCKED) && - ((vma->vm_mm->locked_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_MEMLOCK].rlim_cur) { - spin_unlock(&vma->vm_mm->page_table_lock); - return -ENOMEM; - } - - - vma->vm_start = address; - vma->vm_pgoff -= grow; - vma->vm_mm->total_vm += grow; - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - spin_unlock(&vma->vm_mm->page_table_lock); - return 0; -} - -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); -extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, - struct vm_area_struct **pprev); - -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) -{ - struct vm_area_struct * vma = find_vma(mm,start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} - -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); - -extern struct page * vmalloc_to_page(void *addr); - -#endif /* __KERNEL__ */ - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/sched.h --- a/linux-2.4.30-xen-sparse/include/linux/sched.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,971 +0,0 @@ -#ifndef _LINUX_SCHED_H -#define _LINUX_SCHED_H - -#include <asm/param.h> /* for HZ */ - -extern unsigned long event; - -#include <linux/config.h> -#include <linux/binfmts.h> -#include <linux/threads.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/times.h> -#include <linux/timex.h> -#include <linux/rbtree.h> - -#include <asm/system.h> -#include <asm/semaphore.h> -#include <asm/page.h> -#include <asm/ptrace.h> -#include <asm/mmu.h> - -#include <linux/smp.h> -#include <linux/tty.h> -#include <linux/sem.h> -#include <linux/signal.h> -#include <linux/securebits.h> -#include <linux/fs_struct.h> - -struct exec_domain; - -/* - * cloning flags: - */ -#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ -#define CLONE_VM 0x00000100 /* set if VM shared between processes */ -#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ -#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ -#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ -#define CLONE_PID 0x00001000 /* set if pid shared */ -#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ -#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ -#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ -#define CLONE_THREAD 0x00010000 /* Same thread group? */ -#define CLONE_NEWNS 0x00020000 /* New namespace group? */ - -#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) - -/* - * These are the constant used to fake the fixed-point load-average - * counting. Some notes: - * - 11 bit fractions expand to 22 bits by the multiplies: this gives - * a load-average precision of 10 bits integer + 11 bits fractional - * - if you want to count load-averages more often, you need more - * precision, or rounding will get you. With 2-second counting freq, - * the EXP_n values would be 1981, 2034 and 2043 if still using only - * 11 bit fractions. - */ -extern unsigned long avenrun[]; /* Load averages */ - -#define FSHIFT 11 /* nr of bits of precision */ -#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ -#define LOAD_FREQ (5*HZ) /* 5 sec intervals */ -#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ -#define EXP_5 2014 /* 1/exp(5sec/5min) */ -#define EXP_15 2037 /* 1/exp(5sec/15min) */ - -#define CALC_LOAD(load,exp,n) \ - load *= exp; \ - load += n*(FIXED_1-exp); \ - load >>= FSHIFT; - -#define CT_TO_SECS(x) ((x) / HZ) -#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) - -extern int nr_running, nr_threads; -extern int last_pid; - -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/param.h> -#include <linux/resource.h> -#ifdef __KERNEL__ -#include <linux/timer.h> -#endif - -#include <asm/processor.h> - -#define TASK_RUNNING 0 -#define TASK_INTERRUPTIBLE 1 -#define TASK_UNINTERRUPTIBLE 2 -#define TASK_ZOMBIE 4 -#define TASK_STOPPED 8 - -#define __set_task_state(tsk, state_value) \ - do { (tsk)->state = (state_value); } while (0) -#define set_task_state(tsk, state_value) \ - set_mb((tsk)->state, (state_value)) - -#define __set_current_state(state_value) \ - do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ - set_mb(current->state, (state_value)) - -/* - * Scheduling policies - */ -#define SCHED_OTHER 0 -#define SCHED_FIFO 1 -#define SCHED_RR 2 - -/* - * This is an additional bit set when we want to - * yield the CPU for one re-schedule.. - */ -#define SCHED_YIELD 0x10 - -struct sched_param { - int sched_priority; -}; - -struct completion; - -#ifdef __KERNEL__ - -#include <linux/spinlock.h> - -/* - * This serializes "schedule()" and also protects - * the run-queue from deletions/modifications (but - * _adding_ to the beginning of the run-queue has - * a separate lock). - */ -extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; -extern spinlock_t mmlist_lock; - -extern void sched_init(void); -extern void init_idle(void); -extern void show_state(void); -extern void cpu_init (void); -extern void trap_init(void); -extern void update_process_times(int user); -#ifdef CONFIG_NO_IDLE_HZ -extern void update_process_times_us(int user, int system); -#endif -extern void update_one_process(struct task_struct *p, unsigned long user, - unsigned long system, int cpu); - -#define MAX_SCHEDULE_TIMEOUT LONG_MAX -extern signed long FASTCALL(schedule_timeout(signed long timeout)); -asmlinkage void schedule(void); - -extern int schedule_task(struct tq_struct *task); -extern void flush_scheduled_tasks(void); -extern int start_context_thread(void); -extern int current_is_keventd(void); - -#if CONFIG_SMP -extern void set_cpus_allowed(struct task_struct *p, unsigned long new_mask); -#else -# define set_cpus_allowed(p, new_mask) do { } while (0) -#endif - -/* - * The default fd array needs to be at least BITS_PER_LONG, - * as this is the granularity returned by copy_fdset(). - */ -#define NR_OPEN_DEFAULT BITS_PER_LONG - -struct namespace; -/* - * Open file table structure - */ -struct files_struct { - atomic_t count; - rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ - int max_fds; - int max_fdset; - int next_fd; - struct file ** fd; /* current fd array */ - fd_set *close_on_exec; - fd_set *open_fds; - fd_set close_on_exec_init; - fd_set open_fds_init; - struct file * fd_array[NR_OPEN_DEFAULT]; -}; - -#define INIT_FILES \ -{ \ - count: ATOMIC_INIT(1), \ - file_lock: RW_LOCK_UNLOCKED, \ - max_fds: NR_OPEN_DEFAULT, \ - max_fdset: __FD_SETSIZE, \ - next_fd: 0, \ - fd: &init_files.fd_array[0], \ - close_on_exec: &init_files.close_on_exec_init, \ - open_fds: &init_files.open_fds_init, \ - close_on_exec_init: { { 0, } }, \ - open_fds_init: { { 0, } }, \ - fd_array: { NULL, } \ -} - -/* Maximum number of active map areas.. This is a random (large) number */ -#define DEFAULT_MAX_MAP_COUNT (65536) - -extern int max_map_count; - -struct mm_struct { - struct vm_area_struct * mmap; /* list of VMAs */ - rb_root_t mm_rb; - struct vm_area_struct * mmap_cache; /* last find_vma result */ - pgd_t * pgd; - atomic_t mm_users; /* How many users with user space? */ - atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ - int map_count; /* number of VMAs */ - struct rw_semaphore mmap_sem; - spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ - - struct list_head mmlist; /* List of all active mm's. These are globally strung - * together off init_mm.mmlist, and are protected - * by mmlist_lock - */ - - unsigned long start_code, end_code, start_data, end_data; - unsigned long start_brk, brk, start_stack; - unsigned long arg_start, arg_end, env_start, env_end; - unsigned long rss, total_vm, locked_vm; - unsigned long def_flags; - unsigned long cpu_vm_mask; - unsigned long swap_address; - - unsigned dumpable:1; - - /* Architecture-specific MM context */ - mm_context_t context; -}; - -extern int mmlist_nr; - -#define INIT_MM(name) \ -{ \ - mm_rb: RB_ROOT, \ - pgd: swapper_pg_dir, \ - mm_users: ATOMIC_INIT(2), \ - mm_count: ATOMIC_INIT(1), \ - mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ - page_table_lock: SPIN_LOCK_UNLOCKED, \ - mmlist: LIST_HEAD_INIT(name.mmlist), \ -} - -struct signal_struct { - atomic_t count; - struct k_sigaction action[_NSIG]; - spinlock_t siglock; -}; - - -#define INIT_SIGNALS { \ - count: ATOMIC_INIT(1), \ - action: { {{0,}}, }, \ - siglock: SPIN_LOCK_UNLOCKED \ -} - -/* - * Some day this will be a full-fledged user tracking system.. - */ -struct user_struct { - atomic_t __count; /* reference count */ - atomic_t processes; /* How many processes does this user have? */ - atomic_t files; /* How many open files does this user have? */ - - /* Hash table maintenance information */ - struct user_struct *next, **pprev; - uid_t uid; -}; - -#define get_current_user() ({ \ - struct user_struct *__tmp_user = current->user; \ - atomic_inc(&__tmp_user->__count); \ - __tmp_user; }) - -extern struct user_struct root_user; -#define INIT_USER (&root_user) - -struct task_struct { - /* - * offsets of these are hardcoded elsewhere - touch with care - */ - volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ - unsigned long flags; /* per process flags, defined below */ - int sigpending; - mm_segment_t addr_limit; /* thread address space: - 0-0xBFFFFFFF for user-thead - 0-0xFFFFFFFF for kernel-thread - */ - struct exec_domain *exec_domain; - volatile long need_resched; - unsigned long ptrace; - - int lock_depth; /* Lock depth */ - -/* - * offset 32 begins here on 32-bit platforms. We keep - * all fields in a single cacheline that are needed for - * the goodness() loop in schedule(). - */ - long counter; - long nice; - unsigned long policy; - struct mm_struct *mm; - int processor; - /* - * cpus_runnable is ~0 if the process is not running on any - * CPU. It's (1 << cpu) if it's running on a CPU. This mask - * is updated under the runqueue lock. - * - * To determine whether a process might run on a CPU, this - * mask is AND-ed with cpus_allowed. - */ - unsigned long cpus_runnable, cpus_allowed; - /* - * (only the 'next' pointer fits into the cacheline, but - * that's just fine.) - */ - struct list_head run_list; - unsigned long sleep_time; - - struct task_struct *next_task, *prev_task; - struct mm_struct *active_mm; - struct list_head local_pages; - unsigned int allocation_order, nr_local_pages; - -/* task state */ - struct linux_binfmt *binfmt; - int exit_code, exit_signal; - int pdeath_signal; /* The signal sent when the parent dies */ - /* ??? */ - unsigned long personality; - int did_exec:1; - unsigned task_dumpable:1; - pid_t pid; - pid_t pgrp; - pid_t tty_old_pgrp; - pid_t session; - pid_t tgid; - /* boolean value for session group leader */ - int leader; - /* - * pointers to (original) parent process, youngest child, younger sibling, - * older sibling, respectively. (p->father can be replaced with - * p->p_pptr->pid) - */ - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; - struct list_head thread_group; - - /* PID hash table linkage. */ - struct task_struct *pidhash_next; - struct task_struct **pidhash_pprev; - - wait_queue_head_t wait_chldexit; /* for wait4() */ - struct completion *vfork_done; /* for vfork() */ - unsigned long rt_priority; - unsigned long it_real_value, it_prof_value, it_virt_value; - unsigned long it_real_incr, it_prof_incr, it_virt_incr; - struct timer_list real_timer; - struct tms times; - unsigned long start_time; - long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; -/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ - unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; - int swappable:1; -/* process credentials */ - uid_t uid,euid,suid,fsuid; - gid_t gid,egid,sgid,fsgid; - int ngroups; - gid_t groups[NGROUPS]; - kernel_cap_t cap_effective, cap_inheritable, cap_permitted; - int keep_capabilities:1; - struct user_struct *user; -/* limits */ - struct rlimit rlim[RLIM_NLIMITS]; - unsigned short used_math; - char comm[16]; -/* file system info */ - int link_count, total_link_count; - struct tty_struct *tty; /* NULL if no tty */ - unsigned int locks; /* How many file locks are being held */ -/* ipc stuff */ - struct sem_undo *semundo; - struct sem_queue *semsleeping; -/* CPU-specific state of this task */ - struct thread_struct thread; -/* filesystem information */ - struct fs_struct *fs; -/* open file information */ - struct files_struct *files; -/* namespace */ - struct namespace *namespace; -/* signal handlers */ - spinlock_t sigmask_lock; /* Protects signal and blocked */ - struct signal_struct *sig; - - sigset_t blocked; - struct sigpending pending; - - unsigned long sas_ss_sp; - size_t sas_ss_size; - int (*notifier)(void *priv); - void *notifier_data; - sigset_t *notifier_mask; - -/* Thread group tracking */ - u32 parent_exec_id; - u32 self_exec_id; -/* Protection of (de-)allocation: mm, files, fs, tty */ - spinlock_t alloc_lock; - -/* journalling filesystem info */ - void *journal_info; -}; - -/* - * Per process flags - */ -#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ - /* Not implemented yet, only for 486*/ -#define PF_STARTING 0x00000002 /* being created */ -#define PF_EXITING 0x00000004 /* getting shut down */ -#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ -#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ -#define PF_DUMPCORE 0x00000200 /* dumped core */ -#define PF_SIGNALED 0x00000400 /* killed by a signal */ -#define PF_MEMALLOC 0x00000800 /* Allocating memory */ -#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */ -#define PF_FREE_PAGES 0x00002000 /* per process page freeing */ -#define PF_NOIO 0x00004000 /* avoid generating further I/O */ -#define PF_FSTRANS 0x00008000 /* inside a filesystem transaction */ - -#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ - -/* - * Ptrace flags - */ - -#define PT_PTRACED 0x00000001 -#define PT_TRACESYS 0x00000002 -#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ -#define PT_TRACESYSGOOD 0x00000008 -#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ - -#define is_dumpable(tsk) ((tsk)->task_dumpable && (tsk)->mm && (tsk)->mm->dumpable) - -/* - * Limit the stack by to some sane default: root can always - * increase this limit if needed.. 8MB seems reasonable. - */ -#define _STK_LIM (8*1024*1024) - -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ -#define MAX_COUNTER (20*HZ/100) -#define DEF_NICE (0) - -extern void yield(void); - -/* - * The default (Linux) execution domain. - */ -extern struct exec_domain default_exec_domain; - -/* - * INIT_TASK is used to set up the first task table, touch at - * your own risk!. Base=0, limit=0x1fffff (=2MB) - */ -#define INIT_TASK(tsk) \ -{ \ - state: 0, \ - flags: 0, \ - sigpending: 0, \ - addr_limit: KERNEL_DS, \ - exec_domain: &default_exec_domain, \ - lock_depth: -1, \ - counter: DEF_COUNTER, \ - nice: DEF_NICE, \ - policy: SCHED_OTHER, \ - mm: NULL, \ - active_mm: &init_mm, \ - cpus_runnable: ~0UL, \ - cpus_allowed: ~0UL, \ - run_list: LIST_HEAD_INIT(tsk.run_list), \ - next_task: &tsk, \ - prev_task: &tsk, \ - p_opptr: &tsk, \ - p_pptr: &tsk, \ - thread_group: LIST_HEAD_INIT(tsk.thread_group), \ - wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ - real_timer: { \ - function: it_real_fn \ - }, \ - cap_effective: CAP_INIT_EFF_SET, \ - cap_inheritable: CAP_INIT_INH_SET, \ - cap_permitted: CAP_FULL_SET, \ - keep_capabilities: 0, \ - rlim: INIT_RLIMITS, \ - user: INIT_USER, \ - comm: "swapper", \ - thread: INIT_THREAD, \ - fs: &init_fs, \ - files: &init_files, \ - sigmask_lock: SPIN_LOCK_UNLOCKED, \ - sig: &init_signals, \ - pending: { NULL, &tsk.pending.head, {{0}}}, \ - blocked: {{0}}, \ - alloc_lock: SPIN_LOCK_UNLOCKED, \ - journal_info: NULL, \ -} - - -#ifndef INIT_TASK_SIZE -# define INIT_TASK_SIZE 2048*sizeof(long) -#endif - -union task_union { - struct task_struct task; - unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; -}; - -extern union task_union init_task_union; - -extern struct mm_struct init_mm; -extern struct task_struct *init_tasks[NR_CPUS]; - -/* PID hashing. (shouldnt this be dynamic?) */ -#define PIDHASH_SZ (4096 >> 2) -extern struct task_struct *pidhash[PIDHASH_SZ]; - -#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) - -static inline void hash_pid(struct task_struct *p) -{ - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; - - if((p->pidhash_next = *htable) != NULL) - (*htable)->pidhash_pprev = &p->pidhash_next; - *htable = p; - p->pidhash_pprev = htable; -} - -static inline void unhash_pid(struct task_struct *p) -{ - if(p->pidhash_next) - p->pidhash_next->pidhash_pprev = p->pidhash_pprev; - *p->pidhash_pprev = p->pidhash_next; -} - -static inline struct task_struct *find_task_by_pid(int pid) -{ - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; - - for(p = *htable; p && p->pid != pid; p = p->pidhash_next) - ; - - return p; -} - -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) - -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) -{ - tsk->processor = cpu; - tsk->cpus_runnable = 1UL << cpu; -} - -static inline void task_release_cpu(struct task_struct *tsk) -{ - tsk->cpus_runnable = ~0UL; -} - -/* per-UID process charging. */ -extern struct user_struct * alloc_uid(uid_t); -extern void free_uid(struct user_struct *); -extern void switch_uid(struct user_struct *); - -#include <asm/current.h> - -extern unsigned long volatile jiffies; -extern unsigned long itimer_ticks; -extern unsigned long itimer_next; -extern struct timeval xtime; -extern void do_timer(struct pt_regs *); -#ifdef CONFIG_NO_IDLE_HZ -extern void do_timer_ticks(int ticks); -#endif - -extern unsigned int * prof_buffer; -extern unsigned long prof_len; -extern unsigned long prof_shift; - -#define CURRENT_TIME (xtime.tv_sec) - -extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); -extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); -extern void FASTCALL(sleep_on(wait_queue_head_t *q)); -extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, - signed long timeout)); -extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); -extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, - signed long timeout)); -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); - -#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) -#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) -#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) -#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) -#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) -#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) -#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) -#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) -asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); - -extern int in_group_p(gid_t); -extern int in_egroup_p(gid_t); - -extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *); -extern void sig_exit(int, int, struct siginfo *); -extern int dequeue_signal(sigset_t *, siginfo_t *); -extern void block_all_signals(int (*notifier)(void *priv), void *priv, - sigset_t *mask); -extern void unblock_all_signals(void); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); -extern int kill_pg_info(int, struct siginfo *, pid_t); -extern int kill_sl_info(int, struct siginfo *, pid_t); -extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void notify_parent(struct task_struct *, int); -extern void do_notify_parent(struct task_struct *, int); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); -extern int kill_pg(pid_t, int, int); -extern int kill_sl(pid_t, int, int); -extern int kill_proc(pid_t, int, int); -extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); -extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); - -static inline int signal_pending(struct task_struct *p) -{ - return (p->sigpending != 0); -} - -/* - * Re-calculate pending state from the set of locally pending - * signals, globally pending signals, and blocked signals. - */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) -{ - unsigned long ready; - long i; - - switch (_NSIG_WORDS) { - default: - for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) - ready |= signal->sig[i] &~ blocked->sig[i]; - break; - - case 4: ready = signal->sig[3] &~ blocked->sig[3]; - ready |= signal->sig[2] &~ blocked->sig[2]; - ready |= signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 2: ready = signal->sig[1] &~ blocked->sig[1]; - ready |= signal->sig[0] &~ blocked->sig[0]; - break; - - case 1: ready = signal->sig[0] &~ blocked->sig[0]; - } - return ready != 0; -} - -/* Reevaluate whether the task has signals pending delivery. - This is required every time the blocked sigset_t changes. - All callers should have t->sigmask_lock. */ - -static inline void recalc_sigpending(struct task_struct *t) -{ - t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); -} - -/* True if we are on the alternate signal stack. */ - -static inline int on_sig_stack(unsigned long sp) -{ - return (sp - current->sas_ss_sp < current->sas_ss_size); -} - -static inline int sas_ss_flags(unsigned long sp) -{ - return (current->sas_ss_size == 0 ? SS_DISABLE - : on_sig_stack(sp) ? SS_ONSTACK : 0); -} - -extern int request_irq(unsigned int, - void (*handler)(int, void *, struct pt_regs *), - unsigned long, const char *, void *); -extern void free_irq(unsigned int, void *); - -/* - * This has now become a routine instead of a macro, it sets a flag if - * it returns true (to do BSD-style accounting where the process is flagged - * if it uses root privs). The implication of this is that you should do - * normal permissions checks first, and check suser() last. - * - * [Dec 1997 -- Chris Evans] - * For correctness, the above considerations need to be extended to - * fsuser(). This is done, along with moving fsuser() checks to be - * last. - * - * These will be removed, but in the mean time, when the SECURE_NOROOT - * flag is set, uids don't grant privilege. - */ -static inline int suser(void) -{ - if (!issecure(SECURE_NOROOT) && current->euid == 0) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} - -static inline int fsuser(void) -{ - if (!issecure(SECURE_NOROOT) && current->fsuid == 0) { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} - -/* - * capable() checks for a particular capability. - * New privilege checks should use this interface, rather than suser() or - * fsuser(). See include/linux/capability.h for defined capabilities. - */ - -static inline int capable(int cap) -{ -#if 1 /* ok now */ - if (cap_raised(current->cap_effective, cap)) -#else - if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0) -#endif - { - current->flags |= PF_SUPERPRIV; - return 1; - } - return 0; -} - -/* - * Routines for handling mm_structs - */ -extern struct mm_struct * mm_alloc(void); - -extern struct mm_struct * start_lazy_tlb(void); -extern void end_lazy_tlb(struct mm_struct *mm); - -/* mmdrop drops the mm and the page tables */ -extern void FASTCALL(__mmdrop(struct mm_struct *)); -static inline void mmdrop(struct mm_struct * mm) -{ - if (atomic_dec_and_test(&mm->mm_count)) - __mmdrop(mm); -} - -/* mmput gets rid of the mappings and all user-space */ -extern void mmput(struct mm_struct *); -/* Remove the current tasks stale references to the old mm_struct */ -extern void mm_release(void); - -/* - * Routines for handling the fd arrays - */ -extern struct file ** alloc_fd_array(int); -extern int expand_fd_array(struct files_struct *, int nr); -extern void free_fd_array(struct file **, int); - -extern fd_set *alloc_fdset(int); -extern int expand_fdset(struct files_struct *, int nr); -extern void free_fdset(fd_set *, int); - -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); -extern void flush_thread(void); -extern void exit_thread(void); - -extern void exit_mm(struct task_struct *); -extern void exit_files(struct task_struct *); -extern void exit_sighand(struct task_struct *); - -extern void reparent_to_init(void); -extern void daemonize(void); - -extern int do_execve(char *, char **, char **, struct pt_regs *); -extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); - -extern void set_task_comm(struct task_struct *tsk, char *from); -extern void get_task_comm(char *to, struct task_struct *tsk); - -extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); -extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); -extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); - -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); - -#define __wait_event(wq, condition) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - if (condition) \ - break; \ - schedule(); \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_event(wq, condition) \ -do { \ - if (condition) \ - break; \ - __wait_event(wq, condition); \ -} while (0) - -#define __wait_event_interruptible(wq, condition, ret) \ -do { \ - wait_queue_t __wait; \ - init_waitqueue_entry(&__wait, current); \ - \ - add_wait_queue(&wq, &__wait); \ - for (;;) { \ - set_current_state(TASK_INTERRUPTIBLE); \ - if (condition) \ - break; \ - if (!signal_pending(current)) { \ - schedule(); \ - continue; \ - } \ - ret = -ERESTARTSYS; \ - break; \ - } \ - current->state = TASK_RUNNING; \ - remove_wait_queue(&wq, &__wait); \ -} while (0) - -#define wait_event_interruptible(wq, condition) \ -({ \ - int __ret = 0; \ - if (!(condition)) \ - __wait_event_interruptible(wq, condition, __ret); \ - __ret; \ -}) - -#define REMOVE_LINKS(p) do { \ - (p)->next_task->prev_task = (p)->prev_task; \ - (p)->prev_task->next_task = (p)->next_task; \ - if ((p)->p_osptr) \ - (p)->p_osptr->p_ysptr = (p)->p_ysptr; \ - if ((p)->p_ysptr) \ - (p)->p_ysptr->p_osptr = (p)->p_osptr; \ - else \ - (p)->p_pptr->p_cptr = (p)->p_osptr; \ - } while (0) - -#define SET_LINKS(p) do { \ - (p)->next_task = &init_task; \ - (p)->prev_task = init_task.prev_task; \ - init_task.prev_task->next_task = (p); \ - init_task.prev_task = (p); \ - (p)->p_ysptr = NULL; \ - if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ - (p)->p_osptr->p_ysptr = p; \ - (p)->p_pptr->p_cptr = p; \ - } while (0) - -#define for_each_task(p) \ - for (p = &init_task ; (p = p->next_task) != &init_task ; ) - -#define for_each_thread(task) \ - for (task = next_thread(current) ; task != current ; task = next_thread(task)) - -#define next_thread(p) \ - list_entry((p)->thread_group.next, struct task_struct, thread_group) - -#define thread_group_leader(p) (p->pid == p->tgid) - -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} - -static inline int task_on_runqueue(struct task_struct *p) -{ - return (p->run_list.next != NULL); -} - -static inline void unhash_process(struct task_struct *p) -{ - if (task_on_runqueue(p)) - out_of_line_bug(); - write_lock_irq(&tasklist_lock); - nr_threads--; - unhash_pid(p); - REMOVE_LINKS(p); - list_del(&p->thread_group); - write_unlock_irq(&tasklist_lock); -} - -/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ -static inline void task_lock(struct task_struct *p) -{ - spin_lock(&p->alloc_lock); -} - -static inline void task_unlock(struct task_struct *p) -{ - spin_unlock(&p->alloc_lock); -} - -/* write full pathname into buffer and return start of pathname */ -static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, - char *buf, int buflen) -{ - char *res; - struct vfsmount *rootmnt; - struct dentry *root; - read_lock(¤t->fs->lock); - rootmnt = mntget(current->fs->rootmnt); - root = dget(current->fs->root); - read_unlock(¤t->fs->lock); - spin_lock(&dcache_lock); - res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); - spin_unlock(&dcache_lock); - dput(root); - mntput(rootmnt); - return res; -} - -static inline int need_resched(void) -{ - return (unlikely(current->need_resched)); -} - -extern void __cond_resched(void); -static inline void cond_resched(void) -{ - if (need_resched()) - __cond_resched(); -} - -#endif /* __KERNEL__ */ -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/skbuff.h --- a/linux-2.4.30-xen-sparse/include/linux/skbuff.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1181 +0,0 @@ -/* - * Definitions for the 'struct sk_buff' memory handlers. - * - * Authors: - * Alan Cox, <gw4pts@xxxxxxxxxxxxxxx> - * Florian La Roche, <rzsfl@xxxxxxxxxxxx> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _LINUX_SKBUFF_H -#define _LINUX_SKBUFF_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/time.h> -#include <linux/cache.h> - -#include <asm/atomic.h> -#include <asm/types.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/highmem.h> - -#define HAVE_ALLOC_SKB /* For the drivers to know */ -#define HAVE_ALIGNABLE_SKB /* Ditto 8) */ -#define SLAB_SKB /* Slabified skbuffs */ - -#define CHECKSUM_NONE 0 -#define CHECKSUM_HW 1 -#define CHECKSUM_UNNECESSARY 2 - -#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES-1)) & ~(SMP_CACHE_BYTES-1)) -#define SKB_MAX_ORDER(X,ORDER) (((PAGE_SIZE<<(ORDER)) - (X) - sizeof(struct skb_shared_info))&~(SMP_CACHE_BYTES-1)) -#define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X),0)) -#define SKB_MAX_ALLOC (SKB_MAX_ORDER(0,2)) - -/* A. Checksumming of received packets by device. - * - * NONE: device failed to checksum this packet. - * skb->csum is undefined. - * - * UNNECESSARY: device parsed packet and wouldbe verified checksum. - * skb->csum is undefined. - * It is bad option, but, unfortunately, many of vendors do this. - * Apparently with secret goal to sell you new device, when you - * will add new protocol to your host. F.e. IPv6. 8) - * - * HW: the most generic way. Device supplied checksum of _all_ - * the packet as seen by netif_rx in skb->csum. - * NOTE: Even if device supports only some protocols, but - * is able to produce some skb->csum, it MUST use HW, - * not UNNECESSARY. - * - * B. Checksumming on output. - * - * NONE: skb is checksummed by protocol or csum is not required. - * - * HW: device is required to csum packet as seen by hard_start_xmit - * from skb->h.raw to the end and to record the checksum - * at skb->h.raw+skb->csum. - * - * Device must show its capabilities in dev->features, set - * at device setup time. - * NETIF_F_HW_CSUM - it is clever device, it is able to checksum - * everything. - * NETIF_F_NO_CSUM - loopback or reliable single hop media. - * NETIF_F_IP_CSUM - device is dumb. It is able to csum only - * TCP/UDP over IPv4. Sigh. Vendors like this - * way by an unknown reason. Though, see comment above - * about CHECKSUM_UNNECESSARY. 8) - * - * Any questions? No questions, good. --ANK - */ - -#ifdef __i386__ -#define NET_CALLER(arg) (*(((void**)&arg)-1)) -#else -#define NET_CALLER(arg) __builtin_return_address(0) -#endif - -#ifdef CONFIG_NETFILTER -struct nf_conntrack { - atomic_t use; - void (*destroy)(struct nf_conntrack *); -}; - -struct nf_ct_info { - struct nf_conntrack *master; -}; -#endif - -struct sk_buff_head { - /* These two members must be first. */ - struct sk_buff * next; - struct sk_buff * prev; - - __u32 qlen; - spinlock_t lock; -}; - -struct sk_buff; - -#define MAX_SKB_FRAGS 6 - -typedef struct skb_frag_struct skb_frag_t; - -struct skb_frag_struct -{ - struct page *page; - __u16 page_offset; - __u16 size; -}; - -/* This data is invariant across clones and lives at - * the end of the header data, ie. at skb->end. - */ -struct skb_shared_info { - atomic_t dataref; - unsigned int nr_frags; - struct sk_buff *frag_list; - skb_frag_t frags[MAX_SKB_FRAGS]; -}; - -struct sk_buff { - /* These two members must be first. */ - struct sk_buff * next; /* Next buffer in list */ - struct sk_buff * prev; /* Previous buffer in list */ - - struct sk_buff_head * list; /* List we are on */ - struct sock *sk; /* Socket we are owned by */ - struct timeval stamp; /* Time we arrived */ - struct net_device *dev; /* Device we arrived on/are leaving by */ - struct net_device *real_dev; /* For support of point to point protocols - (e.g. 802.3ad) over bonding, we must save the - physical device that got the packet before - replacing skb->dev with the virtual device. */ - - /* Transport layer header */ - union - { - struct tcphdr *th; - struct udphdr *uh; - struct icmphdr *icmph; - struct igmphdr *igmph; - struct iphdr *ipiph; - struct spxhdr *spxh; - unsigned char *raw; - } h; - - /* Network layer header */ - union - { - struct iphdr *iph; - struct ipv6hdr *ipv6h; - struct arphdr *arph; - struct ipxhdr *ipxh; - unsigned char *raw; - } nh; - - /* Link layer header */ - union - { - struct ethhdr *ethernet; - unsigned char *raw; - } mac; - - struct dst_entry *dst; - - /* - * This is the control buffer. It is free to use for every - * layer. Please put your private variables there. If you - * want to keep them across layers you have to do a skb_clone() - * first. This is owned by whoever has the skb queued ATM. - */ - char cb[48]; - - unsigned int len; /* Length of actual data */ - unsigned int data_len; - unsigned int csum; /* Checksum */ - unsigned char __unused, /* Dead field, may be reused */ - cloned, /* head may be cloned (check refcnt to be sure). */ - pkt_type, /* Packet class */ - ip_summed; /* Driver fed us an IP checksum */ - __u32 priority; /* Packet queueing priority */ - atomic_t users; /* User count - see datagram.c,tcp.c */ - unsigned short protocol; /* Packet protocol from driver. */ - unsigned short security; /* Security level of packet */ - unsigned int truesize; /* Buffer size */ - - unsigned char *head; /* Head of buffer */ - unsigned char *data; /* Data head pointer */ - unsigned char *tail; /* Tail pointer */ - unsigned char *end; /* End pointer */ - - void (*destructor)(struct sk_buff *); /* Destruct function */ -#ifdef CONFIG_NETFILTER - /* Can be used for communication between hooks. */ - unsigned long nfmark; - /* Cache info */ - __u32 nfcache; - /* Associated connection, if any */ - struct nf_ct_info *nfct; -#ifdef CONFIG_NETFILTER_DEBUG - unsigned int nf_debug; -#endif -#endif /*CONFIG_NETFILTER*/ - -#if defined(CONFIG_HIPPI) - union{ - __u32 ifield; - } private; -#endif - -#ifdef CONFIG_NET_SCHED - __u32 tc_index; /* traffic control index */ -#endif -}; - -#ifdef __KERNEL__ -/* - * Handling routines are only of interest to the kernel - */ -#include <linux/slab.h> - -#include <asm/system.h> - -extern void __kfree_skb(struct sk_buff *skb); -extern struct sk_buff * alloc_skb(unsigned int size, int priority); -extern struct sk_buff * alloc_skb_from_cache(kmem_cache_t *cp, unsigned int size, int priority); -extern void kfree_skbmem(struct sk_buff *skb); -extern struct sk_buff * skb_clone(struct sk_buff *skb, int priority); -extern struct sk_buff * skb_copy(const struct sk_buff *skb, int priority); -extern struct sk_buff * pskb_copy(struct sk_buff *skb, int gfp_mask); -extern int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask); -extern struct sk_buff * skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); -extern struct sk_buff * skb_copy_expand(const struct sk_buff *skb, - int newheadroom, - int newtailroom, - int priority); -extern struct sk_buff * skb_pad(struct sk_buff *skb, int pad); -#define dev_kfree_skb(a) kfree_skb(a) -extern void skb_over_panic(struct sk_buff *skb, int len, void *here); -extern void skb_under_panic(struct sk_buff *skb, int len, void *here); - -/* Internal */ -#define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) - -/** - * skb_queue_empty - check if a queue is empty - * @list: queue head - * - * Returns true if the queue is empty, false otherwise. - */ - -static inline int skb_queue_empty(struct sk_buff_head *list) -{ - return (list->next == (struct sk_buff *) list); -} - -/** - * skb_get - reference buffer - * @skb: buffer to reference - * - * Makes another reference to a socket buffer and returns a pointer - * to the buffer. - */ - -static inline struct sk_buff *skb_get(struct sk_buff *skb) -{ - atomic_inc(&skb->users); - return skb; -} - -/* - * If users==1, we are the only owner and are can avoid redundant - * atomic change. - */ - -/** - * kfree_skb - free an sk_buff - * @skb: buffer to free - * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. - */ - -static inline void kfree_skb(struct sk_buff *skb) -{ - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) - return; - __kfree_skb(skb); -} - -/** - * skb_cloned - is the buffer a clone - * @skb: buffer to check - * - * Returns true if the buffer was generated with skb_clone() and is - * one of multiple shared copies of the buffer. Cloned buffers are - * shared data so must not be written to under normal circumstances. - */ - -static inline int skb_cloned(struct sk_buff *skb) -{ - return skb->cloned && atomic_read(&skb_shinfo(skb)->dataref) != 1; -} - -/** - * skb_shared - is the buffer shared - * @skb: buffer to check - * - * Returns true if more than one person has a reference to this - * buffer. - */ - -static inline int skb_shared(struct sk_buff *skb) -{ - return (atomic_read(&skb->users) != 1); -} - -/** - * skb_share_check - check if buffer is shared and if so clone it - * @skb: buffer to check - * @pri: priority for memory allocation - * - * If the buffer is shared the buffer is cloned and the old copy - * drops a reference. A new clone with a single reference is returned. - * If the buffer is not shared the original buffer is returned. When - * being called from interrupt status or with spinlocks held pri must - * be GFP_ATOMIC. - * - * NULL is returned on a memory allocation failure. - */ - -static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri) -{ - if (skb_shared(skb)) { - struct sk_buff *nskb; - nskb = skb_clone(skb, pri); - kfree_skb(skb); - return nskb; - } - return skb; -} - - -/* - * Copy shared buffers into a new sk_buff. We effectively do COW on - * packets to handle cases where we have a local reader and forward - * and a couple of other messy ones. The normal one is tcpdumping - * a packet thats being forwarded. - */ - -/** - * skb_unshare - make a copy of a shared buffer - * @skb: buffer to check - * @pri: priority for memory allocation - * - * If the socket buffer is a clone then this function creates a new - * copy of the data, drops a reference count on the old copy and returns - * the new copy with the reference count at 1. If the buffer is not a clone - * the original buffer is returned. When called with a spinlock held or - * from interrupt state @pri must be %GFP_ATOMIC - * - * %NULL is returned on a memory allocation failure. - */ - -static inline struct sk_buff *skb_unshare(struct sk_buff *skb, int pri) -{ - struct sk_buff *nskb; - if(!skb_cloned(skb)) - return skb; - nskb=skb_copy(skb, pri); - kfree_skb(skb); /* Free our shared copy */ - return nskb; -} - -/** - * skb_peek - * @list_: list to peek at - * - * Peek an &sk_buff. Unlike most other operations you _MUST_ - * be careful with this one. A peek leaves the buffer on the - * list and someone else may run off with it. You must hold - * the appropriate locks or have a private queue to do this. - * - * Returns %NULL for an empty list or a pointer to the head element. - * The reference count is not incremented and the reference is therefore - * volatile. Use with caution. - */ - -static inline struct sk_buff *skb_peek(struct sk_buff_head *list_) -{ - struct sk_buff *list = ((struct sk_buff *)list_)->next; - if (list == (struct sk_buff *)list_) - list = NULL; - return list; -} - -/** - * skb_peek_tail - * @list_: list to peek at - * - * Peek an &sk_buff. Unlike most other operations you _MUST_ - * be careful with this one. A peek leaves the buffer on the - * list and someone else may run off with it. You must hold - * the appropriate locks or have a private queue to do this. - * - * Returns %NULL for an empty list or a pointer to the tail element. - * The reference count is not incremented and the reference is therefore - * volatile. Use with caution. - */ - -static inline struct sk_buff *skb_peek_tail(struct sk_buff_head *list_) -{ - struct sk_buff *list = ((struct sk_buff *)list_)->prev; - if (list == (struct sk_buff *)list_) - list = NULL; - return list; -} - -/** - * skb_queue_len - get queue length - * @list_: list to measure - * - * Return the length of an &sk_buff queue. - */ - -static inline __u32 skb_queue_len(struct sk_buff_head *list_) -{ - return(list_->qlen); -} - -static inline void skb_queue_head_init(struct sk_buff_head *list) -{ - spin_lock_init(&list->lock); - list->prev = (struct sk_buff *)list; - list->next = (struct sk_buff *)list; - list->qlen = 0; -} - -/* - * Insert an sk_buff at the start of a list. - * - * The "__skb_xxxx()" functions are the non-atomic ones that - * can only be called with interrupts disabled. - */ - -/** - * __skb_queue_head - queue a buffer at the list head - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the start of a list. This function takes no locks - * and you must therefore hold required locks before calling it. - * - * A buffer cannot be placed on two lists at the same time. - */ - -static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) -{ - struct sk_buff *prev, *next; - - newsk->list = list; - list->qlen++; - prev = (struct sk_buff *)list; - next = prev->next; - newsk->next = next; - newsk->prev = prev; - next->prev = newsk; - prev->next = newsk; -} - - -/** - * skb_queue_head - queue a buffer at the list head - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the start of the list. This function takes the - * list lock and can be used safely with other locking &sk_buff functions - * safely. - * - * A buffer cannot be placed on two lists at the same time. - */ - -static inline void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_head(list, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} - -/** - * __skb_queue_tail - queue a buffer at the list tail - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the end of a list. This function takes no locks - * and you must therefore hold required locks before calling it. - * - * A buffer cannot be placed on two lists at the same time. - */ - - -static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) -{ - struct sk_buff *prev, *next; - - newsk->list = list; - list->qlen++; - next = (struct sk_buff *)list; - prev = next->prev; - newsk->next = next; - newsk->prev = prev; - next->prev = newsk; - prev->next = newsk; -} - -/** - * skb_queue_tail - queue a buffer at the list tail - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the tail of the list. This function takes the - * list lock and can be used safely with other locking &sk_buff functions - * safely. - * - * A buffer cannot be placed on two lists at the same time. - */ - -static inline void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_tail(list, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} - -/** - * __skb_dequeue - remove from the head of the queue - * @list: list to dequeue from - * - * Remove the head of the list. This function does not take any locks - * so must be used with appropriate locks held only. The head item is - * returned or %NULL if the list is empty. - */ - -static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) -{ - struct sk_buff *next, *prev, *result; - - prev = (struct sk_buff *) list; - next = prev->next; - result = NULL; - if (next != prev) { - result = next; - next = next->next; - list->qlen--; - next->prev = prev; - prev->next = next; - result->next = NULL; - result->prev = NULL; - result->list = NULL; - } - return result; -} - -/** - * skb_dequeue - remove from the head of the queue - * @list: list to dequeue from - * - * Remove the head of the list. The list lock is taken so the function - * may be used safely with other locking list functions. The head item is - * returned or %NULL if the list is empty. - */ - -static inline struct sk_buff *skb_dequeue(struct sk_buff_head *list) -{ - unsigned long flags; - struct sk_buff *result; - - spin_lock_irqsave(&list->lock, flags); - result = __skb_dequeue(list); - spin_unlock_irqrestore(&list->lock, flags); - return result; -} - -/* - * Insert a packet on a list. - */ - -static inline void __skb_insert(struct sk_buff *newsk, - struct sk_buff * prev, struct sk_buff *next, - struct sk_buff_head * list) -{ - newsk->next = next; - newsk->prev = prev; - next->prev = newsk; - prev->next = newsk; - newsk->list = list; - list->qlen++; -} - -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * - * Place a packet before a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls - * A buffer cannot be placed on two lists at the same time. - */ - -static inline void skb_insert(struct sk_buff *old, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&old->list->lock, flags); - __skb_insert(newsk, old->prev, old, old->list); - spin_unlock_irqrestore(&old->list->lock, flags); -} - -/* - * Place a packet after a given packet in a list. - */ - -static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) -{ - __skb_insert(newsk, old, old->next, old->list); -} - -/** - * skb_append - append a buffer - * @old: buffer to insert after - * @newsk: buffer to insert - * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls. - * A buffer cannot be placed on two lists at the same time. - */ - - -static inline void skb_append(struct sk_buff *old, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&old->list->lock, flags); - __skb_append(old, newsk); - spin_unlock_irqrestore(&old->list->lock, flags); -} - -/* - * remove sk_buff from list. _Must_ be called atomically, and with - * the list known.. - */ - -static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) -{ - struct sk_buff * next, * prev; - - list->qlen--; - next = skb->next; - prev = skb->prev; - skb->next = NULL; - skb->prev = NULL; - skb->list = NULL; - next->prev = prev; - prev->next = next; -} - -/** - * skb_unlink - remove a buffer from a list - * @skb: buffer to remove - * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls - * - * Works even without knowing the list it is sitting on, which can be - * handy at times. It also means that THE LIST MUST EXIST when you - * unlink. Thus a list must have its contents unlinked before it is - * destroyed. - */ - -static inline void skb_unlink(struct sk_buff *skb) -{ - struct sk_buff_head *list = skb->list; - - if(list) { - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - if(skb->list == list) - __skb_unlink(skb, skb->list); - spin_unlock_irqrestore(&list->lock, flags); - } -} - -/* XXX: more streamlined implementation */ - -/** - * __skb_dequeue_tail - remove from the tail of the queue - * @list: list to dequeue from - * - * Remove the tail of the list. This function does not take any locks - * so must be used with appropriate locks held only. The tail item is - * returned or %NULL if the list is empty. - */ - -static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) -{ - struct sk_buff *skb = skb_peek_tail(list); - if (skb) - __skb_unlink(skb, list); - return skb; -} - -/** - * skb_dequeue - remove from the head of the queue - * @list: list to dequeue from - * - * Remove the head of the list. The list lock is taken so the function - * may be used safely with other locking list functions. The tail item is - * returned or %NULL if the list is empty. - */ - -static inline struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) -{ - unsigned long flags; - struct sk_buff *result; - - spin_lock_irqsave(&list->lock, flags); - result = __skb_dequeue_tail(list); - spin_unlock_irqrestore(&list->lock, flags); - return result; -} - -static inline int skb_is_nonlinear(const struct sk_buff *skb) -{ - return skb->data_len; -} - -static inline unsigned int skb_headlen(const struct sk_buff *skb) -{ - return skb->len - skb->data_len; -} - -#define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) out_of_line_bug(); } while (0) -#define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) out_of_line_bug(); } while (0) -#define SKB_LINEAR_ASSERT(skb) do { if (skb_is_nonlinear(skb)) out_of_line_bug(); } while (0) - -/* - * Add data to an sk_buff - */ - -static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len) -{ - unsigned char *tmp=skb->tail; - SKB_LINEAR_ASSERT(skb); - skb->tail+=len; - skb->len+=len; - return tmp; -} - -/** - * skb_put - add data to a buffer - * @skb: buffer to use - * @len: amount of data to add - * - * This function extends the used data area of the buffer. If this would - * exceed the total buffer size the kernel will panic. A pointer to the - * first byte of the extra data is returned. - */ - -static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len) -{ - unsigned char *tmp=skb->tail; - SKB_LINEAR_ASSERT(skb); - skb->tail+=len; - skb->len+=len; - if(skb->tail>skb->end) { - skb_over_panic(skb, len, current_text_addr()); - } - return tmp; -} - -static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len) -{ - skb->data-=len; - skb->len+=len; - return skb->data; -} - -/** - * skb_push - add data to the start of a buffer - * @skb: buffer to use - * @len: amount of data to add - * - * This function extends the used data area of the buffer at the buffer - * start. If this would exceed the total buffer headroom the kernel will - * panic. A pointer to the first byte of the extra data is returned. - */ - -static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len) -{ - skb->data-=len; - skb->len+=len; - if(skb->data<skb->head) { - skb_under_panic(skb, len, current_text_addr()); - } - return skb->data; -} - -static inline char *__skb_pull(struct sk_buff *skb, unsigned int len) -{ - skb->len-=len; - if (skb->len < skb->data_len) - out_of_line_bug(); - return skb->data+=len; -} - -/** - * skb_pull - remove data from the start of a buffer - * @skb: buffer to use - * @len: amount of data to remove - * - * This function removes data from the start of a buffer, returning - * the memory to the headroom. A pointer to the next data in the buffer - * is returned. Once the data has been pulled future pushes will overwrite - * the old data. - */ - -static inline unsigned char * skb_pull(struct sk_buff *skb, unsigned int len) -{ - if (len > skb->len) - return NULL; - return __skb_pull(skb,len); -} - -extern unsigned char * __pskb_pull_tail(struct sk_buff *skb, int delta); - -static inline char *__pskb_pull(struct sk_buff *skb, unsigned int len) -{ - if (len > skb_headlen(skb) && - __pskb_pull_tail(skb, len-skb_headlen(skb)) == NULL) - return NULL; - skb->len -= len; - return skb->data += len; -} - -static inline unsigned char * pskb_pull(struct sk_buff *skb, unsigned int len) -{ - if (len > skb->len) - return NULL; - return __pskb_pull(skb,len); -} - -static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) -{ - if (len <= skb_headlen(skb)) - return 1; - if (len > skb->len) - return 0; - return (__pskb_pull_tail(skb, len-skb_headlen(skb)) != NULL); -} - -/** - * skb_headroom - bytes at buffer head - * @skb: buffer to check - * - * Return the number of bytes of free space at the head of an &sk_buff. - */ - -static inline int skb_headroom(const struct sk_buff *skb) -{ - return skb->data-skb->head; -} - -/** - * skb_tailroom - bytes at buffer end - * @skb: buffer to check - * - * Return the number of bytes of free space at the tail of an sk_buff - */ - -static inline int skb_tailroom(const struct sk_buff *skb) -{ - return skb_is_nonlinear(skb) ? 0 : skb->end-skb->tail; -} - -/** - * skb_reserve - adjust headroom - * @skb: buffer to alter - * @len: bytes to move - * - * Increase the headroom of an empty &sk_buff by reducing the tail - * room. This is only allowed for an empty buffer. - */ - -static inline void skb_reserve(struct sk_buff *skb, unsigned int len) -{ - skb->data+=len; - skb->tail+=len; -} - -extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); - -static inline void __skb_trim(struct sk_buff *skb, unsigned int len) -{ - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data+len; - } else { - ___pskb_trim(skb, len, 0); - } -} - -/** - * skb_trim - remove end from a buffer - * @skb: buffer to alter - * @len: new length - * - * Cut the length of a buffer down by removing data from the tail. If - * the buffer is already under the length specified it is not modified. - */ - -static inline void skb_trim(struct sk_buff *skb, unsigned int len) -{ - if (skb->len > len) { - __skb_trim(skb, len); - } -} - - -static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) -{ - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data+len; - return 0; - } else { - return ___pskb_trim(skb, len, 1); - } -} - -static inline int pskb_trim(struct sk_buff *skb, unsigned int len) -{ - if (len < skb->len) - return __pskb_trim(skb, len); - return 0; -} - -/** - * skb_orphan - orphan a buffer - * @skb: buffer to orphan - * - * If a buffer currently has an owner then we call the owner's - * destructor function and make the @skb unowned. The buffer continues - * to exist but is no longer charged to its former owner. - */ - - -static inline void skb_orphan(struct sk_buff *skb) -{ - if (skb->destructor) - skb->destructor(skb); - skb->destructor = NULL; - skb->sk = NULL; -} - -/** - * skb_purge - empty a list - * @list: list to empty - * - * Delete all buffers on an &sk_buff list. Each buffer is removed from - * the list and one reference dropped. This function takes the list - * lock and is atomic with respect to other list locking functions. - */ - - -static inline void skb_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb=skb_dequeue(list))!=NULL) - kfree_skb(skb); -} - -/** - * __skb_purge - empty a list - * @list: list to empty - * - * Delete all buffers on an &sk_buff list. Each buffer is removed from - * the list and one reference dropped. This function does not take the - * list lock and the caller must hold the relevant locks to use it. - */ - - -static inline void __skb_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb=__skb_dequeue(list))!=NULL) - kfree_skb(skb); -} - -/** - * __dev_alloc_skb - allocate an skbuff for sending - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned in there is no free memory. - */ -#ifndef CONFIG_XEN -static inline struct sk_buff *__dev_alloc_skb(unsigned int length, - int gfp_mask) -{ - struct sk_buff *skb = alloc_skb(length+16, gfp_mask); - if (skb) - skb_reserve(skb,16); - return skb; -} -#else -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask); -#endif - -/** - * dev_alloc_skb - allocate an skbuff for sending - * @length: length to allocate - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned in there is no free memory. Although this function - * allocates memory it can be called from an interrupt. - */ - -static inline struct sk_buff *dev_alloc_skb(unsigned int length) -{ - return __dev_alloc_skb(length, GFP_ATOMIC); -} - -/** - * skb_cow - copy header of skb when it is required - * @skb: buffer to cow - * @headroom: needed headroom - * - * If the skb passed lacks sufficient headroom or its data part - * is shared, data is reallocated. If reallocation fails, an error - * is returned and original skb is not changed. - * - * The result is skb with writable area skb->head...skb->tail - * and at least @headroom of space at head. - */ - -static inline int -skb_cow(struct sk_buff *skb, unsigned int headroom) -{ - int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb); - - if (delta < 0) - delta = 0; - - if (delta || skb_cloned(skb)) - return pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC); - return 0; -} - -/** - * skb_padto - pad an skbuff up to a minimal size - * @skb: buffer to pad - * @len: minimal length - * - * Pads up a buffer to ensure the trailing bytes exist and are - * blanked. If the buffer already contains sufficient data it - * is untouched. Returns the buffer, which may be a replacement - * for the original, or NULL for out of memory - in which case - * the original buffer is still freed. - */ - -static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len) -{ - unsigned int size = skb->len; - if(likely(size >= len)) - return skb; - return skb_pad(skb, len-size); -} - -/** - * skb_linearize - convert paged skb to linear one - * @skb: buffer to linarize - * @gfp: allocation mode - * - * If there is no free memory -ENOMEM is returned, otherwise zero - * is returned and the old skb data released. */ -int skb_linearize(struct sk_buff *skb, int gfp); - -static inline void *kmap_skb_frag(const skb_frag_t *frag) -{ -#ifdef CONFIG_HIGHMEM - if (in_irq()) - out_of_line_bug(); - - local_bh_disable(); -#endif - return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); -} - -static inline void kunmap_skb_frag(void *vaddr) -{ - kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ); -#ifdef CONFIG_HIGHMEM - local_bh_enable(); -#endif -} - -#define skb_queue_walk(queue, skb) \ - for (skb = (queue)->next; \ - (skb != (struct sk_buff *)(queue)); \ - skb=skb->next) - - -extern struct sk_buff * skb_recv_datagram(struct sock *sk,unsigned flags,int noblock, int *err); -extern unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); -extern int skb_copy_datagram(const struct sk_buff *from, int offset, char *to,int size); -extern int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to,int size); -extern int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int *csump); -extern int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, int hlen, struct iovec *iov); -extern void skb_free_datagram(struct sock * sk, struct sk_buff *skb); - -extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, int len, unsigned int csum); -extern int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); -extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum); -extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); - -extern void skb_init(void); -extern void skb_add_mtu(int mtu); - -#ifdef CONFIG_NETFILTER -static inline void -nf_conntrack_put(struct nf_ct_info *nfct) -{ - if (nfct && atomic_dec_and_test(&nfct->master->use)) - nfct->master->destroy(nfct->master); -} -static inline void -nf_conntrack_get(struct nf_ct_info *nfct) -{ - if (nfct) - atomic_inc(&nfct->master->use); -} -static inline void -nf_reset(struct sk_buff *skb) -{ - nf_conntrack_put(skb->nfct); - skb->nfct = NULL; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif -} -#else /* CONFIG_NETFILTER */ -static inline void nf_reset(struct sk_buff *skb) {} -#endif /* CONFIG_NETFILTER */ - -#endif /* __KERNEL__ */ -#endif /* _LINUX_SKBUFF_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/include/linux/timer.h --- a/linux-2.4.30-xen-sparse/include/linux/timer.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,77 +0,0 @@ -#ifndef _LINUX_TIMER_H -#define _LINUX_TIMER_H - -#include <linux/config.h> -#include <linux/list.h> - -/* - * In Linux 2.4, static timers have been removed from the kernel. - * Timers may be dynamically created and destroyed, and should be initialized - * by a call to init_timer() upon creation. - * - * The "data" field enables use of a common timeout function for several - * timeouts. You can use this field to distinguish between the different - * invocations. - */ -struct timer_list { - struct list_head list; - unsigned long expires; - unsigned long data; - void (*function)(unsigned long); -}; - -extern void add_timer(struct timer_list * timer); -extern int del_timer(struct timer_list * timer); -#ifdef CONFIG_NO_IDLE_HZ -extern struct timer_list *next_timer_event(void); -#endif - -#ifdef CONFIG_SMP -extern int del_timer_sync(struct timer_list * timer); -extern void sync_timers(void); -#else -#define del_timer_sync(t) del_timer(t) -#define sync_timers() do { } while (0) -#endif - -/* - * mod_timer is a more efficient way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a). - * If the timer is known to be not pending (ie, in the handler), mod_timer - * is less efficient than a->expires = b; add_timer(a). - */ -int mod_timer(struct timer_list *timer, unsigned long expires); - -extern void it_real_fn(unsigned long); - -static inline void init_timer(struct timer_list * timer) -{ - timer->list.next = timer->list.prev = NULL; -} - -static inline int timer_pending (const struct timer_list * timer) -{ - return timer->list.next != NULL; -} - -/* - * These inlines deal with timer wrapping correctly. You are - * strongly encouraged to use them - * 1. Because people otherwise forget - * 2. Because if the timer wrap changes in future you wont have to - * alter your driver code. - * - * time_after(a,b) returns true if the time a is after time b. - * - * Do this with "<0" and ">=0" to only test the sign of the result. A - * good compiler would generate better code (and a really good compiler - * wouldn't care). Gcc is currently neither. - */ -#define time_after(a,b) ((long)(b) - (long)(a) < 0) -#define time_before(a,b) time_after(b,a) - -#define time_after_eq(a,b) ((long)(a) - (long)(b) >= 0) -#define time_before_eq(a,b) time_after_eq(b,a) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/kernel/time.c --- a/linux-2.4.30-xen-sparse/kernel/time.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,415 +0,0 @@ -/* - * linux/kernel/time.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file contains the interface functions for the various - * time related system calls: time, stime, gettimeofday, settimeofday, - * adjtime - */ -/* - * Modification history kernel/time.c - * - * 1993-09-02 Philip Gladstone - * Created file with time related functions from sched.c and adjtimex() - * 1993-10-08 Torsten Duwe - * adjtime interface update and CMOS clock write code - * 1995-08-13 Torsten Duwe - * kernel PLL updated to 1994-12-13 specs (rfc-1589) - * 1999-01-16 Ulrich Windl - * Introduced error checking for many cases in adjtimex(). - * Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) - * (Even though the technical memorandum forbids it) - */ - -#include <linux/mm.h> -#include <linux/timex.h> -#include <linux/smp_lock.h> - -#include <asm/uaccess.h> - -/* - * The timezone where the local system is located. Used as a default by some - * programs who obtain this value by using gettimeofday. - */ -struct timezone sys_tz; - -/* The xtime_lock is not only serializing the xtime read/writes but it's also - serializing all accesses to the global NTP variables now. */ -extern rwlock_t xtime_lock; - -#if !defined(__alpha__) && !defined(__ia64__) - -/* - * sys_time() can be implemented in user-level using - * sys_gettimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - * - * XXX This function is NOT 64-bit clean! - */ -asmlinkage long sys_time(int * tloc) -{ - struct timeval now; - int i; - - do_gettimeofday(&now); - i = now.tv_sec; - if (tloc) { - if (put_user(i,tloc)) - i = -EFAULT; - } - return i; -} - -#if !defined(CONFIG_XEN) - -/* - * sys_stime() can be implemented in user-level using - * sys_settimeofday(). Is this for backwards compatibility? If so, - * why not move it into the appropriate arch directory (for those - * architectures that need it). - */ - -asmlinkage long sys_stime(int * tptr) -{ - int value; - - if (!capable(CAP_SYS_TIME)) - return -EPERM; - if (get_user(value, tptr)) - return -EFAULT; - write_lock_irq(&xtime_lock); - vxtime_lock(); - xtime.tv_sec = value; - xtime.tv_usec = 0; - vxtime_unlock(); - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - write_unlock_irq(&xtime_lock); - return 0; -} - -#endif - -#endif - -asmlinkage long sys_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - if (tv) { - struct timeval ktv; - do_gettimeofday(&ktv); - if (copy_to_user(tv, &ktv, sizeof(ktv))) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -/* - * Adjust the time obtained from the CMOS to be UTC time instead of - * local time. - * - * This is ugly, but preferable to the alternatives. Otherwise we - * would either need to write a program to do it in /etc/rc (and risk - * confusion if the program gets run more than once; it would also be - * hard to make the program warp the clock precisely n hours) or - * compile in the timezone information into the kernel. Bad, bad.... - * - * - TYT, 1992-01-01 - * - * The best thing to do is to keep the CMOS clock in universal time (UTC) - * as real UNIX machines always do it. This avoids all headaches about - * daylight saving times and warping kernel clocks. - */ -inline static void warp_clock(void) -{ - write_lock_irq(&xtime_lock); - vxtime_lock(); - xtime.tv_sec += sys_tz.tz_minuteswest * 60; - vxtime_unlock(); - write_unlock_irq(&xtime_lock); -} - -/* - * In case for some reason the CMOS clock has not already been running - * in UTC, but in some local time: The first time we set the timezone, - * we will warp the clock so that it is ticking UTC time instead of - * local time. Presumably, if someone is setting the timezone then we - * are running in an environment where the programs understand about - * timezones. This should be done at boot time in the /etc/rc script, - * as soon as possible, so that the clock can be set right. Otherwise, - * various programs will get confused when the clock gets warped. - */ - -int do_sys_settimeofday(struct timeval *tv, struct timezone *tz) -{ - static int firsttime = 1; - - if (!capable(CAP_SYS_TIME)) - return -EPERM; - - if (tz) { - /* SMP safe, global irq locking makes it work. */ - sys_tz = *tz; - if (firsttime) { - firsttime = 0; - if (!tv) - warp_clock(); - } - } - if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ - do_settimeofday(tv); - } - return 0; -} - -asmlinkage long sys_settimeofday(struct timeval *tv, struct timezone *tz) -{ - struct timeval new_tv; - struct timezone new_tz; - - if (tv) { - if (copy_from_user(&new_tv, tv, sizeof(*tv))) - return -EFAULT; - } - if (tz) { - if (copy_from_user(&new_tz, tz, sizeof(*tz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &new_tv : NULL, tz ? &new_tz : NULL); -} - -long pps_offset; /* pps time offset (us) */ -long pps_jitter = MAXTIME; /* time dispersion (jitter) (us) */ - -long pps_freq; /* frequency offset (scaled ppm) */ -long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ - -long pps_valid = PPS_VALID; /* pps signal watchdog counter */ - -int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ - -long pps_jitcnt; /* jitter limit exceeded */ -long pps_calcnt; /* calibration intervals */ -long pps_errcnt; /* calibration errors */ -long pps_stbcnt; /* stability limit exceeded */ - -/* hook for a loadable hardpps kernel module */ -void (*hardpps_ptr)(struct timeval *); - -/* adjtimex mainly allows reading (and writing, if superuser) of - * kernel time-keeping variables. used by xntpd. - */ -int do_adjtimex(struct timex *txc) -{ - long ltemp, mtemp, save_adjust; - int result; - - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - /* singleshot must not be used with any other mode bits */ - if (txc->modes != ADJ_OFFSET_SINGLESHOT) - return -EINVAL; - - if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) - /* adjustment Offset limited to +- .512 seconds */ - if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE ) - return -EINVAL; - - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ) - return -EINVAL; - - write_lock_irq(&xtime_lock); - result = time_state; /* mostly `TIME_OK' */ - - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - -#if 0 /* STA_CLOCKERR is never set yet */ - time_status &= ~STA_CLOCKERR; /* reset STA_CLOCKERR */ -#endif - /* If there are input parameters, then process them */ - if (txc->modes) - { - if (txc->modes & ADJ_STATUS) /* only set allowed bits */ - time_status = (txc->status & ~STA_RONLY) | - (time_status & STA_RONLY); - - if (txc->modes & ADJ_FREQUENCY) { /* p. 22 */ - if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) { - result = -EINVAL; - goto leave; - } - time_freq = txc->freq - pps_freq; - } - - if (txc->modes & ADJ_MAXERROR) { - if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_maxerror = txc->maxerror; - } - - if (txc->modes & ADJ_ESTERROR) { - if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) { - result = -EINVAL; - goto leave; - } - time_esterror = txc->esterror; - } - - if (txc->modes & ADJ_TIMECONST) { /* p. 24 */ - if (txc->constant < 0) { /* NTP v4 uses values > 6 */ - result = -EINVAL; - goto leave; - } - time_constant = txc->constant; - } - - if (txc->modes & ADJ_OFFSET) { /* values checked earlier */ - if (txc->modes == ADJ_OFFSET_SINGLESHOT) { - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - } - else if ( time_status & (STA_PLL | STA_PPSTIME) ) { - ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) == - (STA_PPSTIME | STA_PPSSIGNAL) ? - pps_offset : txc->offset; - - /* - * Scale the phase adjustment and - * clamp to the operating range. - */ - if (ltemp > MAXPHASE) - time_offset = MAXPHASE << SHIFT_UPDATE; - else if (ltemp < -MAXPHASE) - time_offset = -(MAXPHASE << SHIFT_UPDATE); - else - time_offset = ltemp << SHIFT_UPDATE; - - /* - * Select whether the frequency is to be controlled - * and in which mode (PLL or FLL). Clamp to the operating - * range. Ugly multiply/divide should be replaced someday. - */ - - if (time_status & STA_FREQHOLD || time_reftime == 0) - time_reftime = xtime.tv_sec; - mtemp = xtime.tv_sec - time_reftime; - time_reftime = xtime.tv_sec; - if (time_status & STA_FLL) { - if (mtemp >= MINSEC) { - ltemp = (time_offset / mtemp) << (SHIFT_USEC - - SHIFT_UPDATE); - if (ltemp < 0) - time_freq -= -ltemp >> SHIFT_KH; - else - time_freq += ltemp >> SHIFT_KH; - } else /* calibration interval too short (p. 12) */ - result = TIME_ERROR; - } else { /* PLL mode */ - if (mtemp < MAXSEC) { - ltemp *= mtemp; - if (ltemp < 0) - time_freq -= -ltemp >> (time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC); - else - time_freq += ltemp >> (time_constant + - time_constant + - SHIFT_KF - SHIFT_USEC); - } else /* calibration interval too long (p. 12) */ - result = TIME_ERROR; - } - if (time_freq > time_tolerance) - time_freq = time_tolerance; - else if (time_freq < -time_tolerance) - time_freq = -time_tolerance; - } /* STA_PLL || STA_PPSTIME */ - } /* txc->modes & ADJ_OFFSET */ - if (txc->modes & ADJ_TICK) { - /* if the quartz is off by more than 10% something is - VERY wrong ! */ - if (txc->tick < 900000/HZ || txc->tick > 1100000/HZ) { - result = -EINVAL; - goto leave; - } - tick = txc->tick; - } - } /* txc->modes */ -leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 - || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0 - && (time_status & STA_PPSSIGNAL) == 0) - /* p. 24, (b) */ - || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) - == (STA_PPSTIME|STA_PPSJITTER)) - /* p. 24, (c) */ - || ((time_status & STA_PPSFREQ) != 0 - && (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0)) - /* p. 24, (d) */ - result = TIME_ERROR; - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) - txc->offset = save_adjust; - else { - if (time_offset < 0) - txc->offset = -(-time_offset >> SHIFT_UPDATE); - else - txc->offset = time_offset >> SHIFT_UPDATE; - } - txc->freq = time_freq + pps_freq; - txc->maxerror = time_maxerror; - txc->esterror = time_esterror; - txc->status = time_status; - txc->constant = time_constant; - txc->precision = time_precision; - txc->tolerance = time_tolerance; - txc->tick = tick; - txc->ppsfreq = pps_freq; - txc->jitter = pps_jitter >> PPS_AVG; - txc->shift = pps_shift; - txc->stabil = pps_stabil; - txc->jitcnt = pps_jitcnt; - txc->calcnt = pps_calcnt; - txc->errcnt = pps_errcnt; - txc->stbcnt = pps_stbcnt; - write_unlock_irq(&xtime_lock); - do_gettimeofday(&txc->time); - return(result); -} - -asmlinkage long sys_adjtimex(struct timex *txc_p) -{ - struct timex txc; /* Local copy of parameter */ - int ret; - - /* Copy the user data space into the kernel copy - * structure. But bear in mind that the structures - * may change - */ - if(copy_from_user(&txc, txc_p, sizeof(struct timex))) - return -EFAULT; - ret = do_adjtimex(&txc); - return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/kernel/timer.c --- a/linux-2.4.30-xen-sparse/kernel/timer.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,968 +0,0 @@ -/* - * linux/kernel/timer.c - * - * Kernel internal timers, kernel timekeeping, basic process system calls - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. - * - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - * Copyright (C) 1998 Andrea Arcangeli - * 1999-03-10 Improved NTP compatibility by Ulrich Windl - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/timex.h> -#include <linux/delay.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> - -#include <asm/uaccess.h> - -/* - * Timekeeping variables - */ - -long tick = (1000000 + HZ/2) / HZ; /* timer interrupt period */ - -/* The current time */ -struct timeval xtime __attribute__ ((aligned (16))); - -/* Don't completely fail for HZ > 500. */ -int tickadj = 500/HZ ? : 1; /* microsecs */ - -DECLARE_TASK_QUEUE(tq_timer); -DECLARE_TASK_QUEUE(tq_immediate); - -/* - * phase-lock loop variables - */ -/* TIME_ERROR prevents overwriting the CMOS clock */ -int time_state = TIME_OK; /* clock synchronization status */ -int time_status = STA_UNSYNC; /* clock status bits */ -long time_offset; /* time adjustment (us) */ -long time_constant = 2; /* pll time constant */ -long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ -long time_precision = 1; /* clock precision (us) */ -long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ -long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -long time_phase; /* phase offset (scaled us) */ -long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC; - /* frequency offset (scaled ppm)*/ -long time_adj; /* tick adjust (scaled 1 / HZ) */ -long time_reftime; /* time at last adjustment (s) */ - -long time_adjust; -long time_adjust_step; - -unsigned long event; - -extern int do_setitimer(int, struct itimerval *, struct itimerval *); - -unsigned long volatile jiffies; - -unsigned int * prof_buffer; -unsigned long prof_len; -unsigned long prof_shift; - -/* - * Event timer code - */ -#define TVN_BITS 6 -#define TVR_BITS 8 -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) - -struct timer_vec { - int index; - struct list_head vec[TVN_SIZE]; -}; - -struct timer_vec_root { - int index; - struct list_head vec[TVR_SIZE]; -}; - -static struct timer_vec tv5; -static struct timer_vec tv4; -static struct timer_vec tv3; -static struct timer_vec tv2; -static struct timer_vec_root tv1; - -static struct timer_vec * const tvecs[] = { - (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 -}; - -static struct list_head * run_timer_list_running; - -#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) - -void init_timervecs (void) -{ - int i; - - for (i = 0; i < TVN_SIZE; i++) { - INIT_LIST_HEAD(tv5.vec + i); - INIT_LIST_HEAD(tv4.vec + i); - INIT_LIST_HEAD(tv3.vec + i); - INIT_LIST_HEAD(tv2.vec + i); - } - for (i = 0; i < TVR_SIZE; i++) - INIT_LIST_HEAD(tv1.vec + i); -} - -static unsigned long timer_jiffies; - -static inline void internal_add_timer(struct timer_list *timer) -{ - /* - * must be cli-ed when calling this - */ - unsigned long expires = timer->expires; - unsigned long idx = expires - timer_jiffies; - struct list_head * vec; - - if (run_timer_list_running) - vec = run_timer_list_running; - else if (idx < TVR_SIZE) { - int i = expires & TVR_MASK; - vec = tv1.vec + i; - } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { - int i = (expires >> TVR_BITS) & TVN_MASK; - vec = tv2.vec + i; - } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; - vec = tv3.vec + i; - } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { - int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; - vec = tv4.vec + i; - } else if ((signed long) idx < 0) { - /* can happen if you add a timer with expires == jiffies, - * or you set a timer to go off in the past - */ - vec = tv1.vec + tv1.index; - } else if (idx <= 0xffffffffUL) { - int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; - vec = tv5.vec + i; - } else { - /* Can only get here on architectures with 64-bit jiffies */ - INIT_LIST_HEAD(&timer->list); - return; - } - /* - * Timers are FIFO! - */ - list_add(&timer->list, vec->prev); -} - -/* Initialize both explicitly - let's try to have them in the same cache line */ -spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED; - -#ifdef CONFIG_SMP -volatile struct timer_list * volatile running_timer; -#define timer_enter(t) do { running_timer = t; mb(); } while (0) -#define timer_exit() do { running_timer = NULL; } while (0) -#define timer_is_running(t) (running_timer == t) -#define timer_synchronize(t) while (timer_is_running(t)) barrier() -#else -#define timer_enter(t) do { } while (0) -#define timer_exit() do { } while (0) -#endif - -void add_timer(struct timer_list *timer) -{ - unsigned long flags; - - spin_lock_irqsave(&timerlist_lock, flags); - if (timer_pending(timer)) - goto bug; - internal_add_timer(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); - return; -bug: - spin_unlock_irqrestore(&timerlist_lock, flags); - printk("bug: kernel timer added twice at %p.\n", - __builtin_return_address(0)); -} - -static inline int detach_timer (struct timer_list *timer) -{ - if (!timer_pending(timer)) - return 0; - list_del(&timer->list); - return 1; -} - -int mod_timer(struct timer_list *timer, unsigned long expires) -{ - int ret; - unsigned long flags; - - spin_lock_irqsave(&timerlist_lock, flags); - timer->expires = expires; - ret = detach_timer(timer); - internal_add_timer(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); - return ret; -} - -int del_timer(struct timer_list * timer) -{ - int ret; - unsigned long flags; - - spin_lock_irqsave(&timerlist_lock, flags); - ret = detach_timer(timer); - timer->list.next = timer->list.prev = NULL; - spin_unlock_irqrestore(&timerlist_lock, flags); - return ret; -} - -#ifdef CONFIG_SMP -void sync_timers(void) -{ - spin_unlock_wait(&global_bh_lock); -} - -/* - * SMP specific function to delete periodic timer. - * Caller must disable by some means restarting the timer - * for new. Upon exit the timer is not queued and handler is not running - * on any CPU. It returns number of times, which timer was deleted - * (for reference counting). - */ - -int del_timer_sync(struct timer_list * timer) -{ - int ret = 0; - - for (;;) { - unsigned long flags; - int running; - - spin_lock_irqsave(&timerlist_lock, flags); - ret += detach_timer(timer); - timer->list.next = timer->list.prev = 0; - running = timer_is_running(timer); - spin_unlock_irqrestore(&timerlist_lock, flags); - - if (!running) - break; - - timer_synchronize(timer); - } - - return ret; -} -#endif - - -static inline void cascade_timers(struct timer_vec *tv) -{ - /* cascade all the timers from tv up one level */ - struct list_head *head, *curr, *next; - - head = tv->vec + tv->index; - curr = head->next; - /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. - */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, list); - next = curr->next; - list_del(curr); // not needed - internal_add_timer(tmp); - curr = next; - } - INIT_LIST_HEAD(head); - tv->index = (tv->index + 1) & TVN_MASK; -} - -static inline void run_timer_list(void) -{ - spin_lock_irq(&timerlist_lock); - while ((long)(jiffies - timer_jiffies) >= 0) { - LIST_HEAD(queued); - struct list_head *head, *curr; - if (!tv1.index) { - int n = 1; - do { - cascade_timers(tvecs[n]); - } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); - } - run_timer_list_running = &queued; -repeat: - head = tv1.vec + tv1.index; - curr = head->next; - if (curr != head) { - struct timer_list *timer; - void (*fn)(unsigned long); - unsigned long data; - - timer = list_entry(curr, struct timer_list, list); - fn = timer->function; - data= timer->data; - - detach_timer(timer); - timer->list.next = timer->list.prev = NULL; - timer_enter(timer); - spin_unlock_irq(&timerlist_lock); - fn(data); - spin_lock_irq(&timerlist_lock); - timer_exit(); - goto repeat; - } - run_timer_list_running = NULL; - ++timer_jiffies; - tv1.index = (tv1.index + 1) & TVR_MASK; - - curr = queued.next; - while (curr != &queued) { - struct timer_list *timer; - - timer = list_entry(curr, struct timer_list, list); - curr = curr->next; - internal_add_timer(timer); - } - } - spin_unlock_irq(&timerlist_lock); -} - -#ifdef CONFIG_NO_IDLE_HZ -/* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when all cpus are idle. - * And in XenoLinux to achieve the same. - * The timerlist_lock must be acquired before calling this function. - */ -struct timer_list *next_timer_event(void) -{ - struct timer_list *nte, *tmp; - struct list_head *lst; - int i, j; - - /* Look for the next timer event in tv1. */ - i = 0; - j = tvecs[0]->index; - do { - struct list_head *head = tvecs[0]->vec + j; - if (!list_empty(head)) { - nte = list_entry(head->next, struct timer_list, list); - goto found; - } - j = (j + 1) & TVR_MASK; - } while (j != tv1.index); - - /* No event found in tv1. Check tv2-tv5. */ - for (i = 1; i < NOOF_TVECS; i++) { - j = tvecs[i]->index; - do { - nte = NULL; - list_for_each(lst, tvecs[i]->vec + j) { - tmp = list_entry(lst, struct timer_list, list); - if (nte == NULL || - time_before(tmp->expires, nte->expires)) - nte = tmp; - } - if (nte) - goto found; - j = (j + 1) & TVN_MASK; - } while (j != tvecs[i]->index); - } - return NULL; -found: - /* Found timer event in tvecs[i]->vec[j] */ - if (j < tvecs[i]->index && i < NOOF_TVECS-1) { - /* - * The search wrapped. We need to look at the next list - * from tvecs[i+1] that would cascade into tvecs[i]. - */ - list_for_each(lst, tvecs[i+1]->vec+tvecs[i+1]->index) { - tmp = list_entry(lst, struct timer_list, list); - if (time_before(tmp->expires, nte->expires)) - nte = tmp; - } - } - return nte; -} -#endif - -spinlock_t tqueue_lock = SPIN_LOCK_UNLOCKED; - -void tqueue_bh(void) -{ - run_task_queue(&tq_timer); -} - -void immediate_bh(void) -{ - run_task_queue(&tq_immediate); -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@xxxxxxxx) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - * - */ -static void second_overflow(void) -{ - long ltemp; - - /* Bump the maxerror field */ - time_maxerror += time_tolerance >> SHIFT_USEC; - if ( time_maxerror > NTP_PHASE_LIMIT ) { - time_maxerror = NTP_PHASE_LIMIT; - time_status |= STA_UNSYNC; - } - - /* - * Leap second processing. If in leap-insert state at - * the end of the day, the system clock is set back one - * second; if in leap-delete state, the system clock is - * set ahead one second. The microtime() routine or - * external clock driver will insure that reported time - * is always monotonic. The ugly divides should be - * replaced. - */ - switch (time_state) { - - case TIME_OK: - if (time_status & STA_INS) - time_state = TIME_INS; - else if (time_status & STA_DEL) - time_state = TIME_DEL; - break; - - case TIME_INS: - if (xtime.tv_sec % 86400 == 0) { - xtime.tv_sec--; - time_state = TIME_OOP; - printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); - } - break; - - case TIME_DEL: - if ((xtime.tv_sec + 1) % 86400 == 0) { - xtime.tv_sec++; - time_state = TIME_WAIT; - printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); - } - break; - - case TIME_OOP: - time_state = TIME_WAIT; - break; - - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - } - - /* - * Compute the phase adjustment for the next second. In - * PLL mode, the offset is reduced by a fixed factor - * times the time constant. In FLL mode the offset is - * used directly. In either mode, the maximum phase - * adjustment for each second is clamped so as to spread - * the adjustment over not more than the number of - * seconds between updates. - */ - if (time_offset < 0) { - ltemp = -time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset += ltemp; - time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } else { - ltemp = time_offset; - if (!(time_status & STA_FLL)) - ltemp >>= SHIFT_KG + time_constant; - if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) - ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; - time_offset -= ltemp; - time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); - } - - /* - * Compute the frequency estimate and additional phase - * adjustment due to frequency error for the next - * second. When the PPS signal is engaged, gnaw on the - * watchdog counter and update the frequency computed by - * the pll and the PPS signal. - */ - pps_valid++; - if (pps_valid == PPS_VALID) { /* PPS signal lost */ - pps_jitter = MAXTIME; - pps_stabil = MAXFREQ; - time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | - STA_PPSWANDER | STA_PPSERROR); - } - ltemp = time_freq + pps_freq; - if (ltemp < 0) - time_adj -= -ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - else - time_adj += ltemp >> - (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); - -#if HZ == 100 - /* Compensate for (HZ==100) != (1 << SHIFT_HZ). - * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) - */ - if (time_adj < 0) - time_adj -= (-time_adj >> 2) + (-time_adj >> 5); - else - time_adj += (time_adj >> 2) + (time_adj >> 5); -#endif -} - -/* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) -{ - if ( (time_adjust_step = time_adjust) != 0 ) { - /* We are doing an adjtime thing. - * - * Prepare time_adjust_step to be within bounds. - * Note that a positive time_adjust means we want the clock - * to run faster. - * - * Limit the amount of the step to be in the range - * -tickadj .. +tickadj - */ - if (time_adjust > tickadj) - time_adjust_step = tickadj; - else if (time_adjust < -tickadj) - time_adjust_step = -tickadj; - - /* Reduce by this step the amount of time left */ - time_adjust -= time_adjust_step; - } - xtime.tv_usec += tick + time_adjust_step; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if (time_phase <= -FINEUSEC) { - long ltemp = -time_phase >> SHIFT_SCALE; - time_phase += ltemp << SHIFT_SCALE; - xtime.tv_usec -= ltemp; - } - else if (time_phase >= FINEUSEC) { - long ltemp = time_phase >> SHIFT_SCALE; - time_phase -= ltemp << SHIFT_SCALE; - xtime.tv_usec += ltemp; - } -} - -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks - */ -static void update_wall_time(unsigned long ticks) -{ - do { - ticks--; - update_wall_time_one_tick(); - } while (ticks); - - while (xtime.tv_usec >= 1000000) { - xtime.tv_usec -= 1000000; - xtime.tv_sec++; - second_overflow(); - } -} - -static inline void do_process_times(struct task_struct *p, - unsigned long user, unsigned long system) -{ - unsigned long psecs; - - psecs = (p->times.tms_utime += user); - psecs += (p->times.tms_stime += system); - if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) { - /* Send SIGXCPU every second.. */ - if (!(psecs % HZ)) - send_sig(SIGXCPU, p, 1); - /* and SIGKILL when we go over max.. */ - if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max) - send_sig(SIGKILL, p, 1); - } -} - -static inline void do_it_virt(struct task_struct * p, unsigned long ticks) -{ - unsigned long it_virt = p->it_virt_value; - - if (it_virt) { - it_virt -= ticks; - if (!it_virt) { - it_virt = p->it_virt_incr; - send_sig(SIGVTALRM, p, 1); - } - p->it_virt_value = it_virt; - } -} - -static inline void do_it_prof(struct task_struct *p) -{ - unsigned long it_prof = p->it_prof_value; - - if (it_prof) { - if (--it_prof == 0) { - it_prof = p->it_prof_incr; - send_sig(SIGPROF, p, 1); - } - p->it_prof_value = it_prof; - } -} - -void update_one_process(struct task_struct *p, unsigned long user, - unsigned long system, int cpu) -{ - p->per_cpu_utime[cpu] += user; - p->per_cpu_stime[cpu] += system; - do_process_times(p, user, system); - do_it_virt(p, user); - do_it_prof(p); -} - -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - int cpu = smp_processor_id(), system = user_tick ^ 1; - - update_one_process(p, user_tick, system, cpu); - if (p->pid) { - if (--p->counter <= 0) { - p->counter = 0; - /* - * SCHED_FIFO is priority preemption, so this is - * not the place to decide whether to reschedule a - * SCHED_FIFO task or not - Bhavesh Davda - */ - if (p->policy != SCHED_FIFO) { - p->need_resched = 1; - } - } - if (p->nice > 0) - kstat.per_cpu_nice[cpu] += user_tick; - else - kstat.per_cpu_user[cpu] += user_tick; - kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; -} - -/* - * Called from the timer interrupt handler to charge a couple of ticks - * to the current process. - */ -void update_process_times_us(int user_ticks, int system_ticks) -{ - struct task_struct *p = current; - int cpu = smp_processor_id(); - - update_one_process(p, user_ticks, system_ticks, cpu); - if (p->pid) { - p->counter -= user_ticks + system_ticks; - if (p->counter <= 0) { - p->counter = 0; - p->need_resched = 1; - } - if (p->nice > 0) - kstat.per_cpu_nice[cpu] += user_ticks; - else - kstat.per_cpu_user[cpu] += user_ticks; - kstat.per_cpu_system[cpu] += system_ticks; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system_ticks; -} - -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - struct task_struct *p; - unsigned long nr = 0; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->state == TASK_RUNNING || - (p->state & TASK_UNINTERRUPTIBLE))) - nr += FIXED_1; - } - read_unlock(&tasklist_lock); - return nr; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - */ -unsigned long avenrun[3]; - -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - while (count < 0) { - count += LOAD_FREQ; - active_tasks = count_active_tasks(); - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - } -} - -/* jiffies at the most recent update of wall time */ -unsigned long wall_jiffies; - -/* - * This spinlock protect us from races in SMP while playing with xtime. -arca - */ -rwlock_t xtime_lock = RW_LOCK_UNLOCKED; - -static inline void update_times(void) -{ - unsigned long ticks; - - /* - * update_times() is run from the raw timer_bh handler so we - * just know that the irqs are locally enabled and so we don't - * need to save/restore the flags of the local CPU here. -arca - */ - write_lock_irq(&xtime_lock); - vxtime_lock(); - - ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } - vxtime_unlock(); - write_unlock_irq(&xtime_lock); - calc_load(ticks); -} - -void timer_bh(void) -{ - update_times(); - run_timer_list(); -} - -void do_timer(struct pt_regs *regs) -{ - (*(unsigned long *)&jiffies)++; -#ifndef CONFIG_SMP - /* SMP process accounting uses the local APIC timer */ - - update_process_times(user_mode(regs)); -#endif - mark_bh(TIMER_BH); - if (TQ_ACTIVE(tq_timer)) - mark_bh(TQUEUE_BH); -} - -void do_timer_ticks(int ticks) -{ - (*(unsigned long *)&jiffies) += ticks; - mark_bh(TIMER_BH); - if (TQ_ACTIVE(tq_timer)) - mark_bh(TQUEUE_BH); -} - -#if !defined(__alpha__) && !defined(__ia64__) - -/* - * For backwards compatibility? This can be done in libc so Alpha - * and all newer ports shouldn't need it. - */ -asmlinkage unsigned long sys_alarm(unsigned int seconds) -{ - struct itimerval it_new, it_old; - unsigned int oldalarm; - - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; - it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - do_setitimer(ITIMER_REAL, &it_new, &it_old); - oldalarm = it_old.it_value.tv_sec; - /* ehhh.. We can't return 0 if we have an alarm pending.. */ - /* And we'd better return too much than too little anyway */ - if (it_old.it_value.tv_usec) - oldalarm++; - return oldalarm; -} - -#endif - -#ifndef __alpha__ - -/* - * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this - * should be moved into arch/i386 instead? - */ - -/** - * sys_getpid - return the thread group id of the current process - * - * Note, despite the name, this returns the tgid not the pid. The tgid and - * the pid are identical unless CLONE_THREAD was specified on clone() in - * which case the tgid is the same in all threads of the same group. - * - * This is SMP safe as current->tgid does not change. - */ -asmlinkage long sys_getpid(void) -{ - return current->tgid; -} - -/* - * This is not strictly SMP safe: p_opptr could change - * from under us. However, rather than getting any lock - * we can use an optimistic algorithm: get the parent - * pid, and go back and check that the parent is still - * the same. If it has changed (which is extremely unlikely - * indeed), we just try again.. - * - * NOTE! This depends on the fact that even if we _do_ - * get an old value of "parent", we can happily dereference - * the pointer: we just can't necessarily trust the result - * until we know that the parent pointer is valid. - * - * The "mb()" macro is a memory barrier - a synchronizing - * event. It also makes sure that gcc doesn't optimize - * away the necessary memory references.. The barrier doesn't - * have to have all that strong semantics: on x86 we don't - * really require a synchronizing instruction, for example. - * The barrier is more important for code generation than - * for any real memory ordering semantics (even if there is - * a small window for a race, using the old pointer is - * harmless for a while). - */ -asmlinkage long sys_getppid(void) -{ - int pid; - struct task_struct * me = current; - struct task_struct * parent; - - parent = me->p_opptr; - for (;;) { - pid = parent->pid; -#if CONFIG_SMP -{ - struct task_struct *old = parent; - mb(); - parent = me->p_opptr; - if (old != parent) - continue; -} -#endif - break; - } - return pid; -} - -asmlinkage long sys_getuid(void) -{ - /* Only we change this so SMP safe */ - return current->uid; -} - -asmlinkage long sys_geteuid(void) -{ - /* Only we change this so SMP safe */ - return current->euid; -} - -asmlinkage long sys_getgid(void) -{ - /* Only we change this so SMP safe */ - return current->gid; -} - -asmlinkage long sys_getegid(void) -{ - /* Only we change this so SMP safe */ - return current->egid; -} - -#endif - -/* Thread ID - the internal kernel "pid" */ -asmlinkage long sys_gettid(void) -{ - return current->pid; -} - -asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp) -{ - struct timespec t; - unsigned long expire; - - if(copy_from_user(&t, rqtp, sizeof(struct timespec))) - return -EFAULT; - - if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 || t.tv_sec < 0) - return -EINVAL; - - - if (t.tv_sec == 0 && t.tv_nsec <= 2000000L && - current->policy != SCHED_OTHER) - { - /* - * Short delay requests up to 2 ms will be handled with - * high precision by a busy wait for all real-time processes. - * - * Its important on SMP not to do this holding locks. - */ - udelay((t.tv_nsec + 999) / 1000); - return 0; - } - - expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); - - if (expire) { - if (rmtp) { - jiffies_to_timespec(expire, &t); - if (copy_to_user(rmtp, &t, sizeof(struct timespec))) - return -EFAULT; - } - return -EINTR; - } - return 0; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/mkbuildtree --- a/linux-2.4.30-xen-sparse/mkbuildtree Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,292 +0,0 @@ -#!/bin/bash - -# mkbuildtree <build tree> -# -# Creates symbolic links in <build tree> for the sparse tree -# in the current directory. - -# Script to determine the relative path between two directories. -# Copyright (c) D. J. Hawkey Jr. 2002 -# Fixed for Xen project by K. Fraser in 2003. -abs_to_rel () -{ - local CWD SRCPATH - - if [ "$1" != "/" -a "${1##*[^/]}" = "/" ]; then - SRCPATH=${1%?} - else - SRCPATH=$1 - fi - if [ "$2" != "/" -a "${2##*[^/]}" = "/" ]; then - DESTPATH=${2%?} - else - DESTPATH=$2 - fi - - CWD=$PWD - [ "${1%%[^/]*}" != "/" ] && cd $1 && SRCPATH=$PWD - [ "${2%%[^/]*}" != "/" ] && cd $2 && DESTPATH=$PWD - [ "$CWD" != "$PWD" ] && cd $CWD - - BASEPATH=$SRCPATH - - [ "$SRCPATH" = "$DESTPATH" ] && DESTPATH="." && return - [ "$SRCPATH" = "/" ] && DESTPATH=${DESTPATH#?} && return - - while [ "$BASEPATH/" != "${DESTPATH%${DESTPATH#$BASEPATH/}}" ]; do - BASEPATH=${BASEPATH%/*} - done - - SRCPATH=${SRCPATH#$BASEPATH} - DESTPATH=${DESTPATH#$BASEPATH} - DESTPATH=${DESTPATH#?} - while [ -n "$SRCPATH" ]; do - SRCPATH=${SRCPATH%/*} - DESTPATH="../$DESTPATH" - done - - [ -z "$BASEPATH" ] && BASEPATH="/" - [ "${DESTPATH##*[^/]}" = "/" ] && DESTPATH=${DESTPATH%?} -} - -# relative_lndir <target_dir> -# Creates a tree of symlinks in the current working directory that mirror -# real files in <target_dir>. <target_dir> should be relative to the current -# working directory. Symlinks in <target_dir> are ignored. Source-control files -# are ignored. -relative_lndir () -{ - local SYMLINK_DIR REAL_DIR pref i j - SYMLINK_DIR=$PWD - REAL_DIR=$1 - ( - cd $REAL_DIR - for i in `find . -type d | grep -v SCCS`; do - [ -d $SYMLINK_DIR/$i ] || mkdir -p $SYMLINK_DIR/$i - ( - cd $i - pref=`echo $i | sed -e 's#/[^/]*#../#g' -e 's#^\.##'` - for j in `find . -maxdepth 1 -type f -o -type l`; do - ln -sf ${pref}${REAL_DIR}/$i/$j ${SYMLINK_DIR}/$i/$j - done - ) - done - ) -} - -[ "$1" == "" ] && { echo "Syntax: $0 <linux tree to xenify>"; exit 1; } - -# Get absolute path to the destination directory -pushd . >/dev/null -cd ${1} || { echo "cannot cd to ${1}"; exit 1; } -AD=$PWD -popd >/dev/null - -# Get absolute path to the source directory -AS=`pwd` - -# Get path to source, relative to destination -abs_to_rel ${AD} ${AS} -RS=$DESTPATH - -# Remove old copies of files and directories at the destination -for i in `find . -type f -o -type l` ; do rm -f ${AD}/${i#./} ; done - -# We now work from the destination directory -cd ${AD} || { echo "cannot cd to ${AD}"; exit 1; } - -# Remove old symlinks -for i in `find . -type l`; do rm -f $i; done - -# Create symlinks of files and directories which exist in the sparse source -relative_lndir ${RS} -rm -f mkbuildtree - -set ${RS}/../linux-2.6.*-xen-sparse -[ "$1" == "${RS}/../linux-2.6.*-xen-parse" ] && { echo "no Linux 2.6 sparse tree at ${RS}/../linux-2.6.*-xen-sparse"; exit 1; } -LINUX_26="$1" - - -# Create links to the shared definitions of the Xen interfaces. -rm -rf ${AD}/include/asm-xen/xen-public -mkdir ${AD}/include/asm-xen/xen-public -cd ${AD}/include/asm-xen/xen-public -relative_lndir ../../../${RS}/../xen/include/public - -# Create a link to the shared definitions for the control interface -cd ${AD}/include/asm-xen - -## Symlinks for files: -## - which are identical in the i386 and xen-i386 architecture-dependent -## subdirectories. -## - which are identical in the Linux 2.6 and Linux 2.4 ports. - -cd ${AD}/include/asm-xen -ln -sf ../asm-i386/a.out.h -ln -sf ../asm-i386/apicdef.h -ln -sf ../asm-i386/apic.h -ln -sf ../asm-i386/atomic.h -ln -sf ../asm-i386/bitops.h -ln -sf ../asm-i386/boot.h -ln -sf ../asm-i386/byteorder.h -ln -sf ../asm-i386/cache.h -ln -sf ../asm-i386/checksum.h -ln -sf ../asm-i386/cpufeature.h -ln -sf ../asm-i386/current.h -ln -sf ../asm-i386/debugreg.h -ln -sf ../asm-i386/delay.h -ln -sf ../asm-i386/div64.h -ln -sf ../asm-i386/dma.h -ln -sf ../asm-i386/elf.h -ln -sf ../asm-i386/errno.h -ln -sf ../asm-i386/fcntl.h -ln -sf ../asm-i386/floppy.h -ln -sf ../asm-i386/hardirq.h -ln -sf ../asm-i386/hdreg.h -ln -sf ../asm-i386/i387.h -ln -sf ../asm-i386/ide.h -ln -sf ../asm-i386/init.h -ln -sf ../asm-i386/io_apic.h -ln -sf ../asm-i386/ioctl.h -ln -sf ../asm-i386/ioctls.h -ln -sf ../asm-i386/ipcbuf.h -ln -sf ../asm-i386/ipc.h -ln -sf ../asm-i386/kmap_types.h -ln -sf ../asm-i386/ldt.h -ln -sf ../asm-i386/linux_logo.h -ln -sf ../asm-i386/locks.h -ln -sf ../asm-i386/math_emu.h -ln -sf ../asm-i386/mc146818rtc.h -ln -sf ../asm-i386/mca_dma.h -ln -sf ../asm-i386/mman.h -ln -sf ../asm-i386/mmu.h -ln -sf ../asm-i386/mmx.h -ln -sf ../asm-i386/mpspec.h -ln -sf ../asm-i386/msgbuf.h -ln -sf ../asm-i386/msr.h -ln -sf ../asm-i386/mtrr.h -ln -sf ../asm-i386/namei.h -ln -sf ../asm-i386/param.h -ln -sf ../asm-i386/parport.h -ln -sf ../asm-i386/pgtable-3level.h -ln -sf ../asm-i386/poll.h -ln -sf ../asm-i386/posix_types.h -ln -sf ../asm-i386/ptrace.h -ln -sf ../asm-i386/resource.h -ln -sf ../asm-i386/rwlock.h -ln -sf ../asm-i386/rwsem.h -ln -sf ../asm-i386/scatterlist.h -ln -sf ../asm-i386/semaphore.h -ln -sf ../asm-i386/sembuf.h -ln -sf ../asm-i386/serial.h -ln -sf ../asm-i386/setup.h -ln -sf ../asm-i386/shmbuf.h -ln -sf ../asm-i386/shmparam.h -ln -sf ../asm-i386/sigcontext.h -ln -sf ../asm-i386/siginfo.h -ln -sf ../asm-i386/signal.h -ln -sf ../asm-i386/smplock.h -ln -sf ../asm-i386/socket.h -ln -sf ../asm-i386/sockios.h -ln -sf ../asm-i386/softirq.h -ln -sf ../asm-i386/spinlock.h -ln -sf ../asm-i386/statfs.h -ln -sf ../asm-i386/stat.h -ln -sf ../asm-i386/string-486.h -ln -sf ../asm-i386/string.h -ln -sf ../asm-i386/termbits.h -ln -sf ../asm-i386/termios.h -ln -sf ../asm-i386/timex.h -ln -sf ../asm-i386/tlb.h -ln -sf ../asm-i386/types.h -ln -sf ../asm-i386/uaccess.h -ln -sf ../asm-i386/ucontext.h -ln -sf ../asm-i386/unaligned.h -ln -sf ../asm-i386/unistd.h -ln -sf ../asm-i386/user.h -ln -sf ../asm-i386/vm86.h -ln -sf ../../${LINUX_26}/include/asm-xen/balloon.h -ln -sf ../../${LINUX_26}/include/asm-xen/ctrl_if.h -ln -sf ../../${LINUX_26}/include/asm-xen/evtchn.h -ln -sf ../../${LINUX_26}/include/asm-xen/gnttab.h -ln -sf ../../${LINUX_26}/include/asm-xen/hypervisor.h -ln -sf ../../${LINUX_26}/include/asm-xen/xen_proc.h -ln -sf ../../${LINUX_26}/include/asm-xen/asm-i386/synch_bitops.h -ln -sf ../../${LINUX_26}/include/asm-xen/asm-i386/hypercall.h - -mkdir -p linux-public && cd linux-public -ln -sf ../../../${LINUX_26}/include/asm-xen/linux-public/privcmd.h -ln -sf ../../../${LINUX_26}/include/asm-xen/linux-public/suspend.h - -cd ${AD}/arch/xen/kernel -ln -sf ../../i386/kernel/i387.c -ln -sf ../../i386/kernel/init_task.c -ln -sf ../../i386/kernel/pci-i386.c -ln -sf ../../i386/kernel/pci-i386.h -ln -sf ../../i386/kernel/ptrace.c -ln -sf ../../i386/kernel/semaphore.c -ln -sf ../../i386/kernel/sys_i386.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/ctrl_if.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/evtchn.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/fixup.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/gnttab.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/reboot.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/skbuff.c - -cd ${AD}/arch/xen/lib -ln -sf ../../i386/lib/checksum.S -ln -sf ../../i386/lib/dec_and_lock.c -ln -sf ../../i386/lib/getuser.S -ln -sf ../../i386/lib/iodebug.c -ln -sf ../../i386/lib/memcpy.c -ln -sf ../../i386/lib/mmx.c -ln -sf ../../i386/lib/old-checksum.c -ln -sf ../../i386/lib/strstr.c -ln -sf ../../i386/lib/usercopy.c -ln -sf ../../../${LINUX_26}/arch/xen/kernel/xen_proc.c - -cd ${AD}/arch/xen/mm -ln -sf ../../i386/mm/extable.c -ln -sf ../../i386/mm/pageattr.c -ln -sf ../../../${LINUX_26}/arch/xen/i386/mm/hypervisor.c - -cd ${AD}/arch/xen/drivers/balloon -ln -sf ../../../../${LINUX_26}/drivers/xen/balloon/balloon.c - -cd ${AD}/arch/xen/drivers/console -ln -sf ../../../../${LINUX_26}/drivers/xen/console/console.c - -cd ${AD}/arch/xen/drivers/dom0 -ln -sf ../../../../${LINUX_26}/drivers/xen/privcmd/privcmd.c core.c - -cd ${AD}/arch/xen/drivers/evtchn -ln -sf ../../../../${LINUX_26}/drivers/xen/evtchn/evtchn.c - -cd ${AD}/arch/xen/drivers/netif/frontend -ln -sf ../../../../../${LINUX_26}/drivers/xen/netfront/netfront.c main.c - -cd ${AD}/arch/xen/drivers/netif/backend -ln -sf ../../../../../${LINUX_26}/drivers/xen/netback/common.h -ln -sf ../../../../../${LINUX_26}/drivers/xen/netback/control.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/netback/interface.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/netback/netback.c main.c - -cd ${AD}/arch/xen/drivers/blkif/backend -ln -sf ../../../../../${LINUX_26}/drivers/xen/blkback/common.h -ln -sf ../../../../../${LINUX_26}/drivers/xen/blkback/blkback.c main.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/blkback/control.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/blkback/interface.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/blkback/vbd.c - -cd ${AD}/arch/xen/drivers/blkif/frontend -ln -sf ../../../../../${LINUX_26}/drivers/xen/blkfront/blkfront.c - -cd ${AD}/arch/xen/drivers/usbif/frontend -ln -sf ../../../../../${LINUX_26}/drivers/xen/usbfront/usbfront.c main.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/usbfront/xhci.h - -cd ${AD}/arch/xen/drivers/usbif/backend -ln -sf ../../../../../${LINUX_26}/drivers/xen/usbback/common.h -ln -sf ../../../../../${LINUX_26}/drivers/xen/usbback/control.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/usbback/interface.c -ln -sf ../../../../../${LINUX_26}/drivers/xen/usbback/usbback.c main.c diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/mm/highmem.c --- a/linux-2.4.30-xen-sparse/mm/highmem.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,461 +0,0 @@ -/* - * High memory handling common code and variables. - * - * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@xxxxxxx - * Gerhard Wichert, Siemens AG, Gerhard.Wichert@xxxxxxxxxxxxxx - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * 64-bit physical space. With current x86 CPUs this - * means up to 64 Gigabytes physical RAM. - * - * Rewrote high memory support to move the page cache into - * high memory. Implemented permanent (schedulable) kmaps - * based on Linus' idea. - * - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - */ - -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/slab.h> - -/* - * Virtual_count is not a pure "count". - * 0 means that it is not mapped, and has not been mapped - * since a TLB flush - it is usable. - * 1 means that there are no users, but it has been mapped - * since the last TLB flush - so we can't use it. - * n means that there are (n-1) current users of it. - */ -static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; -static spinlock_cacheline_t kmap_lock_cacheline = {SPIN_LOCK_UNLOCKED}; -#define kmap_lock kmap_lock_cacheline.lock - -pte_t * pkmap_page_table; - -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - -static void flush_all_zero_pkmaps(void) -{ - int i; - - flush_cache_all(); - - for (i = 0; i < LAST_PKMAP; i++) { - struct page *page; - - /* - * zero means we don't have anything to do, - * >1 means that it is still in use. Only - * a count of 1 means that it is free but - * needs to be unmapped - */ - if (pkmap_count[i] != 1) - continue; - pkmap_count[i] = 0; - - /* sanity check */ - if (pte_none(pkmap_page_table[i])) - BUG(); - - /* - * Don't need an atomic fetch-and-clear op here; - * no-one has the page mapped, and cannot get at - * its virtual address (and hence PTE) without first - * getting the kmap_lock (which is held here). - * So no dangers, even with speculative execution. - */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); - - page->virtual = NULL; - } - flush_tlb_all(); -} - -static inline unsigned long map_new_virtual(struct page *page, int nonblocking) -{ - unsigned long vaddr; - int count; - -start: - count = LAST_PKMAP; - /* Find an empty entry */ - for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { - flush_all_zero_pkmaps(); - count = LAST_PKMAP; - } - if (!pkmap_count[last_pkmap_nr]) - break; /* Found a usable entry */ - if (--count) - continue; - - if (nonblocking) - return 0; - - /* - * Sleep for somebody else to unmap their entries - */ - { - DECLARE_WAITQUEUE(wait, current); - - current->state = TASK_UNINTERRUPTIBLE; - add_wait_queue(&pkmap_map_wait, &wait); - spin_unlock(&kmap_lock); - schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); - spin_lock(&kmap_lock); - - /* Somebody else might have mapped it while we slept */ - if (page->virtual) - return (unsigned long) page->virtual; - - /* Re-start */ - goto start; - } - } - vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); - - pkmap_count[last_pkmap_nr] = 1; - page->virtual = (void *) vaddr; - - return vaddr; -} - -void kmap_flush_unused(void) -{ - spin_lock(&kmap_lock); - flush_all_zero_pkmaps(); - spin_unlock(&kmap_lock); -} - -void fastcall *kmap_high(struct page *page, int nonblocking) -{ - unsigned long vaddr; - - /* - * For highmem pages, we can't trust "virtual" until - * after we have the lock. - * - * We cannot call this from interrupts, as it may block - */ - spin_lock(&kmap_lock); - vaddr = (unsigned long) page->virtual; - if (!vaddr) { - vaddr = map_new_virtual(page, nonblocking); - if (!vaddr) - goto out; - } - pkmap_count[PKMAP_NR(vaddr)]++; - if (pkmap_count[PKMAP_NR(vaddr)] < 2) - BUG(); - out: - spin_unlock(&kmap_lock); - return (void*) vaddr; -} - -void fastcall kunmap_high(struct page *page) -{ - unsigned long vaddr; - unsigned long nr; - int need_wakeup; - - spin_lock(&kmap_lock); - vaddr = (unsigned long) page->virtual; - if (!vaddr) - BUG(); - nr = PKMAP_NR(vaddr); - - /* - * A count must never go down to zero - * without a TLB flush! - */ - need_wakeup = 0; - switch (--pkmap_count[nr]) { - case 0: - BUG(); - case 1: - /* - * Avoid an unnecessary wake_up() function call. - * The common case is pkmap_count[] == 1, but - * no waiters. - * The tasks queued in the wait-queue are guarded - * by both the lock in the wait-queue-head and by - * the kmap_lock. As the kmap_lock is held here, - * no need for the wait-queue-head's lock. Simply - * test if the queue is empty. - */ - need_wakeup = waitqueue_active(&pkmap_map_wait); - } - spin_unlock(&kmap_lock); - - /* do wake-up, if needed, race-free outside of the spin lock */ - if (need_wakeup) - wake_up(&pkmap_map_wait); -} - -#define POOL_SIZE 32 - -/* - * This lock gets no contention at all, normally. - */ -static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED; - -int nr_emergency_pages; -static LIST_HEAD(emergency_pages); - -int nr_emergency_bhs; -static LIST_HEAD(emergency_bhs); - -/* - * Simple bounce buffer support for highmem pages. - * This will be moved to the block layer in 2.5. - */ - -static inline void copy_from_high_bh (struct buffer_head *to, - struct buffer_head *from) -{ - struct page *p_from; - char *vfrom; - - p_from = from->b_page; - - vfrom = kmap_atomic(p_from, KM_USER0); - memcpy(to->b_data, vfrom + bh_offset(from), to->b_size); - kunmap_atomic(vfrom, KM_USER0); -} - -static inline void copy_to_high_bh_irq (struct buffer_head *to, - struct buffer_head *from) -{ - struct page *p_to; - char *vto; - unsigned long flags; - - p_to = to->b_page; - __save_flags(flags); - __cli(); - vto = kmap_atomic(p_to, KM_BOUNCE_READ); - memcpy(vto + bh_offset(to), from->b_data, to->b_size); - kunmap_atomic(vto, KM_BOUNCE_READ); - __restore_flags(flags); -} - -static inline void bounce_end_io (struct buffer_head *bh, int uptodate) -{ - struct page *page; - struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); - unsigned long flags; - - bh_orig->b_end_io(bh_orig, uptodate); - - page = bh->b_page; - - spin_lock_irqsave(&emergency_lock, flags); - if (nr_emergency_pages >= POOL_SIZE) - __free_page(page); - else { - /* - * We are abusing page->list to manage - * the highmem emergency pool: - */ - list_add(&page->list, &emergency_pages); - nr_emergency_pages++; - } - - if (nr_emergency_bhs >= POOL_SIZE) { -#ifdef HIGHMEM_DEBUG - /* Don't clobber the constructed slab cache */ - init_waitqueue_head(&bh->b_wait); -#endif - kmem_cache_free(bh_cachep, bh); - } else { - /* - * Ditto in the bh case, here we abuse b_inode_buffers: - */ - list_add(&bh->b_inode_buffers, &emergency_bhs); - nr_emergency_bhs++; - } - spin_unlock_irqrestore(&emergency_lock, flags); -} - -static __init int init_emergency_pool(void) -{ - struct sysinfo i; - si_meminfo(&i); - si_swapinfo(&i); - - if (!i.totalhigh) - return 0; - - spin_lock_irq(&emergency_lock); - while (nr_emergency_pages < POOL_SIZE) { - struct page * page = alloc_page(GFP_ATOMIC); - if (!page) { - printk("couldn't refill highmem emergency pages"); - break; - } - list_add(&page->list, &emergency_pages); - nr_emergency_pages++; - } - while (nr_emergency_bhs < POOL_SIZE) { - struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC); - if (!bh) { - printk("couldn't refill highmem emergency bhs"); - break; - } - list_add(&bh->b_inode_buffers, &emergency_bhs); - nr_emergency_bhs++; - } - spin_unlock_irq(&emergency_lock); - printk("allocated %d pages and %d bhs reserved for the highmem bounces\n", - nr_emergency_pages, nr_emergency_bhs); - - return 0; -} - -__initcall(init_emergency_pool); - -static void bounce_end_io_write (struct buffer_head *bh, int uptodate) -{ - bounce_end_io(bh, uptodate); -} - -static void bounce_end_io_read (struct buffer_head *bh, int uptodate) -{ - struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private); - - if (uptodate) - copy_to_high_bh_irq(bh_orig, bh); - bounce_end_io(bh, uptodate); -} - -struct page *alloc_bounce_page (void) -{ - struct list_head *tmp; - struct page *page; - - page = alloc_page(GFP_NOHIGHIO); - if (page) - return page; - /* - * No luck. First, kick the VM so it doesn't idle around while - * we are using up our emergency rations. - */ - wakeup_bdflush(); - -repeat_alloc: - /* - * Try to allocate from the emergency pool. - */ - tmp = &emergency_pages; - spin_lock_irq(&emergency_lock); - if (!list_empty(tmp)) { - page = list_entry(tmp->next, struct page, list); - list_del(tmp->next); - nr_emergency_pages--; - } - spin_unlock_irq(&emergency_lock); - if (page) - return page; - - /* we need to wait I/O completion */ - run_task_queue(&tq_disk); - - yield(); - goto repeat_alloc; -} - -struct buffer_head *alloc_bounce_bh (void) -{ - struct list_head *tmp; - struct buffer_head *bh; - - bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO); - if (bh) - return bh; - /* - * No luck. First, kick the VM so it doesn't idle around while - * we are using up our emergency rations. - */ - wakeup_bdflush(); - -repeat_alloc: - /* - * Try to allocate from the emergency pool. - */ - tmp = &emergency_bhs; - spin_lock_irq(&emergency_lock); - if (!list_empty(tmp)) { - bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers); - list_del(tmp->next); - nr_emergency_bhs--; - } - spin_unlock_irq(&emergency_lock); - if (bh) - return bh; - - /* we need to wait I/O completion */ - run_task_queue(&tq_disk); - - yield(); - goto repeat_alloc; -} - -struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig) -{ - struct page *page; - struct buffer_head *bh; - - if (!PageHighMem(bh_orig->b_page)) - return bh_orig; - - bh = alloc_bounce_bh(); - /* - * This is wasteful for 1k buffers, but this is a stopgap measure - * and we are being ineffective anyway. This approach simplifies - * things immensly. On boxes with more than 4GB RAM this should - * not be an issue anyway. - */ - page = alloc_bounce_page(); - - set_bh_page(bh, page, 0); - - bh->b_next = NULL; - bh->b_blocknr = bh_orig->b_blocknr; - bh->b_size = bh_orig->b_size; - bh->b_list = -1; - bh->b_dev = bh_orig->b_dev; - bh->b_count = bh_orig->b_count; - bh->b_rdev = bh_orig->b_rdev; - bh->b_state = bh_orig->b_state; -#ifdef HIGHMEM_DEBUG - bh->b_flushtime = jiffies; - bh->b_next_free = NULL; - bh->b_prev_free = NULL; - /* bh->b_this_page */ - bh->b_reqnext = NULL; - bh->b_pprev = NULL; -#endif - /* bh->b_page */ - if (rw == WRITE) { - bh->b_end_io = bounce_end_io_write; - copy_from_high_bh(bh, bh_orig); - } else - bh->b_end_io = bounce_end_io_read; - bh->b_private = (void *)bh_orig; - bh->b_rsector = bh_orig->b_rsector; -#ifdef HIGHMEM_DEBUG - memset(&bh->b_wait, -1, sizeof(bh->b_wait)); -#endif - - return bh; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/mm/memory.c --- a/linux-2.4.30-xen-sparse/mm/memory.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1534 +0,0 @@ -/* - * linux/mm/memory.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - */ - -/* - * demand-loading started 01.12.91 - seems it is high on the list of - * things wanted, and it should be easy to implement. - Linus - */ - -/* - * Ok, demand-loading was easy, shared pages a little bit tricker. Shared - * pages started 02.12.91, seems to work. - Linus. - * - * Tested sharing by executing about 30 /bin/sh: under the old kernel it - * would have taken more than the 6M I have free, but it worked well as - * far as I could see. - * - * Also corrected some "invalidate()"s - I wasn't doing enough of them. - */ - -/* - * Real VM (paging to/from disk) started 18.12.91. Much more work and - * thought has to go into this. Oh, well.. - * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. - * Found it. Everything seems to work now. - * 20.12.91 - Ok, making the swap-device changeable like the root. - */ - -/* - * 05.04.94 - Multi-page memory management added for v1.1. - * Idea by Alex Bligh (alex@xxxxxxxxxxxxxxx) - * - * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG - * (Gerhard.Wichert@xxxxxxxxxxxxxx) - */ - -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/swap.h> -#include <linux/smp_lock.h> -#include <linux/swapctl.h> -#include <linux/iobuf.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/module.h> - -#include <asm/pgalloc.h> -#include <asm/uaccess.h> -#include <asm/tlb.h> - -unsigned long max_mapnr; -unsigned long num_physpages; -unsigned long num_mappedpages; -void * high_memory; -struct page *highmem_start_page; - -/* - * We special-case the C-O-W ZERO_PAGE, because it's such - * a common occurrence (no need to read the page to know - * that it's zero - better for the cache and memory subsystem). - */ -static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) -{ - if (from == ZERO_PAGE(address)) { - clear_user_highpage(to, address); - return; - } - copy_user_highpage(to, from, address); -} - -mem_map_t * mem_map; - -/* - * Called by TLB shootdown - */ -void __free_pte(pte_t pte) -{ - struct page *page = pte_page(pte); - if ((!VALID_PAGE(page)) || PageReserved(page)) - return; - if (pte_dirty(pte)) - set_page_dirty(page); - free_page_and_swap_cache(page); -} - - -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void free_one_pmd(pmd_t * dir) -{ - pte_t * pte; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - pte = pte_offset(dir, 0); - pmd_clear(dir); - pte_free(pte); -} - -static inline void free_one_pgd(pgd_t * dir) -{ - int j; - pmd_t * pmd; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, 0); - pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) { - prefetchw(pmd+j+(PREFETCH_STRIDE/16)); - free_one_pmd(pmd+j); - } - pmd_free(pmd); -} - -/* Low and high watermarks for page table cache. - The system should try to have pgt_water[0] <= cache elements <= pgt_water[1] - */ -int pgt_cache_water[2] = { 25, 50 }; - -/* Returns the number of pages freed */ -int check_pgt_cache(void) -{ - return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]); -} - - -/* - * This function clears all user-level page tables of a process - this - * is needed by execve(), so that old pages aren't in the way. - */ -void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) -{ - pgd_t * page_dir = mm->pgd; - - spin_lock(&mm->page_table_lock); - page_dir += first; - do { - free_one_pgd(page_dir); - page_dir++; - } while (--nr); - spin_unlock(&mm->page_table_lock); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) -#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) - -/* - * copy one vm_area from one task to the other. Assumes the page tables - * already present in the new task to be cleared in the whole range - * covered by this vma. - * - * 08Jan98 Merged into one routine from several inline routines to reduce - * variable count and make things faster. -jj - * - * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc(). - */ -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pgd_t * src_pgd, * dst_pgd; - unsigned long address = vma->vm_start; - unsigned long end = vma->vm_end; - unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - - src_pgd = pgd_offset(src, address)-1; - dst_pgd = pgd_offset(dst, address)-1; - - for (;;) { - pmd_t * src_pmd, * dst_pmd; - - src_pgd++; dst_pgd++; - - /* copy_pmd_range */ - - if (pgd_none(*src_pgd)) - goto skip_copy_pmd_range; - if (pgd_bad(*src_pgd)) { - pgd_ERROR(*src_pgd); - pgd_clear(src_pgd); -skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; - if (!address || (address >= end)) - goto out; - continue; - } - - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); - if (!dst_pmd) - goto nomem; - - do { - pte_t * src_pte, * dst_pte; - - /* copy_pte_range */ - - if (pmd_none(*src_pmd)) - goto skip_copy_pte_range; - if (pmd_bad(*src_pmd)) { - pmd_ERROR(*src_pmd); - pmd_clear(src_pmd); -skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) - goto out; - goto cont_copy_pmd_range; - } - - src_pte = pte_offset(src_pmd, address); - dst_pte = pte_alloc(dst, dst_pmd, address); - if (!dst_pte) - goto nomem; - - spin_lock(&src->page_table_lock); - do { - pte_t pte = *src_pte; - struct page *ptepage; - - /* copy_one_pte */ - - if (pte_none(pte)) - goto cont_copy_pte_range_noset; - if (!pte_present(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); - goto cont_copy_pte_range; - } - ptepage = pte_page(pte); - if ((!VALID_PAGE(ptepage)) || - PageReserved(ptepage)) - goto cont_copy_pte_range; - - /* If it's a COW mapping, write protect it both in the parent and the child */ - if (cow && pte_write(pte)) { - ptep_set_wrprotect(src_pte); - pte = *src_pte; - } - - /* If it's a shared mapping, mark it clean in the child */ - if (vma->vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - get_page(ptepage); - dst->rss++; - -cont_copy_pte_range: set_pte(dst_pte, pte); -cont_copy_pte_range_noset: address += PAGE_SIZE; - if (address >= end) - goto out_unlock; - src_pte++; - dst_pte++; - } while ((unsigned long)src_pte & PTE_TABLE_MASK); - spin_unlock(&src->page_table_lock); - -cont_copy_pmd_range: src_pmd++; - dst_pmd++; - } while ((unsigned long)src_pmd & PMD_TABLE_MASK); - } -out_unlock: - spin_unlock(&src->page_table_lock); -out: - return 0; -nomem: - return -ENOMEM; -} - -/* - * Return indicates whether a page was freed so caller can adjust rss - */ -static inline void forget_pte(pte_t page) -{ - if (!pte_none(page)) { - printk("forget_pte: old mapping existed!\n"); - BUG(); - } -} - -static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size) -{ - unsigned long offset; - pte_t * ptep; - int freed = 0; - - if (pmd_none(*pmd)) - return 0; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return 0; - } - ptep = pte_offset(pmd, address); - offset = address & ~PMD_MASK; - if (offset + size > PMD_SIZE) - size = PMD_SIZE - offset; - size &= PAGE_MASK; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { - pte_t pte = *ptep; - if (pte_none(pte)) - continue; - if (pte_present(pte)) { - struct page *page = pte_page(pte); - if (VALID_PAGE(page) && !PageReserved(page)) - freed ++; - /* This will eventually call __free_pte on the pte. */ - tlb_remove_page(tlb, ptep, address + offset); - } else { - free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); - } - } - - return freed; -} - -static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size) -{ - pmd_t * pmd; - unsigned long end; - int freed; - - if (pgd_none(*dir)) - return 0; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return 0; - } - pmd = pmd_offset(dir, address); - end = address + size; - if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) - end = ((address + PGDIR_SIZE) & PGDIR_MASK); - freed = 0; - do { - freed += zap_pte_range(tlb, pmd, address, end - address); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return freed; -} - -/* - * remove user pages in a given range. - */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) -{ - mmu_gather_t *tlb; - pgd_t * dir; - unsigned long start = address, end = address + size; - int freed = 0; - - dir = pgd_offset(mm, address); - - /* - * This is a long-lived spinlock. That's fine. - * There's no contention, because the page table - * lock only protects against kswapd anyway, and - * even if kswapd happened to be looking at this - * process we _want_ it to get stuck. - */ - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - flush_cache_range(mm, address, end); - tlb = tlb_gather_mmu(mm); - - do { - freed += zap_pmd_range(tlb, dir, address, end - address); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - - /* this will flush any remaining tlb entries */ - tlb_finish_mmu(tlb, start, end); - - /* - * Update rss for the mm_struct (not necessarily current->mm) - * Notice that rss is an unsigned long. - */ - if (mm->rss > freed) - mm->rss -= freed; - else - mm->rss = 0; - spin_unlock(&mm->page_table_lock); -} - -/* - * Do a quick page-table lookup for a single page. - */ -static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *ptep, pte; - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || pgd_bad(*pgd)) - goto out; - - pmd = pmd_offset(pgd, address); - if (pmd_none(*pmd) || pmd_bad(*pmd)) - goto out; - - ptep = pte_offset(pmd, address); - if (!ptep) - goto out; - - pte = *ptep; - if (pte_present(pte)) { - if (!write || - (pte_write(pte) && pte_dirty(pte))) - return pte_page(pte); - } - -out: - return 0; -} - -/* - * Given a physical address, is there a useful struct page pointing to - * it? This may become more complex in the future if we start dealing - * with IO-aperture pages in kiobufs. - */ - -static inline struct page * get_page_map(struct page *page) -{ - if (!VALID_PAGE(page)) - return 0; - return page; -} - -/* - * Please read Documentation/cachetlb.txt before using this function, - * accessing foreign memory spaces can cause cache coherency problems. - * - * Accessing a VM_IO area is even more dangerous, therefore the function - * fails if pages is != NULL and a VM_IO area is found. - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) -{ - int i; - unsigned int flags; - - /* - * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. - */ - flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - i = 0; - - do { - struct vm_area_struct * vma; - - vma = find_extend_vma(mm, start); - - if ( !vma || (pages && vma->vm_flags & VM_IO) || !(flags & vma->vm_flags) ) - return i ? : -EFAULT; - - spin_lock(&mm->page_table_lock); - do { - struct page *map; - while (!(map = follow_page(mm, start, write))) { - spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm, vma, start, write)) { - case 1: - tsk->min_flt++; - break; - case 2: - tsk->maj_flt++; - break; - case 0: - if (i) return i; - return -EFAULT; - default: - if (i) return i; - return -ENOMEM; - } - spin_lock(&mm->page_table_lock); - } - if (pages) { - pages[i] = get_page_map(map); - /* FIXME: call the correct function, - * depending on the type of the found page - */ - if (!pages[i] || PageReserved(pages[i])) { - if (pages[i] != ZERO_PAGE(start)) - goto bad_page; - } else - page_cache_get(pages[i]); - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - } while(len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); - } while(len); -out: - return i; - - /* - * We found an invalid page in the VMA. Release all we have - * so far and fail. - */ -bad_page: - spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); - i = -EFAULT; - goto out; -} - -EXPORT_SYMBOL(get_user_pages); - -/* - * Force in an entire range of pages from the current process's user VA, - * and pin them in physical memory. - */ -#define dprintk(x...) - -int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len) -{ - int pgcount, err; - struct mm_struct * mm; - - /* Make sure the iobuf is not already mapped somewhere. */ - if (iobuf->nr_pages) - return -EINVAL; - - mm = current->mm; - dprintk ("map_user_kiobuf: begin\n"); - - pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE; - /* mapping 0 bytes is not permitted */ - if (!pgcount) BUG(); - err = expand_kiobuf(iobuf, pgcount); - if (err) - return err; - - iobuf->locked = 0; - iobuf->offset = va & (PAGE_SIZE-1); - iobuf->length = len; - - /* Try to fault in all of the necessary pages */ - down_read(&mm->mmap_sem); - /* rw==READ means read from disk, write into memory area */ - err = get_user_pages(current, mm, va, pgcount, - (rw==READ), 0, iobuf->maplist, NULL); - up_read(&mm->mmap_sem); - if (err < 0) { - unmap_kiobuf(iobuf); - dprintk ("map_user_kiobuf: end %d\n", err); - return err; - } - iobuf->nr_pages = err; - while (pgcount--) { - /* FIXME: flush superflous for rw==READ, - * probably wrong function for rw==WRITE - */ - flush_dcache_page(iobuf->maplist[pgcount]); - } - dprintk ("map_user_kiobuf: end OK\n"); - return 0; -} - -/* - * Mark all of the pages in a kiobuf as dirty - * - * We need to be able to deal with short reads from disk: if an IO error - * occurs, the number of bytes read into memory may be less than the - * size of the kiobuf, so we have to stop marking pages dirty once the - * requested byte count has been reached. - * - * Must be called from process context - set_page_dirty() takes VFS locks. - */ - -void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes) -{ - int index, offset, remaining; - struct page *page; - - index = iobuf->offset >> PAGE_SHIFT; - offset = iobuf->offset & ~PAGE_MASK; - remaining = bytes; - if (remaining > iobuf->length) - remaining = iobuf->length; - - while (remaining > 0 && index < iobuf->nr_pages) { - page = iobuf->maplist[index]; - - if (!PageReserved(page)) - set_page_dirty(page); - - remaining -= (PAGE_SIZE - offset); - offset = 0; - index++; - } -} - -/* - * Unmap all of the pages referenced by a kiobuf. We release the pages, - * and unlock them if they were locked. - */ - -void unmap_kiobuf (struct kiobuf *iobuf) -{ - int i; - struct page *map; - - for (i = 0; i < iobuf->nr_pages; i++) { - map = iobuf->maplist[i]; - if (map) { - if (iobuf->locked) - UnlockPage(map); - /* FIXME: cache flush missing for rw==READ - * FIXME: call the correct reference counting function - */ - page_cache_release(map); - } - } - - iobuf->nr_pages = 0; - iobuf->locked = 0; -} - - -/* - * Lock down all of the pages of a kiovec for IO. - * - * If any page is mapped twice in the kiovec, we return the error -EINVAL. - * - * The optional wait parameter causes the lock call to block until all - * pages can be locked if set. If wait==0, the lock operation is - * aborted if any locked pages are found and -EAGAIN is returned. - */ - -int lock_kiovec(int nr, struct kiobuf *iovec[], int wait) -{ - struct kiobuf *iobuf; - int i, j; - struct page *page, **ppage; - int doublepage = 0; - int repeat = 0; - - repeat: - - for (i = 0; i < nr; i++) { - iobuf = iovec[i]; - - if (iobuf->locked) - continue; - - ppage = iobuf->maplist; - for (j = 0; j < iobuf->nr_pages; ppage++, j++) { - page = *ppage; - if (!page) - continue; - - if (TryLockPage(page)) { - while (j--) { - struct page *tmp = *--ppage; - if (tmp) - UnlockPage(tmp); - } - goto retry; - } - } - iobuf->locked = 1; - } - - return 0; - - retry: - - /* - * We couldn't lock one of the pages. Undo the locking so far, - * wait on the page we got to, and try again. - */ - - unlock_kiovec(nr, iovec); - if (!wait) - return -EAGAIN; - - /* - * Did the release also unlock the page we got stuck on? - */ - if (!PageLocked(page)) { - /* - * If so, we may well have the page mapped twice - * in the IO address range. Bad news. Of - * course, it _might_ just be a coincidence, - * but if it happens more than once, chances - * are we have a double-mapped page. - */ - if (++doublepage >= 3) - return -EINVAL; - - /* Try again... */ - wait_on_page(page); - } - - if (++repeat < 16) - goto repeat; - return -EAGAIN; -} - -/* - * Unlock all of the pages of a kiovec after IO. - */ - -int unlock_kiovec(int nr, struct kiobuf *iovec[]) -{ - struct kiobuf *iobuf; - int i, j; - struct page *page, **ppage; - - for (i = 0; i < nr; i++) { - iobuf = iovec[i]; - - if (!iobuf->locked) - continue; - iobuf->locked = 0; - - ppage = iobuf->maplist; - for (j = 0; j < iobuf->nr_pages; ppage++, j++) { - page = *ppage; - if (!page) - continue; - UnlockPage(page); - } - } - return 0; -} - -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); - pte_t oldpage = ptep_get_and_clear(pte); - set_pte(pte, zero_pte); - forget_pte(oldpage); - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - pte_t * pte = pte_alloc(mm, pmd, address); - if (!pte) - return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot) -{ - int error = 0; - pgd_t * dir; - unsigned long beg = address; - unsigned long end = address + size; - struct mm_struct *mm = current->mm; - - dir = pgd_offset(mm, address); - flush_cache_range(mm, beg, end); - if (address >= end) - BUG(); - - spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, address); - error = -ENOMEM; - if (!pmd) - break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); - if (error) - break; - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); - flush_tlb_range(mm, beg, end); - return error; -} - -/* - * maps a range of physical memory into the requested pages. the old - * mappings are removed. any references to nonexistent pages results - * in null mappings (currently treated as "copy-on-access") - */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - struct page *page; - pte_t oldpage; - oldpage = ptep_get_and_clear(pte); - - page = virt_to_page(__va(phys_addr)); - if ((!VALID_PAGE(page)) || PageReserved(page)) - set_pte(pte, mk_pte_phys(phys_addr, prot)); - forget_pte(oldpage); - address += PAGE_SIZE; - phys_addr += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - phys_addr -= address; - do { - pte_t * pte = pte_alloc(mm, pmd, address); - if (!pte) - return -ENOMEM; - remap_pte_range(pte, address, end - address, address + phys_addr, prot); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -/* Note: this is only safe if the mm semaphore is held when called. */ -int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) -{ - int error = 0; - pgd_t * dir; - unsigned long beg = from; - unsigned long end = from + size; - struct mm_struct *mm = current->mm; - - phys_addr -= from; - dir = pgd_offset(mm, from); - flush_cache_range(mm, beg, end); - if (from >= end) - BUG(); - - spin_lock(&mm->page_table_lock); - do { - pmd_t *pmd = pmd_alloc(mm, dir, from); - error = -ENOMEM; - if (!pmd) - break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); - if (error) - break; - from = (from + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (from && (from < end)); - spin_unlock(&mm->page_table_lock); - flush_tlb_range(mm, beg, end); - return error; -} - -/* - * Establish a new mapping: - * - flush the old one - * - update the page tables - * - inform the TLB about the new one - * - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) -{ -#ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG|UVMF_LOCAL); - } else { - set_pte(page_table, entry); - flush_tlb_page(vma, address); - } -#else - set_pte(page_table, entry); - flush_tlb_page(vma, address); -#endif - update_mmu_cache(vma, address, entry); -} - -/* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - flush_page_to_ram(new_page); - flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); -} - -/* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. - * - * Goto-purists beware: the only reason for goto's here is that it results - * in better assembly code.. The "default" path will see no jumps at all. - * - * Note that this routine assumes that the protection checks have been - * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. - * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. - * - * We hold the mm semaphore and the page_table_lock on entry and exit - * with the page_table_lock released. - */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table, pte_t pte) -{ - struct page *old_page, *new_page; - - old_page = pte_page(pte); - if (!VALID_PAGE(old_page)) - goto bad_wp_page; - - if (!TryLockPage(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - flush_cache_page(vma, address); - establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); - spin_unlock(&mm->page_table_lock); - return 1; /* Minor fault */ - } - } - - /* - * Ok, we need to copy. Oh, well.. - */ - page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); - - new_page = alloc_page(GFP_HIGHUSER); - if (!new_page) - goto no_mem; - copy_cow_page(old_page,new_page,address); - - /* - * Re-check the pte - we dropped the lock - */ - spin_lock(&mm->page_table_lock); - if (pte_same(*page_table, pte)) { - if (PageReserved(old_page)) - ++mm->rss; - break_cow(vma, new_page, address, page_table); - if (vm_anon_lru) - lru_cache_add(new_page); - - /* Free the old page.. */ - new_page = old_page; - } - spin_unlock(&mm->page_table_lock); - page_cache_release(new_page); - page_cache_release(old_page); - return 1; /* Minor fault */ - -bad_wp_page: - spin_unlock(&mm->page_table_lock); - printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page); - return -1; -no_mem: - page_cache_release(old_page); - return -1; -} - -static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff) -{ - do { - struct mm_struct *mm = mpnt->vm_mm; - unsigned long start = mpnt->vm_start; - unsigned long end = mpnt->vm_end; - unsigned long len = end - start; - unsigned long diff; - - /* mapping wholly truncated? */ - if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); - continue; - } - - /* mapping wholly unaffected? */ - len = len >> PAGE_SHIFT; - diff = pgoff - mpnt->vm_pgoff; - if (diff >= len) - continue; - - /* Ok, partially affected.. */ - start += diff << PAGE_SHIFT; - len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); - } while ((mpnt = mpnt->vm_next_share) != NULL); -} - -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - unsigned long pgoff; - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - inode->i_size = offset; - spin_lock(&mapping->i_shared_lock); - if (!mapping->i_mmap && !mapping->i_mmap_shared) - goto out_unlock; - - pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (mapping->i_mmap != NULL) - vmtruncate_list(mapping->i_mmap, pgoff); - if (mapping->i_mmap_shared != NULL) - vmtruncate_list(mapping->i_mmap_shared, pgoff); - -out_unlock: - spin_unlock(&mapping->i_shared_lock); - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - inode->i_size = offset; - -out_truncate: - if (inode->i_op && inode->i_op->truncate) { - lock_kernel(); - inode->i_op->truncate(inode); - unlock_kernel(); - } - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -/* - * Primitive swap readahead code. We simply read an aligned block of - * (1 << page_cluster) entries in the swap area. This method is chosen - * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... - */ -void swapin_readahead(swp_entry_t entry) -{ - int i, num; - struct page *new_page; - unsigned long offset; - - /* - * Get the number of handles we should do readahead io to. - */ - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - /* Ok, do the async read-ahead now */ - new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset)); - if (!new_page) - break; - page_cache_release(new_page); - } - return; -} - -/* - * We hold the mm semaphore and the page_table_lock on entry and - * should release the pagetable lock on exit.. - */ -static int do_swap_page(struct mm_struct * mm, - struct vm_area_struct * vma, unsigned long address, - pte_t * page_table, pte_t orig_pte, int write_access) -{ - struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); - pte_t pte; - int ret = 1; - - spin_unlock(&mm->page_table_lock); - page = lookup_swap_cache(entry); - if (!page) { - swapin_readahead(entry); - page = read_swap_cache_async(entry); - if (!page) { - /* - * Back out if somebody else faulted in this pte while - * we released the page table lock. - */ - int retval; - spin_lock(&mm->page_table_lock); - retval = pte_same(*page_table, orig_pte) ? -1 : 1; - spin_unlock(&mm->page_table_lock); - return retval; - } - - /* Had to read the page from swap area: Major fault */ - ret = 2; - } - - mark_page_accessed(page); - - lock_page(page); - - /* - * Back out if somebody else faulted in this pte while we - * released the page table lock. - */ - spin_lock(&mm->page_table_lock); - if (!pte_same(*page_table, orig_pte)) { - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); - return 1; - } - - /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); - - mm->rss++; - pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) - pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); - - flush_page_to_ram(page); - flush_icache_page(vma, page); -#ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) - HYPERVISOR_update_va_mapping(address, pte, 0); - else - set_pte(page_table, pte); -#else - set_pte(page_table, pte); -#endif - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); - spin_unlock(&mm->page_table_lock); - return ret; -} - -/* - * We are called with the MM semaphore and page_table_lock - * spinlock held to protect against concurrent faults in - * multithreaded programs. - */ -static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr) -{ - pte_t entry; - - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); - - /* ..except if it's a write access */ - if (write_access) { - struct page *page; - - /* Allocate our own private page. */ - spin_unlock(&mm->page_table_lock); - - page = alloc_page(GFP_HIGHUSER); - if (!page) - goto no_mem; - clear_user_highpage(page, addr); - - spin_lock(&mm->page_table_lock); - if (!pte_none(*page_table)) { - page_cache_release(page); - spin_unlock(&mm->page_table_lock); - return 1; - } - mm->rss++; - flush_page_to_ram(page); - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - if (vm_anon_lru) - lru_cache_add(page); - mark_page_accessed(page); - } - -#ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) - HYPERVISOR_update_va_mapping(addr, entry, 0); - else - set_pte(page_table, entry); -#else - set_pte(page_table, entry); -#endif - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); - return 1; /* Minor fault */ - -no_mem: - return -1; -} - -/* - * do_no_page() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the "write_access" parameter is true in order to avoid the next - * page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * This is called with the MM semaphore held and the page table - * spinlock held. Exit with the spinlock released. - */ -static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *page_table) -{ - struct page * new_page; - pte_t entry; - - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, write_access, address); - spin_unlock(&mm->page_table_lock); - - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); - - if (new_page == NULL) /* no page was available -- SIGBUS */ - return 0; - if (new_page == NOPAGE_OOM) - return -1; - - /* - * Should we do an early C-O-W break? - */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); - return -1; - } - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - if (vm_anon_lru) - lru_cache_add(page); - new_page = page; - } - - spin_lock(&mm->page_table_lock); - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if write_access is true, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - ++mm->rss; - flush_page_to_ram(new_page); - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) - entry = pte_mkwrite(pte_mkdirty(entry)); -#ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) - HYPERVISOR_update_va_mapping(address, entry, 0); - else - set_pte(page_table, entry); -#else - set_pte(page_table, entry); -#endif - } else { - /* One of our sibling threads was faster, back out. */ - page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); - return 1; - } - - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); - spin_unlock(&mm->page_table_lock); - return 2; /* Major fault */ -} - -/* - * These routines also need to handle stuff like marking pages dirty - * and/or accessed for architectures that don't do it in hardware (most - * RISC architectures). The early dirtying is also good on the i386. - * - * There is also a hook called "update_mmu_cache()" that architectures - * with external mmu caches can use to update those (ie the Sparc or - * PowerPC hashed page tables that act as extended TLBs). - * - * Note the "page_table_lock". It is to protect against kswapd removing - * pages from under us. Note that kswapd only ever _removes_ pages, never - * adds them. As such, once we have noticed that the page is not present, - * we can drop the lock early. - * - * The adding of pages is protected by the MM semaphore (which we hold), - * so we don't need to worry about a page being suddenly been added into - * our VM. - * - * We enter with the pagetable spinlock held, we are supposed to - * release it when done. - */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct * vma, unsigned long address, - int write_access, pte_t * pte) -{ - pte_t entry; - - entry = *pte; - if (!pte_present(entry)) { - /* - * If it truly wasn't present, we know that kswapd - * and the PTE updates will not touch it later. So - * drop the lock. - */ - if (pte_none(entry)) - return do_no_page(mm, vma, address, write_access, pte); - return do_swap_page(mm, vma, address, pte, entry, write_access); - } - - if (write_access) { - if (!pte_write(entry)) - return do_wp_page(mm, vma, address, pte, entry); - - entry = pte_mkdirty(entry); - } - entry = pte_mkyoung(entry); - establish_pte(vma, address, pte, entry); - spin_unlock(&mm->page_table_lock); - return 1; -} - -/* - * By the time we get here, we already hold the mm semaphore - */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, int write_access) -{ - pgd_t *pgd; - pmd_t *pmd; - - current->state = TASK_RUNNING; - pgd = pgd_offset(mm, address); - - /* - * We need the page table lock to synchronize with kswapd - * and the SMP-safe atomic PTE updates. - */ - spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); - - if (pmd) { - pte_t * pte = pte_alloc(mm, pmd, address); - if (pte) - return handle_pte_fault(mm, vma, address, write_access, pte); - } - spin_unlock(&mm->page_table_lock); - return -1; -} - -/* - * Allocate page middle directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - * - * On a two-level page table, this ends up actually being entirely - * optimized away. - */ -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - pmd_t *new; - - /* "fast" allocation can happen without dropping the lock.. */ - new = pmd_alloc_one_fast(mm, address); - if (!new) { - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (!pgd_none(*pgd)) { - pmd_free(new); - check_pgt_cache(); - goto out; - } - } - pgd_populate(mm, pgd, new); -out: - return pmd_offset(pgd, address); -} - -/* - * Allocate the page table directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - */ -pte_t fastcall *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -{ - if (pmd_none(*pmd)) { - pte_t *new; - - /* "fast" allocation can happen without dropping the lock.. */ - new = pte_alloc_one_fast(mm, address); - if (!new) { - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (!pmd_none(*pmd)) { - pte_free(new); - check_pgt_cache(); - goto out; - } - } - pmd_populate(mm, pmd, new); - } -out: - return pte_offset(pmd, address); -} - -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - return ret == len ? 0 : -1; -} - -struct page * vmalloc_to_page(void * vmalloc_addr) -{ - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pmd_t *pmd; - pte_t *pte; - pgd_t *pgd; - - pgd = pgd_offset_k(addr); - if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); - if (!pmd_none(*pmd)) { - pte = pte_offset(pmd, addr); - if (pte_present(*pte)) { - page = pte_page(*pte); - } - } - } - return page; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/mm/mprotect.c --- a/linux-2.4.30-xen-sparse/mm/mprotect.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,344 +0,0 @@ -/* - * linux/mm/mprotect.c - * - * (C) Copyright 1994 Linus Torvalds - */ -#include <linux/slab.h> -#include <linux/smp_lock.h> -#include <linux/shm.h> -#include <linux/mman.h> - -#include <asm/uaccess.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> - -static inline void change_pte_range(pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t newprot) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - pte = pte_offset(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - if (pte_present(*pte)) { - pte_t entry; - - /* Avoid an SMP race with hardware updated dirty/clean - * bits by wiping the pte and then setting the new pte - * into place. - */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); - } - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline void change_pmd_range(pgd_t * pgd, unsigned long address, - unsigned long size, pgprot_t newprot) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*pgd)) - return; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - change_pte_range(pmd, address, end - address, newprot); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); -} - -static void change_protection(unsigned long start, unsigned long end, pgprot_t newprot) -{ - pgd_t *dir; - unsigned long beg = start; - - dir = pgd_offset(current->mm, start); - flush_cache_range(current->mm, beg, end); - if (start >= end) - BUG(); - spin_lock(¤t->mm->page_table_lock); - do { - change_pmd_range(dir, start, end - start, newprot); - start = (start + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (start && (start < end)); - spin_unlock(¤t->mm->page_table_lock); - flush_tlb_range(current->mm, beg, end); - return; -} - -static inline int mprotect_fixup_all(struct vm_area_struct * vma, struct vm_area_struct ** pprev, - int newflags, pgprot_t prot) -{ - struct vm_area_struct * prev = *pprev; - struct mm_struct * mm = vma->vm_mm; - - if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) && - !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = vma->vm_end; - __vma_unlink(mm, vma, prev); - spin_unlock(&mm->page_table_lock); - - kmem_cache_free(vm_area_cachep, vma); - mm->map_count--; - - return 0; - } - - spin_lock(&mm->page_table_lock); - vma->vm_flags = newflags; - vma->vm_page_prot = prot; - spin_unlock(&mm->page_table_lock); - - *pprev = vma; - - return 0; -} - -static inline int mprotect_fixup_start(struct vm_area_struct * vma, struct vm_area_struct ** pprev, - unsigned long end, - int newflags, pgprot_t prot) -{ - struct vm_area_struct * n, * prev = *pprev; - - *pprev = vma; - - if (prev && prev->vm_end == vma->vm_start && can_vma_merge(prev, newflags) && - !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&vma->vm_mm->page_table_lock); - prev->vm_end = end; - vma->vm_start = end; - spin_unlock(&vma->vm_mm->page_table_lock); - - return 0; - } - n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!n) - return -ENOMEM; - *n = *vma; - n->vm_end = end; - n->vm_flags = newflags; - n->vm_raend = 0; - n->vm_page_prot = prot; - if (n->vm_file) - get_file(n->vm_file); - if (n->vm_ops && n->vm_ops->open) - n->vm_ops->open(n); - vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT; - lock_vma_mappings(vma); - spin_lock(&vma->vm_mm->page_table_lock); - vma->vm_start = end; - __insert_vm_struct(current->mm, n); - spin_unlock(&vma->vm_mm->page_table_lock); - unlock_vma_mappings(vma); - - return 0; -} - -static inline int mprotect_fixup_end(struct vm_area_struct * vma, struct vm_area_struct ** pprev, - unsigned long start, - int newflags, pgprot_t prot) -{ - struct vm_area_struct * n; - - n = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!n) - return -ENOMEM; - *n = *vma; - n->vm_start = start; - n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; - n->vm_flags = newflags; - n->vm_raend = 0; - n->vm_page_prot = prot; - if (n->vm_file) - get_file(n->vm_file); - if (n->vm_ops && n->vm_ops->open) - n->vm_ops->open(n); - lock_vma_mappings(vma); - spin_lock(&vma->vm_mm->page_table_lock); - vma->vm_end = start; - __insert_vm_struct(current->mm, n); - spin_unlock(&vma->vm_mm->page_table_lock); - unlock_vma_mappings(vma); - - *pprev = n; - - return 0; -} - -static inline int mprotect_fixup_middle(struct vm_area_struct * vma, struct vm_area_struct ** pprev, - unsigned long start, unsigned long end, - int newflags, pgprot_t prot) -{ - struct vm_area_struct * left, * right; - - left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!left) - return -ENOMEM; - right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!right) { - kmem_cache_free(vm_area_cachep, left); - return -ENOMEM; - } - *left = *vma; - *right = *vma; - left->vm_end = start; - right->vm_start = end; - right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; - left->vm_raend = 0; - right->vm_raend = 0; - if (vma->vm_file) - atomic_add(2,&vma->vm_file->f_count); - if (vma->vm_ops && vma->vm_ops->open) { - vma->vm_ops->open(left); - vma->vm_ops->open(right); - } - vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT; - vma->vm_raend = 0; - vma->vm_page_prot = prot; - lock_vma_mappings(vma); - spin_lock(&vma->vm_mm->page_table_lock); - vma->vm_start = start; - vma->vm_end = end; - vma->vm_flags = newflags; - __insert_vm_struct(current->mm, left); - __insert_vm_struct(current->mm, right); - spin_unlock(&vma->vm_mm->page_table_lock); - unlock_vma_mappings(vma); - - *pprev = right; - - return 0; -} - -static int mprotect_fixup(struct vm_area_struct * vma, struct vm_area_struct ** pprev, - unsigned long start, unsigned long end, unsigned int newflags) -{ - pgprot_t newprot; - int error; - - if (newflags == vma->vm_flags) { - *pprev = vma; - return 0; - } - newprot = protection_map[newflags & 0xf]; - if (start == vma->vm_start) { - if (end == vma->vm_end) - error = mprotect_fixup_all(vma, pprev, newflags, newprot); - else - error = mprotect_fixup_start(vma, pprev, end, newflags, newprot); - } else if (end == vma->vm_end) - error = mprotect_fixup_end(vma, pprev, start, newflags, newprot); - else - error = mprotect_fixup_middle(vma, pprev, start, end, newflags, newprot); - - if (error) - return error; - - change_protection(start, end, newprot); - return 0; -} - -asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) -{ - unsigned long nstart, end, tmp; - struct vm_area_struct * vma, * next, * prev; - int error = -EINVAL; - - if (start & ~PAGE_MASK) - return -EINVAL; - len = PAGE_ALIGN(len); - end = start + len; - if (end < start) - return -ENOMEM; - if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) - return -EINVAL; - if (end == start) - return 0; - - down_write(¤t->mm->mmap_sem); - - vma = find_vma_prev(current->mm, start, &prev); - error = -ENOMEM; - if (!vma || vma->vm_start > start) - goto out; - -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - /* mprotect() unsupported for I/O mappings in Xenolinux. */ - error = -EINVAL; - if (vma->vm_flags & VM_IO) - goto out; -#endif - - for (nstart = start ; ; ) { - unsigned int newflags; - int last = 0; - - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - - newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); - if ((newflags & ~(newflags >> 4)) & 0xf) { - error = -EACCES; - goto out; - } - - if (vma->vm_end > end) { - error = mprotect_fixup(vma, &prev, nstart, end, newflags); - goto out; - } - if (vma->vm_end == end) - last = 1; - - tmp = vma->vm_end; - next = vma->vm_next; - error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); - if (error) - goto out; - if (last) - break; - nstart = tmp; - vma = next; - if (!vma || vma->vm_start != nstart) { - error = -ENOMEM; - goto out; - } - } - if (next && prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags) && - !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { - spin_lock(&prev->vm_mm->page_table_lock); - prev->vm_end = next->vm_end; - __vma_unlink(prev->vm_mm, next, prev); - spin_unlock(&prev->vm_mm->page_table_lock); - - kmem_cache_free(vm_area_cachep, next); - prev->vm_mm->map_count--; - } -out: - up_write(¤t->mm->mmap_sem); - return error; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/mm/mremap.c --- a/linux-2.4.30-xen-sparse/mm/mremap.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,390 +0,0 @@ -/* - * linux/mm/remap.c - * - * (C) Copyright 1996 Linus Torvalds - */ - -#include <linux/slab.h> -#include <linux/smp_lock.h> -#include <linux/shm.h> -#include <linux/mman.h> -#include <linux/swap.h> - -#include <asm/uaccess.h> -#include <asm/pgalloc.h> - -extern int vm_enough_memory(long pages); - -static inline pte_t *get_one_pte(struct mm_struct *mm, unsigned long addr) -{ - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte = NULL; - - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - goto end; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - goto end; - } - - pmd = pmd_offset(pgd, addr); - if (pmd_none(*pmd)) - goto end; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - goto end; - } - - pte = pte_offset(pmd, addr); - if (pte_none(*pte)) - pte = NULL; -end: - return pte; -} - -static inline pte_t *alloc_one_pte(struct mm_struct *mm, unsigned long addr) -{ - pmd_t * pmd; - pte_t * pte = NULL; - - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); - if (pmd) - pte = pte_alloc(mm, pmd, addr); - return pte; -} - -static inline int copy_one_pte(struct mm_struct *mm, pte_t * src, pte_t * dst) -{ - int error = 0; - pte_t pte; - - if (!pte_none(*src)) { - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; - } - set_pte(dst, pte); - } - return error; -} - -static int move_one_page(struct mm_struct *mm, unsigned long old_addr, unsigned long new_addr) -{ - int error = 0; - pte_t * src, * dst; - - spin_lock(&mm->page_table_lock); - src = get_one_pte(mm, old_addr); - if (src) { - dst = alloc_one_pte(mm, new_addr); - src = get_one_pte(mm, old_addr); - if (src) - error = copy_one_pte(mm, src, dst); - } - spin_unlock(&mm->page_table_lock); - return error; -} - -static int move_page_tables(struct mm_struct * mm, - unsigned long new_addr, unsigned long old_addr, unsigned long len) -{ - unsigned long offset = len; - - flush_cache_range(mm, old_addr, old_addr + len); - - /* - * This is not the clever way to do this, but we're taking the - * easy way out on the assumption that most remappings will be - * only a few pages.. This also makes error recovery easier. - */ - while (offset) { - offset -= PAGE_SIZE; - if (move_one_page(mm, old_addr + offset, new_addr + offset)) - goto oops_we_failed; - } - flush_tlb_range(mm, old_addr, old_addr + len); - return 0; - - /* - * Ok, the move failed because we didn't have enough pages for - * the new page table tree. This is unlikely, but we have to - * take the possibility into account. In that case we just move - * all the pages back (this will work, because we still have - * the old page tables) - */ -oops_we_failed: - flush_cache_range(mm, new_addr, new_addr + len); - while ((offset += PAGE_SIZE) < len) - move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); - return -1; -} - -static inline unsigned long move_vma(struct vm_area_struct * vma, - unsigned long addr, unsigned long old_len, unsigned long new_len, - unsigned long new_addr) -{ - struct mm_struct * mm = vma->vm_mm; - struct vm_area_struct * new_vma, * next, * prev; - int allocated_vma; - - new_vma = NULL; - next = find_vma_prev(mm, new_addr, &prev); - if (next) { - if (prev && prev->vm_end == new_addr && - can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = new_addr + new_len; - spin_unlock(&mm->page_table_lock); - new_vma = prev; - if (next != prev->vm_next) - BUG(); - if (prev->vm_end == next->vm_start && can_vma_merge(next, prev->vm_flags)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = next->vm_end; - __vma_unlink(mm, next, prev); - spin_unlock(&mm->page_table_lock); - - mm->map_count--; - kmem_cache_free(vm_area_cachep, next); - } - } else if (next->vm_start == new_addr + new_len && - can_vma_merge(next, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - next->vm_start = new_addr; - spin_unlock(&mm->page_table_lock); - new_vma = next; - } - } else { - prev = find_vma(mm, new_addr-1); - if (prev && prev->vm_end == new_addr && - can_vma_merge(prev, vma->vm_flags) && !vma->vm_file && !(vma->vm_flags & VM_SHARED)) { - spin_lock(&mm->page_table_lock); - prev->vm_end = new_addr + new_len; - spin_unlock(&mm->page_table_lock); - new_vma = prev; - } - } - - allocated_vma = 0; - if (!new_vma) { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!new_vma) - goto out; - allocated_vma = 1; - } - - if (!move_page_tables(current->mm, new_addr, addr, old_len)) { - unsigned long vm_locked = vma->vm_flags & VM_LOCKED; - - if (allocated_vma) { - *new_vma = *vma; - new_vma->vm_start = new_addr; - new_vma->vm_end = new_addr+new_len; - new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; - new_vma->vm_raend = 0; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - insert_vm_struct(current->mm, new_vma); - } - - /* XXX: possible errors masked, mapping might remain */ - do_munmap(current->mm, addr, old_len); - - current->mm->total_vm += new_len >> PAGE_SHIFT; - if (vm_locked) { - current->mm->locked_vm += new_len >> PAGE_SHIFT; - if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); - } - return new_addr; - } - if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); - out: - return -ENOMEM; -} - -/* - * Expand (or shrink) an existing mapping, potentially moving it at the - * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) - * - * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise - * This option implies MREMAP_MAYMOVE. - */ -unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) -{ - struct vm_area_struct *vma; - unsigned long ret = -EINVAL; - - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; - - if (addr & ~PAGE_MASK) - goto out; - - old_len = PAGE_ALIGN(old_len); - new_len = PAGE_ALIGN(new_len); - - if (old_len > TASK_SIZE || addr > TASK_SIZE - old_len) - goto out; - - if (addr >= TASK_SIZE) - goto out; - - /* new_addr is only valid if MREMAP_FIXED is specified */ - if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) - goto out; - if (!(flags & MREMAP_MAYMOVE)) - goto out; - - if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - goto out; - - if (new_addr >= TASK_SIZE) - goto out; - - /* - * Allow new_len == 0 only if new_addr == addr - * to preserve truncation in place (that was working - * safe and some app may depend on it). - */ - if (unlikely(!new_len && new_addr != addr)) - goto out; - - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) - goto out; - - ret = do_munmap(current->mm, new_addr, new_len); - if (ret && new_len) - goto out; - } - - /* - * Always allow a shrinking remap: that just unmaps - * the unnecessary pages.. - */ - if (old_len >= new_len) { - ret = do_munmap(current->mm, addr+new_len, old_len - new_len); - if (ret && old_len != new_len) - goto out; - ret = addr; - if (!(flags & MREMAP_FIXED) || (new_addr == addr)) - goto out; - } - - /* - * Ok, we need to grow.. or relocate. - */ - ret = -EFAULT; - vma = find_vma(current->mm, addr); - if (!vma || vma->vm_start > addr) - goto out; - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - goto out; - if (vma->vm_flags & VM_DONTEXPAND) { - if (new_len > old_len) - goto out; - } - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked = current->mm->locked_vm << PAGE_SHIFT; - locked += new_len - old_len; - ret = -EAGAIN; - if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) - goto out; - } - ret = -ENOMEM; - if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) - > current->rlim[RLIMIT_AS].rlim_cur) - goto out; - /* Private writable mapping? Check memory availability.. */ - if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && - !(flags & MAP_NORESERVE) && - !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) - goto out; - -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - /* mremap() unsupported for I/O mappings in Xenolinux. */ - ret = -EINVAL; - if (vma->vm_flags & VM_IO) - goto out; -#endif - - /* old_len exactly to the end of the area.. - * And we're not relocating the area. - */ - if (old_len == vma->vm_end - addr && - !((flags & MREMAP_FIXED) && (addr != new_addr)) && - (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { - unsigned long max_addr = TASK_SIZE; - if (vma->vm_next) - max_addr = vma->vm_next->vm_start; - /* can we just expand the current mapping? */ - if (max_addr - addr >= new_len) { - int pages = (new_len - old_len) >> PAGE_SHIFT; - spin_lock(&vma->vm_mm->page_table_lock); - vma->vm_end = addr + new_len; - spin_unlock(&vma->vm_mm->page_table_lock); - current->mm->total_vm += pages; - if (vma->vm_flags & VM_LOCKED) { - current->mm->locked_vm += pages; - make_pages_present(addr + old_len, - addr + new_len); - } - ret = addr; - goto out; - } - } - - /* - * We weren't able to just expand or shrink the area, - * we need to create a new one and move it.. - */ - ret = -ENOMEM; - if (flags & MREMAP_MAYMOVE) { - if (!(flags & MREMAP_FIXED)) { - unsigned long map_flags = 0; - if (vma->vm_flags & VM_SHARED) - map_flags |= MAP_SHARED; - - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, vma->vm_pgoff, map_flags); - ret = new_addr; - if (new_addr & ~PAGE_MASK) - goto out; - } - ret = move_vma(vma, addr, old_len, new_len, new_addr); - } -out: - return ret; -} - -asmlinkage unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) -{ - unsigned long ret; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); - return ret; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/mm/page_alloc.c --- a/linux-2.4.30-xen-sparse/mm/page_alloc.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,972 +0,0 @@ -/* - * linux/mm/page_alloc.c - * - * Manages the free list, the system allocates free pages here. - * Note that kmalloc() lives in slab.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * Swap reorganised 29.12.95, Stephen Tweedie - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 - * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 - * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/swapctl.h> -#include <linux/interrupt.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/slab.h> -#include <linux/module.h> - -int nr_swap_pages; -int nr_active_pages; -int nr_inactive_pages; -LIST_HEAD(inactive_list); -LIST_HEAD(active_list); -pg_data_t *pgdat_list; - -/* - * - * The zone_table array is used to look up the address of the - * struct zone corresponding to a given zone number (ZONE_DMA, - * ZONE_NORMAL, or ZONE_HIGHMEM). - */ -zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; -EXPORT_SYMBOL(zone_table); - -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, }; -static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, }; -static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, }; -static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; - -int vm_gfp_debug = 0; - -static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order)); - -static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED; -struct page * free_pages_ok_no_irq_head; - -static void do_free_pages_ok_no_irq(void * arg) -{ - struct page * page, * __page; - - spin_lock_irq(&free_pages_ok_no_irq_lock); - - page = free_pages_ok_no_irq_head; - free_pages_ok_no_irq_head = NULL; - - spin_unlock_irq(&free_pages_ok_no_irq_lock); - - while (page) { - __page = page; - page = page->next_hash; - __free_pages_ok(__page, __page->index); - } -} - -static struct tq_struct free_pages_ok_no_irq_task = { - .routine = do_free_pages_ok_no_irq, -}; - - -/* - * Temporary debugging check. - */ -#define BAD_RANGE(zone, page) \ -( \ - (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \ - || (((page) - mem_map) < (zone)->zone_start_mapnr) \ - || ((zone) != page_zone(page)) \ -) - -/* - * Freeing function for a buddy system allocator. - * Contrary to prior comments, this is *NOT* hairy, and there - * is no reason for anyone not to understand it. - * - * The concept of a buddy system is to maintain direct-mapped tables - * (containing bit values) for memory blocks of various "orders". - * The bottom level table contains the map for the smallest allocatable - * units of memory (here, pages), and each level above it describes - * pairs of units from the levels below, hence, "buddies". - * At a high level, all that happens here is marking the table entry - * at the bottom level available, and propagating the changes upward - * as necessary, plus some accounting needed to play nicely with other - * parts of the VM system. - * At each level, we keep one bit for each pair of blocks, which - * is set to 1 iff only one of the pair is allocated. So when we - * are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. - * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. - * - * -- wli - */ - -static void fastcall __free_pages_ok (struct page *page, unsigned int order) -{ - unsigned long index, page_idx, mask, flags; - free_area_t *area; - struct page *base; - zone_t *zone; - - if (PageForeign(page)) - return (PageForeignDestructor(page))(page); - - /* - * Yes, think what happens when other parts of the kernel take - * a reference to a page in order to pin it for io. -ben - */ - if (PageLRU(page)) { - if (unlikely(in_interrupt())) { - unsigned long flags; - - spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags); - page->next_hash = free_pages_ok_no_irq_head; - free_pages_ok_no_irq_head = page; - page->index = order; - - spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags); - - schedule_task(&free_pages_ok_no_irq_task); - return; - } - - lru_cache_del(page); - } - - if (page->buffers) - BUG(); - if (page->mapping) - BUG(); - if (!VALID_PAGE(page)) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageActive(page)) - BUG(); - ClearPageReferenced(page); - ClearPageDirty(page); - - if (current->flags & PF_FREE_PAGES) - goto local_freelist; - back_local_freelist: - - zone = page_zone(page); - - mask = (~0UL) << order; - base = zone->zone_mem_map; - page_idx = page - base; - if (page_idx & ~mask) - BUG(); - index = page_idx >> (1 + order); - - area = zone->free_area + order; - - spin_lock_irqsave(&zone->lock, flags); - - zone->free_pages -= mask; - - while (mask + (1 << (MAX_ORDER-1))) { - struct page *buddy1, *buddy2; - - if (area >= zone->free_area + MAX_ORDER) - BUG(); - if (!__test_and_change_bit(index, area->map)) - /* - * the buddy page is still allocated. - */ - break; - /* - * Move the buddy up one level. - * This code is taking advantage of the identity: - * -mask = 1+~mask - */ - buddy1 = base + (page_idx ^ -mask); - buddy2 = base + page_idx; - if (BAD_RANGE(zone,buddy1)) - BUG(); - if (BAD_RANGE(zone,buddy2)) - BUG(); - - list_del(&buddy1->list); - mask <<= 1; - area++; - index >>= 1; - page_idx &= mask; - } - list_add(&(base + page_idx)->list, &area->free_list); - - spin_unlock_irqrestore(&zone->lock, flags); - return; - - local_freelist: - if (current->nr_local_pages) - goto back_local_freelist; - if (in_interrupt()) - goto back_local_freelist; - - list_add(&page->list, ¤t->local_pages); - page->index = order; - current->nr_local_pages++; -} - -#define MARK_USED(index, order, area) \ - __change_bit((index) >> (1+(order)), (area)->map) - -static inline struct page * expand (zone_t *zone, struct page *page, - unsigned long index, int low, int high, free_area_t * area) -{ - unsigned long size = 1 << high; - - while (high > low) { - if (BAD_RANGE(zone,page)) - BUG(); - area--; - high--; - size >>= 1; - list_add(&(page)->list, &(area)->free_list); - MARK_USED(index, high, area); - index += size; - page += size; - } - if (BAD_RANGE(zone,page)) - BUG(); - return page; -} - -static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order)); -static struct page * fastcall rmqueue(zone_t *zone, unsigned int order) -{ - free_area_t * area = zone->free_area + order; - unsigned int curr_order = order; - struct list_head *head, *curr; - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&zone->lock, flags); - do { - head = &area->free_list; - curr = head->next; - - if (curr != head) { - unsigned int index; - - page = list_entry(curr, struct page, list); - if (BAD_RANGE(zone,page)) - BUG(); - list_del(curr); - index = page - zone->zone_mem_map; - if (curr_order != MAX_ORDER-1) - MARK_USED(index, curr_order, area); - zone->free_pages -= 1UL << order; - - page = expand(zone, page, index, order, curr_order, area); - spin_unlock_irqrestore(&zone->lock, flags); - - set_page_count(page, 1); - if (BAD_RANGE(zone,page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - return page; - } - curr_order++; - area++; - } while (curr_order < MAX_ORDER); - spin_unlock_irqrestore(&zone->lock, flags); - - return NULL; -} - -#ifndef CONFIG_DISCONTIGMEM -struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - return __alloc_pages(gfp_mask, order, - contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); -} -#endif - -static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *)); -static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed) -{ - struct page * page = NULL; - int __freed; - - if (in_interrupt()) - BUG(); - - current->allocation_order = order; - current->flags |= PF_MEMALLOC | PF_FREE_PAGES; - - __freed = try_to_free_pages_zone(classzone, gfp_mask); - - current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES); - - if (current->nr_local_pages) { - struct list_head * entry, * local_pages; - struct page * tmp; - int nr_pages; - - local_pages = ¤t->local_pages; - - if (likely(__freed)) { - /* pick from the last inserted so we're lifo */ - entry = local_pages->next; - do { - tmp = list_entry(entry, struct page, list); - if (tmp->index == order && memclass(page_zone(tmp), classzone)) { - list_del(entry); - current->nr_local_pages--; - set_page_count(tmp, 1); - page = tmp; - - if (page->buffers) - BUG(); - if (page->mapping) - BUG(); - if (!VALID_PAGE(page)) - BUG(); - if (PageLocked(page)) - BUG(); - if (PageLRU(page)) - BUG(); - if (PageActive(page)) - BUG(); - if (PageDirty(page)) - BUG(); - - break; - } - } while ((entry = entry->next) != local_pages); - } - - nr_pages = current->nr_local_pages; - /* free in reverse order so that the global order will be lifo */ - while ((entry = local_pages->prev) != local_pages) { - list_del(entry); - tmp = list_entry(entry, struct page, list); - __free_pages_ok(tmp, tmp->index); - if (!nr_pages--) - BUG(); - } - current->nr_local_pages = 0; - } - - *freed = __freed; - return page; -} - -static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order) -{ - long free = zone->free_pages - (1UL << order); - return free >= 0 ? free : 0; -} - -/* - * This is the 'heart' of the zoned buddy allocator: - */ -struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist) -{ - zone_t **zone, * classzone; - struct page * page; - int freed, class_idx; - - zone = zonelist->zones; - classzone = *zone; - class_idx = zone_idx(classzone); - - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - if (zone_free_pages(z, order) > z->watermarks[class_idx].low) { - page = rmqueue(z, order); - if (page) - return page; - } - } - - classzone->need_balance = 1; - mb(); - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); - - zone = zonelist->zones; - for (;;) { - unsigned long min; - zone_t *z = *(zone++); - if (!z) - break; - - min = z->watermarks[class_idx].min; - if (!(gfp_mask & __GFP_WAIT)) - min >>= 2; - if (zone_free_pages(z, order) > min) { - page = rmqueue(z, order); - if (page) - return page; - } - } - - /* here we're in the low on memory slow path */ - - if ((current->flags & PF_MEMALLOC) && - (!in_interrupt() || (current->flags & PF_MEMDIE))) { - zone = zonelist->zones; - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - page = rmqueue(z, order); - if (page) - return page; - } - return NULL; - } - - /* Atomic allocations - we can't balance anything */ - if (!(gfp_mask & __GFP_WAIT)) - goto out; - - rebalance: - page = balance_classzone(classzone, gfp_mask, order, &freed); - if (page) - return page; - - zone = zonelist->zones; - if (likely(freed)) { - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - if (zone_free_pages(z, order) > z->watermarks[class_idx].min) { - page = rmqueue(z, order); - if (page) - return page; - } - } - goto rebalance; - } else { - /* - * Check that no other task is been killed meanwhile, - * in such a case we can succeed the allocation. - */ - for (;;) { - zone_t *z = *(zone++); - if (!z) - break; - - if (zone_free_pages(z, order) > z->watermarks[class_idx].high) { - page = rmqueue(z, order); - if (page) - return page; - } - } - } - - out: - printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n", - order, gfp_mask, !!(current->flags & PF_MEMALLOC)); - if (unlikely(vm_gfp_debug)) - dump_stack(); - return NULL; -} - -/* - * Common helper functions. - */ -fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) -{ - struct page * page; - - page = alloc_pages(gfp_mask, order); - if (!page) - return 0; - return (unsigned long) page_address(page); -} - -fastcall unsigned long get_zeroed_page(unsigned int gfp_mask) -{ - struct page * page; - - page = alloc_pages(gfp_mask, 0); - if (page) { - void *address = page_address(page); - clear_page(address); - return (unsigned long) address; - } - return 0; -} - -fastcall void __free_pages(struct page *page, unsigned int order) -{ - if (!PageReserved(page) && put_page_testzero(page)) - __free_pages_ok(page, order); -} - -fastcall void free_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) - __free_pages(virt_to_page(addr), order); -} - -/* - * Total amount of free (allocatable) RAM: - */ -unsigned int nr_free_pages (void) -{ - unsigned int sum = 0; - zone_t *zone; - - for_each_zone(zone) - sum += zone->free_pages; - - return sum; -} - -/* - * Amount of free RAM allocatable as buffer memory: - */ -unsigned int nr_free_buffer_pages (void) -{ - pg_data_t *pgdat; - unsigned int sum = 0; - zonelist_t *zonelist; - zone_t **zonep, *zone; - - for_each_pgdat(pgdat) { - int class_idx; - zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); - zonep = zonelist->zones; - zone = *zonep; - class_idx = zone_idx(zone); - - sum += zone->nr_cache_pages; - for (; zone; zone = *zonep++) { - int free = zone->free_pages - zone->watermarks[class_idx].high; - if (free <= 0) - continue; - sum += free; - } - } - - return sum; -} - -#if CONFIG_HIGHMEM -unsigned int nr_free_highpages (void) -{ - pg_data_t *pgdat; - unsigned int pages = 0; - - for_each_pgdat(pgdat) - pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - - return pages; -} - -unsigned int freeable_lowmem(void) -{ - unsigned int pages = 0; - pg_data_t *pgdat; - - for_each_pgdat(pgdat) { - pages += pgdat->node_zones[ZONE_DMA].free_pages; - pages += pgdat->node_zones[ZONE_DMA].nr_active_pages; - pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages; - pages += pgdat->node_zones[ZONE_NORMAL].free_pages; - pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages; - pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages; - } - - return pages; -} -#endif - -#define K(x) ((x) << (PAGE_SHIFT-10)) - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - */ -void show_free_areas_core(pg_data_t *pgdat) -{ - unsigned int order; - unsigned type; - pg_data_t *tmpdat = pgdat; - - printk("Free pages: %6dkB (%6dkB HighMem)\n", - K(nr_free_pages()), - K(nr_free_highpages())); - - while (tmpdat) { - zone_t *zone; - for (zone = tmpdat->node_zones; - zone < tmpdat->node_zones + MAX_NR_ZONES; zone++) - printk("Zone:%s freepages:%6lukB\n", - zone->name, - K(zone->free_pages)); - - tmpdat = tmpdat->node_next; - } - - printk("( Active: %d, inactive: %d, free: %d )\n", - nr_active_pages, - nr_inactive_pages, - nr_free_pages()); - - for (type = 0; type < MAX_NR_ZONES; type++) { - struct list_head *head, *curr; - zone_t *zone = pgdat->node_zones + type; - unsigned long nr, total, flags; - - total = 0; - if (zone->size) { - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - head = &(zone->free_area + order)->free_list; - curr = head; - nr = 0; - for (;;) { - if ((curr = curr->next) == head) - break; - nr++; - } - total += nr * (1 << order); - printk("%lu*%lukB ", nr, K(1UL) << order); - } - spin_unlock_irqrestore(&zone->lock, flags); - } - printk("= %lukB)\n", K(total)); - } - -#ifdef SWAP_CACHE_INFO - show_swap_cache_info(); -#endif -} - -void show_free_areas(void) -{ - show_free_areas_core(pgdat_list); -} - -/* - * Builds allocation fallback zone lists. - */ -static inline void build_zonelists(pg_data_t *pgdat) -{ - int i, j, k; - - for (i = 0; i <= GFP_ZONEMASK; i++) { - zonelist_t *zonelist; - zone_t *zone; - - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); - - j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - switch (k) { - default: - BUG(); - /* - * fallthrough: - */ - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->size) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->size) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->size) - zonelist->zones[j++] = zone; - } - zonelist->zones[j++] = NULL; - } -} - -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -static inline unsigned long wait_table_size(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return size; -} - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) - -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - */ -void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, - unsigned long *zones_size, unsigned long zone_start_paddr, - unsigned long *zholes_size, struct page *lmem_map) -{ - unsigned long i, j; - unsigned long map_size; - unsigned long totalpages, offset, realtotalpages; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - - if (zone_start_paddr & ~PAGE_MASK) - BUG(); - - totalpages = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - unsigned long size = zones_size[i]; - totalpages += size; - } - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - - printk("On node %d totalpages: %lu\n", nid, realtotalpages); - - /* - * Some architectures (with lots of mem and discontinous memory - * maps) have to search for a good mem_map area: - * For discontigmem, the conceptual mem map array starts from - * PAGE_OFFSET, we need to align the actual array onto a mem map - * boundary, so that MAP_NR works. - */ - map_size = (totalpages + 1)*sizeof(struct page); - if (lmem_map == (struct page *)0) { - lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); - lmem_map = (struct page *)(PAGE_OFFSET + - MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); - } - *gmap = pgdat->node_mem_map = lmem_map; - pgdat->node_size = totalpages; - pgdat->node_start_paddr = zone_start_paddr; - pgdat->node_start_mapnr = (lmem_map - mem_map); - pgdat->nr_zones = 0; - - offset = lmem_map - mem_map; - for (j = 0; j < MAX_NR_ZONES; j++) { - zone_t *zone = pgdat->node_zones + j; - unsigned long mask; - unsigned long size, realsize; - int idx; - - zone_table[nid * MAX_NR_ZONES + j] = zone; - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; - - printk("zone(%lu): %lu pages.\n", j, size); - zone->size = size; - zone->realsize = realsize; - zone->name = zone_names[j]; - zone->lock = SPIN_LOCK_UNLOCKED; - zone->zone_pgdat = pgdat; - zone->free_pages = 0; - zone->need_balance = 0; - zone->nr_active_pages = zone->nr_inactive_pages = 0; - - - if (!size) - continue; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_shift = - BITS_PER_LONG - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - mask = (realsize / zone_balance_ratio[j]); - if (mask < zone_balance_min[j]) - mask = zone_balance_min[j]; - else if (mask > zone_balance_max[j]) - mask = zone_balance_max[j]; - zone->watermarks[j].min = mask; - zone->watermarks[j].low = mask*2; - zone->watermarks[j].high = mask*3; - /* now set the watermarks of the lower zones in the "j" classzone */ - for (idx = j-1; idx >= 0; idx--) { - zone_t * lower_zone = pgdat->node_zones + idx; - unsigned long lower_zone_reserve; - if (!lower_zone->size) - continue; - - mask = lower_zone->watermarks[idx].min; - lower_zone->watermarks[j].min = mask; - lower_zone->watermarks[j].low = mask*2; - lower_zone->watermarks[j].high = mask*3; - - /* now the brainer part */ - lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx]; - lower_zone->watermarks[j].min += lower_zone_reserve; - lower_zone->watermarks[j].low += lower_zone_reserve; - lower_zone->watermarks[j].high += lower_zone_reserve; - - realsize += lower_zone->realsize; - } - - zone->zone_mem_map = mem_map + offset; - zone->zone_start_mapnr = offset; - zone->zone_start_paddr = zone_start_paddr; - - if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1)) - printk("BUG: wrong zone alignment, it will crash\n"); - - /* - * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is - * done. Non-atomic initialization, single-pass. - */ - for (i = 0; i < size; i++) { - struct page *page = mem_map + offset + i; - set_page_zone(page, nid * MAX_NR_ZONES + j); - set_page_count(page, 0); - SetPageReserved(page); - INIT_LIST_HEAD(&page->list); - if (j != ZONE_HIGHMEM) - set_page_address(page, __va(zone_start_paddr)); - zone_start_paddr += PAGE_SIZE; - } - - offset += size; - for (i = 0; ; i++) { - unsigned long bitmap_size; - - INIT_LIST_HEAD(&zone->free_area[i].free_list); - if (i == MAX_ORDER-1) { - zone->free_area[i].map = NULL; - break; - } - - /* - * Page buddy system uses "index >> (i+1)", - * where "index" is at most "size-1". - * - * The extra "+3" is to round down to byte - * size (8 bits per byte assumption). Thus - * we get "(size-1) >> (i+4)" as the last byte - * we can access. - * - * The "+1" is because we want to round the - * byte allocation up rather than down. So - * we should have had a "+7" before we shifted - * down by three. Also, we have to add one as - * we actually _use_ the last bit (it's [0,n] - * inclusive, not [0,n[). - * - * So we actually had +7+1 before we shift - * down by 3. But (n+8) >> 3 == (n >> 3) + 1 - * (modulo overflows, which we do not have). - * - * Finally, we LONG_ALIGN because all bitmap - * operations are on longs. - */ - bitmap_size = (size-1) >> (i+4); - bitmap_size = LONG_ALIGN(bitmap_size+1); - zone->free_area[i].map = - (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); - } - } - build_zonelists(pgdat); -} - -void __init free_area_init(unsigned long *zones_size) -{ - free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); -} - -static int __init setup_mem_frac(char *str) -{ - int j = 0; - - while (get_option(&str, &zone_balance_ratio[j++]) == 2); - printk("setup_mem_frac: "); - for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]); - printk("\n"); - return 1; -} - -__setup("memfrac=", setup_mem_frac); - -static int __init setup_lower_zone_reserve(char *str) -{ - int j = 0; - - while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2); - printk("setup_lower_zone_reserve: "); - for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]); - printk("\n"); - return 1; -} - -__setup("lower_zone_reserve=", setup_lower_zone_reserve); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.4.30-xen-sparse/net/core/skbuff.c --- a/linux-2.4.30-xen-sparse/net/core/skbuff.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1309 +0,0 @@ -/* - * Routines having to do with the 'struct sk_buff' memory handlers. - * - * Authors: Alan Cox <iiitac@xxxxxxxxxxxxxx> - * Florian La Roche <rzsfl@xxxxxxxxxxxx> - * - * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ - * - * Fixes: - * Alan Cox : Fixed the worst of the load balancer bugs. - * Dave Platt : Interrupt stacking fix. - * Richard Kooijman : Timestamp fixes. - * Alan Cox : Changed buffer format. - * Alan Cox : destructor hook for AF_UNIX etc. - * Linus Torvalds : Better skb_clone. - * Alan Cox : Added skb_copy. - * Alan Cox : Added all the changed routines Linus - * only put in the headers - * Ray VanTassle : Fixed --skb->lock in free - * Alan Cox : skb_copy copy arp field - * Andi Kleen : slabified it. - * - * NOTE: - * The __skb_ routines should be called with interrupts - * disabled, or you better be *real* sure that the operation is atomic - * with respect to whatever list is being frobbed (e.g. via lock_sock() - * or via disabling bottom half handlers, etc). - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* - * The functions in this file will not compile correctly with gcc 2.4.x - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/slab.h> -#include <linux/netdevice.h> -#include <linux/string.h> -#include <linux/skbuff.h> -#include <linux/cache.h> -#include <linux/rtnetlink.h> -#include <linux/init.h> -#include <linux/highmem.h> - -#include <net/protocol.h> -#include <net/dst.h> -#include <net/sock.h> -#include <net/checksum.h> - -#include <asm/uaccess.h> -#include <asm/system.h> - -int sysctl_hot_list_len = 128; - -static kmem_cache_t *skbuff_head_cache; - -static union { - struct sk_buff_head list; - char pad[SMP_CACHE_BYTES]; -} skb_head_pool[NR_CPUS]; - -/* - * Keep out-of-line to prevent kernel bloat. - * __builtin_return_address is not used because it is not always - * reliable. - */ - -/** - * skb_over_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_put(). Not user callable. - */ - -void skb_over_panic(struct sk_buff *skb, int sz, void *here) -{ - printk("skput:over: %p:%d put:%d dev:%s", - here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); - BUG(); -} - -/** - * skb_under_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_push(). Not user callable. - */ - - -void skb_under_panic(struct sk_buff *skb, int sz, void *here) -{ - printk("skput:under: %p:%d put:%d dev:%s", - here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); - BUG(); -} - -static __inline__ struct sk_buff *skb_head_from_pool(void) -{ - struct sk_buff_head *list = &skb_head_pool[smp_processor_id()].list; - - if (skb_queue_len(list)) { - struct sk_buff *skb; - unsigned long flags; - - local_irq_save(flags); - skb = __skb_dequeue(list); - local_irq_restore(flags); - return skb; - } - return NULL; -} - -static __inline__ void skb_head_to_pool(struct sk_buff *skb) -{ - struct sk_buff_head *list = &skb_head_pool[smp_processor_id()].list; - - if (skb_queue_len(list) < sysctl_hot_list_len) { - unsigned long flags; - - local_irq_save(flags); - __skb_queue_head(list, skb); - local_irq_restore(flags); - - return; - } - kmem_cache_free(skbuff_head_cache, skb); -} - - -/* Allocate a new skbuff. We do this ourselves so we can fill in a few - * 'private' fields and also do memory statistics to find all the - * [BEEP] leaks. - * - */ - -/** - * alloc_skb - allocate a network buffer - * @size: size to allocate - * @gfp_mask: allocation mask - * - * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ - -struct sk_buff *alloc_skb(unsigned int size,int gfp_mask) -{ - struct sk_buff *skb; - u8 *data; - - if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { - static int count = 0; - if (++count < 5) { - printk(KERN_ERR "alloc_skb called nonatomically " - "from interrupt %p\n", NET_CALLER(size)); - BUG(); - } - gfp_mask &= ~__GFP_WAIT; - } - - /* Get the HEAD */ - skb = skb_head_from_pool(); - if (skb == NULL) { - skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA); - if (skb == NULL) - goto nohead; - } - - /* Get the DATA. Size must match skb_add_mtu(). */ - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (data == NULL) - goto nodata; - - /* XXX: does not include slab overhead */ - skb->truesize = size + sizeof(struct sk_buff); - - /* Load the data pointers. */ - skb->head = data; - skb->data = data; - skb->tail = data; - skb->end = data + size; - - /* Set up other state */ - skb->len = 0; - skb->cloned = 0; - skb->data_len = 0; - - atomic_set(&skb->users, 1); - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; - return skb; - -nodata: - skb_head_to_pool(skb); -nohead: - return NULL; -} - -/** - * alloc_skb_from_cache - allocate a network buffer - * @cp: kmem_cache from which to allocate the data area - * (object size must be big enough for @size bytes + skb overheads) - * @size: size to allocate - * @gfp_mask: allocation mask - * - * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ - -struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, - unsigned int size, int gfp_mask) -{ - struct sk_buff *skb; - u8 *data; - - if (in_interrupt() && (gfp_mask & __GFP_WAIT)) { - static int count = 0; - if (++count < 5) { - printk(KERN_ERR "alloc_skb called nonatomically " - "from interrupt %p\n", NET_CALLER(size)); - BUG(); - } - gfp_mask &= ~__GFP_WAIT; - } - - /* Get the HEAD */ - skb = skb_head_from_pool(); - if (skb == NULL) { - skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~__GFP_DMA); - if (skb == NULL) - goto nohead; - } - - /* Get the DATA. */ - size = SKB_DATA_ALIGN(size); - data = kmem_cache_alloc(cp, gfp_mask); - if (data == NULL) - goto nodata; - - /* XXX: does not include slab overhead */ - skb->truesize = size + sizeof(struct sk_buff); - - /* Load the data pointers. */ - skb->head = data; - skb->data = data; - skb->tail = data; - skb->end = data + size; - - /* Set up other state */ - skb->len = 0; - skb->cloned = 0; - skb->data_len = 0; - - atomic_set(&skb->users, 1); - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; - return skb; - -nodata: - skb_head_to_pool(skb); -nohead: - return NULL; -} - - -/* - * Slab constructor for a skb head. - */ -static inline void skb_headerinit(void *p, kmem_cache_t *cache, - unsigned long flags) -{ - struct sk_buff *skb = p; - - skb->next = NULL; - skb->prev = NULL; - skb->list = NULL; - skb->sk = NULL; - skb->stamp.tv_sec=0; /* No idea about time */ - skb->dev = NULL; - skb->real_dev = NULL; - skb->dst = NULL; - memset(skb->cb, 0, sizeof(skb->cb)); - skb->pkt_type = PACKET_HOST; /* Default type */ - skb->ip_summed = 0; - skb->priority = 0; - skb->security = 0; /* By default packets are insecure */ - skb->destructor = NULL; - -#ifdef CONFIG_NETFILTER - skb->nfmark = skb->nfcache = 0; - skb->nfct = NULL; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif -#endif -#ifdef CONFIG_NET_SCHED - skb->tc_index = 0; -#endif -} - -static void skb_drop_fraglist(struct sk_buff *skb) -{ - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - skb_shinfo(skb)->frag_list = NULL; - - do { - struct sk_buff *this = list; - list = list->next; - kfree_skb(this); - } while (list); -} - -static void skb_clone_fraglist(struct sk_buff *skb) -{ - struct sk_buff *list; - - for (list = skb_shinfo(skb)->frag_list; list; list=list->next) - skb_get(list); -} - -static void skb_release_data(struct sk_buff *skb) -{ - if (!skb->cloned || - atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); - } - - if (skb_shinfo(skb)->frag_list) - skb_drop_fraglist(skb); - - kfree(skb->head); - } -} - -/* - * Free an skbuff by memory without cleaning the state. - */ -void kfree_skbmem(struct sk_buff *skb) -{ - skb_release_data(skb); - skb_head_to_pool(skb); -} - -/** - * __kfree_skb - private function - * @skb: buffer - * - * Free an sk_buff. Release anything attached to the buffer. - * Clean the state. This is an internal helper function. Users should - * always call kfree_skb - */ - -void __kfree_skb(struct sk_buff *skb) -{ - if (skb->list) { - printk(KERN_WARNING "Warning: kfree_skb passed an skb still " - "on a list (from %p).\n", NET_CALLER(skb)); - BUG(); - } - - dst_release(skb->dst); - if(skb->destructor) { - if (in_irq()) { - printk(KERN_WARNING "Warning: kfree_skb on hard IRQ %p\n", - NET_CALLER(skb)); - } - skb->destructor(skb); - } -#ifdef CONFIG_NETFILTER - nf_conntrack_put(skb->nfct); -#endif - skb_headerinit(skb, NULL, 0); /* clean state */ - kfree_skbmem(skb); -} - -/** - * skb_clone - duplicate an sk_buff - * @skb: buffer to clone - * @gfp_mask: allocation priority - * - * Duplicate an &sk_buff. The new one is not owned by a socket. Both - * copies share the same packet data but not structure. The new - * buffer has a reference count of 1. If the allocation fails the - * function returns %NULL otherwise the new buffer is returned. - * - * If this function is called from an interrupt gfp_mask() must be - * %GFP_ATOMIC. - */ - -struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) -{ - struct sk_buff *n; - - n = skb_head_from_pool(); - if (!n) { - n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); - if (!n) - return NULL; - } - -#define C(x) n->x = skb->x - - n->next = n->prev = NULL; - n->list = NULL; - n->sk = NULL; - C(stamp); - C(dev); - C(real_dev); - C(h); - C(nh); - C(mac); - C(dst); - dst_clone(n->dst); - memcpy(n->cb, skb->cb, sizeof(skb->cb)); - C(len); - C(data_len); - C(csum); - n->cloned = 1; - C(pkt_type); - C(ip_summed); - C(priority); - atomic_set(&n->users, 1); - C(protocol); - C(security); - C(truesize); - C(head); - C(data); - C(tail); - C(end); - n->destructor = NULL; -#ifdef CONFIG_NETFILTER - C(nfmark); - C(nfcache); - C(nfct); -#ifdef CONFIG_NETFILTER_DEBUG - C(nf_debug); -#endif -#endif /*CONFIG_NETFILTER*/ -#if defined(CONFIG_HIPPI) - C(private); -#endif -#ifdef CONFIG_NET_SCHED - C(tc_index); -#endif - - atomic_inc(&(skb_shinfo(skb)->dataref)); - skb->cloned = 1; -#ifdef CONFIG_NETFILTER - nf_conntrack_get(skb->nfct); -#endif - return n; -} - -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) -{ - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; - - new->list=NULL; - new->sk=NULL; - new->dev=old->dev; - new->real_dev=old->real_dev; - new->priority=old->priority; - new->protocol=old->protocol; - new->dst=dst_clone(old->dst); - new->h.raw=old->h.raw+offset; - new->nh.raw=old->nh.raw+offset; - new->mac.raw=old->mac.raw+offset; - memcpy(new->cb, old->cb, sizeof(old->cb)); - atomic_set(&new->users, 1); - new->pkt_type=old->pkt_type; - new->stamp=old->stamp; - new->destructor = NULL; - new->security=old->security; -#ifdef CONFIG_NETFILTER - new->nfmark=old->nfmark; - new->nfcache=old->nfcache; - new->nfct=old->nfct; - nf_conntrack_get(new->nfct); -#ifdef CONFIG_NETFILTER_DEBUG - new->nf_debug=old->nf_debug; -#endif -#endif -#ifdef CONFIG_NET_SCHED - new->tc_index = old->tc_index; -#endif -} - -/** - * skb_copy - create private copy of an sk_buff - * @skb: buffer to copy - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and its data. This is used when the - * caller wishes to modify the data and needs a private copy of the - * data to alter. Returns %NULL on failure or the pointer to the buffer - * on success. The returned buffer has a reference count of 1. - * - * As by-product this function converts non-linear &sk_buff to linear - * one, so that &sk_buff becomes completely private and caller is allowed - * to modify all the data of returned buffer. This means that this - * function is not recommended for use in circumstances when only - * header is going to be modified. Use pskb_copy() instead. - */ - -struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) -{ - struct sk_buff *n; - int headerlen = skb->data-skb->head; - - /* - * Allocate the copy buffer - */ - n=alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); - if(n==NULL) - return NULL; - - /* Set the data pointer */ - skb_reserve(n,headerlen); - /* Set the tail pointer and length */ - skb_put(n,skb->len); - n->csum = skb->csum; - n->ip_summed = skb->ip_summed; - - if (skb_copy_bits(skb, -headerlen, n->head, headerlen+skb->len)) - BUG(); - - copy_skb_header(n, skb); - - return n; -} - -/* Keep head the same: replace data */ -int skb_linearize(struct sk_buff *skb, int gfp_mask) -{ - unsigned int size; - u8 *data; - long offset; - int headerlen = skb->data - skb->head; - int expand = (skb->tail+skb->data_len) - skb->end; - - if (skb_shared(skb)) - BUG(); - - if (expand <= 0) - expand = 0; - - size = (skb->end - skb->head + expand); - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (data == NULL) - return -ENOMEM; - - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen+skb->len)) - BUG(); - - /* Offset between the two in bytes */ - offset = data - skb->head; - - /* Free old data. */ - skb_release_data(skb); - - skb->head = data; - skb->end = data + size; - - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; - - /* Set up shinfo */ - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; - - skb->tail += skb->data_len; - skb->data_len = 0; - return 0; -} - - -/** - * pskb_copy - create copy of an sk_buff with private head. - * @skb: buffer to copy - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and part of its data, located - * in header. Fragmented data remain shared. This is used when - * the caller wishes to modify only header of &sk_buff and needs - * private copy of the header to alter. Returns %NULL on failure - * or the pointer to the buffer on success. - * The returned buffer has a reference count of 1. - */ - -struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) -{ - struct sk_buff *n; - - /* - * Allocate the copy buffer - */ - n=alloc_skb(skb->end - skb->head, gfp_mask); - if(n==NULL) - return NULL; - - /* Set the data pointer */ - skb_reserve(n,skb->data-skb->head); - /* Set the tail pointer and length */ - skb_put(n,skb_headlen(skb)); - /* Copy the bytes */ - memcpy(n->data, skb->data, n->len); - n->csum = skb->csum; - n->ip_summed = skb->ip_summed; - - n->data_len = skb->data_len; - n->len = skb->len; - - if (skb_shinfo(skb)->nr_frags) { - int i; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; - get_page(skb_shinfo(n)->frags[i].page); - } - skb_shinfo(n)->nr_frags = i; - } - - if (skb_shinfo(skb)->frag_list) { - skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; - skb_clone_fraglist(n); - } - - copy_skb_header(n, skb); - - return n; -} - -/** - * pskb_expand_head - reallocate header of &sk_buff - * @skb: buffer to reallocate - * @nhead: room to add at head - * @ntail: room to add at tail - * @gfp_mask: allocation priority - * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have - * reference count of 1. Returns zero in the case of success or error, - * if expansion failed. In the last case, &sk_buff is not changed. - * - * All the pointers pointing into skb header may change and must be - * reloaded after call to this function. - */ - -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) -{ - int i; - u8 *data; - int size = nhead + (skb->end - skb->head) + ntail; - long off; - - if (skb_shared(skb)) - BUG(); - - size = SKB_DATA_ALIGN(size); - - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (data == NULL) - goto nodata; - - /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ - memcpy(data+nhead, skb->head, skb->tail-skb->head); - memcpy(data+size, skb->end, sizeof(struct skb_shared_info)); - - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); - - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - - skb_release_data(skb); - - off = (data+nhead) - skb->head; - - skb->head = data; - skb->end = data+size; - - skb->data += off; - skb->tail += off; - skb->mac.raw += off; - skb->h.raw += off; - skb->nh.raw += off; - skb->cloned = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); - return 0; - -nodata: - return -ENOMEM; -} - -/* Make private copy of skb with writable head and some headroom */ - -struct sk_buff * -skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) -{ - struct sk_buff *skb2; - int delta = headroom - skb_headroom(skb); - - if (delta <= 0) - return pskb_copy(skb, GFP_ATOMIC); - - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL || - !pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) - return skb2; - - kfree_skb(skb2); - return NULL; -} - - -/** - * skb_copy_expand - copy and expand sk_buff - * @skb: buffer to copy - * @newheadroom: new free bytes at head - * @newtailroom: new free bytes at tail - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and its data and while doing so - * allocate additional space. - * - * This is used when the caller wishes to modify the data and needs a - * private copy of the data to alter as well as more space for new fields. - * Returns %NULL on failure or the pointer to the buffer - * on success. The returned buffer has a reference count of 1. - * - * You must pass %GFP_ATOMIC as the allocation priority if this function - * is called from an interrupt. - */ - - -struct sk_buff *skb_copy_expand(const struct sk_buff *skb, - int newheadroom, - int newtailroom, - int gfp_mask) -{ - struct sk_buff *n; - - /* - * Allocate the copy buffer - */ - - n=alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask); - if(n==NULL) - return NULL; - - skb_reserve(n,newheadroom); - - /* Set the tail pointer and length */ - skb_put(n,skb->len); - - /* Copy the data only. */ - if (skb_copy_bits(skb, 0, n->data, skb->len)) - BUG(); - - copy_skb_header(n, skb); - return n; -} - -/** - * skb_pad - zero pad the tail of an skb - * @skb: buffer to pad - * @pad: space to pad - * - * Ensure that a buffer is followed by a padding area that is zero - * filled. Used by network drivers which may DMA or transfer data - * beyond the buffer end onto the wire. - * - * May return NULL in out of memory cases. - */ - -struct sk_buff *skb_pad(struct sk_buff *skb, int pad) -{ - struct sk_buff *nskb; - - /* If the skbuff is non linear tailroom is always zero.. */ - if(skb_tailroom(skb) >= pad) - { - memset(skb->data+skb->len, 0, pad); - return skb; - } - - nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); - kfree_skb(skb); - if(nskb) - memset(nskb->data+nskb->len, 0, pad); - return nskb; -} - -/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. - * If realloc==0 and trimming is impossible without change of data, - * it is BUG(). - */ - -int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) -{ - int offset = skb_headlen(skb); - int nfrags = skb_shinfo(skb)->nr_frags; - int i; - - for (i=0; i<nfrags; i++) { - int end = offset + skb_shinfo(skb)->frags[i].size; - if (end > len) { - if (skb_cloned(skb)) { - if (!realloc) - BUG(); - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; - } - if (len <= offset) { - put_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb)->nr_frags--; - } else { - skb_shinfo(skb)->frags[i].size = len-offset; - } - } - offset = end; - } - - if (offset < len) { - skb->data_len -= skb->len - len; - skb->len = len; - } else { - if (len <= skb_headlen(skb)) { - skb->len = len; - skb->data_len = 0; - skb->tail = skb->data + len; - if (skb_shinfo(skb)->frag_list && !skb_cloned(skb)) - skb_drop_fraglist(skb); - } else { - skb->data_len -= skb->len - len; - skb->len = len; - } - } - - return 0; -} - -/** - * __pskb_pull_tail - advance tail of skb header - * @skb: buffer to reallocate - * @delta: number of bytes to advance tail - * - * The function makes a sense only on a fragmented &sk_buff, - * it expands header moving its tail forward and copying necessary - * data from fragmented part. - * - * &sk_buff MUST have reference count of 1. - * - * Returns %NULL (and &sk_buff does not change) if pull failed - * or value of new tail of skb in the case of success. - * - * All the pointers pointing into skb header may change and must be - * reloaded after call to this function. - */ - -/* Moves tail of skb head forward, copying data from fragmented part, - * when it is necessary. - * 1. It may fail due to malloc failure. - * 2. It may change skb pointers. - * - * It is pretty complicated. Luckily, it is called only in exceptional cases. - */ -unsigned char * __pskb_pull_tail(struct sk_buff *skb, int delta) -{ - int i, k, eat; - - /* If skb has not enough free space at tail, get new one - * plus 128 bytes for future expansions. If we have enough - * room at tail, reallocate without expansion only if skb is cloned. - */ - eat = (skb->tail+delta) - skb->end; - - if (eat > 0 || skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, eat>0 ? eat+128 : 0, GFP_ATOMIC)) - return NULL; - } - - if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta)) - BUG(); - - /* Optimization: no fragments, no reasons to preestimate - * size of pulled pages. Superb. - */ - if (skb_shinfo(skb)->frag_list == NULL) - goto pull_pages; - - /* Estimate size of pulled pages. */ - eat = delta; - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size >= eat) - goto pull_pages; - eat -= skb_shinfo(skb)->frags[i].size; - } - - /* If we need update frag list, we are in troubles. - * Certainly, it possible to add an offset to skb data, - * but taking into account that pulling is expected to - * be very rare operation, it is worth to fight against - * further bloating skb head and crucify ourselves here instead. - * Pure masohism, indeed. 8)8) - */ - if (eat) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - struct sk_buff *clone = NULL; - struct sk_buff *insp = NULL; - - do { - if (list == NULL) - BUG(); - - if (list->len <= eat) { - /* Eaten as whole. */ - eat -= list->len; - list = list->next; - insp = list; - } else { - /* Eaten partially. */ - - if (skb_shared(list)) { - /* Sucks! We need to fork list. :-( */ - clone = skb_clone(list, GFP_ATOMIC); - if (clone == NULL) - return NULL; - insp = list->next; - list = clone; - } else { - /* This may be pulled without - * problems. */ - insp = list; - } - if (pskb_pull(list, eat) == NULL) { - if (clone) - kfree_skb(clone); - return NULL; - } - break; - } - } while (eat); - - /* Free pulled out fragments. */ - while ((list = skb_shinfo(skb)->frag_list) != insp) { - skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); - } - /* And insert new clone at head. */ - if (clone) { - clone->next = list; - skb_shinfo(skb)->frag_list = clone; - } - } - /* Success! Now we may commit changes to skb data. */ - -pull_pages: - eat = delta; - k = 0; - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); - eat -= skb_shinfo(skb)->frags[i].size; - } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; - if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; - eat = 0; - } - k++; - } - } - skb_shinfo(skb)->nr_frags = k; - - skb->tail += delta; - skb->data_len -= delta; - - return skb->tail; -} - -/* Copy some data bits from skb to kernel buffer. */ - -int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) -{ - int i, copy; - int start = skb->len - skb->data_len; - - if (offset > (int)skb->len-len) - goto fault; - - /* Copy header. */ - if ((copy = start-offset) > 0) { - if (copy > len) - copy = len; - memcpy(to, skb->data + offset, copy); - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset+len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end-offset) > 0) { - u8 *vaddr; - - if (copy > len) - copy = len; - - vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); - memcpy(to, vaddr+skb_shinfo(skb)->frags[i].page_offset+ - offset-start, copy); - kunmap_skb_frag(vaddr); - - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list; - - for (list = skb_shinfo(skb)->frag_list; list; list=list->next) { - int end; - - BUG_TRAP(start <= offset+len); - - end = start + list->len; - if ((copy = end-offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_bits(list, offset-start, to, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; - } - } - if (len == 0) - return 0; - -fault: - return -EFAULT; -} - -/* Checksum skb data. */ - -unsigned int skb_checksum(const struct sk_buff *skb, int offset, int len, unsigned int csum) -{ - int i, copy; - int start = skb->len - skb->data_len; - int pos = 0; - - /* Checksum header. */ - if ((copy = start-offset) > 0) { - if (copy > len) - copy = len; - csum = csum_partial(skb->data+offset, copy, csum); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos = copy; - } - - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset+len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end-offset) > 0) { - unsigned int csum2; - u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - vaddr = kmap_skb_frag(frag); - csum2 = csum_partial(vaddr + frag->page_offset + - offset-start, copy, 0); - kunmap_skb_frag(vaddr); - csum = csum_block_add(csum, csum2, pos); - if (!(len -= copy)) - return csum; - offset += copy; - pos += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list; - - for (list = skb_shinfo(skb)->frag_list; list; list=list->next) { - int end; - - BUG_TRAP(start <= offset+len); - - end = start + list->len; - if ((copy = end-offset) > 0) { - unsigned int csum2; - if (copy > len) - copy = len; - csum2 = skb_checksum(list, offset-start, copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos += copy; - } - start = end; - } - } - if (len == 0) - return csum; - - BUG(); - return csum; -} - -/* Both of above in one bottle. */ - -unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum) -{ - int i, copy; - int start = skb->len - skb->data_len; - int pos = 0; - - /* Copy header. */ - if ((copy = start-offset) > 0) { - if (copy > len) - copy = len; - csum = csum_partial_copy_nocheck(skb->data+offset, to, copy, csum); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos = copy; - } - - for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset+len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end-offset) > 0) { - unsigned int csum2; - u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - vaddr = kmap_skb_frag(frag); - csum2 = csum_partial_copy_nocheck(vaddr + frag->page_offset + - offset-start, to, copy, 0); - kunmap_skb_frag(vaddr); - csum = csum_block_add(csum, csum2, pos); - if (!(len -= copy)) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list; - - for (list = skb_shinfo(skb)->frag_list; list; list=list->next) { - unsigned int csum2; - int end; - - BUG_TRAP(start <= offset+len); - - end = start + list->len; - if ((copy = end-offset) > 0) { - if (copy > len) - copy = len; - csum2 = skb_copy_and_csum_bits(list, offset-start, to, copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - } - if (len == 0) - return csum; - - BUG(); - return csum; -} - -void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) -{ - unsigned int csum; - long csstart; - - if (skb->ip_summed == CHECKSUM_HW) - csstart = skb->h.raw - skb->data; - else - csstart = skb->len - skb->data_len; - - if (csstart > skb->len - skb->data_len) - BUG(); - - memcpy(to, skb->data, csstart); - - csum = 0; - if (csstart != skb->len) - csum = skb_copy_and_csum_bits(skb, csstart, to+csstart, - skb->len-csstart, 0); - - if (skb->ip_summed == CHECKSUM_HW) { - long csstuff = csstart + skb->csum; - - *((unsigned short *)(to + csstuff)) = csum_fold(csum); - } -} - -#if 0 -/* - * Tune the memory allocator for a new MTU size. - */ -void skb_add_mtu(int mtu) -{ - /* Must match allocation in alloc_skb */ - mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info); - - kmem_add_cache_size(mtu); -} -#endif - -void __init skb_init(void) -{ - int i; - - skbuff_head_cache = kmem_cache_create("skbuff_head_cache", - sizeof(struct sk_buff), - 0, - SLAB_HWCACHE_ALIGN, - skb_headerinit, NULL); - if (!skbuff_head_cache) - panic("cannot create skbuff cache"); - - for (i=0; i<NR_CPUS; i++) - skb_queue_head_init(&skb_head_pool[i].list); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/Kconfig --- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,196 +0,0 @@ -# -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. -# - -mainmenu "Linux Kernel Configuration" - -config XEN - bool - default y - help - This is the Linux Xen port. - -config ARCH_XEN - bool - default y - - -config NO_IDLE_HZ - bool - default y - - -menu "XEN" - -config XEN_PRIVILEGED_GUEST - bool "Privileged Guest (domain 0)" - default n - select XEN_PHYSDEV_ACCESS - help - Support for privileged operation (domain 0) - -config XEN_PHYSDEV_ACCESS - bool "Physical device access" - default XEN_PRIVILEGED_GUEST - help - Assume access is available to physical hardware devices - (e.g., hard drives, network cards). This allows you to configure - such devices and also includes some low-level support that is - otherwise not compiled into the kernel. - -config XEN_BLKDEV_BACKEND - bool "Block-device backend driver" - depends on XEN_PHYSDEV_ACCESS - default y - help - The block-device backend driver allows the kernel to export its - block devices to other guests via a high-performance shared-memory - interface. - -config XEN_BLKDEV_TAP_BE - bool "Block Tap support for backend driver (DANGEROUS)" - depends on XEN_BLKDEV_BACKEND - default n - help - If you intend to use the block tap driver, the backend domain will - not know the domain id of the real frontend, and so will not be able - to map its data pages. This modifies the backend to attempt to map - from both the tap domain and the real frontend. This presents a - security risk, and so should ONLY be used for development - with the blktap. This option will be removed as the block drivers are - modified to use grant tables. - -config XEN_BLKDEV_GRANT - bool "Grant table substrate for block drivers" - depends on !XEN_BLKDEV_TAP_BE - default y - help - This introduces the use of grant tables as a data exhange mechanism - between the frontend and backend block drivers. This currently - conflicts with the block tap. - -config XEN_NETDEV_BACKEND - bool "Network-device backend driver" - depends on XEN_PHYSDEV_ACCESS - default y - help - The network-device backend driver allows the kernel to export its - network devices to other guests via a high-performance shared-memory - interface. - -config XEN_BLKDEV_FRONTEND - bool "Block-device frontend driver" - default y - help - The block-device frontend driver allows the kernel to access block - devices mounted within another guest OS. Unless you are building a - dedicated device-driver domain, or your master control domain - (domain 0), then you almost certainly want to say Y here. - -config XEN_NETDEV_FRONTEND - bool "Network-device frontend driver" - default y - help - The network-device frontend driver allows the kernel to access - network interfaces within another guest OS. Unless you are building a - dedicated device-driver domain, or your master control domain - (domain 0), then you almost certainly want to say Y here. - -config XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER - bool "Pipelined transmitter (DANGEROUS)" - depends on XEN_NETDEV_FRONTEND - default n - help - The driver will assume that the backend is pipelining packets for - transmission: whenever packets are pending in the remote backend, - the driver will not send asynchronous notifications when it queues - additional packets for transmission. - If the backend is a dumb domain, such as a transparent Ethernet - bridge with no local IP interface, it is safe to say Y here to get - slightly lower network overhead. - If the backend has a local IP interface; or may be doing smart things - like reassembling packets to perform firewall filtering; or if you - are unsure; or if you experience network hangs when this option is - enabled; then you must say N here. - -config XEN_BLKDEV_TAP - bool "Block device tap driver" - default n - help - This driver allows a VM to interact on block device channels - to other VMs. Block messages may be passed through or redirected - to a character device, allowing device prototyping in application - space. Odds are that you want to say N here. - -config XEN_SHADOW_MODE - bool "Fake shadow mode" - default n - help - fakes out a shadow mode kernel - - -config XEN_SCRUB_PAGES - bool "Scrub memory before freeing it to Xen" - default y - help - Erase memory contents before freeing it back to Xen's global - pool. This ensures that any secrets contained within that - memory (e.g., private keys) cannot be found by other guests that - may be running on the machine. Most people will want to say Y here. - If security is not a concern then you may increase performance by - saying N. - -choice - prompt "Processor Type" - default XEN_X86 - -config XEN_X86 - bool "X86" - help - Choose this option if your computer is a X86 architecture. - -config XEN_X86_64 - bool "X86_64" - help - Choose this option if your computer is a X86_64 architecture. - -endchoice - -endmenu - -config HAVE_ARCH_DEV_ALLOC_SKB - bool - default y - -source "init/Kconfig" - -if XEN_X86 -source "arch/xen/i386/Kconfig" -endif - -if XEN_X86_64 -source "arch/xen/x86_64/Kconfig" -endif - -menu "Executable file formats" - -source "fs/Kconfig.binfmt" - -endmenu - -source "arch/xen/Kconfig.drivers" - -if XEN_PRIVILEGED_GUEST -menu "Power management options" -source "drivers/acpi/Kconfig" -endmenu -endif - -source "fs/Kconfig" - -source "security/Kconfig" - -source "crypto/Kconfig" - -source "lib/Kconfig" diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers --- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,94 +0,0 @@ -# arch/xen/Kconfig.drivers - -menu "Device Drivers" - -source "drivers/base/Kconfig" - -if XEN_PHYSDEV_ACCESS -source "drivers/mtd/Kconfig" -source "drivers/parport/Kconfig" -source "drivers/pnp/Kconfig" -endif - -source "drivers/block/Kconfig" - -if XEN_PHYSDEV_ACCESS -source "drivers/ide/Kconfig" -endif - -source "drivers/scsi/Kconfig" - -if XEN_PHYSDEV_ACCESS -source "drivers/cdrom/Kconfig" -endif - -source "drivers/md/Kconfig" - -if XEN_PHYSDEV_ACCESS -source "drivers/message/fusion/Kconfig" -source "drivers/ieee1394/Kconfig" -source "drivers/message/i2o/Kconfig" -endif - -source "net/Kconfig" - -if XEN_PHYSDEV_ACCESS -source "drivers/isdn/Kconfig" -source "drivers/telephony/Kconfig" -source "drivers/input/Kconfig" -source "drivers/char/Kconfig" -source "drivers/i2c/Kconfig" -source "drivers/w1/Kconfig" -source "drivers/misc/Kconfig" -source "drivers/media/Kconfig" -source "drivers/video/Kconfig" -source "sound/Kconfig" -source "drivers/usb/Kconfig" -source "drivers/mmc/Kconfig" -source "drivers/infiniband/Kconfig" -endif - -if !XEN_PHYSDEV_ACCESS - -menu "Character devices" - -config UNIX98_PTYS - bool - default y - -config LEGACY_PTYS - bool "Legacy (BSD) PTY support" - default y - ---help--- - A pseudo terminal (PTY) is a software device consisting of two - halves: a master and a slave. The slave device behaves identical to - a physical terminal; the master device is used by a process to - read data from and write data to the slave, thereby emulating a - terminal. Typical programs for the master side are telnet servers - and xterms. - - Linux has traditionally used the BSD-like names /dev/ptyxx - for masters and /dev/ttyxx for slaves of pseudo - terminals. This scheme has a number of problems, including - security. This option enables these legacy devices; on most - systems, it is safe to say N. - - -config LEGACY_PTY_COUNT - int "Maximum number of legacy PTY in use" - depends on LEGACY_PTYS - range 1 256 - default "256" - ---help--- - The maximum number of legacy PTYs that can be used at any one time. - The default is 256, and should be more than enough. Embedded - systems may want to reduce this to save memory. - - When not in use, each legacy PTY occupies 12 bytes on 32-bit - architectures and 24 bytes on 64-bit architectures. - -endmenu - -endif - -endmenu diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,91 +0,0 @@ -# -# xen/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" cleaning up for this architecture. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 2004 by Christian Limpach -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -# pick up headers from include/asm-xen/asm in preference over include/asm -NOSTDINC_FLAGS = -nostdinc -iwithprefix include/asm-xen -Iinclude/asm-xen -iwithprefix include -ifneq ($(KBUILD_SRC),) -NOSTDINC_FLAGS += -I$(srctree)/include/asm-xen -endif - -# make uname return the processor arch -UTS_MACHINE := $(XENARCH) - -core-y += arch/xen/kernel/ - -.PHONY: include2/asm -include2/asm: -ifneq ($(KBUILD_SRC),) - @echo ' SYMLINK ../include/asm-$(XENARCH) -> include2/asm' - $(Q)ln -fsn ../include/asm-$(XENARCH) include2/asm -endif - -include/.asm-ignore: include/asm - @rm -f include/.asm-ignore - @mv include/asm include/.asm-ignore - @echo ' SYMLINK include/asm -> include/asm-$(XENARCH)' - $(Q)if [ ! -d include ]; then mkdir -p include; fi; - @ln -fsn $(srctree)/include/asm-$(XENARCH) include/asm - -include/asm-xen/asm: - @echo ' SYMLINK $@ -> include/asm-xen/asm-$(XENARCH)' - @mkdir -p include/asm-xen - @ln -fsn $(srctree)/include/asm-xen/asm-$(XENARCH) $@ - -arch/xen/arch: - @rm -f $@ - @mkdir -p arch/xen - @ln -fsn $(srctree)/arch/xen/$(XENARCH) $@ - -arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \ - include/config/MARKER - -include/asm-$(ARCH)/asm_offsets.h: arch/$(XENARCH)/kernel/asm-offsets.s - $(call filechk,gen-asm-offsets) - -prepare: include/.asm-ignore include/asm-xen/asm \ - arch/xen/arch include/asm-$(ARCH)/asm_offsets.h include2/asm ; - -all: vmlinuz - -vmlinuz: vmlinux - $(Q)$(MAKE) $(build)=arch/xen/boot vmlinuz - -XINSTALL_NAME ?= $(KERNELRELEASE) -install: vmlinuz - mkdir -p $(INSTALL_PATH)/boot - ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_PATH)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) - rm -f $(INSTALL_PATH)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) - install -m0644 vmlinuz $(INSTALL_PATH)/boot/vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) - install -m0644 vmlinux $(INSTALL_PATH)/boot/vmlinux-syms-$(XINSTALL_NAME)$(INSTALL_SUFFIX) - install -m0664 .config $(INSTALL_PATH)/boot/config-$(XINSTALL_NAME)$(INSTALL_SUFFIX) - install -m0664 System.map $(INSTALL_PATH)/boot/System.map-$(XINSTALL_NAME)$(INSTALL_SUFFIX) - ln -f -s vmlinuz-$(XINSTALL_NAME)$(INSTALL_SUFFIX) $(INSTALL_PATH)/boot/vmlinuz-$(VERSION).$(PATCHLEVEL)$(XENGUEST)$(INSTALL_SUFFIX) - mkdir -p $(INSTALL_PATH)/usr/include/xen/linux - install -m0644 $(srctree)/include/asm-xen/linux-public/*.h $(INSTALL_PATH)/usr/include/xen/linux - -archclean: - @if [ -e arch/xen/arch ]; then $(MAKE) $(clean)=arch/xen/arch; fi; - @rm -f arch/xen/arch include/.asm-ignore include/asm-xen/asm - @rm -f vmlinux-stripped vmlinuz - -define archhelp - echo '* vmlinuz - Compressed kernel image' - echo ' install - Install kernel image and config file' -endef - -ifneq ($(XENARCH),) -include $(srctree)/arch/xen/$(XENARCH)/Makefile -endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/boot/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/boot/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,8 +0,0 @@ - -OBJCOPYFLAGS := -g --strip-unneeded - -vmlinuz: vmlinux-stripped FORCE - $(call if_changed,gzip) - -vmlinux-stripped: vmlinux FORCE - $(call if_changed,objcopy) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1023 +0,0 @@ -# -# Automatically generated make config: don't edit -# Linux kernel version: 2.6.11.1-xen0 -# Tue May 10 11:07:02 2005 -# -CONFIG_XEN=y -CONFIG_ARCH_XEN=y -CONFIG_NO_IDLE_HZ=y - -# -# XEN -# -CONFIG_XEN_PRIVILEGED_GUEST=y -CONFIG_XEN_PHYSDEV_ACCESS=y -CONFIG_XEN_BLKDEV_BACKEND=y -# CONFIG_XEN_BLKDEV_TAP_BE is not set -CONFIG_XEN_BLKDEV_GRANT=y -CONFIG_XEN_NETDEV_BACKEND=y -CONFIG_XEN_BLKDEV_FRONTEND=y -CONFIG_XEN_NETDEV_FRONTEND=y -# CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set -# CONFIG_XEN_BLKDEV_TAP is not set -# CONFIG_XEN_SHADOW_MODE is not set -CONFIG_XEN_SCRUB_PAGES=y -# CONFIG_XEN_X86 is not set -CONFIG_XEN_X86_64=y -CONFIG_HAVE_ARCH_DEV_ALLOC_SKB=y - -# -# Code maturity level options -# -CONFIG_EXPERIMENTAL=y -# CONFIG_CLEAN_COMPILE is not set -CONFIG_BROKEN=y -CONFIG_BROKEN_ON_SMP=y - -# -# General setup -# -CONFIG_LOCALVERSION="" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -# CONFIG_POSIX_MQUEUE is not set -# CONFIG_BSD_PROCESS_ACCT is not set -CONFIG_SYSCTL=y -# CONFIG_AUDIT is not set -CONFIG_LOG_BUF_SHIFT=14 -# CONFIG_HOTPLUG is not set -CONFIG_KOBJECT_UEVENT=y -# CONFIG_IKCONFIG is not set -# CONFIG_EMBEDDED is not set -CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_EXTRA_PASS is not set -CONFIG_FUTEX=y -CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SHMEM=y -CONFIG_CC_ALIGN_FUNCTIONS=0 -CONFIG_CC_ALIGN_LABELS=0 -CONFIG_CC_ALIGN_LOOPS=0 -CONFIG_CC_ALIGN_JUMPS=0 -# CONFIG_TINY_SHMEM is not set - -# -# Loadable module support -# -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_MODULE_FORCE_UNLOAD is not set -CONFIG_OBSOLETE_MODPARM=y -# CONFIG_MODVERSIONS is not set -# CONFIG_MODULE_SRCVERSION_ALL is not set -CONFIG_KMOD=y -CONFIG_XENARCH="x86_64" -CONFIG_X86=y -CONFIG_MMU=y -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_IOMAP=y -CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 -CONFIG_RWSEM_GENERIC_SPINLOCK=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_X86_GOOD_APIC=y -# CONFIG_HPET_TIMER is not set -# CONFIG_SMP is not set -# CONFIG_PREEMPT is not set -# CONFIG_MICROCODE is not set -# CONFIG_X86_CPUID is not set -# CONFIG_NUMA is not set -# CONFIG_MTRR is not set -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_PCI=y -CONFIG_PCI_DIRECT=y -# CONFIG_PCI_MMCONFIG is not set -CONFIG_EARLY_PRINTK=y -CONFIG_GENERIC_HARDIRQS=y -CONFIG_GENERIC_IRQ_PROBE=y - -# -# X86_64 processor configuration -# -CONFIG_X86_64=y -CONFIG_64BIT=y - -# -# Processor type and features -# -# CONFIG_MPSC is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -# CONFIG_X86_TSC is not set -# CONFIG_X86_MSR is not set -# CONFIG_GART_IOMMU is not set -CONFIG_DUMMY_IOMMU=y -# CONFIG_X86_MCE is not set - -# -# Power management options -# -# CONFIG_PM is not set - -# -# CPU Frequency scaling -# -# CONFIG_CPU_FREQ is not set - -# -# Bus options (PCI etc.) -# -# CONFIG_UNORDERED_IO is not set - -# -# Executable file formats / Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_IA32_AOUT is not set -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_BINFMT_MISC=y - -# -# Device Drivers -# - -# -# Generic Driver Options -# -CONFIG_STANDALONE=y -# CONFIG_PREVENT_FIRMWARE_BUILD is not set -# CONFIG_FW_LOADER is not set - -# -# Memory Technology Devices (MTD) -# -# CONFIG_MTD is not set - -# -# Parallel port support -# -# CONFIG_PARPORT is not set - -# -# Plug and Play support -# -# CONFIG_PNP is not set - -# -# Block devices -# -CONFIG_BLK_DEV_FD=y -# CONFIG_BLK_CPQ_DA is not set -CONFIG_BLK_CPQ_CISS_DA=y -# CONFIG_CISS_SCSI_TAPE is not set -# CONFIG_BLK_DEV_DAC960 is not set -# CONFIG_BLK_DEV_UMEM is not set -# CONFIG_BLK_DEV_COW_COMMON is not set -CONFIG_BLK_DEV_LOOP=y -# CONFIG_BLK_DEV_CRYPTOLOOP is not set -# CONFIG_BLK_DEV_NBD is not set -# CONFIG_BLK_DEV_SX8 is not set -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -# CONFIG_LBD is not set -# CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y -# CONFIG_ATA_OVER_ETH is not set - -# -# ATA/ATAPI/MFM/RLL support -# -CONFIG_IDE=y -CONFIG_BLK_DEV_IDE=y - -# -# Please see Documentation/ide.txt for help/info on IDE drives -# -# CONFIG_BLK_DEV_IDE_SATA is not set -# CONFIG_BLK_DEV_HD_IDE is not set -CONFIG_BLK_DEV_IDEDISK=y -# CONFIG_IDEDISK_MULTI_MODE is not set -CONFIG_BLK_DEV_IDECD=y -# CONFIG_BLK_DEV_IDETAPE is not set -# CONFIG_BLK_DEV_IDEFLOPPY is not set -# CONFIG_BLK_DEV_IDESCSI is not set -# CONFIG_IDE_TASK_IOCTL is not set - -# -# IDE chipset support/bugfixes -# -CONFIG_IDE_GENERIC=y -# CONFIG_BLK_DEV_CMD640 is not set -CONFIG_BLK_DEV_IDEPCI=y -# CONFIG_IDEPCI_SHARE_IRQ is not set -# CONFIG_BLK_DEV_OFFBOARD is not set -CONFIG_BLK_DEV_GENERIC=y -# CONFIG_BLK_DEV_OPTI621 is not set -# CONFIG_BLK_DEV_RZ1000 is not set -CONFIG_BLK_DEV_IDEDMA_PCI=y -# CONFIG_BLK_DEV_IDEDMA_FORCED is not set -CONFIG_IDEDMA_PCI_AUTO=y -# CONFIG_IDEDMA_ONLYDISK is not set -# CONFIG_BLK_DEV_AEC62XX is not set -# CONFIG_BLK_DEV_ALI15X3 is not set -# CONFIG_BLK_DEV_AMD74XX is not set -# CONFIG_BLK_DEV_ATIIXP is not set -# CONFIG_BLK_DEV_CMD64X is not set -# CONFIG_BLK_DEV_TRIFLEX is not set -# CONFIG_BLK_DEV_CY82C693 is not set -# CONFIG_BLK_DEV_CS5520 is not set -# CONFIG_BLK_DEV_CS5530 is not set -# CONFIG_BLK_DEV_HPT34X is not set -# CONFIG_BLK_DEV_HPT366 is not set -# CONFIG_BLK_DEV_SC1200 is not set -CONFIG_BLK_DEV_PIIX=y -# CONFIG_BLK_DEV_NS87415 is not set -# CONFIG_BLK_DEV_PDC202XX_OLD is not set -# CONFIG_BLK_DEV_PDC202XX_NEW is not set -CONFIG_BLK_DEV_SVWKS=y -# CONFIG_BLK_DEV_SIIMAGE is not set -# CONFIG_BLK_DEV_SIS5513 is not set -# CONFIG_BLK_DEV_SLC90E66 is not set -# CONFIG_BLK_DEV_TRM290 is not set -# CONFIG_BLK_DEV_VIA82CXXX is not set -# CONFIG_IDE_ARM is not set -CONFIG_BLK_DEV_IDEDMA=y -# CONFIG_IDEDMA_IVB is not set -CONFIG_IDEDMA_AUTO=y -# CONFIG_BLK_DEV_HD is not set - -# -# SCSI device support -# -CONFIG_SCSI=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -# CONFIG_CHR_DEV_ST is not set -# CONFIG_CHR_DEV_OSST is not set -# CONFIG_BLK_DEV_SR is not set -# CONFIG_CHR_DEV_SG is not set - -# -# Some SCSI devices (e.g. CD jukebox) support multiple LUNs -# -# CONFIG_SCSI_MULTI_LUN is not set -# CONFIG_SCSI_CONSTANTS is not set -# CONFIG_SCSI_LOGGING is not set - -# -# SCSI Transport Attributes -# -# CONFIG_SCSI_SPI_ATTRS is not set -# CONFIG_SCSI_FC_ATTRS is not set -# CONFIG_SCSI_ISCSI_ATTRS is not set - -# -# SCSI low-level drivers -# -CONFIG_BLK_DEV_3W_XXXX_RAID=y -# CONFIG_SCSI_3W_9XXX is not set -# CONFIG_SCSI_ACARD is not set -CONFIG_SCSI_AACRAID=y -CONFIG_SCSI_AIC7XXX=y -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -# CONFIG_SCSI_AIC7XXX_OLD is not set -CONFIG_SCSI_AIC79XX=y -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -# CONFIG_AIC79XX_BUILD_FIRMWARE is not set -# CONFIG_AIC79XX_ENABLE_RD_STRM is not set -CONFIG_AIC79XX_DEBUG_ENABLE=y -CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -# CONFIG_SCSI_ADVANSYS is not set -CONFIG_MEGARAID_NEWGEN=y -# CONFIG_MEGARAID_MM is not set -CONFIG_SCSI_SATA=y -# CONFIG_SCSI_SATA_AHCI is not set -# CONFIG_SCSI_SATA_SVW is not set -CONFIG_SCSI_ATA_PIIX=y -# CONFIG_SCSI_SATA_NV is not set -CONFIG_SCSI_SATA_PROMISE=y -# CONFIG_SCSI_SATA_QSTOR is not set -CONFIG_SCSI_SATA_SX4=y -CONFIG_SCSI_SATA_SIL=y -# CONFIG_SCSI_SATA_SIS is not set -# CONFIG_SCSI_SATA_ULI is not set -# CONFIG_SCSI_SATA_VIA is not set -# CONFIG_SCSI_SATA_VITESSE is not set -CONFIG_SCSI_BUSLOGIC=y -# CONFIG_SCSI_OMIT_FLASHPOINT is not set -# CONFIG_SCSI_CPQFCTS is not set -# CONFIG_SCSI_DMX3191D is not set -# CONFIG_SCSI_EATA is not set -# CONFIG_SCSI_EATA_PIO is not set -# CONFIG_SCSI_FUTURE_DOMAIN is not set -# CONFIG_SCSI_GDTH is not set -# CONFIG_SCSI_IPS is not set -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set -# CONFIG_SCSI_SYM53C8XX_2 is not set -# CONFIG_SCSI_IPR is not set -# CONFIG_SCSI_PCI2000 is not set -# CONFIG_SCSI_PCI2220I is not set -# CONFIG_SCSI_QLOGIC_ISP is not set -# CONFIG_SCSI_QLOGIC_FC is not set -# CONFIG_SCSI_QLOGIC_1280 is not set -CONFIG_SCSI_QLA2XXX=y -# CONFIG_SCSI_QLA21XX is not set -# CONFIG_SCSI_QLA22XX is not set -# CONFIG_SCSI_QLA2300 is not set -# CONFIG_SCSI_QLA2322 is not set -# CONFIG_SCSI_QLA6312 is not set -# CONFIG_SCSI_DC395x is not set -# CONFIG_SCSI_DC390T is not set -# CONFIG_SCSI_DEBUG is not set - -# -# Multi-device support (RAID and LVM) -# -# CONFIG_MD is not set - -# -# Fusion MPT device support -# -CONFIG_FUSION=y -CONFIG_FUSION_MAX_SGE=40 -# CONFIG_FUSION_CTL is not set - -# -# IEEE 1394 (FireWire) support -# -# CONFIG_IEEE1394 is not set - -# -# I2O device support -# -# CONFIG_I2O is not set - -# -# Networking support -# -CONFIG_NET=y - -# -# Networking options -# -CONFIG_PACKET=y -# CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK_DEV is not set -CONFIG_UNIX=y -# CONFIG_NET_KEY is not set -CONFIG_INET=y -# CONFIG_IP_MULTICAST is not set -# CONFIG_IP_ADVANCED_ROUTER is not set -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -# CONFIG_IP_PNP_BOOTP is not set -# CONFIG_IP_PNP_RARP is not set -# CONFIG_NET_IPIP is not set -# CONFIG_NET_IPGRE is not set -# CONFIG_ARPD is not set -# CONFIG_SYN_COOKIES is not set -# CONFIG_INET_AH is not set -# CONFIG_INET_ESP is not set -# CONFIG_INET_IPCOMP is not set -# CONFIG_INET_TUNNEL is not set -CONFIG_IP_TCPDIAG=y -# CONFIG_IP_TCPDIAG_IPV6 is not set - -# -# IP: Virtual Server Configuration -# -# CONFIG_IP_VS is not set -# CONFIG_IPV6 is not set -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_DEBUG is not set -CONFIG_BRIDGE_NETFILTER=y - -# -# IP: Netfilter Configuration -# -CONFIG_IP_NF_CONNTRACK=m -CONFIG_IP_NF_CT_ACCT=y -# CONFIG_IP_NF_CONNTRACK_MARK is not set -# CONFIG_IP_NF_CT_PROTO_SCTP is not set -CONFIG_IP_NF_FTP=m -# CONFIG_IP_NF_IRC is not set -# CONFIG_IP_NF_TFTP is not set -# CONFIG_IP_NF_AMANDA is not set -# CONFIG_IP_NF_QUEUE is not set -CONFIG_IP_NF_IPTABLES=m -# CONFIG_IP_NF_MATCH_LIMIT is not set -CONFIG_IP_NF_MATCH_IPRANGE=m -# CONFIG_IP_NF_MATCH_MAC is not set -# CONFIG_IP_NF_MATCH_PKTTYPE is not set -# CONFIG_IP_NF_MATCH_MARK is not set -# CONFIG_IP_NF_MATCH_MULTIPORT is not set -# CONFIG_IP_NF_MATCH_TOS is not set -# CONFIG_IP_NF_MATCH_RECENT is not set -# CONFIG_IP_NF_MATCH_ECN is not set -# CONFIG_IP_NF_MATCH_DSCP is not set -# CONFIG_IP_NF_MATCH_AH_ESP is not set -# CONFIG_IP_NF_MATCH_LENGTH is not set -# CONFIG_IP_NF_MATCH_TTL is not set -# CONFIG_IP_NF_MATCH_TCPMSS is not set -# CONFIG_IP_NF_MATCH_HELPER is not set -# CONFIG_IP_NF_MATCH_STATE is not set -# CONFIG_IP_NF_MATCH_CONNTRACK is not set -# CONFIG_IP_NF_MATCH_OWNER is not set -# CONFIG_IP_NF_MATCH_PHYSDEV is not set -# CONFIG_IP_NF_MATCH_ADDRTYPE is not set -# CONFIG_IP_NF_MATCH_REALM is not set -# CONFIG_IP_NF_MATCH_SCTP is not set -# CONFIG_IP_NF_MATCH_COMMENT is not set -# CONFIG_IP_NF_MATCH_HASHLIMIT is not set -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -# CONFIG_IP_NF_TARGET_LOG is not set -# CONFIG_IP_NF_TARGET_ULOG is not set -# CONFIG_IP_NF_TARGET_TCPMSS is not set -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_NAT_NEEDED=y -CONFIG_IP_NF_TARGET_MASQUERADE=m -# CONFIG_IP_NF_TARGET_REDIRECT is not set -# CONFIG_IP_NF_TARGET_NETMAP is not set -# CONFIG_IP_NF_TARGET_SAME is not set -# CONFIG_IP_NF_NAT_SNMP_BASIC is not set -CONFIG_IP_NF_NAT_FTP=m -# CONFIG_IP_NF_MANGLE is not set -# CONFIG_IP_NF_RAW is not set -# CONFIG_IP_NF_ARPTABLES is not set - -# -# Bridge: Netfilter Configuration -# -# CONFIG_BRIDGE_NF_EBTABLES is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# -# CONFIG_IP_SCTP is not set -# CONFIG_ATM is not set -CONFIG_BRIDGE=y -# CONFIG_VLAN_8021Q is not set -# CONFIG_DECNET is not set -# CONFIG_LLC2 is not set -# CONFIG_IPX is not set -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set -# CONFIG_ECONET is not set -# CONFIG_WAN_ROUTER is not set - -# -# QoS and/or fair queueing -# -# CONFIG_NET_SCHED is not set -# CONFIG_NET_CLS_ROUTE is not set - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -# CONFIG_NETPOLL is not set -# CONFIG_NET_POLL_CONTROLLER is not set -# CONFIG_HAMRADIO is not set -# CONFIG_IRDA is not set -# CONFIG_BT is not set -CONFIG_NETDEVICES=y -# CONFIG_DUMMY is not set -# CONFIG_BONDING is not set -# CONFIG_EQUALIZER is not set -CONFIG_TUN=y - -# -# ARCnet devices -# -# CONFIG_ARCNET is not set - -# -# Ethernet (10 or 100Mbit) -# -CONFIG_NET_ETHERNET=y -CONFIG_MII=y -# CONFIG_HAPPYMEAL is not set -# CONFIG_SUNGEM is not set -CONFIG_NET_VENDOR_3COM=y -CONFIG_VORTEX=y -# CONFIG_TYPHOON is not set - -# -# Tulip family network device support -# -CONFIG_NET_TULIP=y -# CONFIG_DE2104X is not set -CONFIG_TULIP=y -# CONFIG_TULIP_MWI is not set -# CONFIG_TULIP_MMIO is not set -# CONFIG_TULIP_NAPI is not set -# CONFIG_DE4X5 is not set -# CONFIG_WINBOND_840 is not set -# CONFIG_DM9102 is not set -# CONFIG_HP100 is not set -CONFIG_NET_PCI=y -CONFIG_PCNET32=y -# CONFIG_AMD8111_ETH is not set -# CONFIG_ADAPTEC_STARFIRE is not set -# CONFIG_B44 is not set -# CONFIG_FORCEDETH is not set -# CONFIG_DGRS is not set -# CONFIG_EEPRO100 is not set -CONFIG_E100=y -# CONFIG_E100_NAPI is not set -# CONFIG_FEALNX is not set -# CONFIG_NATSEMI is not set -CONFIG_NE2K_PCI=y -# CONFIG_8139CP is not set -CONFIG_8139TOO=y -CONFIG_8139TOO_PIO=y -# CONFIG_8139TOO_TUNE_TWISTER is not set -# CONFIG_8139TOO_8129 is not set -# CONFIG_8139_OLD_RX_RESET is not set -# CONFIG_SIS900 is not set -# CONFIG_EPIC100 is not set -# CONFIG_SUNDANCE is not set -CONFIG_VIA_RHINE=y -# CONFIG_VIA_RHINE_MMIO is not set - -# -# Ethernet (1000 Mbit) -# -CONFIG_ACENIC=y -# CONFIG_ACENIC_OMIT_TIGON_I is not set -# CONFIG_DL2K is not set -CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set -# CONFIG_NS83820 is not set -# CONFIG_HAMACHI is not set -# CONFIG_YELLOWFIN is not set -# CONFIG_R8169 is not set -# CONFIG_SK98LIN is not set -# CONFIG_VIA_VELOCITY is not set -CONFIG_TIGON3=y - -# -# Ethernet (10000 Mbit) -# -# CONFIG_IXGB is not set -# CONFIG_S2IO is not set - -# -# Token Ring devices -# -# CONFIG_TR is not set - -# -# Wireless LAN (non-hamradio) -# -# CONFIG_NET_RADIO is not set - -# -# Wan interfaces -# -# CONFIG_WAN is not set -# CONFIG_FDDI is not set -# CONFIG_HIPPI is not set -# CONFIG_PPP is not set -# CONFIG_SLIP is not set -# CONFIG_NET_FC is not set -# CONFIG_SHAPER is not set -# CONFIG_NETCONSOLE is not set - -# -# ISDN subsystem -# -# CONFIG_ISDN is not set - -# -# Telephony Support -# -# CONFIG_PHONE is not set - -# -# Input device support -# -CONFIG_INPUT=y - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=y -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -# CONFIG_INPUT_JOYDEV is not set -# CONFIG_INPUT_TSDEV is not set -# CONFIG_INPUT_EVDEV is not set -# CONFIG_INPUT_EVBUG is not set - -# -# Input I/O drivers -# -# CONFIG_GAMEPORT is not set -CONFIG_SOUND_GAMEPORT=y -CONFIG_SERIO=y -CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=y -# CONFIG_SERIO_CT82C710 is not set -# CONFIG_SERIO_PCIPS2 is not set -CONFIG_SERIO_LIBPS2=y -# CONFIG_SERIO_RAW is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ATKBD=y -# CONFIG_KEYBOARD_SUNKBD is not set -# CONFIG_KEYBOARD_LKKBD is not set -# CONFIG_KEYBOARD_XTKBD is not set -# CONFIG_KEYBOARD_NEWTON is not set -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=y -# CONFIG_MOUSE_SERIAL is not set -# CONFIG_MOUSE_VSXXXAA is not set -# CONFIG_INPUT_JOYSTICK is not set -# CONFIG_INPUT_TOUCHSCREEN is not set -# CONFIG_INPUT_MISC is not set - -# -# Character devices -# -CONFIG_VT=y -CONFIG_VT_CONSOLE=y -CONFIG_HW_CONSOLE=y -# CONFIG_SERIAL_NONSTANDARD is not set - -# -# Serial drivers -# -# CONFIG_SERIAL_8250 is not set - -# -# Non-8250 serial port support -# -CONFIG_UNIX98_PTYS=y -CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# -# CONFIG_IPMI_HANDLER is not set - -# -# Watchdog Cards -# -# CONFIG_WATCHDOG is not set -# CONFIG_HW_RANDOM is not set -# CONFIG_NVRAM is not set -CONFIG_RTC=y -# CONFIG_GEN_RTC is not set -# CONFIG_DTLK is not set -# CONFIG_R3964 is not set -# CONFIG_APPLICOM is not set - -# -# Ftape, the floppy tape device driver -# -# CONFIG_FTAPE is not set -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL_MCH=m -CONFIG_DRM=m -CONFIG_DRM_TDFX=m -# CONFIG_DRM_GAMMA is not set -CONFIG_DRM_R128=m -CONFIG_DRM_RADEON=m -CONFIG_DRM_MGA=m -CONFIG_DRM_SIS=m -# CONFIG_MWAVE is not set -# CONFIG_RAW_DRIVER is not set -# CONFIG_HPET is not set -# CONFIG_HANGCHECK_TIMER is not set - -# -# I2C support -# -# CONFIG_I2C is not set - -# -# Dallas's 1-wire bus -# -# CONFIG_W1 is not set - -# -# Misc devices -# -# CONFIG_IBM_ASM is not set - -# -# Multimedia devices -# -# CONFIG_VIDEO_DEV is not set - -# -# Digital Video Broadcasting Devices -# -# CONFIG_DVB is not set - -# -# Graphics support -# -# CONFIG_FB is not set -# CONFIG_VIDEO_SELECT is not set - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_DUMMY_CONSOLE=y - -# -# Sound -# -# CONFIG_SOUND is not set - -# -# USB support -# -# CONFIG_USB is not set -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB_ARCH_HAS_OHCI=y - -# -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information -# - -# -# USB Gadget Support -# -# CONFIG_USB_GADGET is not set - -# -# MMC/SD Card support -# -# CONFIG_MMC is not set - -# -# InfiniBand support -# -# CONFIG_INFINIBAND is not set - -# -# Power management options -# - -# -# ACPI (Advanced Configuration and Power Interface) Support -# -CONFIG_ACPI=y -CONFIG_ACPI_BOOT=y -CONFIG_ACPI_INTERPRETER=y -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=m -CONFIG_ACPI_VIDEO=m -CONFIG_ACPI_FAN=m -CONFIG_ACPI_PROCESSOR=m -CONFIG_ACPI_THERMAL=m -CONFIG_ACPI_ASUS=m -CONFIG_ACPI_IBM=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_ACPI_BLACKLIST_YEAR=0 -# CONFIG_ACPI_DEBUG is not set -CONFIG_ACPI_BUS=y -CONFIG_ACPI_EC=y -CONFIG_ACPI_POWER=y -CONFIG_ACPI_PCI=y -CONFIG_ACPI_SYSTEM=y -# CONFIG_ACPI_CONTAINER is not set - -# -# File systems -# -CONFIG_EXT2_FS=y -# CONFIG_EXT2_FS_XATTR is not set -CONFIG_EXT3_FS=y -CONFIG_EXT3_FS_XATTR=y -# CONFIG_EXT3_FS_POSIX_ACL is not set -# CONFIG_EXT3_FS_SECURITY is not set -CONFIG_JBD=y -# CONFIG_JBD_DEBUG is not set -CONFIG_FS_MBCACHE=y -CONFIG_REISERFS_FS=y -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -# CONFIG_REISERFS_FS_XATTR is not set -# CONFIG_JFS_FS is not set - -# -# XFS support -# -# CONFIG_XFS_FS is not set -# CONFIG_MINIX_FS is not set -# CONFIG_ROMFS_FS is not set -# CONFIG_QUOTA is not set -CONFIG_DNOTIFY=y -CONFIG_AUTOFS_FS=y -CONFIG_AUTOFS4_FS=y - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_ZISOFS_FS=y -# CONFIG_UDF_FS is not set - -# -# DOS/FAT/NT Filesystems -# -CONFIG_FAT_FS=y -CONFIG_MSDOS_FS=y -CONFIG_VFAT_FS=y -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -# CONFIG_NTFS_FS is not set - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set -# CONFIG_DEVPTS_FS_XATTR is not set -CONFIG_TMPFS=y -# CONFIG_TMPFS_XATTR is not set -# CONFIG_HUGETLBFS is not set -# CONFIG_HUGETLB_PAGE is not set -CONFIG_RAMFS=y - -# -# Miscellaneous filesystems -# -# CONFIG_ADFS_FS is not set -# CONFIG_AFFS_FS is not set -# CONFIG_HFS_FS is not set -# CONFIG_HFSPLUS_FS is not set -# CONFIG_BEFS_FS is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -# CONFIG_CRAMFS is not set -# CONFIG_VXFS_FS is not set -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_SYSV_FS is not set -# CONFIG_UFS_FS is not set - -# -# Network File Systems -# -CONFIG_NFS_FS=y -CONFIG_NFS_V3=y -# CONFIG_NFS_V4 is not set -# CONFIG_NFS_DIRECTIO is not set -CONFIG_NFSD=m -CONFIG_NFSD_V3=y -# CONFIG_NFSD_V4 is not set -CONFIG_NFSD_TCP=y -CONFIG_ROOT_NFS=y -CONFIG_LOCKD=y -CONFIG_LOCKD_V4=y -CONFIG_EXPORTFS=m -CONFIG_SUNRPC=y -# CONFIG_RPCSEC_GSS_KRB5 is not set -# CONFIG_RPCSEC_GSS_SPKM3 is not set -# CONFIG_SMB_FS is not set -# CONFIG_CIFS is not set -# CONFIG_NCP_FS is not set -# CONFIG_CODA_FS is not set -# CONFIG_AFS_FS is not set - -# -# Partition Types -# -# CONFIG_PARTITION_ADVANCED is not set -CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="iso8859-1" -CONFIG_NLS_CODEPAGE_437=y -# CONFIG_NLS_CODEPAGE_737 is not set -# CONFIG_NLS_CODEPAGE_775 is not set -# CONFIG_NLS_CODEPAGE_850 is not set -# CONFIG_NLS_CODEPAGE_852 is not set -# CONFIG_NLS_CODEPAGE_855 is not set -# CONFIG_NLS_CODEPAGE_857 is not set -# CONFIG_NLS_CODEPAGE_860 is not set -# CONFIG_NLS_CODEPAGE_861 is not set -# CONFIG_NLS_CODEPAGE_862 is not set -# CONFIG_NLS_CODEPAGE_863 is not set -# CONFIG_NLS_CODEPAGE_864 is not set -# CONFIG_NLS_CODEPAGE_865 is not set -# CONFIG_NLS_CODEPAGE_866 is not set -# CONFIG_NLS_CODEPAGE_869 is not set -# CONFIG_NLS_CODEPAGE_936 is not set -# CONFIG_NLS_CODEPAGE_950 is not set -# CONFIG_NLS_CODEPAGE_932 is not set -# CONFIG_NLS_CODEPAGE_949 is not set -# CONFIG_NLS_CODEPAGE_874 is not set -# CONFIG_NLS_ISO8859_8 is not set -# CONFIG_NLS_CODEPAGE_1250 is not set -# CONFIG_NLS_CODEPAGE_1251 is not set -# CONFIG_NLS_ASCII is not set -CONFIG_NLS_ISO8859_1=y -# CONFIG_NLS_ISO8859_2 is not set -# CONFIG_NLS_ISO8859_3 is not set -# CONFIG_NLS_ISO8859_4 is not set -# CONFIG_NLS_ISO8859_5 is not set -# CONFIG_NLS_ISO8859_6 is not set -# CONFIG_NLS_ISO8859_7 is not set -# CONFIG_NLS_ISO8859_9 is not set -# CONFIG_NLS_ISO8859_13 is not set -# CONFIG_NLS_ISO8859_14 is not set -# CONFIG_NLS_ISO8859_15 is not set -# CONFIG_NLS_KOI8_R is not set -# CONFIG_NLS_KOI8_U is not set -# CONFIG_NLS_UTF8 is not set - -# -# Security options -# -# CONFIG_KEYS is not set -# CONFIG_SECURITY is not set - -# -# Cryptographic options -# -CONFIG_CRYPTO=y -CONFIG_CRYPTO_HMAC=y -# CONFIG_CRYPTO_NULL is not set -# CONFIG_CRYPTO_MD4 is not set -CONFIG_CRYPTO_MD5=m -CONFIG_CRYPTO_SHA1=m -# CONFIG_CRYPTO_SHA256 is not set -# CONFIG_CRYPTO_SHA512 is not set -# CONFIG_CRYPTO_WP512 is not set -CONFIG_CRYPTO_DES=m -# CONFIG_CRYPTO_BLOWFISH is not set -# CONFIG_CRYPTO_TWOFISH is not set -# CONFIG_CRYPTO_SERPENT is not set -# CONFIG_CRYPTO_AES is not set -# CONFIG_CRYPTO_CAST5 is not set -# CONFIG_CRYPTO_CAST6 is not set -# CONFIG_CRYPTO_TEA is not set -# CONFIG_CRYPTO_ARC4 is not set -# CONFIG_CRYPTO_KHAZAD is not set -# CONFIG_CRYPTO_ANUBIS is not set -# CONFIG_CRYPTO_DEFLATE is not set -# CONFIG_CRYPTO_MICHAEL_MIC is not set -CONFIG_CRYPTO_CRC32C=m -# CONFIG_CRYPTO_TEST is not set - -# -# Hardware crypto devices -# - -# -# Library routines -# -# CONFIG_CRC_CCITT is not set -CONFIG_CRC32=y -CONFIG_LIBCRC32C=y -CONFIG_ZLIB_INFLATE=y diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,897 +0,0 @@ -# -# Automatically generated make config: don't edit -# Linux kernel version: 2.6.11.10-xenU -# Mon May 23 15:07:58 2005 -# -CONFIG_XEN=y -CONFIG_ARCH_XEN=y -CONFIG_NO_IDLE_HZ=y - -# -# XEN -# -# CONFIG_XEN_PRIVILEGED_GUEST is not set -# CONFIG_XEN_PHYSDEV_ACCESS is not set -CONFIG_XEN_BLKDEV_GRANT=y -CONFIG_XEN_BLKDEV_FRONTEND=y -CONFIG_XEN_NETDEV_FRONTEND=y -# CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set -# CONFIG_XEN_BLKDEV_TAP is not set -# CONFIG_XEN_SHADOW_MODE is not set -CONFIG_XEN_SCRUB_PAGES=y -# CONFIG_XEN_X86 is not set -CONFIG_XEN_X86_64=y -CONFIG_HAVE_ARCH_DEV_ALLOC_SKB=y - -# -# Code maturity level options -# -CONFIG_EXPERIMENTAL=y -CONFIG_CLEAN_COMPILE=y -CONFIG_BROKEN_ON_SMP=y - -# -# General setup -# -CONFIG_LOCALVERSION="" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_POSIX_MQUEUE=y -CONFIG_BSD_PROCESS_ACCT=y -# CONFIG_BSD_PROCESS_ACCT_V3 is not set -CONFIG_SYSCTL=y -CONFIG_AUDIT=y -CONFIG_AUDITSYSCALL=y -CONFIG_LOG_BUF_SHIFT=14 -CONFIG_HOTPLUG=y -CONFIG_KOBJECT_UEVENT=y -# CONFIG_IKCONFIG is not set -# CONFIG_EMBEDDED is not set -CONFIG_KALLSYMS=y -CONFIG_KALLSYMS_EXTRA_PASS=y -CONFIG_FUTEX=y -CONFIG_EPOLL=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SHMEM=y -CONFIG_CC_ALIGN_FUNCTIONS=0 -CONFIG_CC_ALIGN_LABELS=0 -CONFIG_CC_ALIGN_LOOPS=0 -CONFIG_CC_ALIGN_JUMPS=0 -# CONFIG_TINY_SHMEM is not set - -# -# Loadable module support -# -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -# CONFIG_MODULE_FORCE_UNLOAD is not set -CONFIG_OBSOLETE_MODPARM=y -CONFIG_MODVERSIONS=y -# CONFIG_MODULE_SRCVERSION_ALL is not set -CONFIG_KMOD=y -CONFIG_XENARCH="x86_64" -CONFIG_X86=y -CONFIG_MMU=y -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_IOMAP=y -CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 -CONFIG_RWSEM_GENERIC_SPINLOCK=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_X86_GOOD_APIC=y -# CONFIG_HPET_TIMER is not set -# CONFIG_SMP is not set -# CONFIG_PREEMPT is not set -# CONFIG_MICROCODE is not set -CONFIG_X86_CPUID=y -# CONFIG_NUMA is not set -# CONFIG_MTRR is not set -# CONFIG_X86_LOCAL_APIC is not set -# CONFIG_X86_IO_APIC is not set -# CONFIG_PCI is not set -CONFIG_EARLY_PRINTK=y -CONFIG_GENERIC_HARDIRQS=y -CONFIG_GENERIC_IRQ_PROBE=y - -# -# X86_64 processor configuration -# -CONFIG_X86_64=y -CONFIG_64BIT=y - -# -# Processor type and features -# -CONFIG_MPSC=y -# CONFIG_GENERIC_CPU is not set -CONFIG_X86_L1_CACHE_BYTES=128 -# CONFIG_X86_TSC is not set -# CONFIG_X86_MSR is not set -CONFIG_DUMMY_IOMMU=y -# CONFIG_X86_MCE is not set - -# -# Power management options -# -# CONFIG_PM is not set - -# -# CPU Frequency scaling -# -# CONFIG_CPU_FREQ is not set - -# -# Bus options (PCI etc.) -# -# CONFIG_UNORDERED_IO is not set - -# -# Executable file formats / Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_IA32_AOUT is not set -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_BINFMT_MISC=y - -# -# Device Drivers -# - -# -# Generic Driver Options -# -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y -CONFIG_FW_LOADER=y - -# -# Block devices -# -CONFIG_BLK_DEV_FD=m -# CONFIG_BLK_DEV_COW_COMMON is not set -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_LBD=y -# CONFIG_CDROM_PKTCDVD is not set - -# -# IO Schedulers -# -CONFIG_IOSCHED_NOOP=y -CONFIG_IOSCHED_AS=y -CONFIG_IOSCHED_DEADLINE=y -CONFIG_IOSCHED_CFQ=y -# CONFIG_ATA_OVER_ETH is not set - -# -# SCSI device support -# -CONFIG_SCSI=m -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=m -CONFIG_CHR_DEV_ST=m -CONFIG_CHR_DEV_OSST=m -CONFIG_BLK_DEV_SR=m -CONFIG_BLK_DEV_SR_VENDOR=y -CONFIG_CHR_DEV_SG=m - -# -# Some SCSI devices (e.g. CD jukebox) support multiple LUNs -# -# CONFIG_SCSI_MULTI_LUN is not set -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y - -# -# SCSI Transport Attributes -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -# CONFIG_SCSI_ISCSI_ATTRS is not set - -# -# SCSI low-level drivers -# -CONFIG_SCSI_SATA=y -# CONFIG_SCSI_DEBUG is not set - -# -# Multi-device support (RAID and LVM) -# -CONFIG_MD=y -CONFIG_BLK_DEV_MD=y -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID5=m -CONFIG_MD_RAID6=m -CONFIG_MD_MULTIPATH=m -# CONFIG_MD_FAULTY is not set -CONFIG_BLK_DEV_DM=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_MIRROR=m -CONFIG_DM_ZERO=m - -# -# Networking support -# -CONFIG_NET=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_MMAP=y -CONFIG_NETLINK_DEV=y -CONFIG_UNIX=y -CONFIG_NET_KEY=m -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_FWMARK=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -# CONFIG_IP_PNP is not set -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE=m -CONFIG_NET_IPGRE_BROADCAST=y -CONFIG_IP_MROUTE=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -# CONFIG_ARPD is not set -CONFIG_SYN_COOKIES=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_IPCOMP=m -CONFIG_INET_TUNNEL=m -CONFIG_IP_TCPDIAG=y -# CONFIG_IP_TCPDIAG_IPV6 is not set - -# -# IP: Virtual Server Configuration -# -CONFIG_IP_VS=m -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=12 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IPV6=m -CONFIG_IPV6_PRIVACY=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_IPCOMP=m -CONFIG_INET6_TUNNEL=m -CONFIG_IPV6_TUNNEL=m -CONFIG_NETFILTER=y -# CONFIG_NETFILTER_DEBUG is not set -CONFIG_BRIDGE_NETFILTER=y - -# -# IP: Netfilter Configuration -# -CONFIG_IP_NF_CONNTRACK=m -CONFIG_IP_NF_CT_ACCT=y -# CONFIG_IP_NF_CONNTRACK_MARK is not set -CONFIG_IP_NF_CT_PROTO_SCTP=m -CONFIG_IP_NF_FTP=m -CONFIG_IP_NF_IRC=m -CONFIG_IP_NF_TFTP=m -CONFIG_IP_NF_AMANDA=m -CONFIG_IP_NF_QUEUE=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_LIMIT=m -CONFIG_IP_NF_MATCH_IPRANGE=m -CONFIG_IP_NF_MATCH_MAC=m -CONFIG_IP_NF_MATCH_PKTTYPE=m -CONFIG_IP_NF_MATCH_MARK=m -CONFIG_IP_NF_MATCH_MULTIPORT=m -CONFIG_IP_NF_MATCH_TOS=m -CONFIG_IP_NF_MATCH_RECENT=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_DSCP=m -CONFIG_IP_NF_MATCH_AH_ESP=m -CONFIG_IP_NF_MATCH_LENGTH=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_MATCH_TCPMSS=m -CONFIG_IP_NF_MATCH_HELPER=m -CONFIG_IP_NF_MATCH_STATE=m -CONFIG_IP_NF_MATCH_CONNTRACK=m -CONFIG_IP_NF_MATCH_OWNER=m -CONFIG_IP_NF_MATCH_PHYSDEV=m -CONFIG_IP_NF_MATCH_ADDRTYPE=m -CONFIG_IP_NF_MATCH_REALM=m -CONFIG_IP_NF_MATCH_SCTP=m -CONFIG_IP_NF_MATCH_COMMENT=m -# CONFIG_IP_NF_MATCH_HASHLIMIT is not set -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_LOG=m -CONFIG_IP_NF_TARGET_ULOG=m -CONFIG_IP_NF_TARGET_TCPMSS=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_NAT_NEEDED=y -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_SAME=m -CONFIG_IP_NF_NAT_SNMP_BASIC=m -CONFIG_IP_NF_NAT_IRC=m -CONFIG_IP_NF_NAT_FTP=m -CONFIG_IP_NF_NAT_TFTP=m -CONFIG_IP_NF_NAT_AMANDA=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_TOS=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_DSCP=m -CONFIG_IP_NF_TARGET_MARK=m -CONFIG_IP_NF_TARGET_CLASSIFY=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_TARGET_NOTRACK=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m - -# -# IPv6: Netfilter Configuration -# -# CONFIG_IP6_NF_QUEUE is not set -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_LIMIT=m -CONFIG_IP6_NF_MATCH_MAC=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_MULTIPORT=m -CONFIG_IP6_NF_MATCH_OWNER=m -CONFIG_IP6_NF_MATCH_MARK=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_AHESP=m -CONFIG_IP6_NF_MATCH_LENGTH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_PHYSDEV=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_LOG=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_TARGET_MARK=m -CONFIG_IP6_NF_RAW=m - -# -# Bridge: Netfilter Configuration -# -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -# CONFIG_BRIDGE_EBT_ULOG is not set -CONFIG_XFRM=y -CONFIG_XFRM_USER=y - -# -# SCTP Configuration (EXPERIMENTAL) -# -CONFIG_IP_SCTP=m -# CONFIG_SCTP_DBG_MSG is not set -# CONFIG_SCTP_DBG_OBJCNT is not set -# CONFIG_SCTP_HMAC_NONE is not set -# CONFIG_SCTP_HMAC_SHA1 is not set -CONFIG_SCTP_HMAC_MD5=y -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -# CONFIG_ATM_MPOA is not set -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_BRIDGE=m -CONFIG_VLAN_8021Q=m -# CONFIG_DECNET is not set -CONFIG_LLC=m -# CONFIG_LLC2 is not set -CONFIG_IPX=m -# CONFIG_IPX_INTERN is not set -CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=y -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y -CONFIG_IPDDP_DECAP=y -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -CONFIG_NET_DIVERT=y -# CONFIG_ECONET is not set -CONFIG_WAN_ROUTER=m - -# -# QoS and/or fair queueing -# -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_CLK_JIFFIES=y -# CONFIG_NET_SCH_CLK_GETTIMEOFDAY is not set -# CONFIG_NET_SCH_CLK_CPU is not set -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_QOS=y -CONFIG_NET_ESTIMATOR=y -CONFIG_NET_CLS=y -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_ROUTE=y -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_NET_CLS_IND=y -# CONFIG_CLS_U32_MARK is not set -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -# CONFIG_NET_CLS_ACT is not set -CONFIG_NET_CLS_POLICE=y - -# -# Network testing -# -# CONFIG_NET_PKTGEN is not set -CONFIG_NETPOLL=y -# CONFIG_NETPOLL_RX is not set -CONFIG_NETPOLL_TRAP=y -CONFIG_NET_POLL_CONTROLLER=y -# CONFIG_HAMRADIO is not set -CONFIG_IRDA=m - -# -# IrDA protocols -# -CONFIG_IRLAN=m -CONFIG_IRNET=m -CONFIG_IRCOMM=m -# CONFIG_IRDA_ULTRA is not set - -# -# IrDA options -# -CONFIG_IRDA_CACHE_LAST_LSAP=y -CONFIG_IRDA_FAST_RR=y -# CONFIG_IRDA_DEBUG is not set - -# -# Infrared-port device drivers -# - -# -# SIR device drivers -# -CONFIG_IRTTY_SIR=m - -# -# Dongle support -# -CONFIG_DONGLE=y -CONFIG_ESI_DONGLE=m -CONFIG_ACTISYS_DONGLE=m -CONFIG_TEKRAM_DONGLE=m -CONFIG_LITELINK_DONGLE=m -CONFIG_MA600_DONGLE=m -CONFIG_GIRBIL_DONGLE=m -CONFIG_MCP2120_DONGLE=m -CONFIG_OLD_BELKIN_DONGLE=m -CONFIG_ACT200L_DONGLE=m - -# -# Old SIR device drivers -# -# CONFIG_IRPORT_SIR is not set - -# -# Old Serial dongle support -# - -# -# FIR device drivers -# -CONFIG_BT=m -CONFIG_BT_L2CAP=m -CONFIG_BT_SCO=m -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_HIDP=m - -# -# Bluetooth device drivers -# -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_BCSP_TXCRC=y -CONFIG_BT_HCIVHCI=m -CONFIG_NETDEVICES=y -CONFIG_DUMMY=m -CONFIG_BONDING=m -CONFIG_EQUALIZER=m -CONFIG_TUN=m -CONFIG_ETHERTAP=m - -# -# Ethernet (10 or 100Mbit) -# -CONFIG_NET_ETHERNET=y -CONFIG_MII=m - -# -# Ethernet (1000 Mbit) -# - -# -# Ethernet (10000 Mbit) -# - -# -# Token Ring devices -# - -# -# Wireless LAN (non-hamradio) -# -CONFIG_NET_RADIO=y - -# -# Obsolete Wireless cards support (pre-802.11) -# -# CONFIG_STRIP is not set -CONFIG_ATMEL=m - -# -# Wan interfaces -# -# CONFIG_WAN is not set - -# -# ATM drivers -# -CONFIG_ATM_TCP=m -CONFIG_PPP=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPP_FILTER=y -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_PPP_DEFLATE=m -# CONFIG_PPP_BSDCOMP is not set -CONFIG_PPPOE=m -CONFIG_PPPOATM=m -# CONFIG_SLIP is not set -# CONFIG_SHAPER is not set -CONFIG_NETCONSOLE=m -CONFIG_INPUT=m -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set - -# -# Character devices -# - -# -# File systems -# -CONFIG_EXT2_FS=y -CONFIG_EXT2_FS_XATTR=y -CONFIG_EXT2_FS_POSIX_ACL=y -CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=m -CONFIG_EXT3_FS_XATTR=y -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y -CONFIG_JBD=m -# CONFIG_JBD_DEBUG is not set -CONFIG_FS_MBCACHE=y -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -CONFIG_REISERFS_PROC_INFO=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -# CONFIG_JFS_SECURITY is not set -# CONFIG_JFS_DEBUG is not set -# CONFIG_JFS_STATISTICS is not set -CONFIG_FS_POSIX_ACL=y - -# -# XFS support -# -CONFIG_XFS_FS=m -CONFIG_XFS_EXPORT=y -# CONFIG_XFS_RT is not set -CONFIG_XFS_QUOTA=y -CONFIG_XFS_SECURITY=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_MINIX_FS=m -CONFIG_ROMFS_FS=m -CONFIG_QUOTA=y -# CONFIG_QFMT_V1 is not set -CONFIG_QFMT_V2=y -CONFIG_QUOTACTL=y -CONFIG_DNOTIFY=y -CONFIG_AUTOFS_FS=m -CONFIG_AUTOFS4_FS=m - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=y -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_ZISOFS_FS=y -CONFIG_UDF_FS=m -CONFIG_UDF_NLS=y - -# -# DOS/FAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -# CONFIG_NTFS_FS is not set - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_SYSFS=y -# CONFIG_DEVFS_FS is not set -CONFIG_DEVPTS_FS_XATTR=y -CONFIG_DEVPTS_FS_SECURITY=y -CONFIG_TMPFS=y -CONFIG_TMPFS_XATTR=y -CONFIG_TMPFS_SECURITY=y -# CONFIG_HUGETLBFS is not set -# CONFIG_HUGETLB_PAGE is not set -CONFIG_RAMFS=y - -# -# Miscellaneous filesystems -# -# CONFIG_ADFS_FS is not set -CONFIG_AFFS_FS=m -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -CONFIG_BFS_FS=m -CONFIG_EFS_FS=m -CONFIG_CRAMFS=m -CONFIG_VXFS_FS=m -# CONFIG_HPFS_FS is not set -CONFIG_QNX4FS_FS=m -CONFIG_SYSV_FS=m -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set - -# -# Network File Systems -# -CONFIG_NFS_FS=m -CONFIG_NFS_V3=y -CONFIG_NFS_V4=y -CONFIG_NFS_DIRECTIO=y -CONFIG_NFSD=m -CONFIG_NFSD_V3=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_TCP=y -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_EXPORTFS=m -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_RPCSEC_GSS_SPKM3=m -CONFIG_SMB_FS=m -# CONFIG_SMB_NLS_DEFAULT is not set -CONFIG_CIFS=m -# CONFIG_CIFS_STATS is not set -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -# CONFIG_CIFS_EXPERIMENTAL is not set -CONFIG_NCP_FS=m -CONFIG_NCPFS_PACKET_SIGNING=y -CONFIG_NCPFS_IOCTL_LOCKING=y -CONFIG_NCPFS_STRONG=y -CONFIG_NCPFS_NFS_NS=y -CONFIG_NCPFS_OS2_NS=y -CONFIG_NCPFS_SMALLDOS=y -CONFIG_NCPFS_NLS=y -CONFIG_NCPFS_EXTRAS=y -# CONFIG_CODA_FS is not set -# CONFIG_AFS_FS is not set - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -CONFIG_OSF_PARTITION=y -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -CONFIG_MAC_PARTITION=y -CONFIG_MSDOS_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -# CONFIG_LDM_PARTITION is not set -CONFIG_SGI_PARTITION=y -# CONFIG_ULTRIX_PARTITION is not set -CONFIG_SUN_PARTITION=y -CONFIG_EFI_PARTITION=y - -# -# Native Language Support -# -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=y -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_UTF8=m - -# -# Security options -# -# CONFIG_KEYS is not set -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_CAPABILITIES=y -# CONFIG_SECURITY_SECLVL is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1 -CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -# CONFIG_SECURITY_SELINUX_MLS is not set - -# -# Cryptographic options -# -CONFIG_CRYPTO=y -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_NULL=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA256=m -CONFIG_CRYPTO_SHA512=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_SERPENT=m -# CONFIG_CRYPTO_AES is not set -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_KHAZAD=m -# CONFIG_CRYPTO_ANUBIS is not set -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_CRC32C=m -# CONFIG_CRYPTO_TEST is not set - -# -# Hardware crypto devices -# - -# -# Library routines -# -CONFIG_CRC_CCITT=m -CONFIG_CRC32=y -CONFIG_LIBCRC32C=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=m diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig --- a/linux-2.6.11-xen-sparse/arch/xen/i386/Kconfig Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,921 +0,0 @@ -# -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. -# - -menu "X86 Processor Configuration" - -config XENARCH - string - default i386 - -config X86 - bool - default y - help - This is Linux's home port. Linux was originally native to the Intel - 386, and runs on all the later x86 processors including the Intel - 486, 586, Pentiums, and various instruction-set-compatible chips by - AMD, Cyrix, and others. - -config MMU - bool - default y - -config SBUS - bool - -config UID16 - bool - default y - -config GENERIC_ISA_DMA - bool - default y - -config GENERIC_IOMAP - bool - default y - -choice - prompt "Processor family" - default M686 - -config M386 - bool "386" - ---help--- - This is the processor type of your CPU. This information is used for - optimizing purposes. In order to compile a kernel that can run on - all x86 CPU types (albeit not optimally fast), you can specify - "386" here. - - The kernel will not necessarily run on earlier architectures than - the one you have chosen, e.g. a Pentium optimized kernel will run on - a PPro, but not necessarily on a i486. - - Here are the settings recommended for greatest speed: - - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI - 486DLC/DLC2, UMC 486SX-S and NexGen Nx586. Only "386" kernels - will run on a 386 class machine. - - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or - SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - - "586" for generic Pentium CPUs lacking the TSC - (time stamp counter) register. - - "Pentium-Classic" for the Intel Pentium. - - "Pentium-MMX" for the Intel Pentium MMX. - - "Pentium-Pro" for the Intel Pentium Pro. - - "Pentium-II" for the Intel Pentium II or pre-Coppermine Celeron. - - "Pentium-III" for the Intel Pentium III or Coppermine Celeron. - - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). - - "Crusoe" for the Transmeta Crusoe series. - - "Efficeon" for the Transmeta Efficeon series. - - "Winchip-C6" for original IDT Winchip. - - "Winchip-2" for IDT Winchip 2. - - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. - - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - - "VIA C3-2 for VIA C3-2 "Nehemiah" (model 9 and above). - - If you don't know what to do, choose "386". - -config M486 - bool "486" - help - Select this for a 486 series processor, either Intel or one of the - compatible processors from AMD, Cyrix, IBM, or Intel. Includes DX, - DX2, and DX4 variants; also SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or - U5S. - -config M586 - bool "586/K5/5x86/6x86/6x86MX" - help - Select this for an 586 or 686 series processor such as the AMD K5, - the Cyrix 5x86, 6x86 and 6x86MX. This choice does not - assume the RDTSC (Read Time Stamp Counter) instruction. - -config M586TSC - bool "Pentium-Classic" - help - Select this for a Pentium Classic processor with the RDTSC (Read - Time Stamp Counter) instruction for benchmarking. - -config M586MMX - bool "Pentium-MMX" - help - Select this for a Pentium with the MMX graphics/multimedia - extended instructions. - -config M686 - bool "Pentium-Pro" - help - Select this for Intel Pentium Pro chips. This enables the use of - Pentium Pro extended instructions, and disables the init-time guard - against the f00f bug found in earlier Pentiums. - -config MPENTIUMII - bool "Pentium-II/Celeron(pre-Coppermine)" - help - Select this for Intel chips based on the Pentium-II and - pre-Coppermine Celeron core. This option enables an unaligned - copy optimization, compiles the kernel with optimization flags - tailored for the chip, and applies any applicable Pentium Pro - optimizations. - -config MPENTIUMIII - bool "Pentium-III/Celeron(Coppermine)/Pentium-III Xeon" - help - Select this for Intel chips based on the Pentium-III and - Celeron-Coppermine core. This option enables use of some - extended prefetch instructions in addition to the Pentium II - extensions. - -config MPENTIUMM - bool "Pentium M" - help - Select this for Intel Pentium M (not Pentium-4 M) - notebook chips. - -config MPENTIUM4 - bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/Xeon" - help - Select this for Intel Pentium 4 chips. This includes the - Pentium 4, P4-based Celeron and Xeon, and Pentium-4 M - (not Pentium M) chips. This option enables compile flags - optimized for the chip, uses the correct cache shift, and - applies any applicable Pentium III optimizations. - -config MK6 - bool "K6/K6-II/K6-III" - help - Select this for an AMD K6-family processor. Enables use of - some extended instructions, and passes appropriate optimization - flags to GCC. - -config MK7 - bool "Athlon/Duron/K7" - help - Select this for an AMD Athlon K7-family processor. Enables use of - some extended instructions, and passes appropriate optimization - flags to GCC. - -config MK8 - bool "Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. Enables - use of some extended instructions, and passes appropriate optimization - flags to GCC. - -config MCRUSOE - bool "Crusoe" - help - Select this for a Transmeta Crusoe processor. Treats the processor - like a 586 with TSC, and sets some GCC optimization flags (like a - Pentium Pro with no alignment requirements). - -config MEFFICEON - bool "Efficeon" - help - Select this for a Transmeta Efficeon processor. - -config MWINCHIPC6 - bool "Winchip-C6" - help - Select this for an IDT Winchip C6 chip. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - -config MWINCHIP2 - bool "Winchip-2" - help - Select this for an IDT Winchip-2. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - -config MWINCHIP3D - bool "Winchip-2A/Winchip-3" - help - Select this for an IDT Winchip-2A or 3. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment reqirements. Also enable out of order memory - stores for this CPU, which can increase performance of some - operations. - -config MCYRIXIII - bool "CyrixIII/VIA-C3" - help - Select this for a Cyrix III or C3 chip. Presently Linux and GCC - treat this chip as a generic 586. Whilst the CPU is 686 class, - it lacks the cmov extension which gcc assumes is present when - generating 686 code. - Note that Nehemiah (Model 9) and above will not boot with this - kernel due to them lacking the 3DNow! instructions used in earlier - incarnations of the CPU. - -config MVIAC3_2 - bool "VIA C3-2 (Nehemiah)" - help - Select this for a VIA C3 "Nehemiah". Selecting this enables usage - of SSE and tells gcc to treat the CPU as a 686. - Note, this kernel will not boot on older (pre model 9) C3s. - -endchoice - -config X86_GENERIC - bool "Generic x86 support" - help - Instead of just including optimizations for the selected - x86 variant (e.g. PII, Crusoe or Athlon), include some more - generic optimizations as well. This will make the kernel - perform better on x86 CPUs other than that selected. - - This is really intended for distributors who need more - generic optimizations. - -# -# Define implied options from the CPU selection here -# -config X86_CMPXCHG - bool - depends on !M386 - default y - -config X86_XADD - bool - depends on !M386 - default y - -config X86_L1_CACHE_SHIFT - int - default "7" if MPENTIUM4 || X86_GENERIC - default "4" if X86_ELAN || M486 || M386 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 - default "6" if MK7 || MK8 || MPENTIUMM - -config RWSEM_GENERIC_SPINLOCK - bool - depends on M386 - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - -config GENERIC_CALIBRATE_DELAY - bool - default y - -config X86_PPRO_FENCE - bool - depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 - default y - -config X86_F00F_BUG - bool - depends on M586MMX || M586TSC || M586 || M486 || M386 - default y - -config X86_WP_WORKS_OK - bool - depends on !M386 - default y - -config X86_INVLPG - bool - depends on !M386 - default y - -config X86_BSWAP - bool - depends on !M386 - default y - -config X86_POPAD_OK - bool - depends on !M386 - default y - -config X86_ALIGNMENT_16 - bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 - default y - -config X86_GOOD_APIC - bool - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON - default y - -config X86_INTEL_USERCOPY - bool - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON - default y - -config X86_USE_PPRO_CHECKSUM - bool - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON - default y - -config X86_USE_3DNOW - bool - depends on MCYRIXIII || MK7 - default y - -config X86_OOSTORE - bool - depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR - default y - -config HPET_TIMER - bool - default n -#config HPET_TIMER -# bool "HPET Timer Support" -# help -# This enables the use of the HPET for the kernel's internal timer. -# HPET is the next generation timer replacing legacy 8254s. -# You can safely choose Y here. However, HPET will only be -# activated if the platform and the BIOS support this feature. -# Otherwise the 8254 will be used for timing services. -# -# Choose N to continue using the legacy 8254 timer. - -config HPET_EMULATE_RTC - def_bool HPET_TIMER && RTC=y - -config SMP - bool "Symmetric multi-processing support" - ---help--- - This enables support for systems with more than one CPU. If you have - a system with only one CPU, like most personal computers, say N. If - you have a system with more than one CPU, say Y. - - If you say N here, the kernel will run on single and multiprocessor - machines, but will use only one CPU of a multiprocessor machine. If - you say Y here, the kernel will run on many, but not all, - singleprocessor machines. On a singleprocessor machine, the kernel - will run faster if you say N here. - - Note that if you say Y here and choose architecture "586" or - "Pentium" under "Processor family", the kernel will not work on 486 - architectures. Similarly, multiprocessor kernels for the "PPro" - architecture may not work on all Pentium based boards. - - People using multiprocessor machines who say Y here should also say - Y to "Enhanced Real Time Clock Support", below. The "Advanced Power - Management" code will be disabled if you say Y here. - - See also the <file:Documentation/smp.txt>, - <file:Documentation/i386/IO-APIC.txt>, - <file:Documentation/nmi_watchdog.txt> and the SMP-HOWTO available at - <http://www.tldp.org/docs.html#howto>. - - If you don't know what to do here, say N. - -config SMP_ALTERNATIVES - bool "SMP alternatives support (EXPERIMENTAL)" - depends on SMP && EXPERIMENTAL - help - Try to reduce the overhead of running an SMP kernel on a uniprocessor - host slightly by replacing certain key instruction sequences - according to whether we currently have more than one CPU available. - This should provide a noticeable boost to performance when - running SMP kernels on UP machines, and have negligible impact - when running on an true SMP host. - - If unsure, say N. - -config NR_CPUS - int "Maximum number of CPUs (2-255)" - range 2 255 - depends on SMP - default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 - default "8" - help - This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 255 and the - minimum value which makes sense is 2. - - This is purely to save memory - each supported CPU adds - approximately eight kilobytes to the kernel image. - -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SMP - default off - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a - cost of slightly increased overhead in some places. If unsure say - N here. - -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. - -config PREEMPT_BKL - bool "Preempt The Big Kernel Lock" - depends on PREEMPT - default y - help - This option reduces the latency of the kernel by making the - big kernel lock preemptible. - - Say Y here if you are building a kernel for a desktop system. - Say N if you are unsure. - -#config X86_TSC -# bool -# depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ -# default y - -#config X86_MCE -# bool "Machine Check Exception" -# depends on !X86_VOYAGER -# ---help--- -# Machine Check Exception support allows the processor to notify the -# kernel if it detects a problem (e.g. overheating, component failure). -# The action the kernel takes depends on the severity of the problem, -# ranging from a warning message on the console, to halting the machine. -# Your processor must be a Pentium or newer to support this - check the -# flags in /proc/cpuinfo for mce. Note that some older Pentium systems -# have a design flaw which leads to false MCE events - hence MCE is -# disabled on all P5 processors, unless explicitly enabled with "mce" -# as a boot argument. Similarly, if MCE is built in and creates a -# problem on some new non-standard machine, you can boot with "nomce" -# to disable it. MCE support simply ignores non-MCE processors like -# the 386 and 486, so nearly everyone can say Y here. - -#config X86_MCE_NONFATAL -# tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" -# depends on X86_MCE -# help -# Enabling this feature starts a timer that triggers every 5 seconds which -# will look at the machine check registers to see if anything happened. -# Non-fatal problems automatically get corrected (but still logged). -# Disable this if you don't want to see these messages. -# Seeing the messages this option prints out may be indicative of dying hardware, -# or out-of-spec (ie, overclocked) hardware. -# This option only does something on certain CPUs. -# (AMD Athlon/Duron and Intel Pentium 4) - -#config X86_MCE_P4THERMAL -# bool "check for P4 thermal throttling interrupt." -# depends on X86_MCE && (X86_UP_APIC || SMP) -# help -# Enabling this feature will cause a message to be printed when the P4 -# enters thermal throttling. - -config MICROCODE - tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" - depends on XEN_PRIVILEGED_GUEST - ---help--- - If you say Y here and also to "/dev file system support" in the - 'File systems' section, you will be able to update the microcode on - Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, - Pentium III, Pentium 4, Xeon etc. You will obviously need the - actual microcode binary data itself which is not shipped with the - Linux kernel. - - For latest news and information on obtaining all the required - ingredients for this driver, check: - <http://www.urbanmyth.org/microcode/>. - - To compile this driver as a module, choose M here: the - module will be called microcode. - -#config X86_MSR -# tristate "/dev/cpu/*/msr - Model-specific register support" -# help -# This device gives privileged processes access to the x86 -# Model-Specific Registers (MSRs). It is a character device with -# major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. -# MSR accesses are directed to a specific CPU on multi-processor -# systems. - -config X86_CPUID - tristate "/dev/cpu/*/cpuid - CPU information support" - help - This device gives processes access to the x86 CPUID instruction to - be executed on a specific processor. It is a character device - with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to - /dev/cpu/31/cpuid. - -source "drivers/firmware/Kconfig" - -choice - prompt "High Memory Support" - default NOHIGHMEM - -config NOHIGHMEM - bool "off" - ---help--- - Linux can use up to 64 Gigabytes of physical memory on x86 systems. - However, the address space of 32-bit x86 processors is only 4 - Gigabytes large. That means that, if you have a large amount of - physical memory, not all of it can be "permanently mapped" by the - kernel. The physical memory that's not permanently mapped is called - "high memory". - - If you are compiling a kernel which will never run on a machine with - more than 1 Gigabyte total physical RAM, answer "off" here (default - choice and suitable for most users). This will result in a "3GB/1GB" - split: 3GB are mapped so that each process sees a 3GB virtual memory - space and the remaining part of the 4GB virtual memory space is used - by the kernel to permanently map as much physical memory as - possible. - - If the machine has between 1 and 4 Gigabytes physical RAM, then - answer "4GB" here. - - If more than 4 Gigabytes is used then answer "64GB" here. This - selection turns Intel PAE (Physical Address Extension) mode on. - PAE implements 3-level paging on IA32 processors. PAE is fully - supported by Linux, PAE mode is implemented on all recent Intel - processors (Pentium Pro and better). NOTE: If you say "64GB" here, - then the kernel will not boot on CPUs that don't support PAE! - - The actual amount of total physical memory will either be - auto detected or can be forced by using a kernel command line option - such as "mem=256M". (Try "man bootparam" or see the documentation of - your boot loader (lilo or loadlin) about how to pass options to the - kernel at boot time.) - - If unsure, say "off". - -config HIGHMEM4G - bool "4GB" - help - Select this if you have a 32-bit processor and between 1 and 4 - gigabytes of physical RAM. - -#config HIGHMEM64G -# bool "64GB" -# help -# Select this if you have a 32-bit processor and more than 4 -# gigabytes of physical RAM. - -endchoice - -config HIGHMEM - bool - depends on HIGHMEM64G || HIGHMEM4G - default y - -config X86_PAE - bool - depends on HIGHMEM64G - default y - -# Common NUMA Features -config NUMA - bool "Numa Memory Allocation and Scheduler Support" - depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) - default n if X86_PC - default y if (X86_NUMAQ || X86_SUMMIT) - -# Need comments to help the hapless user trying to turn on NUMA support -comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support" - depends on X86_NUMAQ && (!HIGHMEM64G || !SMP) - -comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" - depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI) - -config DISCONTIGMEM - bool - depends on NUMA - default y - -config HAVE_ARCH_BOOTMEM_NODE - bool - depends on NUMA - default y - -#config HIGHPTE -# bool "Allocate 3rd-level pagetables from highmem" -# depends on HIGHMEM4G || HIGHMEM64G -# help -# The VM uses one page table entry for each page of physical memory. -# For systems with a lot of RAM, this can be wasteful of precious -# low memory. Setting this option will put user-space page table -# entries in high memory. - -config MTRR - bool - depends on XEN_PRIVILEGED_GUEST - default y - -#config MTRR -# bool "MTRR (Memory Type Range Register) support" -# ---help--- -# On Intel P6 family processors (Pentium Pro, Pentium II and later) -# the Memory Type Range Registers (MTRRs) may be used to control -# processor access to memory ranges. This is most useful if you have -# a video (VGA) card on a PCI or AGP bus. Enabling write-combining -# allows bus write transfers to be combined into a larger transfer -# before bursting over the PCI/AGP bus. This can increase performance -# of image write operations 2.5 times or more. Saying Y here creates a -# /proc/mtrr file which may be used to manipulate your processor's -# MTRRs. Typically the X server should use this. -# -# This code has a reasonably generic interface so that similar -# control registers on other processors can be easily supported -# as well: -# -# The Cyrix 6x86, 6x86MX and M II processors have Address Range -# Registers (ARRs) which provide a similar functionality to MTRRs. For -# these, the ARRs are used to emulate the MTRRs. -# The AMD K6-2 (stepping 8 and above) and K6-3 processors have two -# MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing -# write-combining. All of these processors are supported by this code -# and it makes sense to say Y here if you have one of them. -# -# Saying Y here also fixes a problem with buggy SMP BIOSes which only -# set the MTRRs for the boot CPU and not for the secondary CPUs. This -# can lead to all sorts of problems, so it's good to say Y here. -# -# You can safely say Y even if your machine doesn't have MTRRs, you'll -# just add about 9 KB to your kernel. -# -# See <file:Documentation/mtrr.txt> for more information. - -config IRQBALANCE - bool "Enable kernel irq balancing" - depends on SMP && X86_IO_APIC && !XEN - default y - help - The default yes will allow the kernel to do irq load balancing. - Saying no will keep the kernel from doing irq load balancing. - -config HAVE_DEC_LOCK - bool - depends on (SMP || PREEMPT) && X86_CMPXCHG - default y - -# turning this on wastes a bunch of space. -# Summit needs it only when NUMA is on -config BOOT_IOREMAP - bool - depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) - default y - -config REGPARM - bool "Use register arguments (EXPERIMENTAL)" - depends on EXPERIMENTAL - default n - help - Compile the kernel with -mregparm=3. This uses a different ABI - and passes the first three arguments of a function call in registers. - This will probably break binary only modules. - - This feature is only enabled for gcc-3.0 and later - earlier compilers - generate incorrect output with certain kernel constructs when - -mregparm=3 is used. - -config X86_LOCAL_APIC - bool - depends on !SMP && X86_UP_APIC - default y - -config X86_IO_APIC - bool - depends on !SMP && X86_UP_IOAPIC - default y - -config HOTPLUG_CPU - bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL - ---help--- - Say Y here to experiment with turning CPUs off and on. CPUs - can be controlled through /sys/devices/system/cpu. - - Say N. - - -if XEN_PHYSDEV_ACCESS - -menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)" - -config X86_VISWS_APIC - bool - depends on X86_VISWS - default y - -config X86_LOCAL_APIC - bool - depends on (X86_VISWS || SMP) && !X86_VOYAGER - default y - -config X86_UP_APIC - bool "Local APIC support on uniprocessors" if !SMP - depends on !(X86_VISWS || X86_VOYAGER) - ---help--- - A local APIC (Advanced Programmable Interrupt Controller) is an - integrated interrupt controller in the CPU. If you have a single-CPU - system which has a processor with a local APIC, you can say Y here to - enable and use it. If you say Y here even though your machine doesn't - have a local APIC, then the kernel will still run with no slowdown at - all. The local APIC supports CPU-generated self-interrupts (timer, - performance counters), and the NMI watchdog which detects hard - lockups. - - If you have a system with several CPUs, you do not need to say Y - here: the local APIC will be used automatically. - -config X86_UP_IOAPIC - bool "IO-APIC support on uniprocessors" - depends on !SMP && X86_UP_APIC - help - An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an - SMP-capable replacement for PC-style interrupt controllers. Most - SMP systems and a small number of uniprocessor systems have one. - If you have a single-CPU system with an IO-APIC, you can say Y here - to use it. If you say Y here even though your machine doesn't have - an IO-APIC, then the kernel will still run with no slowdown at all. - - If you have a system with several CPUs, you do not need to say Y - here: the IO-APIC will be used automatically. - -config X86_IO_APIC - bool - depends on SMP && !(X86_VISWS || X86_VOYAGER) - default y - -config PCI - bool "PCI support" if !X86_VISWS - depends on !X86_VOYAGER - default y if X86_VISWS - help - Find out whether you have a PCI motherboard. PCI is the name of a - bus system, i.e. the way the CPU talks to the other stuff inside - your box. Other bus systems are ISA, EISA, MicroChannel (MCA) or - VESA. If you have PCI, say Y, otherwise N. - - The PCI-HOWTO, available from - <http://www.tldp.org/docs.html#howto>, contains valuable - information about which PCI hardware does work under Linux and which - doesn't. - -choice - prompt "PCI access mode" - depends on PCI && !X86_VISWS - default PCI_GOANY - ---help--- - On PCI systems, the BIOS can be used to detect the PCI devices and - determine their configuration. However, some old PCI motherboards - have BIOS bugs and may crash if this is done. Also, some embedded - PCI-based systems don't have any BIOS at all. Linux can also try to - detect the PCI hardware directly without using the BIOS. - - With this option, you can specify how Linux should detect the - PCI devices. If you choose "BIOS", the BIOS will be used, - if you choose "Direct", the BIOS won't be used, and if you - choose "MMConfig", then PCI Express MMCONFIG will be used. - If you choose "Any", the kernel will try MMCONFIG, then the - direct access method and falls back to the BIOS if that doesn't - work. If unsure, go with the default, which is "Any". - -config PCI_GOBIOS - bool "BIOS" - -config PCI_GOMMCONFIG - bool "MMConfig" - -config PCI_GODIRECT - bool "Direct" - -config PCI_GOANY - bool "Any" - -endchoice - -config PCI_BIOS - bool - depends on !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) - default y - -config PCI_DIRECT - bool - depends on PCI && ((PCI_GODIRECT || PCI_GOANY) || X86_VISWS) - default y - -config PCI_MMCONFIG - bool - depends on PCI && (PCI_GOMMCONFIG || (PCI_GOANY && ACPI)) - select ACPI_BOOT - default y - -source "drivers/pci/pcie/Kconfig" - -source "drivers/pci/Kconfig" - -config ISA - bool "ISA support" - depends on !(X86_VOYAGER || X86_VISWS) - help - Find out whether you have ISA slots on your motherboard. ISA is the - name of a bus system, i.e. the way the CPU talks to the other stuff - inside your box. Other bus systems are PCI, EISA, MicroChannel - (MCA) or VESA. ISA is an older system, now being displaced by PCI; - newer boards don't support it. If you have ISA, say Y, otherwise N. - -config EISA - bool "EISA support" - depends on ISA - ---help--- - The Extended Industry Standard Architecture (EISA) bus was - developed as an open alternative to the IBM MicroChannel bus. - - The EISA bus provided some of the features of the IBM MicroChannel - bus while maintaining backward compatibility with cards made for - the older ISA bus. The EISA bus saw limited use between 1988 and - 1995 when it was made obsolete by the PCI bus. - - Say Y here if you are building a kernel for an EISA-based machine. - - Otherwise, say N. - -source "drivers/eisa/Kconfig" - -config MCA - bool "MCA support" - depends on !(X86_VISWS || X86_VOYAGER) - help - MicroChannel Architecture is found in some IBM PS/2 machines and - laptops. It is a bus system similar to PCI or ISA. See - <file:Documentation/mca.txt> (and especially the web page given - there) before attempting to build an MCA bus kernel. - -config MCA - depends on X86_VOYAGER - default y if X86_VOYAGER - -source "drivers/mca/Kconfig" - -config SCx200 - tristate "NatSemi SCx200 support" - depends on !X86_VOYAGER - help - This provides basic support for the National Semiconductor SCx200 - processor. Right now this is just a driver for the GPIO pins. - - If you don't know what to do here, say N. - - This support is also available as a module. If compiled as a - module, it will be called scx200. - -source "drivers/pcmcia/Kconfig" - -source "drivers/pci/hotplug/Kconfig" - -endmenu - -endif - -source "arch/i386/Kconfig.debug" - -# -# Use the generic interrupt handling code in kernel/irq/: -# -config GENERIC_HARDIRQS - bool - default y - -config GENERIC_IRQ_PROBE - bool - default y - -config X86_SMP - bool - depends on SMP && !X86_VOYAGER - default y - -#config X86_HT -# bool -# depends on SMP && !(X86_VISWS || X86_VOYAGER) -# default y - -config X86_BIOS_REBOOT - bool - depends on !(X86_VISWS || X86_VOYAGER) - default y - -config X86_TRAMPOLINE - bool - depends on X86_SMP || (X86_VOYAGER && SMP) - default y - -config PC - bool - depends on X86 && !EMBEDDED - default y - -endmenu diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,103 +0,0 @@ -# -# i386/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" cleaning up for this architecture. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@xxxxxxxxxxxxx> -# Added '-march' and '-mpreferred-stack-boundary' support -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -LDFLAGS := -m elf_i386 -LDFLAGS_vmlinux := -CHECK := $(CHECK) -D__i386__=1 - -CFLAGS += -m32 -AFLAGS += -m32 - -CFLAGS += -pipe -msoft-float - -# prevent gcc from keeping the stack 16 byte aligned -CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2,) - -align := $(cc-option-align) -cflags-$(CONFIG_M386) += -march=i386 -cflags-$(CONFIG_M486) += -march=i486 -cflags-$(CONFIG_M586) += -march=i586 -cflags-$(CONFIG_M586TSC) += -march=i586 -cflags-$(CONFIG_M586MMX) += $(call cc-option,-march=pentium-mmx,-march=i586) -cflags-$(CONFIG_M686) += -march=i686 -cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call cc-option,-mtune=pentium2) -cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call cc-option,-mtune=pentium3) -cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call cc-option,-mtune=pentium3) -cflags-$(CONFIG_MPENTIUM4) += -march=i686 $(call cc-option,-mtune=pentium4) -cflags-$(CONFIG_MK6) += -march=k6 -# Please note, that patches that add -march=athlon-xp and friends are pointless. -# They make zero difference whatsosever to performance at this time. -cflags-$(CONFIG_MK7) += $(call cc-option,-march=athlon,-march=i686 $(align)-functions=4) -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,$(call cc-option,-march=athlon,-march=i686 $(align)-functions=4)) -cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call cc-option,-mtune=pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 -cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) - -# AMD Elan support -cflags-$(CONFIG_X86_ELAN) += -march=i486 - -# -mregparm=3 works ok on gcc-3.0 and later -# -GCC_VERSION := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC)) -cflags-$(CONFIG_REGPARM) += $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;) - -# Disable unit-at-a-time mode, it makes gcc use a lot more stack -# due to the lack of sharing of stacklots. -CFLAGS += $(call cc-option,-fno-unit-at-a-time,) - -CFLAGS += $(cflags-y) - -head-y := arch/xen/i386/kernel/head.o arch/xen/i386/kernel/init_task.o - -libs-y += arch/i386/lib/ -core-y += arch/xen/i386/kernel/ \ - arch/xen/i386/mm/ \ - arch/xen/i386/mach-default/ \ - arch/i386/crypto/ -# \ -# arch/xen/$(mcore-y)/ -drivers-$(CONFIG_MATH_EMULATION) += arch/i386/math-emu/ -drivers-$(CONFIG_PCI) += arch/xen/i386/pci/ -# must be linked after kernel/ -drivers-$(CONFIG_OPROFILE) += arch/i386/oprofile/ -drivers-$(CONFIG_PM) += arch/i386/power/ - -# for clean -obj- += kernel/ mm/ pci/ -#obj- += ../../i386/lib/ ../../i386/mm/ -#../../i386/$(mcore-y)/ -#obj- += ../../i386/pci/ ../../i386/oprofile/ ../../i386/power/ - -xenflags-y += -Iinclude/asm-xen/asm-i386/mach-xen \ - -Iinclude/asm-i386/mach-default -CFLAGS += $(xenflags-y) -AFLAGS += $(xenflags-y) - -prepare: include/asm-$(XENARCH)/asm_offsets.h -CLEAN_FILES += include/asm-$(XENARCH)/asm_offsets.h - -arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/.asm-ignore \ - include/linux/version.h include/config/MARKER - -include/asm-$(XENARCH)/asm_offsets.h: arch/$(XENARCH)/kernel/asm-offsets.s - $(call filechk,gen-asm-offsets) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,96 +0,0 @@ -# -# Makefile for the linux kernel. -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CFLAGS += -Iarch/$(XENARCH)/kernel - -extra-y := head.o init_task.o - -obj-y := process.o signal.o entry.o traps.o \ - time.o ioport.o ldt.o setup.o \ - pci-dma.o i386_ksyms.o irq.o - -c-obj-y := semaphore.o vm86.o \ - ptrace.o sys_i386.o \ - i387.o dmi_scan.o bootflag.o \ - doublefault.o quirks.o -s-obj-y := - -obj-y += cpu/ -obj-y += timers/ -obj-$(CONFIG_ACPI_BOOT) += acpi/ -#c-obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o -c-obj-$(CONFIG_MCA) += mca.o -c-obj-$(CONFIG_X86_MSR) += msr.o -c-obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o -c-obj-$(CONFIG_APM) += apm.o -obj-$(CONFIG_X86_SMP) += smp.o smpboot.o -#obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o -obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o -c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o -c-obj-$(CONFIG_X86_NUMAQ) += numaq.o -c-obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o -c-obj-$(CONFIG_MODULES) += module.o -c-obj-y += sysenter.o -obj-y += vsyscall.o -c-obj-$(CONFIG_ACPI_SRAT) += srat.o -c-obj-$(CONFIG_HPET_TIMER) += time_hpet.o -c-obj-$(CONFIG_EFI) += efi.o efi_stub.o -c-obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o - -EXTRA_AFLAGS := -traditional - -c-obj-$(CONFIG_SCx200) += scx200.o - -# vsyscall.o contains the vsyscall DSO images as __initdata. -# We must build both images before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so -targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) -targets += vsyscall.lds - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -nostdlib -m32 $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ - -export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH) - -vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 -SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags) -SYSCFLAGS_vsyscall-int80.so = $(vsyscall-flags) - -$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE - $(call if_changed,syscall) - -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vsyscall-syms.o -$(obj)/built-in.o: $(obj)/vsyscall-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o - -SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds $(obj)/vsyscall-sysenter.o FORCE - $(call if_changed,syscall) - -c-link := init_task.o -s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): - @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@ - -$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S - -obj-y += $(c-obj-y) $(s-obj-y) -obj-m += $(c-obj-m) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link)) -clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/acpi/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/acpi/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,13 +0,0 @@ -obj-$(CONFIG_ACPI_BOOT) := boot.o -c-obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o -c-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@ - -obj-y += $(c-obj-y) $(s-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) -clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/acpi/boot.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/acpi/boot.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,906 +0,0 @@ -/* - * boot.c - Architecture-Specific Low-Level ACPI Boot Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@xxxxxxxxx> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@xxxxxxxxx> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/init.h> -#include <linux/config.h> -#include <linux/acpi.h> -#include <linux/efi.h> -#include <linux/irq.h> -#include <linux/module.h> - -#include <asm/pgtable.h> -#include <asm/io_apic.h> -#include <asm/apic.h> -#include <asm/io.h> -#include <asm/irq.h> -#include <asm/mpspec.h> -#ifdef CONFIG_XEN -#include <asm/fixmap.h> -#endif - -void (*pm_power_off)(void) = NULL; - -#ifdef CONFIG_X86_64 - -static inline void acpi_madt_oem_check(char *oem_id, char *oem_table_id) { } -extern void __init clustered_apic_check(void); -static inline int ioapic_setup_disabled(void) { return 0; } -#include <asm/proto.h> - -#else /* X86 */ - -#ifdef CONFIG_X86_LOCAL_APIC -#include <mach_apic.h> -#include <mach_mpparse.h> -#endif /* CONFIG_X86_LOCAL_APIC */ - -#endif /* X86 */ - -#define BAD_MADT_ENTRY(entry, end) ( \ - (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ - ((acpi_table_entry_header *)entry)->length != sizeof(*entry)) - -#define PREFIX "ACPI: " - -#ifdef CONFIG_ACPI_PCI -int acpi_noirq __initdata; /* skip ACPI IRQ initialization */ -int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization */ -#else -int acpi_noirq __initdata = 1; -int acpi_pci_disabled __initdata = 1; -#endif -int acpi_ht __initdata = 1; /* enable HT */ - -int acpi_lapic; -int acpi_ioapic; -int acpi_strict; -EXPORT_SYMBOL(acpi_strict); - -acpi_interrupt_flags acpi_sci_flags __initdata; -int acpi_sci_override_gsi __initdata; -int acpi_skip_timer_override __initdata; - -#ifdef CONFIG_X86_LOCAL_APIC -static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; -#endif - -#ifndef __HAVE_ARCH_CMPXCHG -#warning ACPI uses CMPXCHG, i486 and later hardware -#endif - -#define MAX_MADT_ENTRIES 256 -u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] = - { [0 ... MAX_MADT_ENTRIES-1] = 0xff }; -EXPORT_SYMBOL(x86_acpiid_to_apicid); - -/* -------------------------------------------------------------------------- - Boot-time Configuration - -------------------------------------------------------------------------- */ - -/* - * The default interrupt routing model is PIC (8259). This gets - * overriden if IOAPICs are enumerated (below). - */ -enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; - -#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN) - -/* rely on all ACPI tables being in the direct mapping */ -char *__acpi_map_table(unsigned long phys_addr, unsigned long size) -{ - if (!phys_addr || !size) - return NULL; - - if (phys_addr < (end_pfn_map << PAGE_SHIFT)) - return __va(phys_addr); - - return NULL; -} - -#else - -/* - * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, - * to map the target physical address. The problem is that set_fixmap() - * provides a single page, and it is possible that the page is not - * sufficient. - * By using this area, we can map up to MAX_IO_APICS pages temporarily, - * i.e. until the next __va_range() call. - * - * Important Safety Note: The fixed I/O APIC page numbers are *subtracted* - * from the fixed base. That's why we start at FIX_IO_APIC_BASE_END and - * count idx down while incrementing the phys address. - */ -char *__acpi_map_table(unsigned long phys, unsigned long size) -{ - unsigned long base, offset, mapped_size; - int idx; - -#ifndef CONFIG_XEN - if (phys + size < 8*1024*1024) - return __va(phys); -#endif - - offset = phys & (PAGE_SIZE - 1); - mapped_size = PAGE_SIZE - offset; - set_fixmap(FIX_ACPI_END, phys); - base = fix_to_virt(FIX_ACPI_END); - - /* - * Most cases can be covered by the below. - */ - idx = FIX_ACPI_END; - while (mapped_size < size) { - if (--idx < FIX_ACPI_BEGIN) - return NULL; /* cannot handle this */ - phys += PAGE_SIZE; - set_fixmap(idx, phys); - mapped_size += PAGE_SIZE; - } - - return ((unsigned char *) base + offset); -} -#endif - -#ifdef CONFIG_PCI_MMCONFIG -static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) -{ - struct acpi_table_mcfg *mcfg; - - if (!phys_addr || !size) - return -EINVAL; - - mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size); - if (!mcfg) { - printk(KERN_WARNING PREFIX "Unable to map MCFG\n"); - return -ENODEV; - } - - if (mcfg->base_reserved) { - printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); - return -ENODEV; - } - - pci_mmcfg_base_addr = mcfg->base_address; - - return 0; -} -#else -#define acpi_parse_mcfg NULL -#endif /* !CONFIG_PCI_MMCONFIG */ - -#ifdef CONFIG_X86_LOCAL_APIC -static int __init -acpi_parse_madt ( - unsigned long phys_addr, - unsigned long size) -{ - struct acpi_table_madt *madt = NULL; - - if (!phys_addr || !size) - return -EINVAL; - - madt = (struct acpi_table_madt *) __acpi_map_table(phys_addr, size); - if (!madt) { - printk(KERN_WARNING PREFIX "Unable to map MADT\n"); - return -ENODEV; - } - - if (madt->lapic_address) { - acpi_lapic_addr = (u64) madt->lapic_address; - - printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n", - madt->lapic_address); - } - - acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id); - - return 0; -} - - -static int __init -acpi_parse_lapic ( - acpi_table_entry_header *header, const unsigned long end) -{ - struct acpi_table_lapic *processor = NULL; - - processor = (struct acpi_table_lapic*) header; - - if (BAD_MADT_ENTRY(processor, end)) - return -EINVAL; - - acpi_table_print_madt_entry(header); - - /* no utility in registering a disabled processor */ - if (processor->flags.enabled == 0) - return 0; - - x86_acpiid_to_apicid[processor->acpi_id] = processor->id; - - mp_register_lapic ( - processor->id, /* APIC ID */ - processor->flags.enabled); /* Enabled? */ - - return 0; -} - -static int __init -acpi_parse_lapic_addr_ovr ( - acpi_table_entry_header *header, const unsigned long end) -{ - struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL; - - lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr*) header; - - if (BAD_MADT_ENTRY(lapic_addr_ovr, end)) - return -EINVAL; - - acpi_lapic_addr = lapic_addr_ovr->address; - - return 0; -} - -static int __init -acpi_parse_lapic_nmi ( - acpi_table_entry_header *header, const unsigned long end) -{ - struct acpi_table_lapic_nmi *lapic_nmi = NULL; - - lapic_nmi = (struct acpi_table_lapic_nmi*) header; - - if (BAD_MADT_ENTRY(lapic_nmi, end)) - return -EINVAL; - - acpi_table_print_madt_entry(header); - - if (lapic_nmi->lint != 1) - printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n"); - - return 0; -} - - -#endif /*CONFIG_X86_LOCAL_APIC*/ - -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) - -static int __init -acpi_parse_ioapic ( - acpi_table_entry_header *header, const unsigned long end) -{ - struct acpi_table_ioapic *ioapic = NULL; - - ioapic = (struct acpi_table_ioapic*) header; - - if (BAD_MADT_ENTRY(ioapic, end)) - return -EINVAL; - - acpi_table_print_madt_entry(header); - - mp_register_ioapic ( - ioapic->id, - ioapic->address, - ioapic->global_irq_base); - - return 0; -} - -/* - * Parse Interrupt Source Override for the ACPI SCI - */ -static void -acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) -{ - if (trigger == 0) /* compatible SCI trigger is level */ - trigger = 3; - - if (polarity == 0) /* compatible SCI polarity is low */ - polarity = 3; - - /* Command-line over-ride via acpi_sci= */ - if (acpi_sci_flags.trigger) - trigger = acpi_sci_flags.trigger; - - if (acpi_sci_flags.polarity) - polarity = acpi_sci_flags.polarity; - - /* - * mp_config_acpi_legacy_irqs() already setup IRQs < 16 - * If GSI is < 16, this will update its flags, - * else it will create a new mp_irqs[] entry. - */ - mp_override_legacy_irq(gsi, polarity, trigger, gsi); - - /* - * stash over-ride to indicate we've been here - * and for later update of acpi_fadt - */ - acpi_sci_override_gsi = gsi; - return; -} - -static int __init -acpi_parse_int_src_ovr ( - acpi_table_entry_header *header, const unsigned long end) -{ - struct acpi_table_int_src_ovr *intsrc = NULL; - - intsrc = (struct acpi_table_int_src_ovr*) header; - - if (BAD_MADT_ENTRY(intsrc, end)) - return -EINVAL; - - acpi_table_print_madt_entry(header); - - if (intsrc->bus_irq == acpi_fadt.sci_int) { - acpi_sci_ioapic_setup(intsrc->global_irq, - intsrc->flags.polarity, intsrc->flags.trigger); - return 0; - } - - if (acpi_skip_timer_override && - intsrc->bus_irq == 0 && intsrc->global_irq == 2) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); - return 0; - } - - mp_override_legacy_irq ( - intsrc->bus_irq, - intsrc->flags.polarity, - intsrc->flags.trigger, - intsrc->global_irq); - - return 0; -} - - -static int __init -acpi_parse_nmi_src ( - acpi_table_entry_header *header, const unsigned long end) -{ - struct acpi_table_nmi_src *nmi_src = NULL; - - nmi_src = (struct acpi_table_nmi_src*) header; - - if (BAD_MADT_ENTRY(nmi_src, end)) - return -EINVAL; - - acpi_table_print_madt_entry(header); - - /* TBD: Support nimsrc entries? */ - - return 0; -} - -#endif /* CONFIG_X86_IO_APIC */ - -#ifdef CONFIG_ACPI_BUS - -/* - * acpi_pic_sci_set_trigger() - * - * use ELCR to set PIC-mode trigger type for SCI - * - * If a PIC-mode SCI is not recognized or gives spurious IRQ7's - * it may require Edge Trigger -- use "acpi_sci=edge" - * - * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers - * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. - * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) - * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) - */ - -void __init -acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) -{ - unsigned int mask = 1 << irq; - unsigned int old, new; - - /* Real old ELCR mask */ - old = inb(0x4d0) | (inb(0x4d1) << 8); - - /* - * If we use ACPI to set PCI irq's, then we should clear ELCR - * since we will set it correctly as we enable the PCI irq - * routing. - */ - new = acpi_noirq ? old : 0; - - /* - * Update SCI information in the ELCR, it isn't in the PCI - * routing tables.. - */ - switch (trigger) { - case 1: /* Edge - clear */ - new &= ~mask; - break; - case 3: /* Level - set */ - new |= mask; - break; - } - - if (old == new) - return; - - printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old); - outb(new, 0x4d0); - outb(new >> 8, 0x4d1); -} - - -#endif /* CONFIG_ACPI_BUS */ - -int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) -{ -#ifdef CONFIG_X86_IO_APIC - if (use_pci_vector() && !platform_legacy_irq(gsi)) - *irq = IO_APIC_VECTOR(gsi); - else -#endif - *irq = gsi; - return 0; -} - -unsigned int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low) -{ - unsigned int irq; - unsigned int plat_gsi = gsi; - -#ifdef CONFIG_X86_IO_APIC - if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low); - } -#endif - acpi_gsi_to_irq(plat_gsi, &irq); - return irq; -} -EXPORT_SYMBOL(acpi_register_gsi); - -/* - * ACPI based hotplug support for CPU - */ -#ifdef CONFIG_ACPI_HOTPLUG_CPU -int -acpi_map_lsapic(acpi_handle handle, int *pcpu) -{ - /* TBD */ - return -EINVAL; -} -EXPORT_SYMBOL(acpi_map_lsapic); - - -int -acpi_unmap_lsapic(int cpu) -{ - /* TBD */ - return -EINVAL; -} -EXPORT_SYMBOL(acpi_unmap_lsapic); -#endif /* CONFIG_ACPI_HOTPLUG_CPU */ - -static unsigned long __init -acpi_scan_rsdp ( - unsigned long start, - unsigned long length) -{ - unsigned long offset = 0; - unsigned long sig_len = sizeof("RSD PTR ") - 1; - unsigned long vstart = (unsigned long)isa_bus_to_virt(start); - - /* - * Scan all 16-byte boundaries of the physical memory region for the - * RSDP signature. - */ - for (offset = 0; offset < length; offset += 16) { - if (strncmp((char *) (vstart + offset), "RSD PTR ", sig_len)) - continue; - return (start + offset); - } - - return 0; -} - -static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size) -{ - struct acpi_table_sbf *sb; - - if (!phys_addr || !size) - return -EINVAL; - - sb = (struct acpi_table_sbf *) __acpi_map_table(phys_addr, size); - if (!sb) { - printk(KERN_WARNING PREFIX "Unable to map SBF\n"); - return -ENODEV; - } - - sbf_port = sb->sbf_cmos; /* Save CMOS port */ - - return 0; -} - - -#ifdef CONFIG_HPET_TIMER - -static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) -{ - struct acpi_table_hpet *hpet_tbl; - - if (!phys || !size) - return -EINVAL; - - hpet_tbl = (struct acpi_table_hpet *) __acpi_map_table(phys, size); - if (!hpet_tbl) { - printk(KERN_WARNING PREFIX "Unable to map HPET\n"); - return -ENODEV; - } - - if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) { - printk(KERN_WARNING PREFIX "HPET timers must be located in " - "memory.\n"); - return -1; - } - -#ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->addr.addrl | - ((long) hpet_tbl->addr.addrh << 32); - - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); -#else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->addr.addrl; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); - } -#endif /* X86 */ - - return 0; -} -#else -#define acpi_parse_hpet NULL -#endif - -#ifdef CONFIG_X86_PM_TIMER -extern u32 pmtmr_ioport; -#endif - -static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) -{ - struct fadt_descriptor_rev2 *fadt = NULL; - - fadt = (struct fadt_descriptor_rev2*) __acpi_map_table(phys,size); - if(!fadt) { - printk(KERN_WARNING PREFIX "Unable to map FADT\n"); - return 0; - } - -#ifdef CONFIG_ACPI_INTERPRETER - /* initialize sci_int early for INT_SRC_OVR MADT parsing */ - acpi_fadt.sci_int = fadt->sci_int; -#endif - -#ifdef CONFIG_X86_PM_TIMER - /* detect the location of the ACPI PM Timer */ - if (fadt->revision >= FADT2_REVISION_ID) { - /* FADT rev. 2 */ - if (fadt->xpm_tmr_blk.address_space_id != ACPI_ADR_SPACE_SYSTEM_IO) - return 0; - - pmtmr_ioport = fadt->xpm_tmr_blk.address; - } else { - /* FADT rev. 1 */ - pmtmr_ioport = fadt->V1_pm_tmr_blk; - } - if (pmtmr_ioport) - printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", pmtmr_ioport); -#endif - return 0; -} - - -unsigned long __init -acpi_find_rsdp (void) -{ - unsigned long rsdp_phys = 0; - - if (efi_enabled) { - if (efi.acpi20) - return __pa(efi.acpi20); - else if (efi.acpi) - return __pa(efi.acpi); - } - /* - * Scan memory looking for the RSDP signature. First search EBDA (low - * memory) paragraphs and then search upper memory (E0000-FFFFF). - */ - rsdp_phys = acpi_scan_rsdp (0, 0x400); - if (!rsdp_phys) - rsdp_phys = acpi_scan_rsdp (0xE0000, 0x20000); - - set_fixmap(FIX_ACPI_RSDP_PAGE, rsdp_phys); - - return rsdp_phys; -} - -#ifdef CONFIG_X86_LOCAL_APIC -/* - * Parse LAPIC entries in MADT - * returns 0 on success, < 0 on error - */ -static int __init -acpi_parse_madt_lapic_entries(void) -{ - int count; - - /* - * Note that the LAPIC address is obtained from the MADT (32-bit value) - * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value). - */ - - count = acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, acpi_parse_lapic_addr_ovr, 0); - if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC address override entry\n"); - return count; - } - - mp_register_lapic_address(acpi_lapic_addr); - - count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic, - MAX_APICS); - if (!count) { - printk(KERN_ERR PREFIX "No LAPIC entries present\n"); - /* TBD: Cleanup to allow fallback to MPS */ - return -ENODEV; - } - else if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n"); - /* TBD: Cleanup to allow fallback to MPS */ - return count; - } - - count = acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi, 0); - if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n"); - /* TBD: Cleanup to allow fallback to MPS */ - return count; - } - return 0; -} -#endif /* CONFIG_X86_LOCAL_APIC */ - -#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER) -/* - * Parse IOAPIC related entries in MADT - * returns 0 on success, < 0 on error - */ -static int __init -acpi_parse_madt_ioapic_entries(void) -{ - int count; - - /* - * ACPI interpreter is required to complete interrupt setup, - * so if it is off, don't enumerate the io-apics with ACPI. - * If MPS is present, it will handle them, - * otherwise the system will stay in PIC mode - */ - if (acpi_disabled || acpi_noirq) { - return -ENODEV; - } - - /* - * if "noapic" boot option, don't look for IO-APICs - */ - if (skip_ioapic_setup) { - printk(KERN_INFO PREFIX "Skipping IOAPIC probe " - "due to 'noapic' option.\n"); - return -ENODEV; - } - - count = acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, MAX_IO_APICS); - if (!count) { - printk(KERN_ERR PREFIX "No IOAPIC entries present\n"); - return -ENODEV; - } - else if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n"); - return count; - } - - count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, acpi_parse_int_src_ovr, NR_IRQ_VECTORS); - if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); - /* TBD: Cleanup to allow fallback to MPS */ - return count; - } - - /* - * If BIOS did not supply an INT_SRC_OVR for the SCI - * pretend we got one so we can set the SCI flags. - */ - if (!acpi_sci_override_gsi) - acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0); - - /* Fill in identity legacy mapings where no override */ - mp_config_acpi_legacy_irqs(); - - count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, NR_IRQ_VECTORS); - if (count < 0) { - printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); - /* TBD: Cleanup to allow fallback to MPS */ - return count; - } - - return 0; -} -#else -static inline int acpi_parse_madt_ioapic_entries(void) -{ - return -1; -} -#endif /* !(CONFIG_X86_IO_APIC && CONFIG_ACPI_INTERPRETER) */ - - -static void __init -acpi_process_madt(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int count, error; - - count = acpi_table_parse(ACPI_APIC, acpi_parse_madt); - if (count >= 1) { - - /* - * Parse MADT LAPIC entries - */ - error = acpi_parse_madt_lapic_entries(); - if (!error) { - acpi_lapic = 1; - - /* - * Parse MADT IO-APIC entries - */ - error = acpi_parse_madt_ioapic_entries(); - if (!error) { - acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; - acpi_irq_balance_set(NULL); - acpi_ioapic = 1; - - smp_found_config = 1; - clustered_apic_check(); - } - } - if (error == -EINVAL) { - /* - * Dell Precision Workstation 410, 610 come here. - */ - printk(KERN_ERR PREFIX "Invalid BIOS MADT, disabling ACPI\n"); - disable_acpi(); - } - } -#endif - return; -} - -/* - * acpi_boot_table_init() and acpi_boot_init() - * called from setup_arch(), always. - * 1. checksums all tables - * 2. enumerates lapics - * 3. enumerates io-apics - * - * acpi_table_init() is separate to allow reading SRAT without - * other side effects. - * - * side effects of acpi_boot_init: - * acpi_lapic = 1 if LAPIC found - * acpi_ioapic = 1 if IOAPIC found - * if (acpi_lapic && acpi_ioapic) smp_found_config = 1; - * if acpi_blacklisted() acpi_disabled = 1; - * acpi_irq_model=... - * ... - * - * return value: (currently ignored) - * 0: success - * !0: failure - */ - -int __init -acpi_boot_table_init(void) -{ - int error; - - /* - * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs - */ - if (acpi_disabled && !acpi_ht) - return 1; - - /* - * Initialize the ACPI boot-time table parser. - */ - error = acpi_table_init(); - if (error) { - disable_acpi(); - return error; - } - -#ifdef __i386__ - check_acpi_pci(); -#endif - - acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); - - /* - * blacklist may disable ACPI entirely - */ - error = acpi_blacklisted(); - if (error) { - extern int acpi_force; - - if (acpi_force) { - printk(KERN_WARNING PREFIX "acpi=force override\n"); - } else { - printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); - disable_acpi(); - return error; - } - } - - return 0; -} - - -int __init acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs - */ - if (acpi_disabled && !acpi_ht) - return 1; - - acpi_table_parse(ACPI_BOOT, acpi_parse_sbf); - - /* - * set sci_int and PM timer address - */ - acpi_table_parse(ACPI_FADT, acpi_parse_fadt); - - /* - * Process the Multiple APIC Description Table (MADT), if present - */ - acpi_process_madt(); - - acpi_table_parse(ACPI_HPET, acpi_parse_hpet); - acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); - - return 0; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/apic.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/apic.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,83 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include <linux/config.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> - -#include <asm/atomic.h> -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/desc.h> -#include <asm/arch_hooks.h> -#include <asm/hpet.h> - -#include <mach_apic.h> - -#include "io_ports.h" - -/* - * Debug level - */ -int apic_verbosity; - -int get_physical_broadcast(void) -{ - return 0xff; -} - -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk("unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - */ - ack_APIC_irq(); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) -{ -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,31 +0,0 @@ -# -# Makefile for x86-compatible CPU details and quirks -# - -CFLAGS += -Iarch/i386/kernel/cpu - -obj-y := common.o -c-obj-y += proc.o - -c-obj-y += amd.o -c-obj-y += cyrix.o -c-obj-y += centaur.o -c-obj-y += transmeta.o -c-obj-y += intel.o intel_cacheinfo.o -c-obj-y += rise.o -c-obj-y += nexgen.o -c-obj-y += umc.o - -#obj-$(CONFIG_X86_MCE) += ../../../../i386/kernel/cpu/mcheck/ - -obj-$(CONFIG_MTRR) += mtrr/ -#obj-$(CONFIG_CPU_FREQ) += ../../../../i386/kernel/cpu/cpufreq/ - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/kernel/cpu/$(notdir $@) $@ - -obj-y += $(c-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/common.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/common.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,642 +0,0 @@ -#include <linux/init.h> -#include <linux/string.h> -#include <linux/delay.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/msr.h> -#include <asm/io.h> -#include <asm/mmu_context.h> -#ifdef CONFIG_X86_LOCAL_APIC -#include <asm/mpspec.h> -#include <asm/apic.h> -#include <mach_apic.h> -#endif -#include <asm-xen/hypervisor.h> - -#include "cpu.h" - -DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); -EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); - -static int cachesize_override __initdata = -1; -static int disable_x86_fxsr __initdata = 0; -static int disable_x86_serial_nr __initdata = 1; - -struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; - -extern void mcheck_init(struct cpuinfo_x86 *c); - -extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c); - -extern int disable_pse; - -static void default_init(struct cpuinfo_x86 * c) -{ - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -} - -static struct cpu_dev default_cpu = { - .c_init = default_init, -}; -static struct cpu_dev * this_cpu = &default_cpu; - -static int __init cachesize_setup(char *str) -{ - get_option (&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - -int __init get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - char *p, *q; - - if (cpuid_eax(0x80000000) < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - - /* Intel chips right-justify this string for some dumb reason; - undo that brain damage */ - p = q = &c->x86_model_id[0]; - while ( *p == ' ' ) - p++; - if ( p != q ) { - while ( *p ) - *q++ = *p++; - while ( q <= &c->x86_model_id[48] ) - *q++ = '\0'; /* Zero-pad the rest */ - } - - return 1; -} - - -void __init display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, ecx, edx, l2size; - - n = cpuid_eax(0x80000000); - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); - } - - if (n < 0x80000006) /* Some chips just has a large L1. */ - return; - - ecx = cpuid_ecx(0x80000006); - l2size = ecx >> 16; - - /* do processor-specific cache resizing */ - if (this_cpu->c_size_cache) - l2size = this_cpu->c_size_cache(c,l2size); - - /* Allow user to override all this if necessary. */ - if (cachesize_override != -1) - l2size = cachesize_override; - - if ( l2size == 0 ) - return; /* Again, no L2 cache is possible */ - - c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); -} - -/* Naming convention should be: <Name> [(<Codename>)] */ -/* This table only is used unless init_<vendor>() below doesn't set it; */ -/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ - -/* Look up CPU names by table lookup. */ -static char __init *table_lookup_model(struct cpuinfo_x86 *c) -{ - struct cpu_model_info *info; - - if ( c->x86_model >= 16 ) - return NULL; /* Range check */ - - if (!this_cpu) - return NULL; - - info = this_cpu->c_models; - - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; - } - return NULL; /* Not found */ -} - - -void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early) -{ - char *v = c->x86_vendor_id; - int i; - - for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v,cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v,cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - if (!early) - this_cpu = cpu_devs[i]; - break; - } - } - } -} - - -static int __init x86_fxsr_setup(char * s) -{ - disable_x86_fxsr = 1; - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - - -/* Probe for the CPUID instruction */ -int __init have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - -/* Do minimum CPU detection early. - Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. - The others are not touched to avoid unwanted side effects. */ -void __init early_cpu_detect(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - c->x86_cache_alignment = 32; - - if (!have_cpuid_p()) - return; - - /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c, 1); - - c->x86 = 4; - if (c->cpuid_level >= 0x00000001) { - u32 junk, tfms, cap0, misc; - cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) { - c->x86 += (tfms >> 20) & 0xff; - c->x86_model += ((tfms >> 16) & 0xF) << 4; - } - c->x86_mask = tfms & 15; - if (cap0 & (1<<19)) - c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; - } - - early_intel_workaround(c); -} - -void __init generic_identify(struct cpuinfo_x86 * c) -{ - u32 tfms, xlvl; - int junk; - - if (have_cpuid_p()) { - /* Get vendor name */ - cpuid(0x00000000, &c->cpuid_level, - (int *)&c->x86_vendor_id[0], - (int *)&c->x86_vendor_id[8], - (int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c, 0); - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if ( c->cpuid_level >= 0x00000001 ) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &junk, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) { - c->x86 += (tfms >> 20) & 0xff; - c->x86_model += ((tfms >> 16) & 0xF) << 4; - } - c->x86_mask = tfms & 15; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ( (xlvl & 0xffff0000) == 0x80000000 ) { - if ( xlvl >= 0x80000001 ) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if ( xlvl >= 0x80000004 ) - get_model_name(c); /* Default name */ - } - } -} - -static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { - /* Disable processor serial number */ - unsigned long lo,hi; - rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_bit(X86_FEATURE_PN, c->x86_capability); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } -} - -static int __init x86_serial_nr_setup(char *s) -{ - disable_x86_serial_nr = 0; - return 1; -} -__setup("serialnumber", x86_serial_nr_setup); - - - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __init identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->cpuid_level = -1; /* CPUID not detected */ - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_num_cores = 1; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - if (!have_cpuid_p()) { - /* First of all, decide if this is a 486 or higher */ - /* It's a 486 if we can modify the AC flag */ - if ( flag_is_changeable_p(X86_EFLAGS_AC) ) - c->x86 = 4; - else - c->x86 = 3; - } - - generic_identify(c); - - printk(KERN_DEBUG "CPU: After generic identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - if (this_cpu->c_identify) { - this_cpu->c_identify(c); - - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - } - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - if (this_cpu->c_init) - this_cpu->c_init(c); - - /* Disable the PN if appropriate */ - squash_the_stupid_serial_number(c); - - /* - * The vendor-specific functions might have changed features. Now - * we do "generic changes." - */ - - /* TSC disabled? */ - if ( tsc_disable ) - clear_bit(X86_FEATURE_TSC, c->x86_capability); - - /* FXSR disabled? */ - if (disable_x86_fxsr) { - clear_bit(X86_FEATURE_FXSR, c->x86_capability); - clear_bit(X86_FEATURE_XMM, c->x86_capability); - } - - if (disable_pse) - clear_bit(X86_FEATURE_PSE, c->x86_capability); - - /* If the model name is still unset, do table lookup. */ - if ( !c->x86_model_id[0] ) { - char *p; - p = table_lookup_model(c); - if ( p ) - strcpy(c->x86_model_id, p); - else - /* Last resort... */ - sprintf(c->x86_model_id, "%02x/%02x", - c->x86_vendor, c->x86_model); - } - - machine_specific_modify_cpu_capabilities(c); - - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After all inits, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if ( c != &boot_cpu_data ) { - /* AND the already accumulated flags with these */ - for ( i = 0 ; i < NCAPINTS ; i++ ) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - /* Init Machine Check Exception if available. */ -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif -} -/* - * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c - */ - -void __init dodgy_tsc(void) -{ - if (( boot_cpu_data.x86_vendor == X86_VENDOR_CYRIX ) || - ( boot_cpu_data.x86_vendor == X86_VENDOR_NSC )) - cpu_devs[X86_VENDOR_CYRIX]->c_init(&boot_cpu_data); -} - -#ifdef CONFIG_X86_HT -void __init detect_ht(struct cpuinfo_x86 *c) -{ - u32 eax, ebx, ecx, edx; - int index_lsb, index_msb, tmp; - int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - - cpuid(1, &eax, &ebx, &ecx, &edx); - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { - index_lsb = 0; - index_msb = 31; - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - tmp = smp_num_siblings; - while ((tmp & 1) == 0) { - tmp >>=1 ; - index_lsb++; - } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (index_lsb != index_msb ) - index_msb++; - phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); - } -} -#endif - -void __init print_cpu_info(struct cpuinfo_x86 *c) -{ - char *vendor = NULL; - - if (c->x86_vendor < X86_VENDOR_NUM) - vendor = this_cpu->c_vendor; - else if (c->cpuid_level >= 0) - vendor = c->x86_vendor_id; - - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk("%s ", vendor); - - if (!c->x86_model_id[0]) - printk("%d86", c->x86); - else - printk("%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); - else - printk("\n"); -} - -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; - -/* This is hacky. :) - * We're emulating future behavior. - * In the future, the cpu-specific init functions will be called implicitly - * via the magic of initcalls. - * They will insert themselves into the cpu_devs structure. - * Then, when cpu_init() is called, we can just iterate over that array. - */ - -extern int intel_cpu_init(void); -extern int cyrix_init_cpu(void); -extern int nsc_init_cpu(void); -extern int amd_init_cpu(void); -extern int centaur_init_cpu(void); -extern int transmeta_init_cpu(void); -extern int rise_init_cpu(void); -extern int nexgen_init_cpu(void); -extern int umc_init_cpu(void); -void early_cpu_detect(void); - -void __init early_cpu_init(void) -{ - intel_cpu_init(); - cyrix_init_cpu(); - nsc_init_cpu(); - amd_init_cpu(); - centaur_init_cpu(); - transmeta_init_cpu(); - rise_init_cpu(); - nexgen_init_cpu(); - umc_init_cpu(); - early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; -#endif -} - -void __init cpu_gdt_init(struct Xgt_desc_struct *gdt_descr) -{ - unsigned long frames[16]; - unsigned long va; - int f; - - for (va = gdt_descr->address, f = 0; - va < gdt_descr->address + gdt_descr->size; - va += PAGE_SIZE, f++) { - frames[f] = virt_to_machine(va) >> PAGE_SHIFT; - make_page_readonly((void *)va); - } - if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) - BUG(); - lgdt_finish(); -} - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - */ -void __init cpu_init (void) -{ - int cpu = smp_processor_id(); - struct tss_struct * t = &per_cpu(init_tss, cpu); - struct thread_struct *thread = ¤t->thread; - - if (cpu_test_and_set(cpu, cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); - } - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - if (cpu_has_vme || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } - - /* - * Set up the per-thread TLS descriptor cache: - */ - memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], - GDT_ENTRY_TLS_ENTRIES * 8); - - cpu_gdt_init(&cpu_gdt_descr[cpu]); - - /* - * Delete NT - */ - __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); - - /* - * Set up and load the per-CPU TSS and LDT - */ - atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; - if (current->mm) - BUG(); - enter_lazy_tlb(&init_mm, current); - - load_esp0(t, thread); - - load_LDT(&init_mm.context); - - /* Clear %fs and %gs. */ - asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); - - /* Clear all 6 debug registers: */ - -#define CD(register) HYPERVISOR_set_debugreg(register, 0) - - CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); - -#undef CD - - /* - * Force FPU initialization: - */ - current_thread_info()->status = 0; - clear_used_math(); - mxcsr_feature_mask_init(); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,16 +0,0 @@ -obj-y := main.o -c-obj-y := if.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): $(obj)/mtrr.h - @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(obj-y)): $(obj)/mtrr.h - -$(obj)/mtrr.h: - @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/mtrr.h $@ - -obj-y += $(c-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/cpu/mtrr/main.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,165 +0,0 @@ -#include <linux/init.h> -#include <linux/proc_fs.h> -#include <linux/ctype.h> -#include <linux/module.h> -#include <linux/seq_file.h> -#include <asm/uaccess.h> - -#include <asm/mtrr.h> -#include "mtrr.h" - -void generic_get_mtrr(unsigned int reg, unsigned long *base, - unsigned int *size, mtrr_type * type) -{ - dom0_op_t op; - - op.cmd = DOM0_READ_MEMTYPE; - op.u.read_memtype.reg = reg; - (void)HYPERVISOR_dom0_op(&op); - - *size = op.u.read_memtype.nr_pfns; - *base = op.u.read_memtype.pfn; - *type = op.u.read_memtype.type; -} - -struct mtrr_ops generic_mtrr_ops = { - .use_intel_if = 1, - .get = generic_get_mtrr, -}; - -struct mtrr_ops *mtrr_if = &generic_mtrr_ops; -unsigned int num_var_ranges; -unsigned int *usage_table; - -void __init set_num_var_ranges(void) -{ - dom0_op_t op; - - for (num_var_ranges = 0; ; num_var_ranges++) { - op.cmd = DOM0_READ_MEMTYPE; - op.u.read_memtype.reg = num_var_ranges; - if (HYPERVISOR_dom0_op(&op) != 0) - break; - } -} - -static void __init init_table(void) -{ - int i, max; - - max = num_var_ranges; - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) - == NULL) { - printk(KERN_ERR "mtrr: could not allocate\n"); - return; - } - for (i = 0; i < max; i++) - usage_table[i] = 0; -} - -int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, char increment) -{ - int error; - dom0_op_t op; - - op.cmd = DOM0_ADD_MEMTYPE; - op.u.add_memtype.pfn = base; - op.u.add_memtype.nr_pfns = size; - op.u.add_memtype.type = type; - if ((error = HYPERVISOR_dom0_op(&op))) - return error; - - if (increment) - ++usage_table[op.u.add_memtype.reg]; - - return op.u.add_memtype.reg; -} - -int -mtrr_add(unsigned long base, unsigned long size, unsigned int type, - char increment) -{ - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); - return -EINVAL; - } - return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, - increment); -} - -int mtrr_del_page(int reg, unsigned long base, unsigned long size) -{ - int i, max; - mtrr_type ltype; - unsigned long lbase; - unsigned int lsize; - int error = -EINVAL; - dom0_op_t op; - - max = num_var_ranges; - if (reg < 0) { - /* Search for existing MTRR */ - for (i = 0; i < max; ++i) { - mtrr_if->get(i, &lbase, &lsize, <ype); - if (lbase == base && lsize == size) { - reg = i; - break; - } - } - if (reg < 0) { - printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); - goto out; - } - } - if (usage_table[reg] < 1) { - printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); - goto out; - } - if (--usage_table[reg] < 1) { - op.cmd = DOM0_DEL_MEMTYPE; - op.u.del_memtype.handle = 0; - op.u.add_memtype.reg = reg; - (void)HYPERVISOR_dom0_op(&op); - } - error = reg; - out: - return error; -} - -int -mtrr_del(int reg, unsigned long base, unsigned long size) -{ - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); - return -EINVAL; - } - return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); -} - -EXPORT_SYMBOL(mtrr_add); -EXPORT_SYMBOL(mtrr_del); - -static int __init mtrr_init(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - return -ENODEV; - - if ((!cpu_has(c, X86_FEATURE_MTRR)) && - (!cpu_has(c, X86_FEATURE_K6_MTRR)) && - (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) && - (!cpu_has(c, X86_FEATURE_CENTAUR_MCR))) - return -ENODEV; - - set_num_var_ranges(); - init_table(); - - return 0; -} - -subsys_initcall(mtrr_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/entry.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1047 +0,0 @@ -/* - * linux/arch/i386/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * This also contains the timer-interrupt handler, as well as all interrupts - * and faults that can result in a task-switch. - * - * NOTE: This code handles signal-recognition, which happens every time - * after a timer-interrupt and after each system call. - * - * I changed all the .align's to 4 (16 byte alignment), as that's faster - * on a 486. - * - * Stack layout in 'ret_from_system_call': - * ptrace needs to have all regs on the stack. - * if the order here is changed, it needs to be - * updated in fork.c:copy_process, signal.c:do_signal, - * ptrace.c and ptrace.h - * - * 0(%esp) - %ebx - * 4(%esp) - %ecx - * 8(%esp) - %edx - * C(%esp) - %esi - * 10(%esp) - %edi - * 14(%esp) - %ebp - * 18(%esp) - %eax - * 1C(%esp) - %ds - * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss - * - * "current" is in register %ebx during any slow entries. - */ - -#include <linux/config.h> -#include <linux/linkage.h> -#include <asm/thread_info.h> -#include <asm/errno.h> -#include <asm/segment.h> -#include <asm/smp.h> -#include <asm/page.h> -#include "irq_vectors.h" -#include <asm-xen/xen-public/xen.h> - -#define nr_syscalls ((syscall_table_size)/4) - -EBX = 0x00 -ECX = 0x04 -EDX = 0x08 -ESI = 0x0C -EDI = 0x10 -EBP = 0x14 -EAX = 0x18 -DS = 0x1C -ES = 0x20 -ORIG_EAX = 0x24 -EIP = 0x28 -CS = 0x2C -EVENT_MASK = 0x2E -EFLAGS = 0x30 -OLDESP = 0x34 -OLDSS = 0x38 - -CF_MASK = 0x00000001 -TF_MASK = 0x00000100 -IF_MASK = 0x00000200 -DF_MASK = 0x00000400 -NT_MASK = 0x00004000 -VM_MASK = 0x00020000 - -/* Offsets into shared_info_t. */ -#define evtchn_upcall_pending /* 0 */ -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 3 - -#ifdef CONFIG_SMP -#define preempt_disable(reg) incl TI_preempt_count(reg) -#define preempt_enable(reg) decl TI_preempt_count(reg) -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%ebp) ; \ - movl TI_cpu(%ebp),reg ; \ - shl $sizeof_vcpu_shift,reg ; \ - addl HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%ebp) -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff -#else -#define XEN_GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) -#define XEN_PUT_VCPU_INFO_fixup -#endif - -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_BLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) - -#ifdef CONFIG_PREEMPT -#define preempt_stop GET_THREAD_INFO(%ebp) ; \ - XEN_BLOCK_EVENTS(%esi) -#else -#define preempt_stop -#define resume_kernel restore_all -#endif - -#define SAVE_ALL \ - cld; \ - pushl %es; \ - pushl %ds; \ - pushl %eax; \ - pushl %ebp; \ - pushl %edi; \ - pushl %esi; \ - pushl %edx; \ - pushl %ecx; \ - pushl %ebx; \ - movl $(__USER_DS), %edx; \ - movl %edx, %ds; \ - movl %edx, %es; - -#define RESTORE_INT_REGS \ - popl %ebx; \ - popl %ecx; \ - popl %edx; \ - popl %esi; \ - popl %edi; \ - popl %ebp; \ - popl %eax - -#define RESTORE_REGS \ - RESTORE_INT_REGS; \ -1: popl %ds; \ -2: popl %es; \ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ - jmp 1b; \ -4: movl $0,(%esp); \ - jmp 2b; \ -.previous; \ -.section __ex_table,"a";\ - .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ -.previous - - -#define RESTORE_ALL \ - RESTORE_REGS \ - addl $4, %esp; \ -1: iret; \ -.section .fixup,"ax"; \ -2: movl $(__USER_DS), %edx; \ - movl %edx, %ds; \ - movl %edx, %es; \ - movl $11,%eax; \ - call do_exit; \ -.previous; \ -.section __ex_table,"a";\ - .align 4; \ - .long 1b,2b; \ -.previous - - -ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - GET_THREAD_INFO(%ebp) - popl %eax - jmp syscall_exit - -/* - * Return to user mode is not as complex as all this looks, - * but we want the default path for a system call return to - * go as quickly as possible which is why some of this is - * less clear than it otherwise should be. - */ - - # userspace resumption stub bypassing syscall exit tracing - ALIGN -ret_from_exception: - preempt_stop -ret_from_intr: - GET_THREAD_INFO(%ebp) - movl EFLAGS(%esp), %eax # mix EFLAGS and CS - movb CS(%esp), %al - testl $(VM_MASK | 2), %eax - jz resume_kernel # returning to kernel or vm86-space -ENTRY(resume_userspace) - XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending - jmp restore_all - -#ifdef CONFIG_PREEMPT -ENTRY(resume_kernel) - XEN_BLOCK_EVENTS(%esi) - cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? - jnz restore_all -need_resched: - movl TI_flags(%ebp), %ecx # need_resched set ? - testb $_TIF_NEED_RESCHED, %cl - jz restore_all - testb $0xFF,EVENT_MASK(%esp) # interrupts off (exception path) ? - jnz restore_all - call preempt_schedule_irq - jmp need_resched -#endif - -/* SYSENTER_RETURN points to after the "sysenter" instruction in - the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ - - # sysenter call handler stub -ENTRY(sysenter_entry) - movl TSS_sysenter_esp0(%esp),%esp -sysenter_past_esp: - sti - pushl $(__USER_DS) - pushl %ebp - pushfl - pushl $(__USER_CS) - pushl $SYSENTER_RETURN - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3,%ebp - jae syscall_fault -1: movl (%ebp),%ebp -.section __ex_table,"a" - .align 4 - .long 1b,syscall_fault -.previous - - pushl %eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax - jae syscall_badsys - call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) - cli - movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx - jne syscall_exit_work -/* if something modifies registers it must also disable sysexit */ - movl EIP(%esp), %edx - movl OLDESP(%esp), %ecx - xorl %ebp,%ebp - sti - sysexit - - - # system call handler stub -ENTRY(system_call) - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(nr_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(,%eax,4) - movl %eax,EAX(%esp) # store the return value -syscall_exit: - XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - movl TI_flags(%ebp), %ecx - testw $_TIF_ALLWORK_MASK, %cx # current->work - jne syscall_exit_work -restore_all: - testl $VM_MASK, EFLAGS(%esp) - jnz resume_vm86 - movb EVENT_MASK(%esp), %al - notb %al # %al == ~saved_mask - XEN_GET_VCPU_INFO(%esi) - andb evtchn_upcall_mask(%esi),%al - andb $1,%al # %al == mask & ~saved_mask - jnz restore_all_enable_events # != 0 => reenable event delivery - XEN_PUT_VCPU_INFO(%esi) - RESTORE_ALL - -resume_vm86: - XEN_UNBLOCK_EVENTS(%esi) - RESTORE_REGS - movl %eax,(%esp) - movl $__HYPERVISOR_switch_vm86,%eax - int $0x82 - ud2 - - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - XEN_BLOCK_EVENTS(%esi) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests - testl $VM_MASK, EFLAGS(%esp) - movl %esp, %eax - jne work_notifysig_v86 # returning to kernel-space or - # vm86-space - xorl %edx, %edx - call do_notify_resume - jmp restore_all - - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - xorl %edx, %edx - call do_notify_resume - jmp restore_all - - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS,EAX(%esp) - movl %esp, %eax - xorl %edx,%edx - call do_syscall_trace - movl ORIG_EAX(%esp), %eax - cmpl $(nr_syscalls), %eax - jnae syscall_call - jmp syscall_exit - - # perform syscall exit tracing - ALIGN -syscall_exit_work: - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl - jz work_pending - XEN_UNBLOCK_EVENTS(%esi) # could let do_syscall_trace() call - # schedule() instead - movl %esp, %eax - movl $1, %edx - call do_syscall_trace - jmp resume_userspace - - ALIGN -syscall_fault: - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - movl $-EFAULT,EAX(%esp) - jmp resume_userspace - - ALIGN -syscall_badsys: - movl $-ENOSYS,EAX(%esp) - jmp resume_userspace - -#if 0 /* XEN */ -/* - * Build the entry stubs and pointer table with - * some assembler magic. - */ -.data -ENTRY(interrupt) -.text - -vector=0 -ENTRY(irq_entries_start) -.rept NR_IRQS - ALIGN -1: pushl $vector-256 - jmp common_interrupt -.data - .long 1b -.text -vector=vector+1 -.endr - - ALIGN -common_interrupt: - SAVE_ALL - movl %esp,%eax - call do_IRQ - jmp ret_from_intr - -#define BUILD_INTERRUPT(name, nr) \ -ENTRY(name) \ - pushl $nr-256; \ - SAVE_ALL \ - movl %esp,%eax; \ - call smp_/**/name; \ - jmp ret_from_intr; - -/* The include is where all of the SMP etc. interrupts come from */ -#include "entry_arch.h" -#endif /* XEN */ - -ENTRY(divide_error) - pushl $0 # no error code - pushl $do_divide_error - ALIGN -error_code: - pushl %ds - pushl %eax - xorl %eax, %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - decl %eax # eax = -1 - pushl %ecx - pushl %ebx - cld - movl %es, %ecx - movl ES(%esp), %edi # get the function address - movl ORIG_EAX(%esp), %edx # get the error code - movl %eax, ORIG_EAX(%esp) - movl %ecx, ES(%esp) - movl $(__USER_DS), %ecx - movl %ecx, %ds - movl %ecx, %es - movl %esp,%eax # pt_regs pointer - call *%edi - jmp ret_from_exception - -# A note on the "critical region" in our callback handler. -# We want to avoid stacking callback handlers due to events occurring -# during handling of the last event. To do this, we keep events disabled -# until we've done all processing. HOWEVER, we must enable events before -# popping the stack frame (can't be done atomically) and so it would still -# be possible to get enough handler activations to overflow the stack. -# Although unlikely, bugs of that kind are hard to track down, so we'd -# like to avoid the possibility. -# So, on entry to the handler we detect whether we interrupted an -# existing activation in its critical region -- if so, we pop the current -# activation and restart the handler using the previous one. -ENTRY(hypervisor_callback) - pushl %eax - SAVE_ALL - movl EIP(%esp),%eax - cmpl $scrit,%eax - jb 11f - cmpl $ecrit,%eax - jb critical_region_fixup -11: push %esp - call evtchn_do_upcall - add $4,%esp - jmp ret_from_intr - - ALIGN -restore_all_enable_events: - XEN_LOCKED_UNBLOCK_EVENTS(%esi) -scrit: /**** START OF CRITICAL REGION ****/ - XEN_TEST_PENDING(%esi) - jnz 14f # process more events if necessary... - XEN_PUT_VCPU_INFO(%esi) - RESTORE_ALL -14: XEN_LOCKED_BLOCK_EVENTS(%esi) - XEN_PUT_VCPU_INFO(%esi) - jmp 11b -ecrit: /**** END OF CRITICAL REGION ****/ -# [How we do the fixup]. We want to merge the current stack frame with the -# just-interrupted frame. How we do this depends on where in the critical -# region the interrupted handler was executing, and so how many saved -# registers are in each frame. We do this quickly using the lookup table -# 'critical_fixup_table'. For each byte offset in the critical region, it -# provides the number of bytes which have already been popped from the -# interrupted stack frame. -critical_region_fixup: - addl $critical_fixup_table-scrit,%eax - movzbl (%eax),%eax # %eax contains num bytes popped - cmpb $0xff,%al # 0xff => vcpu_info critical region - jne 15f - GET_THREAD_INFO(%ebp) - XEN_PUT_VCPU_INFO(%esi) # abort vcpu_info critical region - xorl %eax,%eax -15: mov %esp,%esi - add %eax,%esi # %esi points at end of src region - mov %esp,%edi - add $0x34,%edi # %edi points at end of dst region - mov %eax,%ecx - shr $2,%ecx # convert words to bytes - je 17f # skip loop if nothing to copy -16: subl $4,%esi # pre-decrementing copy loop - subl $4,%edi - movl (%esi),%eax - movl %eax,(%edi) - loop 16b -17: movl %edi,%esp # final %edi is top of merged stack - jmp 11b - -critical_fixup_table: - .byte 0xff,0xff,0xff # testb $0xff,(%esi) = XEN_TEST_PENDING - .byte 0xff,0xff # jnz 14f - XEN_PUT_VCPU_INFO_fixup - .byte 0x00 # pop %ebx - .byte 0x04 # pop %ecx - .byte 0x08 # pop %edx - .byte 0x0c # pop %esi - .byte 0x10 # pop %edi - .byte 0x14 # pop %ebp - .byte 0x18 # pop %eax - .byte 0x1c # pop %ds - .byte 0x20 # pop %es - .byte 0x24,0x24,0x24 # add $4,%esp - .byte 0x28 # iret - .byte 0xff,0xff,0xff,0xff # movb $1,1(%esi) - XEN_PUT_VCPU_INFO_fixup - .byte 0x00,0x00 # jmp 11b - -# Hypervisor uses this for application faults while it executes. -ENTRY(failsafe_callback) -1: popl %ds -2: popl %es -3: popl %fs -4: popl %gs - subl $4,%esp - SAVE_ALL - jmp ret_from_exception -.section .fixup,"ax"; \ -6: movl $0,(%esp); \ - jmp 1b; \ -7: movl $0,(%esp); \ - jmp 2b; \ -8: movl $0,(%esp); \ - jmp 3b; \ -9: movl $0,(%esp); \ - jmp 4b; \ -.previous; \ -.section __ex_table,"a";\ - .align 4; \ - .long 1b,6b; \ - .long 2b,7b; \ - .long 3b,8b; \ - .long 4b,9b; \ -.previous - -ENTRY(coprocessor_error) - pushl $0 - pushl $do_coprocessor_error - jmp error_code - -ENTRY(simd_coprocessor_error) - pushl $0 - pushl $do_simd_coprocessor_error - jmp error_code - -ENTRY(device_not_available) - pushl $-1 # mark this as an int - SAVE_ALL - preempt_stop - call math_state_restore - jmp ret_from_exception - -/* - * Debug traps and NMI can happen at the one SYSENTER instruction - * that sets up the real kernel stack. Check here, since we can't - * allow the wrong stack to be used. - * - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have - * already pushed 3 words if it hits on the sysenter instruction: - * eflags, cs and eip. - * - * We just load the right stack, and push the three (known) values - * by hand onto the new stack - while updating the return eip past - * the instruction that would have done it for sysenter. - */ -#define FIX_STACK(offset, ok, label) \ - cmpw $__KERNEL_CS,4(%esp); \ - jne ok; \ -label: \ - movl TSS_sysenter_esp0+offset(%esp),%esp; \ - pushfl; \ - pushl $__KERNEL_CS; \ - pushl $sysenter_past_esp - -ENTRY(debug) - cmpl $sysenter_entry,(%esp) - jne debug_stack_correct - FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) -debug_stack_correct: - pushl $-1 # mark this as an int - SAVE_ALL - xorl %edx,%edx # error code 0 - movl %esp,%eax # pt_regs pointer - call do_debug - testl %eax,%eax - jnz restore_all - jmp ret_from_exception - -#if 0 /* XEN */ -/* - * NMI is doubly nasty. It can happen _while_ we're handling - * a debug fault, and the debug fault hasn't yet been able to - * clear up the stack. So we first check whether we got an - * NMI on the sysenter entry path, but after that we need to - * check whether we got an NMI on the debug path where the debug - * fault happened on the sysenter path. - */ -ENTRY(nmi) - cmpl $sysenter_entry,(%esp) - je nmi_stack_fixup - pushl %eax - movl %esp,%eax - /* Do not access memory above the end of our stack page, - * it might not exist. - */ - andl $(THREAD_SIZE-1),%eax - cmpl $(THREAD_SIZE-20),%eax - popl %eax - jae nmi_stack_correct - cmpl $sysenter_entry,12(%esp) - je nmi_debug_stack_check -nmi_stack_correct: - pushl %eax - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_nmi - RESTORE_ALL - -nmi_stack_fixup: - FIX_STACK(12,nmi_stack_correct, 1) - jmp nmi_stack_correct -nmi_debug_stack_check: - cmpw $__KERNEL_CS,16(%esp) - jne nmi_stack_correct - cmpl $debug - 1,(%esp) - jle nmi_stack_correct - cmpl $debug_esp_fix_insn,(%esp) - jle nmi_debug_stack_fixup -nmi_debug_stack_fixup: - FIX_STACK(24,nmi_stack_correct, 1) - jmp nmi_stack_correct -#endif /* XEN */ - -ENTRY(int3) - pushl $-1 # mark this as an int - SAVE_ALL - xorl %edx,%edx # zero error code - movl %esp,%eax # pt_regs pointer - call do_int3 - testl %eax,%eax - jnz restore_all - jmp ret_from_exception - -ENTRY(overflow) - pushl $0 - pushl $do_overflow - jmp error_code - -ENTRY(bounds) - pushl $0 - pushl $do_bounds - jmp error_code - -ENTRY(invalid_op) - pushl $0 - pushl $do_invalid_op - jmp error_code - -ENTRY(coprocessor_segment_overrun) - pushl $0 - pushl $do_coprocessor_segment_overrun - jmp error_code - -ENTRY(invalid_TSS) - pushl $do_invalid_TSS - jmp error_code - -ENTRY(segment_not_present) - pushl $do_segment_not_present - jmp error_code - -ENTRY(stack_segment) - pushl $do_stack_segment - jmp error_code - -ENTRY(general_protection) - pushl $do_general_protection - jmp error_code - -ENTRY(alignment_check) - pushl $do_alignment_check - jmp error_code - -# This handler is special, because it gets an extra value on its stack, -# which is the linear faulting address. -# fastcall register usage: %eax = pt_regs, %edx = error code, -# %ecx = fault address -ENTRY(page_fault) - pushl %ds - pushl %eax - xorl %eax, %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - decl %eax /* eax = -1 */ - pushl %ecx - pushl %ebx - cld - movl %es,%edi - movl ES(%esp), %ecx /* get the faulting address */ - movl ORIG_EAX(%esp), %edx /* get the error code */ - movl %eax, ORIG_EAX(%esp) - movl %edi, ES(%esp) - movl $(__KERNEL_DS),%eax - movl %eax, %ds - movl %eax, %es - movl %esp,%eax /* pt_regs pointer */ - call do_page_fault - jmp ret_from_exception - -#ifdef CONFIG_X86_MCE -ENTRY(machine_check) - pushl $0 - pushl machine_check_vector - jmp error_code -#endif - -ENTRY(fixup_4gb_segment) - pushl $do_fixup_4gb_segment - jmp error_code - -.data -ENTRY(sys_call_table) - .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ - .long sys_exit - .long sys_fork - .long sys_read - .long sys_write - .long sys_open /* 5 */ - .long sys_close - .long sys_waitpid - .long sys_creat - .long sys_link - .long sys_unlink /* 10 */ - .long sys_execve - .long sys_chdir - .long sys_time - .long sys_mknod - .long sys_chmod /* 15 */ - .long sys_lchown16 - .long sys_ni_syscall /* old break syscall holder */ - .long sys_stat - .long sys_lseek - .long sys_getpid /* 20 */ - .long sys_mount - .long sys_oldumount - .long sys_setuid16 - .long sys_getuid16 - .long sys_stime /* 25 */ - .long sys_ptrace - .long sys_alarm - .long sys_fstat - .long sys_pause - .long sys_utime /* 30 */ - .long sys_ni_syscall /* old stty syscall holder */ - .long sys_ni_syscall /* old gtty syscall holder */ - .long sys_access - .long sys_nice - .long sys_ni_syscall /* 35 - old ftime syscall holder */ - .long sys_sync - .long sys_kill - .long sys_rename - .long sys_mkdir - .long sys_rmdir /* 40 */ - .long sys_dup - .long sys_pipe - .long sys_times - .long sys_ni_syscall /* old prof syscall holder */ - .long sys_brk /* 45 */ - .long sys_setgid16 - .long sys_getgid16 - .long sys_signal - .long sys_geteuid16 - .long sys_getegid16 /* 50 */ - .long sys_acct - .long sys_umount /* recycled never used phys() */ - .long sys_ni_syscall /* old lock syscall holder */ - .long sys_ioctl - .long sys_fcntl /* 55 */ - .long sys_ni_syscall /* old mpx syscall holder */ - .long sys_setpgid - .long sys_ni_syscall /* old ulimit syscall holder */ - .long sys_olduname - .long sys_umask /* 60 */ - .long sys_chroot - .long sys_ustat - .long sys_dup2 - .long sys_getppid - .long sys_getpgrp /* 65 */ - .long sys_setsid - .long sys_sigaction - .long sys_sgetmask - .long sys_ssetmask - .long sys_setreuid16 /* 70 */ - .long sys_setregid16 - .long sys_sigsuspend - .long sys_sigpending - .long sys_sethostname - .long sys_setrlimit /* 75 */ - .long sys_old_getrlimit - .long sys_getrusage - .long sys_gettimeofday - .long sys_settimeofday - .long sys_getgroups16 /* 80 */ - .long sys_setgroups16 - .long old_select - .long sys_symlink - .long sys_lstat - .long sys_readlink /* 85 */ - .long sys_uselib - .long sys_swapon - .long sys_reboot - .long old_readdir - .long old_mmap /* 90 */ - .long sys_munmap - .long sys_truncate - .long sys_ftruncate - .long sys_fchmod - .long sys_fchown16 /* 95 */ - .long sys_getpriority - .long sys_setpriority - .long sys_ni_syscall /* old profil syscall holder */ - .long sys_statfs - .long sys_fstatfs /* 100 */ - .long sys_ioperm - .long sys_socketcall - .long sys_syslog - .long sys_setitimer - .long sys_getitimer /* 105 */ - .long sys_newstat - .long sys_newlstat - .long sys_newfstat - .long sys_uname - .long sys_iopl /* 110 */ - .long sys_vhangup - .long sys_ni_syscall /* old "idle" system call */ - .long sys_vm86old - .long sys_wait4 - .long sys_swapoff /* 115 */ - .long sys_sysinfo - .long sys_ipc - .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ - .long sys_setdomainname - .long sys_newuname - .long sys_modify_ldt - .long sys_adjtimex - .long sys_mprotect /* 125 */ - .long sys_sigprocmask - .long sys_ni_syscall /* old "create_module" */ - .long sys_init_module - .long sys_delete_module - .long sys_ni_syscall /* 130: old "get_kernel_syms" */ - .long sys_quotactl - .long sys_getpgid - .long sys_fchdir - .long sys_bdflush - .long sys_sysfs /* 135 */ - .long sys_personality - .long sys_ni_syscall /* reserved for afs_syscall */ - .long sys_setfsuid16 - .long sys_setfsgid16 - .long sys_llseek /* 140 */ - .long sys_getdents - .long sys_select - .long sys_flock - .long sys_msync - .long sys_readv /* 145 */ - .long sys_writev - .long sys_getsid - .long sys_fdatasync - .long sys_sysctl - .long sys_mlock /* 150 */ - .long sys_munlock - .long sys_mlockall - .long sys_munlockall - .long sys_sched_setparam - .long sys_sched_getparam /* 155 */ - .long sys_sched_setscheduler - .long sys_sched_getscheduler - .long sys_sched_yield - .long sys_sched_get_priority_max - .long sys_sched_get_priority_min /* 160 */ - .long sys_sched_rr_get_interval - .long sys_nanosleep - .long sys_mremap - .long sys_setresuid16 - .long sys_getresuid16 /* 165 */ - .long sys_vm86 - .long sys_ni_syscall /* Old sys_query_module */ - .long sys_poll - .long sys_nfsservctl - .long sys_setresgid16 /* 170 */ - .long sys_getresgid16 - .long sys_prctl - .long sys_rt_sigreturn - .long sys_rt_sigaction - .long sys_rt_sigprocmask /* 175 */ - .long sys_rt_sigpending - .long sys_rt_sigtimedwait - .long sys_rt_sigqueueinfo - .long sys_rt_sigsuspend - .long sys_pread64 /* 180 */ - .long sys_pwrite64 - .long sys_chown16 - .long sys_getcwd - .long sys_capget - .long sys_capset /* 185 */ - .long sys_sigaltstack - .long sys_sendfile - .long sys_ni_syscall /* reserved for streams1 */ - .long sys_ni_syscall /* reserved for streams2 */ - .long sys_vfork /* 190 */ - .long sys_getrlimit - .long sys_mmap2 - .long sys_truncate64 - .long sys_ftruncate64 - .long sys_stat64 /* 195 */ - .long sys_lstat64 - .long sys_fstat64 - .long sys_lchown - .long sys_getuid - .long sys_getgid /* 200 */ - .long sys_geteuid - .long sys_getegid - .long sys_setreuid - .long sys_setregid - .long sys_getgroups /* 205 */ - .long sys_setgroups - .long sys_fchown - .long sys_setresuid - .long sys_getresuid - .long sys_setresgid /* 210 */ - .long sys_getresgid - .long sys_chown - .long sys_setuid - .long sys_setgid - .long sys_setfsuid /* 215 */ - .long sys_setfsgid - .long sys_pivot_root - .long sys_mincore - .long sys_madvise - .long sys_getdents64 /* 220 */ - .long sys_fcntl64 - .long sys_ni_syscall /* reserved for TUX */ - .long sys_ni_syscall - .long sys_gettid - .long sys_readahead /* 225 */ - .long sys_setxattr - .long sys_lsetxattr - .long sys_fsetxattr - .long sys_getxattr - .long sys_lgetxattr /* 230 */ - .long sys_fgetxattr - .long sys_listxattr - .long sys_llistxattr - .long sys_flistxattr - .long sys_removexattr /* 235 */ - .long sys_lremovexattr - .long sys_fremovexattr - .long sys_tkill - .long sys_sendfile64 - .long sys_futex /* 240 */ - .long sys_sched_setaffinity - .long sys_sched_getaffinity - .long sys_set_thread_area - .long sys_get_thread_area - .long sys_io_setup /* 245 */ - .long sys_io_destroy - .long sys_io_getevents - .long sys_io_submit - .long sys_io_cancel - .long sys_fadvise64 /* 250 */ - .long sys_ni_syscall - .long sys_exit_group - .long sys_lookup_dcookie - .long sys_epoll_create - .long sys_epoll_ctl /* 255 */ - .long sys_epoll_wait - .long sys_remap_file_pages - .long sys_set_tid_address - .long sys_timer_create - .long sys_timer_settime /* 260 */ - .long sys_timer_gettime - .long sys_timer_getoverrun - .long sys_timer_delete - .long sys_clock_settime - .long sys_clock_gettime /* 265 */ - .long sys_clock_getres - .long sys_clock_nanosleep - .long sys_statfs64 - .long sys_fstatfs64 - .long sys_tgkill /* 270 */ - .long sys_utimes - .long sys_fadvise64_64 - .long sys_ni_syscall /* sys_vserver */ - .long sys_mbind - .long sys_get_mempolicy - .long sys_set_mempolicy - .long sys_mq_open - .long sys_mq_unlink - .long sys_mq_timedsend - .long sys_mq_timedreceive /* 280 */ - .long sys_mq_notify - .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ - .long sys_waitid - .long sys_ni_syscall /* 285 */ /* available */ - .long sys_add_key - .long sys_request_key - .long sys_keyctl - -syscall_table_size=(.-sys_call_table) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/head.S --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/head.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,186 +0,0 @@ - -#include <linux/config.h> - -.section __xen_guest - .ascii "GUEST_OS=linux,GUEST_VER=2.6" - .ascii ",XEN_VER=3.0" - .ascii ",VIRT_BASE=0xC0000000" - .ascii ",LOADER=generic" - .byte 0 - -.text -#include <linux/threads.h> -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/thread_info.h> -#include <asm/asm_offsets.h> -#include <asm-xen/xen-public/arch-x86_32.h> - -/* - * References to members of the new_cpu_data structure. - */ - -#define X86 new_cpu_data+CPUINFO_x86 -#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor -#define X86_MODEL new_cpu_data+CPUINFO_x86_model -#define X86_MASK new_cpu_data+CPUINFO_x86_mask -#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math -#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level -#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability -#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id - -ENTRY(startup_32) - cld - - /* Copy the necessary stuff from xen_start_info structure. */ - mov $xen_start_info_union,%edi - mov $512,%ecx - rep movsl - -#ifdef CONFIG_SMP -ENTRY(startup_32_smp) - cld -#endif /* CONFIG_SMP */ - - /* Set up the stack pointer */ - lss stack_start,%esp - -checkCPUtype: - - /* get vendor info */ - xorl %eax,%eax # call CPUID with 0 -> return vendor ID - cpuid - movl %eax,X86_CPUID # save CPUID level - movl %ebx,X86_VENDOR_ID # lo 4 chars - movl %edx,X86_VENDOR_ID+4 # next 4 chars - movl %ecx,X86_VENDOR_ID+8 # last 4 chars - - movl $1,%eax # Use the CPUID instruction to get CPU type - cpuid - movb %al,%cl # save reg for future use - andb $0x0f,%ah # mask processor family - movb %ah,X86 - andb $0xf0,%al # mask model - shrb $4,%al - movb %al,X86_MODEL - andb $0x0f,%cl # mask mask revision - movb %cl,X86_MASK - movl %edx,X86_CAPABILITY - - incb ready - - xorl %eax,%eax # Clear FS/GS and LDT - movl %eax,%fs - movl %eax,%gs - cld # gcc2 wants the direction flag cleared at all times - -#ifdef CONFIG_SMP - movb ready, %cl - cmpb $1,%cl - je 1f # the first CPU calls start_kernel - # all other CPUs call initialize_secondary - call initialize_secondary - jmp L6 -1: -#endif /* CONFIG_SMP */ - call start_kernel -L6: - jmp L6 # main should never return here, but - # just in case, we know what happens. - -ENTRY(lgdt_finish) - movl $(__KERNEL_DS),%eax # reload all the segment registers - movw %ax,%ss # after changing gdt. - - movl $(__USER_DS),%eax # DS/ES contains default USER segment - movw %ax,%ds - movw %ax,%es - - popl %eax # reload CS by intersegment return - pushl $(__KERNEL_CS) - pushl %eax - lret - -ENTRY(stack_start) - .long init_thread_union+THREAD_SIZE - .long __BOOT_DS - -ready: .byte 0 - -.globl idt_descr -.globl cpu_gdt_descr - - ALIGN - .word 0 # 32-bit align idt_desc.address -idt_descr: - .word IDT_ENTRIES*8-1 # idt contains 256 entries - .long idt_table - -# boot GDT descriptor (later on used by CPU#0): - .word 0 # 32 bit align gdt_desc.address -cpu_gdt_descr: - .word GDT_SIZE - .long cpu_gdt_table - - .fill NR_CPUS-1,8,0 # space for the other GDT descriptors - -.org 0x1000 -ENTRY(empty_zero_page) - -.org 0x2000 -ENTRY(swapper_pg_dir) - -.org 0x3000 -ENTRY(cpu_gdt_table) - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x0000000000000000 /* 0x0b reserved */ - .quad 0x0000000000000000 /* 0x13 reserved */ - .quad 0x0000000000000000 /* 0x1b reserved */ - .quad 0x0000000000000000 /* 0x20 unused */ - .quad 0x0000000000000000 /* 0x28 unused */ - .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ - .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ - .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ - .quad 0x0000000000000000 /* 0x4b reserved */ - .quad 0x0000000000000000 /* 0x53 reserved */ - .quad 0x0000000000000000 /* 0x5b reserved */ - - .quad 0x00cfbb000000c3ff /* 0x60 kernel 4GB code at 0x00000000 */ - .quad 0x00cfb3000000c3ff /* 0x68 kernel 4GB data at 0x00000000 */ - .quad 0x00cffb000000c3ff /* 0x73 user 4GB code at 0x00000000 */ - .quad 0x00cff3000000c3ff /* 0x7b user 4GB data at 0x00000000 */ - - .quad 0x0000000000000000 /* 0x80 TSS descriptor */ - .quad 0x0000000000000000 /* 0x88 LDT descriptor */ - - /* Segments used for calling PnP BIOS */ - .quad 0x0000000000000000 /* 0x90 32-bit code */ - .quad 0x0000000000000000 /* 0x98 16-bit code */ - .quad 0x0000000000000000 /* 0xa0 16-bit data */ - .quad 0x0000000000000000 /* 0xa8 16-bit data */ - .quad 0x0000000000000000 /* 0xb0 16-bit data */ - /* - * The APM segments have byte granularity and their bases - * and limits are set at run time. - */ - .quad 0x0000000000000000 /* 0xb8 APM CS code */ - .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ - .quad 0x0000000000000000 /* 0xc8 APM DS data */ - - .quad 0x0000000000000000 /* 0xd0 - unused */ - .quad 0x0000000000000000 /* 0xd8 - unused */ - .quad 0x0000000000000000 /* 0xe0 - unused */ - .quad 0x0000000000000000 /* 0xe8 - unused */ - .quad 0x0000000000000000 /* 0xf0 - unused */ - .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ - .fill GDT_ENTRIES-32,8,0 - -.org 0x4000 -ENTRY(default_ldt) - -.org 0x5000 -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,194 +0,0 @@ -#include <linux/config.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/user.h> -#include <linux/elfcore.h> -#include <linux/mca.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/tty.h> -#include <linux/highmem.h> -#include <linux/time.h> - -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/nmi.h> -#include <asm/ist.h> -#include <asm/kdebug.h> - -extern void dump_thread(struct pt_regs *, struct user *); -extern spinlock_t rtc_lock; - -/* This is definitely a GPL-only symbol */ -EXPORT_SYMBOL_GPL(cpu_gdt_table); - -#if defined(CONFIG_APM_MODULE) -extern void machine_real_restart(unsigned char *, int); -EXPORT_SYMBOL(machine_real_restart); -extern void default_idle(void); -EXPORT_SYMBOL(default_idle); -#endif - -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -#endif - -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - -extern unsigned long cpu_khz; -extern unsigned long get_cmos_time(void); - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -#ifdef CONFIG_DISCONTIGMEM -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(physnode_map); -#endif -#ifdef CONFIG_X86_NUMAQ -EXPORT_SYMBOL(xquad_portio); -#endif -EXPORT_SYMBOL(dump_thread); -EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL_GPL(kernel_fpu_begin); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -#ifdef CONFIG_ACPI_BOOT -EXPORT_SYMBOL(pm_power_off); -#endif -EXPORT_SYMBOL(get_cmos_time); -EXPORT_SYMBOL(cpu_khz); -EXPORT_SYMBOL(apm_info); - -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); -/* Delay loops */ -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); - -EXPORT_SYMBOL(__get_user_1); -EXPORT_SYMBOL(__get_user_2); -EXPORT_SYMBOL(__get_user_4); - -EXPORT_SYMBOL(strpbrk); -EXPORT_SYMBOL(strstr); - -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__copy_from_user_ll); -EXPORT_SYMBOL(__copy_to_user_ll); -EXPORT_SYMBOL(strnlen_user); - -EXPORT_SYMBOL(dma_alloc_coherent); -EXPORT_SYMBOL(dma_free_coherent); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pcibios_penalize_isa_irq); -EXPORT_SYMBOL(pci_mem_start); -#endif - -#ifdef CONFIG_PCI_BIOS -EXPORT_SYMBOL(pcibios_set_irq_routing); -EXPORT_SYMBOL(pcibios_get_irq_routing_table); -#endif - -#ifdef CONFIG_X86_USE_3DNOW -EXPORT_SYMBOL(_mmx_memcpy); -EXPORT_SYMBOL(mmx_clear_page); -EXPORT_SYMBOL(mmx_copy_page); -#endif - -#ifdef CONFIG_X86_HT -EXPORT_SYMBOL(smp_num_siblings); -EXPORT_SYMBOL(cpu_sibling_map); -#endif - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(cpu_callout_map); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); - -/* Global SMP stuff */ -EXPORT_SYMBOL(smp_call_function); - -/* TLB flushing */ -EXPORT_SYMBOL(flush_tlb_page); -EXPORT_SYMBOL_GPL(flush_tlb_all); -#endif - -#ifdef CONFIG_X86_IO_APIC -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -#endif - -#ifdef CONFIG_MCA -EXPORT_SYMBOL(machine_id); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); -#endif - -EXPORT_SYMBOL(get_wchan); - -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); - -#undef memcmp -extern int memcmp(const void *,const void *,__kernel_size_t); -EXPORT_SYMBOL(memcmp); - -EXPORT_SYMBOL(register_die_notifier); -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); -#endif - -EXPORT_SYMBOL(__PAGE_KERNEL); - -#ifdef CONFIG_HIGHMEM -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); -#endif - -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) -EXPORT_SYMBOL(ist_info); -#endif - -EXPORT_SYMBOL(csum_partial); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/io_apic.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/io_apic.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2611 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku <yaku@xxxxxxxxxxxxxxxxxxxx> and - * Hidemi Kishimoto <kisimoto@xxxxxxxxxxxxxxxxxxxx>, - * further tested and cleaned up by Zach Brown <zab@xxxxxxxxxx> - * and Ingo Molnar <mingo@xxxxxxxxxx> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/config.h> -#include <linux/smp_lock.h> -#include <linux/mc146818rtc.h> -#include <linux/compiler.h> -#include <linux/acpi.h> - -#include <linux/sysdev.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/timer.h> - -#include <mach_apic.h> - -#include "io_ports.h" - -#ifdef CONFIG_XEN - -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/xen-public/physdev.h> - -/* Fake i8259 */ -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -#define disable_8259A_irq(_irq) ((void)0) -#define i8259A_irq_pending(_irq) (0) - -unsigned long io_apic_irqs; - -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) -{ - physdev_op_t op; - int ret; - - op.cmd = PHYSDEVOP_APIC_READ; - op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; - op.u.apic_op.offset = reg; - ret = HYPERVISOR_physdev_op(&op); - if (ret) - return ret; - return op.u.apic_op.value; -} - -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - physdev_op_t op; - - op.cmd = PHYSDEVOP_APIC_WRITE; - op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; - op.u.apic_op.offset = reg; - op.u.apic_op.value = value; - HYPERVISOR_physdev_op(&op); -} - -#define io_apic_read(a,r) xen_io_apic_read(a,r) -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) - -#endif /* CONFIG_XEN */ - -int (*ioapic_renumber_irq)(int ioapic, int irq); -atomic_t irq_mis_count; - -static DEFINE_SPINLOCK(ioapic_lock); - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); - } - entry->apic = apic; - entry->pin = pin; -} - -#ifndef CONFIG_XEN -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - - while (1) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - } - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int pin, reg; - - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, 0x00010000); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (entry.delivery_mode == dest_SMI) - return; - - /* - * Disable it in the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -{ - unsigned long flags; - int pin; - struct irq_pin_list *entry = irq_2_pin + irq; - unsigned int apicid_value; - - apicid_value = cpu_mask_to_apicid(cpumask); - /* Prepare to do the io_apic_write */ - apicid_value = apicid_value << 24; - spin_lock_irqsave(&ioapic_lock, flags); - for (;;) { - pin = entry->pin; - if (pin == -1) - break; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#else -#define clear_IO_APIC() ((void)0) -#endif - -#if defined(CONFIG_IRQBALANCE) -# include <asm/processor.h> /* kernel_thread() */ -# include <linux/kernel_stat.h> /* kstat */ -# include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> /* time_after() */ - -# ifdef CONFIG_BALANCED_IRQ_DEBUG -# define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) -# define Dprintk(x...) do { TDprintk(x); } while (0) -# else -# define TDprintk(x...) -# define Dprintk(x...) -# endif - -cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS]; - -#define IRQBALANCE_CHECK_ARCH -999 -static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH; -static int physical_balance = 0; - -struct irq_cpu_info { - unsigned long * last_irq; - unsigned long * irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) - -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu,now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) { - irq_desc_t *desc = irq_desc + irq; - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu); - spin_unlock_irqrestore(&desc->lock, flags); - } -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - Dprintk("Rotating IRQs among CPUs.\n"); - for (i = 0; i < NR_CPUS; i++) { - for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) { - if (!irq_desc[j].action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - - for (i = 0; i < NR_CPUS; i++) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < NR_IRQS; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ? */ - if (!irq_desc[j].action) - continue; - if ( package_index == i ) - IRQ_DELTA(package_index,j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i,j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i,j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index,j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - Dprintk("max_loaded cpu = %d\n", max_loaded); - Dprintk("min_loaded cpu = %d\n", min_loaded); - Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq); - Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq); - Dprintk("load imbalance = %lu\n", imbalance); - - /* if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) { - Dprintk("Imbalance too trivial\n"); - goto not_worth_the_effort; - } - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < NR_IRQS; j++) { - /* Is this an active IRQ? */ - if (!irq_desc[j].action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded,j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded,j)) { - move_this_load = IRQ_DELTA(max_loaded,j); - selected_irq = j; - } - } - if (selected_irq == -1) { - goto tryanothercpu; - } - - imbalance = move_this_load; - - /* For physical_balance case, we accumlated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - irq_desc_t *desc = irq_desc + selected_irq; - unsigned long flags; - - Dprintk("irq = %d moved to cpu = %d\n", - selected_irq, min_loaded); - /* mark for change destination */ - spin_lock_irqsave(&desc->lock, flags); - pending_irq_balance_cpumask[selected_irq] = - cpumask_of_cpu(min_loaded); - spin_unlock_irqrestore(&desc->lock, flags); - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - Dprintk("IRQ worth rotating not found\n"); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - - daemonize("kirqd"); - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) { - pending_irq_balance_cpumask[i] = cpumask_of_cpu(0); - } - - for ( ; ; ) { - set_current_state(TASK_INTERRUPTIBLE); - time_remaining = schedule_timeout(time_remaining); - try_to_freeze(PF_FREEZE); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_online(i)) - continue; - irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); - memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) - return 0; - else - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for (i = 0; i < NR_CPUS; i++) { - if(irq_cpu_data[i].irq_delta) - kfree(irq_cpu_data[i].irq_delta); - if(irq_cpu_data[i].last_irq) - kfree(irq_cpu_data[i].last_irq); - } - return 0; -} - -int __init irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 0; -} - -__setup("noirqbalance", irqbalance_disable); - -static inline void move_irq(int irq) -{ - /* note - we hold the desc->lock */ - if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) { - set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]); - cpus_clear(pending_irq_balance_cpumask[irq]); - } -} - -late_initcall(balanced_irq_init); - -#else /* !CONFIG_IRQBALANCE */ -static inline void move_irq(int irq) { } -#endif /* CONFIG_IRQBALANCE */ - -#ifndef CONFIG_SMP -void fastcall send_IPI_self(int vector) -{ -#ifndef CONFIG_XEN - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write_around(APIC_ICR, cfg); -#endif -} -#endif /* !CONFIG_SMP */ - - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -int pirq_entries [MAX_PIRQS]; -int pirqs_enabled; -int skip_ioapic_setup; - -static int __init ioapic_setup(char *str) -{ - skip_ioapic_setup = 1; - return 1; -} - -__setup("noapic", ioapic_setup); - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) - return i; - - return -1; -} - -#ifndef CONFIG_XEN -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA || - mp_bus_id_to_type[lbus] == MP_BUS_NEC98 - ) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) - - return mp_irqs[i].mpc_dstirq; - } - return -1; -} -#endif - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " - "slot:%d, pin:%d.\n", bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { - printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) - break; - - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && - !mp_irqs[i].mpc_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -#ifndef CONFIG_XEN -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif /* !CONFIG_XEN */ - -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - -/* NEC98 interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_NEC98_trigger(idx) (0) -#define default_NEC98_polarity(idx) (0) - -static int __init MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mpc_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - polarity = default_NEC98_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - case MP_BUS_NEC98: /* NEC 98 pin */ - { - trigger = default_NEC98_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mpc_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: - case MP_BUS_NEC98: - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } - } - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - return irq; -} - -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic,pin,mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS]; /* = { FIRST_DEVICE_VECTOR , 0 }; */ - -int assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR; - physdev_op_t op; - - BUG_ON(irq >= NR_IRQ_VECTORS); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) - return IO_APIC_VECTOR(irq); - - op.cmd = PHYSDEVOP_ASSIGN_VECTOR; - op.u.irq_op.irq = irq; - if (HYPERVISOR_physdev_op(&op)) - return -ENOSPC; - current_vector = op.u.irq_op.vector; - - vector_irq[current_vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; - - return current_vector; -} - -#ifndef CONFIG_XEN -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) -{ - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } -} -#else -#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) -#endif - -void __init setup_IO_APIC_irqs(void) -{ - struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; - unsigned long flags; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = - cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mpc_apicid, - pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mpc_apicid, pin); - continue; - } - - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - } - - irq = pin_2_irq(idx, apic, pin); - /* - * skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum - */ - if (multi_timer_check(apic, irq)) - continue; - else - add_pin_to_irq(irq, apic, pin); - - if (/*!apic &&*/ !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the 8259A-master output pin: - */ -#ifndef CONFIG_XEN -void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - memset(&entry,0,sizeof(entry)); - - disable_8259A_irq(0); - - /* mask LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we have a 8259A-master in AEOI mode ... - */ - irq_desc[0].handler = &ioapic_edge_type; - - /* - * Add it to the IO-APIC irq-routing table: - */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - enable_8259A_irq(0); -} - -static inline void UNEXPECTED_IO_APIC(void) -{ -} - -void __init print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - if (reg_00.bits.ID >= get_physical_broadcast()) - UNEXPECTED_IO_APIC(); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - - printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) - ) - UNEXPECTED_IO_APIC(); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - if (reg_03.bits.__reserved_1) - UNEXPECTED_IO_APIC(); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -static void print_APIC_bitfield (int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } -} - -void /*__init*/ print_local_APIC(void * dummy) -{ - unsigned int v, ver, maxlvt; - - if (apic_verbosity == APIC_QUIET) - return; - - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } - - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs (void) -{ - on_each_cpu(print_local_APIC, NULL, 1, 1); -} - -void /*__init*/ print_PIC(void) -{ - extern spinlock_t i8259A_lock; - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} -#else -void __init print_IO_APIC(void) { } -#endif /* !CONFIG_XEN */ - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (i = 0; i < nr_ioapics; i++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(i, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[i] = reg_01.bits.entries+1; - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - -#ifndef CONFIG_XEN - disconnect_bsp_APIC(); -#endif -} - -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@xxxxxxxx> Tue Dec 21 12:25:05 CST 1999 - */ - -#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ) -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mpc_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; - } - - /* Don't check I/O APIC IDs for some xAPIC systems. They have - * no meaning without the serial APIC bus. */ - if (NO_IOAPIC_CHECK) - continue; - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mpc_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mpc_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mpc_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mpc_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} -#else -static void __init setup_ioapic_ids_from_mpc(void) { } -#endif - -#ifndef CONFIG_XEN -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - if (jiffies - t1 > 4) - return 1; - - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) -{ - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ -} - -static void end_level_ioapic_irq (unsigned int irq) -{ - unsigned long v; - int i; - - move_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = IO_APIC_VECTOR(irq); - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} - -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ -static struct hw_interrupt_type ioapic_edge_type = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, - .set_affinity = set_ioapic_affinity, -}; - -static struct hw_interrupt_type ioapic_level_type = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, - .set_affinity = set_ioapic_affinity, -}; -#endif /* !CONFIG_XEN */ - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); -#ifndef CONFIG_XEN - else - /* Strange. Oh, well.. */ - irq_desc[irq].handler = &no_irq_type; -#endif - } - } -} - -#ifndef CONFIG_XEN -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void disable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static void end_lapic_irq (unsigned int i) { /* nothing */ } - -static struct hw_interrupt_type lapic_irq_type = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq -}; - -static void setup_nmi (void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void unlock_ExtINT_logic(void) -{ - int pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - unsigned long flags; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) - return; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - clear_IO_APIC_pin(0, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(0, pin); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - */ -static inline void check_timer(void) -{ - int pin1, pin2; - int vector; - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); - - /* - * Subtle, code in do_timer_interrupt() expects an AEOI - * mode for the 8259A whenever interrupts are routed - * through I/O APICs. Also IRQ0 has to be enabled in - * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. - */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - timer_ack = 1; - enable_8259A_irq(0); - - pin1 = find_isa_irq_pin(0, mp_INT); - pin2 = find_isa_irq_pin(0, mp_ExtINT); - - printk(KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - check_nmi_watchdog(); - } - return; - } - clear_IO_APIC_pin(0, pin1); - printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - } - - printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); - if (pin2 != -1) { - printk("\n..... (found pin %d) ...", pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - setup_ExtINT_IRQ0_pin(pin2, vector); - if (timer_irq_works()) { - printk("works.\n"); - if (pin1 != -1) - replace_pin_at_irq(0, 0, pin1, 0, pin2); - else - add_pin_to_irq(0, 0, pin2); - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - check_nmi_watchdog(); - } - return; - } - /* - * Cleanup, just in case ... - */ - clear_IO_APIC_pin(0, pin2); - } - printk(" failed.\n"); - - if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = 0; - } - - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - - disable_8259A_irq(0); - irq_desc[0].handler = &lapic_irq_type; - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - printk(" works.\n"); - return; - } - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - printk(" failed.\n"); - - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); - - timer_ack = 0; - init_8259A(0); - make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - printk(" works.\n"); - return; - } - printk(" failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option"); -} -#else -#define check_timer() ((void)0) -#endif - -/* - * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - enable_IO_APIC(); - - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; - - printk("ENABLING IO-APIC IRQs\n"); - - /* - * Set up IO-APIC IRQ routing. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#ifndef CONFIG_XEN - sync_Arb_IDs(); -#endif - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if(sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, u32 state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - memset(mp_ioapic_data[i], 0, size); - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_BOOT - -int __init io_apic_get_unique_id (int ioapic, int apic_id) -{ -#ifndef CONFIG_XEN - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) - panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); -#endif /* !CONFIG_XEN */ - - return apic_id; -} - - -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -#endif /*CONFIG_ACPI_BOOT*/ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ioport.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ioport.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,129 +0,0 @@ -/* - * linux/arch/i386/kernel/ioport.c - * - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <asm-xen/xen-public/physdev.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - unsigned long mask; - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); - unsigned int low_index = base & (BITS_PER_LONG-1); - int length = low_index + extent; - - if (low_index != 0) { - mask = (~0UL << low_index); - if (length < BITS_PER_LONG) - mask &= ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - length -= BITS_PER_LONG; - } - - mask = (new_value ? ~0UL : 0UL); - while (length >= BITS_PER_LONG) { - *bitmap_base++ = mask; - length -= BITS_PER_LONG; - } - - if (length > 0) { - mask = ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - } -} - - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - struct thread_struct * t = ¤t->thread; - unsigned long *bitmap; - physdev_op_t op; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - - op.cmd = PHYSDEVOP_SET_IOBITMAP; - op.u.set_iobitmap.bitmap = (unsigned long)bitmap; - op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS; - HYPERVISOR_physdev_op(&op); - } - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned int new_io_pl) -{ - unsigned int old_io_pl = current->thread.io_pl; - physdev_op_t op; - - if (new_io_pl > 3) - return -EINVAL; - - /* Need "raw I/O" privileges for direct port access. */ - if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* Maintain OS privileges even if user attempts to relinquish them. */ - if (new_io_pl == 0) - new_io_pl = 1; - - /* Change our version of the privilege levels. */ - current->thread.io_pl = new_io_pl; - - /* Force the change at ring 0. */ - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = new_io_pl; - HYPERVISOR_physdev_op(&op); - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/ldt.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,275 +0,0 @@ -/* - * linux/kernel/ldt.c - * - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/mmu_context.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, int mincount, int reload) -{ - void *oldldt; - void *newldt; - int oldsize; - - if (mincount <= pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); -#endif - make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) / - PAGE_SIZE); - load_LDT(pc); -#ifdef CONFIG_SMP - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#endif - } - if (oldsize) { - make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) / - PAGE_SIZE); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) / - PAGE_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - memset(&mm->context, 0, sizeof(mm->context)); - init_MUTEX(&mm->context.sem); - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); - retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); - } - if (retval == 0) { - spin_lock(&mm_unpinned_lock); - list_add(&mm->context.unpinned, &mm_unpinned); - spin_unlock(&mm_unpinned_lock); - } - return retval; -} - -/* - * No need to lock the MM as we are the last user - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - make_pages_writable(mm->context.ldt, - (mm->context.size * LDT_ENTRY_SIZE) / - PAGE_SIZE); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } - if (!mm->context.pinned) { - spin_lock(&mm_unpinned_lock); - list_del(&mm->context.unpinned); - spin_unlock(&mm_unpinned_lock); - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - down(&mm->context.sem); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - up(&mm->context.sem); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - void *address; - - err = 0; - address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (copy_to_user(ptr, address, size)) - err = -EFAULT; - - return err; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2, *lp; - unsigned long mach_lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - down(&mm->context.sem); - if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - mach_lp = arbitrary_virt_to_machine(lp); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2); - -out_unlock: - up(&mm->context.sem); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/microcode.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/microcode.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,163 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2004 Tigran Aivazian - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, - * Order Number 245472 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/245472.htm - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -//#define DEBUG /* pr_debug */ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/miscdevice.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/syscalls.h> - -#include <asm/msr.h> -#include <asm/uaccess.h> -#include <asm/processor.h> - -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>"); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "1.14-xen" - -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ - -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DECLARE_MUTEX(microcode_sem); - -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - - -static int do_microcode_update (void) -{ - int err; - dom0_op_t op; - - err = sys_mlock((unsigned long)user_buffer, user_buffer_size); - if (err != 0) - return err; - - op.cmd = DOM0_MICROCODE; - op.u.microcode.data = user_buffer; - op.u.microcode.length = user_buffer_size; - err = HYPERVISOR_dom0_op(&op); - - (void)sys_munlock((unsigned long)user_buffer, user_buffer_size); - - return err; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if (len < DEFAULT_UCODE_TOTALSIZE) { - printk(KERN_ERR "microcode: not enough data\n"); - return -EINVAL; - } - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - - down(µcode_sem); - - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); - if (!ret) - ret = (ssize_t)len; - - up(µcode_sem); - - return ret; -} - -static int microcode_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - /* - * XXX: will be removed after microcode_ctl - * is updated to ignore failure of this ioctl() - */ - case MICROCODE_IOCFREE: - return 0; - default: - return -EINVAL; - } - return -EINVAL; -} - -static struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .ioctl = microcode_ioctl, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .devfs_name = "cpu/microcode", - .fops = µcode_fops, -}; - -static int __init microcode_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@xxxxxxxxxxx>\n"); - return 0; -} - -static void __exit microcode_exit (void) -{ - misc_deregister(µcode_dev); - printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " unregistered\n"); -} - -module_init(microcode_init) -module_exit(microcode_exit) -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/mpparse.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/mpparse.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1115 +0,0 @@ -/* - * Intel Multiprocessor Specification 1.1 and 1.4 - * compliant MP-table parsing routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * - * Fixes - * Erich Boleyn : MP v1.4 and additional changes. - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Maciej W. Rozycki: Bits for default MP configurations - * Paul Diefenbaugh: Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/init.h> -#include <linux/acpi.h> -#include <linux/delay.h> -#include <linux/config.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/bitops.h> - -#include <asm/smp.h> -#include <asm/acpi.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/io_apic.h> - -#include <mach_apic.h> -#include <mach_mpparse.h> -#include <bios_ebda.h> - -/* Have we found an MP table */ -int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; - -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -int apic_version [MAX_APICS]; -int mp_bus_id_to_type [MAX_MP_BUSSES]; -int mp_bus_id_to_node [MAX_MP_BUSSES]; -int mp_bus_id_to_local [MAX_MP_BUSSES]; -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -int mp_current_pci_id; - -/* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; - -/* # of MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* MP IRQ source entries */ -int mp_irq_entries; - -int nr_ioapics; - -int pic_mode; -unsigned long mp_lapic_addr; - -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; -unsigned int boot_cpu_logical_apicid = -1U; -/* Internal processor count */ -static unsigned int __initdata num_processors; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; - -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - -/* - * Intel MP BIOS table parsing routines: - */ - - -/* - * Checksum an MP configuration block. - */ - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ - -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __initdata; - -#ifdef CONFIG_X86_NUMAQ -static int MP_valid_apicid(int apicid, int version) -{ - return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf; -} -#elif !defined(CONFIG_XEN) -static int MP_valid_apicid(int apicid, int version) -{ - if (version >= 0x14) - return apicid < 0xff; - else - return apicid < 0xf; -} -#endif - -#ifndef CONFIG_XEN -void __init MP_processor_info (struct mpc_config_processor *m) -{ - int ver, apicid; - physid_mask_t tmp; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) - return; - - apicid = mpc_apic_id(m, translation_table[mpc_record]); - - if (m->mpc_featureflag&(1<<0)) - Dprintk(" Floating point unit present.\n"); - if (m->mpc_featureflag&(1<<7)) - Dprintk(" Machine Exception supported.\n"); - if (m->mpc_featureflag&(1<<8)) - Dprintk(" 64 bit compare & exchange supported.\n"); - if (m->mpc_featureflag&(1<<9)) - Dprintk(" Internal APIC present.\n"); - if (m->mpc_featureflag&(1<<11)) - Dprintk(" SEP present.\n"); - if (m->mpc_featureflag&(1<<12)) - Dprintk(" MTRR present.\n"); - if (m->mpc_featureflag&(1<<13)) - Dprintk(" PGE present.\n"); - if (m->mpc_featureflag&(1<<14)) - Dprintk(" MCA present.\n"); - if (m->mpc_featureflag&(1<<15)) - Dprintk(" CMOV present.\n"); - if (m->mpc_featureflag&(1<<16)) - Dprintk(" PAT present.\n"); - if (m->mpc_featureflag&(1<<17)) - Dprintk(" PSE present.\n"); - if (m->mpc_featureflag&(1<<18)) - Dprintk(" PSN present.\n"); - if (m->mpc_featureflag&(1<<19)) - Dprintk(" Cache Line Flush Instruction present.\n"); - /* 20 Reserved */ - if (m->mpc_featureflag&(1<<21)) - Dprintk(" Debug Trace and EMON Store present.\n"); - if (m->mpc_featureflag&(1<<22)) - Dprintk(" ACPI Thermal Throttle Registers present.\n"); - if (m->mpc_featureflag&(1<<23)) - Dprintk(" MMX present.\n"); - if (m->mpc_featureflag&(1<<24)) - Dprintk(" FXSR present.\n"); - if (m->mpc_featureflag&(1<<25)) - Dprintk(" XMM present.\n"); - if (m->mpc_featureflag&(1<<26)) - Dprintk(" Willamette New Instructions present.\n"); - if (m->mpc_featureflag&(1<<27)) - Dprintk(" Self Snoop present.\n"); - if (m->mpc_featureflag&(1<<28)) - Dprintk(" HT present.\n"); - if (m->mpc_featureflag&(1<<29)) - Dprintk(" Thermal Monitor present.\n"); - /* 30, 31 Reserved */ - - - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); - boot_cpu_physical_apicid = m->mpc_apicid; - boot_cpu_logical_apicid = apicid; - } - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - num_processors++; - ver = m->mpc_apicver; - - if (!MP_valid_apicid(apicid, ver)) { - printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - --num_processors; - return; - } - - tmp = apicid_to_cpu_present(apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp); - - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; -} -#else -void __init MP_processor_info (struct mpc_config_processor *m) -{ - num_processors++; -} -#endif /* CONFIG_XEN */ - -static void __init MP_bus_info (struct mpc_config_bus *m) -{ - char str[7]; - - memcpy(str, m->mpc_bustype, 6); - str[6] = 0; - - mpc_oem_bus_info(m, str, translation_table[mpc_record]); - - if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; - } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { - mpc_oem_pci_bus(m, translation_table[mpc_record]); - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; - } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98; - } else { - printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); - } -} - -static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -{ - if (!(m->mpc_flags & MPC_APIC_USABLE)) - return; - - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", - MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); - } - if (!m->mpc_apicaddr) { - printk(KERN_ERR "WARNING: bogus zero I/O APIC address" - " found in MP table, skipping!\n"); - return; - } - mp_ioapics[nr_ioapics] = *m; - nr_ioapics++; -} - -static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -{ - mp_irqs [mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -{ - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); -} - -#ifdef CONFIG_X86_NUMAQ -static void __init MP_translation_info (struct mpc_config_translation *m) -{ - printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ - unsigned short oemsize) -{ - int count = sizeof (*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable)+count; - - mpc_record = 0; - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); - if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) - { - printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->oem_signature[0], - oemtable->oem_signature[1], - oemtable->oem_signature[2], - oemtable->oem_signature[3]); - return; - } - if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) - { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - while (count < oemtable->oem_length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_config_translation *m= - (struct mpc_config_translation *)oemptr; - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - { - printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); - return; - } - } - } -} - -static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! May not be a NUMA-Q system!\n"); - if (mpc->mpc_oemptr) - smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, - mpc->mpc_oemsize); -} -#endif /* CONFIG_X86_NUMAQ */ - -/* - * Read/parse the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - char oem[10]; - int count=sizeof(*mpc); - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", - *(u32 *)mpc->mpc_signature); - return 0; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk(KERN_ERR "SMP mptable: checksum error!\n"); - return 0; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", - mpc->mpc_spec); - return 0; - } - if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); - return 0; - } - memcpy(oem,mpc->mpc_oem,8); - oem[8]=0; - printk(KERN_INFO "OEM ID: %s ",oem); - - memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk("Product ID: %s ",str); - - mps_oem_check(mpc, oem, str); - - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); - - /* - * Save the local APIC address (it might be non-default) -- but only - * if we're not using ACPI. - */ - if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - mpc_record = 0; - while (count < mpc->mpc_length) { - switch(*mpt) { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - /* ACPI may have already provided this data */ - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - MP_bus_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_lintsrc *m= - (struct mpc_config_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - default: - { - count = mpc->mpc_length; - break; - } - } - ++mpc_record; - } - clustered_apic_check(); - if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); - return num_processors; -} - -static int __init ELCR_trigger(unsigned int irq) -{ - unsigned int port; - - port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; -} - -static void __init construct_default_ioirq_mptable(int mpc_default_type) -{ - struct mpc_config_intsrc intsrc; - int i; - int ELCR_fallback = 0; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; - - intsrc.mpc_irqtype = mp_INT; - - /* - * If true, we have an ISA/PCI system with no IRQ entries - * in the MP table. To prevent the PCI interrupts from being set up - * incorrectly, we try to use the ELCR. The sanity check to see if - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can - * never be level sensitive, so we simply see if the ELCR agrees. - * If it does, we assume it's valid. - */ - if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); - - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); - else { - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); - ELCR_fallback = 1; - } - } - - for (i = 0; i < 16; i++) { - switch (mpc_default_type) { - case 2: - if (i == 0 || i == 13) - continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ - default: - if (i == 2) - continue; /* IRQ2 is never connected */ - } - - if (ELCR_fallback) { - /* - * If the ELCR indicates a level-sensitive interrupt, we - * copy that information over to the MP table in the - * irqflag field (level sensitive, active high polarity). - */ - if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; - else - intsrc.mpc_irqflag = 0; - } - - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); - } - - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); -} - -static inline void __init construct_default_ISA_mptable(int mpc_default_type) -{ - struct mpc_config_processor processor; - struct mpc_config_bus bus; - struct mpc_config_ioapic ioapic; - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - switch (mpc_default_type) { - default: - printk("???\n"); - printk(KERN_ERR "Unknown standard configuration %d\n", - mpc_default_type); - /* fall through */ - case 1: - case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); - break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); - } - MP_bus_info(&bus); - if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); - MP_bus_info(&bus); - } - - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; - MP_ioapic_info(&ioapic); - - /* - * We set up most of the low 16 IO-APIC pins according to MPS rules. - */ - construct_default_ioirq_mptable(mpc_default_type); - - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; - for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; - MP_lintsrc_info(&lintsrc); - } -} - -static struct intel_mp_floating *mpf_found; - -/* - * Scan the memory blocks for an SMP configuration block. - */ -void __init get_smp_config (void) -{ - struct intel_mp_floating *mpf = mpf_found; - - /* - * ACPI may be used to obtain the entire SMP configuration or just to - * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. - */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); - return; - } - else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); - - printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } - - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1 != 0) { - - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); - return; - } - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_config_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } - - } else - BUG(); - - printk(KERN_INFO "Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ -} - -static int __init smp_scan_config (unsigned long base, unsigned long length) -{ - unsigned long *bp = isa_bus_to_virt(base); - struct intel_mp_floating *mpf; - - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); - if (sizeof(*mpf) != 16) - printk("Error: MPF size\n"); - - while (length > 0) { - mpf = (struct intel_mp_floating *)bp; - if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && - !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); - if (mpf->mpf_physptr) { - /* - * We cannot access to MPC table to compute - * table size yet, as only few megabytes from - * the bottom is mapped now. - * PC-9800's MPC table places on the very last - * of physical memory; so that simply reserving - * PAGE_SIZE from mpg->mpf_physptr yields BUG() - * in reserve_bootmem. - */ - unsigned long size = PAGE_SIZE; - unsigned long end = max_low_pfn * PAGE_SIZE; - if (mpf->mpf_physptr + size > end) - size = end - mpf->mpf_physptr; - reserve_bootmem(mpf->mpf_physptr, size); - } - - mpf_found = mpf; - return 1; - } - bp += 4; - length -= 16; - } - return 0; -} - -void __init find_smp_config (void) -{ - unsigned int address; - - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (smp_scan_config(0x0,0x400) || - smp_scan_config(639*0x400,0x400) || - smp_scan_config(0xF0000,0x10000)) - return; - /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - * - * MP1.4 SPEC states to only scan first 1K of 4K EBDA. - */ - - address = get_bios_ebda(); - if (address) - smp_scan_config(address, 0x400); -} - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_BOOT - -void __init mp_register_lapic_address ( - u64 address) -{ -#ifndef CONFIG_XEN - mp_lapic_addr = (unsigned long) address; - - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); -#endif -} - - -void __init mp_register_lapic ( - u8 id, - u8 enabled) -{ - struct mpc_config_processor processor; - int boot_cpu = 0; - - if (MAX_APICS - id <= 0) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) - boot_cpu = 1; - -#ifndef CONFIG_XEN - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; -#endif - - MP_processor_info(&processor); -} - -#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_ACPI_INTERPRETER) || defined(CONFIG_ACPI_BOOT)) - -#define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 - -struct mp_ioapic_routing { - int apic_id; - int gsi_base; - int gsi_end; - u32 pin_programmed[4]; -} mp_ioapic_routing[MAX_IO_APICS]; - - -static int mp_find_ioapic ( - int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_base) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - - return -1; -} - - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) -{ - int idx = 0; - - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in MADT table, skipping!\n"); - return; - } - - idx = nr_ioapics++; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); - - /* - * Build basic GSI lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI GSIs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); - - return; -} - - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - - return; -} - - -void __init mp_config_acpi_legacy_irqs (void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - - /* - * Fabricate the legacy ISA bus (bus #31). - */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); - - /* - * ES7000 has no legacy identity mappings - */ - if (es7000_plat) - return; - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; - - /* - * Use the default configuration for the IRQs 0-15. Unless - * overriden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } -} - -int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) -{ - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - -#ifdef CONFIG_ACPI_BUS - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_fadt.sci_int == gsi) - return gsi; -#endif - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; - - if (ioapic_renumber_irq) - gsi = ioapic_renumber_irq(ioapic, gsi); - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, - active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} - -#endif /*CONFIG_X86_IO_APIC && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT)*/ -#endif /*CONFIG_ACPI_BOOT*/ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/pci-dma.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/pci-dma.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,154 +0,0 @@ -/* - * Dynamic DMA mapping support. - * - * On i386 there is no hardware dynamic DMA address translation, - * so consistent alloc/free are merely page allocation/freeing. - * The rest of the dynamic DMA mapping interface is implemented - * in asm/pci.h. - */ - -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/version.h> -#include <asm/io.h> -#include <asm-xen/balloon.h> -#include <asm/tlbflush.h> - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int gfp) -{ - void *ret; - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - unsigned int order = get_order(size); - unsigned long vstart; - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - ret = mem->virt_base + (page << PAGE_SHIFT); - memset(ret, 0, size); - return ret; - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - return NULL; - } - - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) - gfp |= GFP_DMA; - - vstart = __get_free_pages(gfp, order); - ret = (void *)vstart; - - if (ret != NULL) { - xen_contig_memory(vstart, order); - - memset(ret, 0, size); - *dma_handle = virt_to_bus(ret); - } - return ret; -} - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - } else - free_pages((unsigned long)vaddr, order); -} - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base; - int pages = size >> PAGE_SHIFT; - int bitmap_size = (pages + 31)/32; - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem->bitmap); - out: - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if(!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; - int pos, err; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/process.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,769 +0,0 @@ -/* - * linux/arch/i386/kernel/process.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - */ - -/* - * This file handles the architecture-dependent parts of process handling.. - */ - -#include <stdarg.h> - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/elfcore.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/interrupt.h> -#include <linux/config.h> -#include <linux/utsname.h> -#include <linux/delay.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/ptrace.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/ldt.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/irq.h> -#include <asm/desc.h> -#include <asm-xen/xen-public/physdev.h> -#ifdef CONFIG_MATH_EMULATION -#include <asm/math_emu.h> -#endif - -#include <linux/irq.h> -#include <linux/err.h> - -#include <asm/tlbflush.h> -#include <asm/cpu.h> - -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); - -int hlt_counter; - -unsigned long boot_option_idle_override = 0; -EXPORT_SYMBOL(boot_option_idle_override); - -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *)tsk->thread.esp)[3]; -} - -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -static cpumask_t cpu_idle_map; - -void disable_hlt(void) -{ - hlt_counter++; -} - -EXPORT_SYMBOL(disable_hlt); - -void enable_hlt(void) -{ - hlt_counter--; -} - -EXPORT_SYMBOL(enable_hlt); - -/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */ -extern void stop_hz_timer(void); -extern void start_hz_timer(void); -void xen_idle(void) -{ - local_irq_disable(); - - if (need_resched()) { - local_irq_enable(); - } else { - stop_hz_timer(); - HYPERVISOR_block(); /* implicit local_irq_enable() */ - start_hz_timer(); - } -} - -#ifdef CONFIG_HOTPLUG_CPU -#include <asm/nmi.h> -/* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) -{ - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* We shouldn't have to disable interrupts while dead, but - * some interrupts just don't seem to go away, and this makes - * it "work" for testing purposes. */ - /* Death loop */ - while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) - HYPERVISOR_yield(); - - local_irq_disable(); - __flush_tlb_all(); - cpu_set(smp_processor_id(), cpu_online_map); - local_irq_enable(); -} -#else -static inline void play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle (void) -{ - int cpu = _smp_processor_id(); - - /* endless idle loop with no priority at all */ - while (1) { - while (!need_resched()) { - - if (cpu_isset(cpu, cpu_idle_map)) - cpu_clear(cpu, cpu_idle_map); - rmb(); - - if (cpu_is_offline(cpu)) { -#if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU) - /* Tell hypervisor to take vcpu down. */ - HYPERVISOR_vcpu_down(cpu); -#endif - play_dead(); - } - - irq_stat[cpu].idle_timestamp = jiffies; - xen_idle(); - } - schedule(); - } -} - -void cpu_idle_wait(void) -{ - int cpu; - cpumask_t map; - - for_each_online_cpu(cpu) - cpu_set(cpu, cpu_idle_map); - - wmb(); - do { - ssleep(1); - cpus_and(map, cpu_idle_map, cpu_online_map); - } while (!cpus_empty(map)); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */ -/* Always use xen_idle() instead. */ -void __init select_idle_routine(const struct cpuinfo_x86 *c) {} - -void show_regs(struct pt_regs * regs) -{ - printk("\n"); - printk("Pid: %d, comm: %20s\n", current->pid, current->comm); - printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); - - if (regs->xcs & 2) - printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); - printk(" EFLAGS: %08lx %s (%s)\n", - regs->eflags, print_tainted(), system_utsname.release); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax,regs->ebx,regs->ecx,regs->edx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx", - regs->esi, regs->edi, regs->ebp); - printk(" DS: %04x ES: %04x\n", - 0xffff & regs->xds,0xffff & regs->xes); - - show_trace(NULL, ®s->esp); -} - -/* - * This gets run with %ebx containing the - * function to call, and %edx containing - * the "args". - */ -extern void kernel_thread_helper(void); -__asm__(".section .text\n" - ".align 4\n" - "kernel_thread_helper:\n\t" - "movl %edx,%eax\n\t" - "pushl %edx\n\t" - "call *%ebx\n\t" - "pushl %eax\n\t" - "call do_exit\n" - ".previous"); - -/* - * Create a kernel thread - */ -int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) -{ - struct pt_regs regs; - - memset(®s, 0, sizeof(regs)); - - regs.ebx = (unsigned long) fn; - regs.edx = (unsigned long) arg; - - regs.xds = __USER_DS; - regs.xes = __USER_DS; - regs.orig_eax = -1; - regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS; - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; - - /* Ok, create the new process.. */ - return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -} - -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - struct task_struct *tsk = current; - struct thread_struct *t = &tsk->thread; - - /* The process may have allocated an io port bitmap... nuke it. */ - if (unlikely(NULL != t->io_bitmap_ptr)) { - physdev_op_t op = { 0 }; - op.cmd = PHYSDEVOP_SET_IOBITMAP; - HYPERVISOR_physdev_op(&op); - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - } -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - clear_fpu(tsk); - clear_used_math(); -} - -void release_thread(struct task_struct *dead_task) -{ - if (dead_task->mm) { - // temporary debugging check - if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); - BUG(); - } - } - - release_vm86_irqs(dead_task); -} - -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, - unsigned long unused, - struct task_struct * p, struct pt_regs * regs) -{ - struct pt_regs * childregs; - struct task_struct *tsk; - int err; - - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; - *childregs = *regs; - childregs->eax = 0; - childregs->esp = esp; - - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); - - p->thread.eip = (unsigned long) ret_from_fork; - - savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); - - tsk = current; - if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { - struct desc_struct *desc; - struct user_desc info; - int idx; - - err = -EFAULT; - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) - goto out; - err = -EINVAL; - if (LDT_empty(&info)) - goto out; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - goto out; - - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - - p->thread.io_pl = current->thread.io_pl; - - err = 0; - out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - -/* - * fill in the user structure for a core dump.. - */ -void dump_thread(struct pt_regs * regs, struct user * dump) -{ - int i; - -/* changed the size calculations - should hopefully work better. lbt */ - dump->magic = CMAGIC; - dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); - dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; - dump->u_dsize -= dump->u_tsize; - dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; - - if (dump->start_stack < TASK_SIZE) - dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - savesegment(fs,dump->regs.fs); - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; - - dump->u_fpvalid = dump_fpu (regs, &dump->i387); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs ptregs; - - ptregs = *(struct pt_regs *) - ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs)); - ptregs.xcs &= 0xffff; - ptregs.xds &= 0xffff; - ptregs.xes &= 0xffff; - ptregs.xss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - boot_option_idle_override = 1; - return 1; -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread,register) \ - HYPERVISOR_set_debugreg((register), \ - (thread->debugreg[register])) - -/* - * switch_to(x,yn) should switch tasks from x to y. - * - * We fsave/fwait so that an exception goes off at the right time - * (as a call from the fsave or fwait in effect) rather than to - * the wrong process. Lazy FP saving no longer makes any sense - * with modern CPU's, and this simplifies a lot of things (SMP - * and UP become the same). - * - * NOTE! We used to use the x86 hardware context switching. The - * reason for not using it any more becomes apparent when you - * try to recover gracefully from saved state that is no longer - * valid (stale segment register values in particular). With the - * hardware task-switch, there is no way to fix up bad state in - * a reasonable manner. - * - * The fact that Intel documents the hardware task-switching to - * be slow is a fairly red herring - this code is not noticeably - * faster. However, there _is_ some room for improvement here, - * so the performance issues may eventually be a valid point. - * More important, however, is the fact that this allows us much - * more flexibility. - * - * The return value (in %eax) will be the "prev" task after - * the task-switch, and shows up in ret_from_fork in entry.S, - * for example. - */ -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) -{ - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; - int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - physdev_op_t iopl_op, iobmp_op; - multicall_entry_t _mcl[8], *mcl = _mcl; - - /* XEN NOTE: FS/GS saved in switch_mm(), not here. */ - - /* - * This is basically '__unlazy_fpu', except that we queue a - * multicall to indicate FPU task switch, rather than - * synchronously trapping to Xen. - */ - if (prev_p->thread_info->status & TS_USEDFPU) { - __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; - } - - /* - * Reload esp0, LDT and the page table pointer: - * This is load_esp0(tss, next) with a multicall. - */ - tss->esp0 = next->esp0; - mcl->op = __HYPERVISOR_stack_switch; - mcl->args[0] = tss->ss0; - mcl->args[1] = tss->esp0; - mcl++; - - /* - * Load the per-thread Thread-Local Storage descriptor. - * This is load_TLS(next, cpu) with multicalls. - */ -#define C(i) do { \ - if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ - next->tls_array[i].b != prev->tls_array[i].b)) { \ - mcl->op = __HYPERVISOR_update_descriptor; \ - mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \ - [GDT_ENTRY_TLS_MIN + i]); \ - mcl->args[1] = ((u32 *)&next->tls_array[i])[0]; \ - mcl->args[2] = ((u32 *)&next->tls_array[i])[1]; \ - mcl++; \ - } \ -} while (0) - C(0); C(1); C(2); -#undef C - - if (unlikely(prev->io_pl != next->io_pl)) { - iopl_op.cmd = PHYSDEVOP_SET_IOPL; - iopl_op.u.set_iopl.iopl = next->io_pl; - mcl->op = __HYPERVISOR_physdev_op; - mcl->args[0] = (unsigned long)&iopl_op; - mcl++; - } - - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - iobmp_op.cmd = - PHYSDEVOP_SET_IOBITMAP; - iobmp_op.u.set_iobitmap.bitmap = - (unsigned long)next->io_bitmap_ptr; - iobmp_op.u.set_iobitmap.nr_ports = - next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; - mcl->op = __HYPERVISOR_physdev_op; - mcl->args[0] = (unsigned long)&iobmp_op; - mcl++; - } - - (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); - - /* - * Restore %fs and %gs if needed. - */ - if (unlikely(next->fs | next->gs)) { - loadsegment(fs, next->fs); - loadsegment(gs, next->gs); - } - - /* - * Now maybe reload the debug registers - */ - if (unlikely(next->debugreg[7])) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - return prev_p; -} - -asmlinkage int sys_fork(struct pt_regs regs) -{ - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); -} - -asmlinkage int sys_clone(struct pt_regs regs) -{ - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs.ebx; - newsp = regs.ecx; - parent_tidptr = (int __user *)regs.edx; - child_tidptr = (int __user *)regs.edi; - if (!newsp) - newsp = regs.esp; - return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage int sys_vfork(struct pt_regs regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); -} - -/* - * sys_execve() executes a new program. - */ -asmlinkage int sys_execve(struct pt_regs regs) -{ - int error; - char * filename; - - filename = getname((char __user *) regs.ebx); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - goto out; - error = do_execve(filename, - (char __user * __user *) regs.ecx, - (char __user * __user *) regs.edx, - ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - /* Make sure we don't return using sysenter.. */ - set_thread_flag(TIF_IRET); - } - putname(filename); -out: - return error; -} - -#define top_esp (THREAD_SIZE - sizeof(unsigned long)) -#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long ebp, esp, eip; - unsigned long stack_page; - int count = 0; - if (!p || p == current || p->state == TASK_RUNNING) - return 0; - stack_page = (unsigned long)p->thread_info; - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) - return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; - do { - if (ebp < stack_page || ebp > top_ebp+stack_page) - return 0; - eip = *(unsigned long *) (ebp+4); - if (!in_sched_functions(eip)) - return eip; - ebp = *(unsigned long *) ebp; - } while (count++ < 16); - return 0; -} - -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty(t->tls_array + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - */ -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -{ - struct thread_struct *t = ¤t->thread; - struct user_desc info; - struct desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); - - put_cpu(); - - return 0; -} - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -{ - struct user_desc info; - struct desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/setup.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/setup.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1649 +0,0 @@ -/* - * linux/arch/i386/kernel/setup.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * - * Memory region support - * David Parsons <orc@xxxxxxxxxxxxxx>, July-August 1999 - * - * Added E820 sanitization routine (removes overlapping memory regions); - * Brian Moyle <bmoyle@xxxxxxxxxx>, February 2001 - * - * Moved CPU detection code to cpu/${cpu}.c - * Patrick Mochel <mochel@xxxxxxxx>, March 2002 - * - * Provisions for empty E820 memory regions (reported by certain BIOSes). - * Alex Achenbach <xela@xxxxxxx>, December 2002. - * - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/tty.h> -#include <linux/ioport.h> -#include <linux/acpi.h> -#include <linux/apm_bios.h> -#include <linux/initrd.h> -#include <linux/bootmem.h> -#include <linux/seq_file.h> -#include <linux/console.h> -#include <linux/mca.h> -#include <linux/root_dev.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/efi.h> -#include <linux/init.h> -#include <linux/edd.h> -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <video/edid.h> -#include <asm/e820.h> -#include <asm/mpspec.h> -#include <asm/setup.h> -#include <asm/arch_hooks.h> -#include <asm/sections.h> -#include <asm/io_apic.h> -#include <asm/ist.h> -#include <asm/io.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/xen-public/physdev.h> -#include "setup_arch_pre.h" -#include <bios_ebda.h> - -/* Allows setting of maximum possible memory size */ -static unsigned long xen_override_max_pfn; - -static int xen_panic_event(struct notifier_block *, unsigned long, void *); -static struct notifier_block xen_panic_block = { - xen_panic_event, NULL, 0 /* try to go last */ -}; - -int disable_pse __initdata = 0; - -/* - * Machine setup.. - */ - -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif - -/* cpu data as detected by the assembly code in head.S */ -struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; -/* common cpu data for all cpus */ -struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 }; - -unsigned long mmu_cr4_features; -EXPORT_SYMBOL_GPL(mmu_cr4_features); - -#ifdef CONFIG_ACPI_INTERPRETER - int acpi_disabled = 0; -#else - int acpi_disabled = 1; -#endif -EXPORT_SYMBOL(acpi_disabled); - -#ifdef CONFIG_ACPI_BOOT -int __initdata acpi_force = 0; -extern acpi_interrupt_flags acpi_sci_flags; -#endif - -/* for MCA, but anyone else can use it if they want */ -unsigned int machine_id; -unsigned int machine_submodel_id; -unsigned int BIOS_revision; -unsigned int mca_pentium_flag; - -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; - -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; - -/* user-defined highmem size */ -static unsigned int highmem_pages = -1; - -/* - * Setup options - */ -struct drive_info_struct { char dummy[32]; } drive_info; -struct screen_info screen_info; -struct apm_info apm_info; -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; -struct edid_info edid_info; -struct ist_info ist_info; -struct e820map e820; - -unsigned char aux_device_present; - -extern void early_cpu_init(void); -extern void dmi_scan_machine(void); -extern void generic_apic_probe(char *); -extern int root_mountflags; - -unsigned long saved_videomode; - -#define RAMDISK_IMAGE_START_MASK 0x07FF -#define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 - -static char command_line[COMMAND_LINE_SIZE]; - -unsigned char __initdata boot_params[PARAM_SIZE]; - -static struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; -#endif - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* Nothing to do if not running in dom0. */ - if (!(xen_start_info.flags & SIF_INITDOMAIN)) - return; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} -#endif - -/* - * Point at the empty zero page to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; -EXPORT_SYMBOL(HYPERVISOR_shared_info); - -unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list; -EXPORT_SYMBOL(phys_to_machine_mapping); - -/* Raw start-of-day parameters from the hypervisor. */ -union xen_start_info_union xen_start_info_union; - -static void __init limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - int i; - - if (efi_enabled) { - for (i = 0; i < memmap.nr_map; i++) { - current_addr = memmap.map[i].phys_addr + - (memmap.map[i].num_pages << 12); - if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) { - if (current_addr >= size) { - memmap.map[i].num_pages -= - (((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT); - memmap.nr_map = i + 1; - return; - } - } - } - } - for (i = 0; i < e820.nr_map; i++) { - if (e820.map[i].type == E820_RAM) { - current_addr = e820.map[i].addr + e820.map[i].size; - if (current_addr >= size) { - e820.map[i].size -= current_addr-size; - e820.nr_map = i + 1; - return; - } - } - } -} - -static void __init add_memory_region(unsigned long long start, - unsigned long long size, int type) -{ - int x; - - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; - } -} /* add_memory_region */ - -#define E820_DEBUG 1 - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %lu\n", e820.map[i].type); - break; - } - } -} - -#if 0 -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ -}; -struct change_member change_point_list[2*E820MAX] __initdata; -struct change_member *change_point[2*E820MAX] __initdata; -struct e820entry *overlap_list[E820MAX] __initdata; -struct e820entry new_bios[E820MAX] __initdata; - -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr, chg_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses), - omitting those that are for empty memory regions */ - chgidx = 0; - for (i=0; i < old_nr; i++) { - if (biosmap[i].size != 0) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - } - chg_nr = chgidx; /* true number of change-points */ - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; i<overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - unsigned long long start = biosmap->addr; - unsigned long long size = biosmap->size; - unsigned long long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -} -#endif - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -struct edd edd; -#ifdef CONFIG_EDD_MODULE -EXPORT_SYMBOL(edd); -#endif -/** - * copy_edd() - Copy the BIOS EDD information - * from boot_params into a safe place. - * - */ -static inline void copy_edd(void) -{ - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; -} -#else -static inline void copy_edd(void) -{ -} -#endif - -/* - * Do NOT EVER look at the BIOS memory size location. - * It does not work on many machines. - */ -#define LOWMEMSIZE() (0x9f000) - -static void __init parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = saved_command_line; - int len = 0, max_cmdline; - int userdef = 0; - - if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) - max_cmdline = COMMAND_LINE_SIZE; - memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline); - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[max_cmdline-1] = '\0'; - - for (;;) { - if (c != ' ') - goto next_char; - /* - * "mem=nopentium" disables the 4MB page tables. - * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM - * to <mem>, overriding the bios size. - * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from - * <start> to <start>+<mem>, overriding the bios size. - * - * HPA tells me bootloaders need to parse mem=, so no new - * option should be mem= [also see Documentation/i386/boot.txt] - */ - if (!memcmp(from, "mem=", 4)) { - if (to != command_line) - to--; - if (!memcmp(from+4, "nopentium", 9)) { - from += 9+4; - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long mem_size; - - mem_size = memparse(from+4, &from); -#if 0 - limit_regions(mem_size); - userdef=1; -#else - xen_override_max_pfn = - (unsigned long)(mem_size>>PAGE_SHIFT); -#endif - } - } - - else if (!memcmp(from, "memmap=", 7)) { - if (to != command_line) - to--; - if (!memcmp(from+7, "exactmap", 8)) { - from += 8+7; - e820.nr_map = 0; - userdef = 1; - } else { - /* If the user specifies memory size, we - * limit the BIOS-provided memory map to - * that size. exactmap can be used to specify - * the exact map. mem=number can be used to - * trim the existing memory map. - */ - unsigned long long start_at, mem_size; - - mem_size = memparse(from+7, &from); - if (*from == '@') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RAM); - } else if (*from == '#') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_ACPI); - } else if (*from == '$') { - start_at = memparse(from+1, &from); - add_memory_region(start_at, mem_size, E820_RESERVED); - } else { - limit_regions(mem_size); - userdef=1; - } - } - } - - else if (!memcmp(from, "noexec=", 7)) - noexec_setup(from + 7); - - -#ifdef CONFIG_X86_MPPARSE - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif - -#ifdef CONFIG_ACPI_BOOT - /* "acpi=off" disables both ACPI table parsing and interpreter */ - else if (!memcmp(from, "acpi=off", 8)) { - disable_acpi(); - } - - /* acpi=force to over-ride black-list */ - else if (!memcmp(from, "acpi=force", 10)) { - acpi_force = 1; - acpi_ht = 1; - acpi_disabled = 0; - } - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } - - /* Limit ACPI just to boot-time to enable HT */ - else if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - - /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ - else if (!memcmp(from, "pci=noacpi", 10)) { - acpi_disable_pci(); - } - /* "acpi=noirq" disables ACPI interrupt routing */ - else if (!memcmp(from, "acpi=noirq", 10)) { - acpi_noirq_set(); - } - - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; - - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; - -#ifdef CONFIG_X86_IO_APIC - else if (!memcmp(from, "acpi_skip_timer_override", 24)) - acpi_skip_timer_override = 1; -#endif - -#ifdef CONFIG_X86_LOCAL_APIC - /* disable IO-APIC */ - else if (!memcmp(from, "noapic", 6)) - disable_ioapic_setup(); -#endif /* CONFIG_X86_LOCAL_APIC */ -#endif /* CONFIG_ACPI_BOOT */ - - /* - * highmem=size forces highmem to be exactly 'size' bytes. - * This works even on boxes that have no highmem otherwise. - * This also works to reduce highmem size on bigger boxes. - */ - else if (!memcmp(from, "highmem=", 8)) - highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; - - /* - * vmalloc=size forces the vmalloc area to be exactly 'size' - * bytes. This can be used to increase (or decrease) the - * vmalloc area - the default is 128m. - */ - else if (!memcmp(from, "vmalloc=", 8)) - __VMALLOC_RESERVE = memparse(from+8, &from); - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; - if (userdef) { - printk(KERN_INFO "user-defined physical RAM map:\n"); - print_memory_map("user"); - } -} - -#if 0 /* !XEN */ -/* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - - -/* - * Find the highest page frame number we have available - */ -void __init find_max_pfn(void) -{ - int i; - - max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - return; - } - - for (i = 0; i < e820.nr_map; i++) { - unsigned long start, end; - /* RAM? */ - if (e820.map[i].type != E820_RAM) - continue; - start = PFN_UP(e820.map[i].addr); - end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - if (start >= end) - continue; - if (end > max_pfn) - max_pfn = end; - } -} -#else -/* We don't use the fake e820 because we need to respond to user override. */ -void __init find_max_pfn(void) -{ - if ( xen_override_max_pfn < xen_start_info.nr_pages ) - xen_override_max_pfn = xen_start_info.nr_pages; - max_pfn = xen_override_max_pfn; -} -#endif /* XEN */ - -/* - * Determine low and high memory ranges: - */ -unsigned long __init find_max_low_pfn(void) -{ - unsigned long max_low_pfn; - - max_low_pfn = max_pfn; - if (max_low_pfn > MAXMEM_PFN) { - if (highmem_pages == -1) - highmem_pages = max_pfn - MAXMEM_PFN; - if (highmem_pages + MAXMEM_PFN < max_pfn) - max_pfn = MAXMEM_PFN + highmem_pages; - if (highmem_pages + MAXMEM_PFN > max_pfn) { - printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn = MAXMEM_PFN; -#ifndef CONFIG_HIGHMEM - /* Maximum memory usable is what is directly addressable */ - printk(KERN_WARNING "Warning only %ldMB will be used.\n", - MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); - max_pfn = MAXMEM_PFN; -#else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_X86_PAE - if (max_pfn > MAX_NONPAE_PFN) { - max_pfn = MAX_NONPAE_PFN; - printk(KERN_WARNING "Warning only 4GB will be used.\n"); - printk(KERN_WARNING "Use a PAE enabled kernel.\n"); - } -#endif /* !CONFIG_X86_PAE */ -#endif /* !CONFIG_HIGHMEM */ - } else { - if (highmem_pages == -1) - highmem_pages = 0; -#ifdef CONFIG_HIGHMEM - if (highmem_pages >= max_pfn) { - printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); - highmem_pages = 0; - } - if (highmem_pages) { - if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ - printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); - highmem_pages = 0; - } - max_low_pfn -= highmem_pages; - } -#else - if (highmem_pages) - printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); -#endif - } - return max_low_pfn; -} - -#ifndef CONFIG_DISCONTIGMEM - -/* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= ((max_low_pfn + 1) << PAGE_SHIFT)) - return 0; - if (end >= ((max_low_pfn + 1) << PAGE_SHIFT)) - end = (max_low_pfn + 1) << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* - * Register fully available low RAM pages with the bootmem allocator. - */ -static void __init register_bootmem_low_pages(unsigned long max_low_pfn) -{ - int i; - - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } - for (i = 0; i < e820.nr_map; i++) { - unsigned long curr_pfn, last_pfn, size; - /* - * Reserve usable low memory - */ - if (e820.map[i].type != E820_RAM) - continue; - /* - * We are rounding up the start address of usable memory: - */ - curr_pfn = PFN_UP(e820.map[i].addr); - if (curr_pfn >= max_low_pfn) - continue; - /* - * ... and at the end of the usable range downwards: - */ - last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); - - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = last_pfn - curr_pfn; - free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); - } -} - -/* - * workaround for Dell systems that neglect to reserve EBDA - */ -static void __init reserve_ebda_region(void) -{ - unsigned int addr; - addr = get_bios_ebda(); - if (addr) - reserve_bootmem(addr, PAGE_SIZE); -} - -static unsigned long __init setup_memory(void) -{ - unsigned long bootmap_size, start_pfn, max_low_pfn; - - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - start_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + xen_start_info.nr_pt_frames; - - find_max_pfn(); - - max_low_pfn = find_max_low_pfn(); - -#ifdef CONFIG_HIGHMEM - highstart_pfn = highend_pfn = max_pfn; - if (max_pfn > max_low_pfn) { - highstart_pfn = max_low_pfn; - } - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - /* - * Initialize the boot-time allocator (with low memory only): - */ - bootmap_size = init_bootmem(start_pfn, max_low_pfn); - - register_bootmem_low_pages(max_low_pfn); - - /* - * Reserve the bootmem bitmap itself as well. We do this in two - * steps (first step was init_bootmem()) because this catches - * the (very unlikely) case of us accidentally initializing the - * bootmem allocator with an invalid RAM area. - */ - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); - - /* reserve EBDA region, it's a 4K region */ - reserve_ebda_region(); - - /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent - PCI prefetch into it (errata #56). Usually the page is reserved anyways, - unless you have no PS/2 mouse plugged in. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - reserve_bootmem(0xa0000 - 4096, 4096); - -#ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem(PAGE_SIZE, PAGE_SIZE); -#endif -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif - -#ifdef CONFIG_BLK_DEV_INITRD - if (xen_start_info.mod_start) { - if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { - /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/ - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; - initrd_below_start_ok = 1; - } - else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - INITRD_START + INITRD_SIZE, - max_low_pfn << PAGE_SHIFT); - initrd_start = 0; - } - } -#endif - - phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list; - - return max_low_pfn; -} -#else -extern unsigned long setup_memory(void); -#endif /* !CONFIG_DISCONTIGMEM */ - -/* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) -{ - int i; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - probe_roms(); -#endif - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; - res = alloc_bootmem_low(sizeof(struct resource)); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - request_resource(&iomem_resource, res); - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, code_resource); - request_resource(res, data_resource); - } - } -} - -/* - * Request address space for all standard resources - */ -static void __init register_memory(void) -{ - unsigned long gapstart, gapsize; - unsigned long long last; - int i; - - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, &data_resource); - else - legacy_init_iomem_resources(&code_resource, &data_resource); - - if (xen_start_info.flags & SIF_INITDOMAIN) - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - - /* - * Search for the bigest gap in the low 32 bits of the e820 - * memory space. - */ - last = 0x100000000ull; - gapstart = 0x10000000; - gapsize = 0x400000; - i = e820.nr_map; - while (--i >= 0) { - unsigned long long start = e820.map[i].addr; - unsigned long long end = start + e820.map[i].size; - - /* - * Since "last" is at most 4GB, we know we'll - * fit in 32 bits if this condition is true - */ - if (last > end) { - unsigned long gap = last - end; - - if (gap > gapsize) { - gapsize = gap; - gapstart = end; - } - } - if (start < last) - last = start; - } - - /* - * Start allocating dynamic PCI memory a bit into the gap, - * aligned up to the nearest megabyte. - * - * Question: should we try to pad it up a bit (do something - * like " + (gapsize >> 3)" in there too?). We now have the - * technology. - */ - pci_mem_start = (gapstart + 0xfffff) & ~0xfffff; - - printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", - pci_mem_start, gapstart, gapsize); -} - -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t.data\nintelnops: " - GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 - GENERIC_NOP7 GENERIC_NOP8); -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); -asm("\t.data\nk7nops: " - K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 - K7_NOP7 K7_NOP8); - -extern unsigned char intelnops[], k8nops[], k7nops[]; -static unsigned char *intel_nops[ASM_NOP_MAX+1] = { - NULL, - intelnops, - intelnops + 1, - intelnops + 1 + 2, - intelnops + 1 + 2 + 3, - intelnops + 1 + 2 + 3 + 4, - intelnops + 1 + 2 + 3 + 4 + 5, - intelnops + 1 + 2 + 3 + 4 + 5 + 6, - intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { - NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; -static unsigned char *k7_nops[ASM_NOP_MAX+1] = { - NULL, - k7nops, - k7nops + 1, - k7nops + 1 + 2, - k7nops + 1 + 2 + 3, - k7nops + 1 + 2 + 3 + 4, - k7nops + 1 + 2 + 3 + 4 + 5, - k7nops + 1 + 2 + 3 + 4 + 5 + 6, - k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; -static struct nop { - int cpuid; - unsigned char **noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { -1, NULL } -}; - -/* Replace instructions with better alternatives for this CPU type. - - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where - APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ -void apply_alternatives(void *start, void *end) -{ - struct alt_instr *a; - int diff, i, k; - unsigned char **noptable = intel_nops; - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - for (a = start; (void *)a < end; a++) { - if (!boot_cpu_has(a->cpuid)) - continue; - BUG_ON(a->replacementlen > a->instrlen); - memcpy(a->instr, a->replacement, a->replacementlen); - diff = a->instrlen - a->replacementlen; - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - memcpy(a->instr + i, noptable[k], k); - } - } -} - -static int no_replacement __initdata = 0; - -void __init alternative_instructions(void) -{ - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - if (no_replacement) - return; - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static int __init noreplacement_setup(char *s) -{ - no_replacement = 1; - return 0; -} - -__setup("noreplacement", noreplacement_setup); - -static char * __init machine_specific_memory_setup(void); - -#ifdef CONFIG_MCA -static void set_mca_bus(int x) -{ - MCA_bus = x; -} -#else -static void set_mca_bus(int x) { } -#endif - -/* - * Determine if we were loaded by an EFI loader. If so, then we have also been - * passed the efi memmap, systab, etc., so we should use these data structures - * for initialization. Note, the efi init code path is determined by the - * global efi_enabled. This allows the same kernel image to be used on existing - * systems (with a traditional BIOS) as well as on EFI systems. - */ -void __init setup_arch(char **cmdline_p) -{ - int i, j; - physdev_op_t op; - unsigned long max_low_pfn; - - /* Force a quick death if the kernel panics. */ - extern int panic_timeout; - if (panic_timeout == 0) - panic_timeout = 1; - - /* Register a call for panic conditions. */ - notifier_chain_register(&panic_notifier_list, &xen_panic_block); - - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); - HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_writable_pagetables); - - memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); - early_cpu_init(); - - /* - * FIXME: This isn't an official loader_type right - * now but does currently work with elilo. - * If we were configured as an EFI kernel, check to make - * sure that we were loaded correctly from elilo and that - * the system table is valid. If not, then initialize normally. - */ -#ifdef CONFIG_EFI - if ((LOADER_TYPE == 0x50) && EFI_SYSTAB) - efi_enabled = 1; -#endif - - /* This must be initialized to UNNAMED_MAJOR for ipconfig to work - properly. Setting ROOT_DEV to default to /dev/ram0 breaks initrd. - */ - ROOT_DEV = MKDEV(UNNAMED_MAJOR,0); - drive_info = DRIVE_INFO; - screen_info = SCREEN_INFO; - edid_info = EDID_INFO; - apm_info.bios = APM_BIOS_INFO; - ist_info = IST_INFO; - saved_videomode = VIDEO_MODE; - if( SYS_DESC_TABLE.length != 0 ) { - set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2); - machine_id = SYS_DESC_TABLE.table[0]; - machine_submodel_id = SYS_DESC_TABLE.table[1]; - BIOS_revision = SYS_DESC_TABLE.table[2]; - } - aux_device_present = AUX_DEVICE_INFO; - bootloader_type = LOADER_TYPE; - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - /* This is drawn from a dump from vgacon:startup in standard Linux. */ - screen_info.orig_video_mode = 3; - screen_info.orig_video_isVGA = 1; - screen_info.orig_video_lines = 25; - screen_info.orig_video_cols = 80; - screen_info.orig_video_ega_bx = 3; - screen_info.orig_video_points = 16; -#endif - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); -#endif - ARCH_SETUP - if (efi_enabled) - efi_init(); - else { - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(machine_specific_memory_setup()); - } - - copy_edd(); - - if (!MOUNT_ROOT_RDONLY) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) _text; - init_mm.end_code = (unsigned long) _etext; - init_mm.end_data = (unsigned long) _edata; - init_mm.brk = (PFN_UP(__pa(xen_start_info.pt_base)) + - xen_start_info.nr_pt_frames) << PAGE_SHIFT; - - /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */ - /*code_resource.start = virt_to_phys(_text);*/ - /*code_resource.end = virt_to_phys(_etext)-1;*/ - /*data_resource.start = virt_to_phys(_etext);*/ - /*data_resource.end = virt_to_phys(_edata)-1;*/ - - parse_cmdline_early(cmdline_p); - - max_low_pfn = setup_memory(); - - /* - * NOTE: before this point _nobody_ is allowed to allocate - * any memory using the bootmem allocator. Although the - * alloctor is now initialised only the first 8Mb of the kernel - * virtual address space has been mapped. All allocations before - * paging_init() has completed must use the alloc_bootmem_low_pages() - * variant (which allocates DMA'able memory) and care must be taken - * not to exceed the 8Mb limit. - */ - -#ifdef CONFIG_SMP - smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ -#endif - paging_init(); - -#ifdef CONFIG_X86_FIND_SMP_CONFIG - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif - - /* Make sure we have a correctly sized P->M table. */ - if (max_pfn != xen_start_info.nr_pages) { - phys_to_machine_mapping = alloc_bootmem_low_pages( - max_pfn * sizeof(unsigned long)); - - if (max_pfn > xen_start_info.nr_pages) { - /* set to INVALID_P2M_ENTRY */ - memset(phys_to_machine_mapping, ~0, - max_pfn * sizeof(unsigned long)); - memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - xen_start_info.nr_pages * sizeof(unsigned long)); - } else { - memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - max_pfn * sizeof(unsigned long)); - if (HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, - (unsigned long *)xen_start_info.mfn_list + max_pfn, - xen_start_info.nr_pages - max_pfn, 0) != - (xen_start_info.nr_pages - max_pfn)) BUG(); - } - free_bootmem( - __pa(xen_start_info.mfn_list), - PFN_PHYS(PFN_UP(xen_start_info.nr_pages * - sizeof(unsigned long)))); - } - - pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE); - for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) - { - pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; - } - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = - virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; - - - /* - * NOTE: at this point the bootmem allocator is fully available. - */ - -#ifdef CONFIG_EARLY_PRINTK - { - char *s = strstr(*cmdline_p, "earlyprintk="); - if (s) { - extern void setup_early_printk(char *); - - setup_early_printk(s); - printk("early console enabled\n"); - } - } -#endif - - - dmi_scan_machine(); - -#ifdef CONFIG_X86_GENERICARCH - generic_apic_probe(*cmdline_p); -#endif - if (efi_enabled) - efi_map_memmap(); - - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = current->thread.io_pl = 1; - HYPERVISOR_physdev_op(&op); - -#ifdef CONFIG_ACPI_BOOT - if (!(xen_start_info.flags & SIF_INITDOMAIN)) { - printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); - acpi_disabled = 1; - acpi_ht = 0; - } -#endif - - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - acpi_boot_init(); - -#ifdef CONFIG_X86_LOCAL_APIC - if (smp_found_config) - get_smp_config(); -#endif - - /* XXX Disable irqdebug until we have a way to avoid interrupt - * conflicts. */ - noirqdebug_setup(""); - - register_memory(); - - if (xen_start_info.flags & SIF_INITDOMAIN) { - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - panic("Xen granted us console access " - "but not privileged status"); - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - if (!efi_enabled || - (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif - } else { -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - extern const struct consw xennull_con; - extern int console_use_vt; -#if defined(CONFIG_VGA_CONSOLE) - /* disable VGA driver */ - ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB; -#endif - conswitchp = &xennull_con; - console_use_vt = 0; -#endif - } -} - -static int -xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - HYPERVISOR_crash(); - /* we're never actually going to get here... */ - return NOTIFY_DONE; -} - -#include "setup_arch_post.h" -/* - * Local Variables: - * mode:c - * c-file-style:"k&r" - * c-basic-offset:8 - * End: - */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/signal.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/signal.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,656 +0,0 @@ -/* - * linux/arch/i386/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/personality.h> -#include <linux/suspend.h> -#include <linux/ptrace.h> -#include <linux/elf.h> -#include <asm/processor.h> -#include <asm/ucontext.h> -#include <asm/uaccess.h> -#include <asm/i387.h> -#include "sigframe.h" - -#define DEBUG_SIG 0 - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -/* - * Atomically swap in the new signal mask, and wait for a signal. - */ -asmlinkage int -sys_sigsuspend(int history0, int history1, old_sigset_t mask) -{ - struct pt_regs * regs = (struct pt_regs *) &history0; - sigset_t saveset; - - mask &= _BLOCKABLE; - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - siginitset(¤t->blocked, mask); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - regs->eax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage int -sys_rt_sigsuspend(struct pt_regs regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (regs.ecx != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, (sigset_t __user *)regs.ebx, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - regs.eax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(®s, &saveset)) - return -EINTR; - } -} - -asmlinkage int -sys_sigaction(int sig, const struct old_sigaction __user *act, - struct old_sigaction __user *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - if (verify_area(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} - -asmlinkage int -sys_sigaltstack(unsigned long ebx) -{ - /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ - struct pt_regs *regs = (struct pt_regs *)&ebx; - const stack_t __user *uss = (const stack_t __user *)ebx; - stack_t __user *uoss = (stack_t __user *)regs->ecx; - - return do_sigaltstack(uss, uoss, regs->esp); -} - - -/* - * Do a signal return; undo the signal stack. - */ - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax) -{ - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg,tmp); } - -#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF | \ - X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ - X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - - GET_SEG(gs); - GET_SEG(fs); - COPY_SEG(es); - COPY_SEG(ds); - COPY(edi); - COPY(esi); - COPY(ebp); - COPY(esp); - COPY(ebx); - COPY(edx); - COPY(ecx); - COPY(eip); - COPY_SEG_STRICT(cs); - COPY_SEG_STRICT(ss); - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_eax = -1; /* disable syscall checks */ - } - - { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); - if (buf) { - if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } - - err |= __get_user(*peax, &sc->eax); - return err; - -badframe: - return 1; -} - -asmlinkage int sys_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *) &__unused; - struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); - sigset_t set; - int eax; - - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__get_user(set.sig[0], &frame->sc.oldmask) - || (_NSIG_WORDS > 1 - && __copy_from_user(&set.sig[1], &frame->extramask, - sizeof(frame->extramask)))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->sc, &eax)) - goto badframe; - return eax; - -badframe: - force_sig(SIGSEGV, current); - return 0; -} - -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *) &__unused; - struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); - sigset_t set; - int eax; - - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) - goto badframe; - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) - goto badframe; - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) - goto badframe; - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) - goto badframe; - - return eax; - -badframe: - force_sig(SIGSEGV, current); - return 0; -} - -/* - * Set up a signal frame. - */ - -static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, - struct pt_regs *regs, unsigned long mask) -{ - int tmp, err = 0; - - tmp = 0; - __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); - err |= __put_user(tmp, (unsigned int __user *)&sc->fs); - - err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); - err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); - err |= __put_user(current->thread.trap_no, &sc->trapno); - err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); - - tmp = save_i387(fpstate); - if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); - - /* non-iBCS2 extensions.. */ - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(current->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ -static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) -{ - unsigned long esp; - - /* Default to using normal stack */ - esp = regs->esp; - - /* This is the X/Open sanctioned signal stack switching. */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(esp) == 0) - esp = current->sas_ss_sp + current->sas_ss_size; - } - - /* This is the legacy signal stack switching. */ - else if ((regs->xss & 0xffff) != __USER_DS && - !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - esp = (unsigned long) ka->sa.sa_restorer; - } - - return (void __user *)((esp - frame_size) & -8ul); -} - -/* These symbols are defined with the addresses in the vsyscall page. - See vsyscall-sigreturn.S. */ -extern void __user __kernel_sigreturn; -extern void __user __kernel_rt_sigreturn; - -static void setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs) -{ - void __user *restorer; - struct sigframe __user *frame; - int err = 0; - int usig; - - frame = get_sigframe(ka, regs, sizeof(*frame)); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err = __put_user(usig, &frame->sig); - if (err) - goto give_sigsegv; - - err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); - if (err) - goto give_sigsegv; - - if (_NSIG_WORDS > 1) { - err = __copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) - goto give_sigsegv; - } - - restorer = &__kernel_sigreturn; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - - /* Set up to return from userspace. */ - err |= __put_user(restorer, &frame->pretcode); - - /* - * This is popl %eax ; movl $,%eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); - err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); - - if (err) - goto give_sigsegv; - - /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) sig; - regs->edx = (unsigned long) 0; - regs->ecx = (unsigned long) 0; - - set_fs(USER_DS); - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - -#if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); -#endif - - return; - -give_sigsegv: - force_sigsegv(sig, current); -} - -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) -{ - void __user *restorer; - struct rt_sigframe __user *frame; - int err = 0; - int usig; - - frame = get_sigframe(ka, regs, sizeof(*frame)); - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; - - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err |= __put_user(usig, &frame->sig); - err |= __put_user(&frame->info, &frame->pinfo); - err |= __put_user(&frame->uc, &frame->puc); - err |= copy_siginfo_to_user(&frame->info, info); - if (err) - goto give_sigsegv; - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->esp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, - regs, set->sig[0]); - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) - goto give_sigsegv; - - /* Set up to return from userspace. */ - restorer = &__kernel_rt_sigreturn; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(restorer, &frame->pretcode); - - /* - * This is movl $,%eax ; int $0x80 - * - * WE DO NOT USE IT ANY MORE! It's only left here for historical - * reasons and because gdb uses it as a signature to notice - * signal handler stack frames. - */ - err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); - err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); - err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); - - if (err) - goto give_sigsegv; - - /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) usig; - regs->edx = (unsigned long) &frame->info; - regs->ecx = (unsigned long) &frame->uc; - - set_fs(USER_DS); - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; - - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->eflags &= ~TF_MASK; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - -#if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); -#endif - - return; - -give_sigsegv: - force_sigsegv(sig, current); -} - -/* - * OK, we're invoking a handler - */ - -static void -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs * regs) -{ - /* Are we from a system call? */ - if (regs->orig_eax >= 0) { - /* If so, check system call restarting.. */ - switch (regs->eax) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->eax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->eax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; - } - } - - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame(sig, ka, info, oldset, regs); - else - setup_frame(sig, ka, oldset, regs); - - if (!(ka->sa.sa_flags & SA_NODEFER)) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - sigaddset(¤t->blocked,sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) -{ - siginfo_t info; - int signr; - struct k_sigaction ka; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if ((regs->xcs & 2) != 2) - return 1; - - if (current->flags & PF_FREEZE) { - refrigerator(0); - goto no_signal; - } - - if (!oldset) - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* Reenable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (unlikely(current->thread.debugreg[7])) { - HYPERVISOR_set_debugreg(7, - current->thread.debugreg[7]); - } - - /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; - } - - no_signal: - /* Did we come from a system call? */ - if (regs->orig_eax >= 0) { - /* Restart the system call - no handlers present */ - if (regs->eax == -ERESTARTNOHAND || - regs->eax == -ERESTARTSYS || - regs->eax == -ERESTARTNOINTR) { - regs->eax = regs->orig_eax; - regs->eip -= 2; - } - if (regs->eax == -ERESTART_RESTARTBLOCK){ - regs->eax = __NR_restart_syscall; - regs->eip -= 2; - } - } - return 0; -} - -/* - * notification of userspace execution resumption - * - triggered by current->work.notify_resume - */ -__attribute__((regparm(3))) -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, - __u32 thread_info_flags) -{ - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; - clear_thread_flag(TIF_SINGLESTEP); - } - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); - - clear_thread_flag(TIF_IRET); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smp.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/smp.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,624 +0,0 @@ -/* - * Intel SMP support routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> - * (c) 1998-99, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * - * This code is released under the GNU General Public License version 2 or - * later. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/smp_lock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/cache.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> - -#include <asm/mtrr.h> -#include <asm/tlbflush.h> -#if 0 -#include <mach_apic.h> -#endif -#include <asm-xen/evtchn.h> - -#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) - -/* - * Some notes on x86 processor bugs affecting SMP operation: - * - * Pentium, Pentium Pro, II, III (and all CPUs) have bugs. - * The Linux implications for SMP are handled as follows: - * - * Pentium III / [Xeon] - * None of the E1AP-E3AP errata are visible to the user. - * - * E1AP. see PII A1AP - * E2AP. see PII A2AP - * E3AP. see PII A3AP - * - * Pentium II / [Xeon] - * None of the A1AP-A3AP errata are visible to the user. - * - * A1AP. see PPro 1AP - * A2AP. see PPro 2AP - * A3AP. see PPro 7AP - * - * Pentium Pro - * None of 1AP-9AP errata are visible to the normal user, - * except occasional delivery of 'spurious interrupt' as trap #15. - * This is very rare and a non-problem. - * - * 1AP. Linux maps APIC as non-cacheable - * 2AP. worked around in hardware - * 3AP. fixed in C0 and above steppings microcode update. - * Linux does not use excessive STARTUP_IPIs. - * 4AP. worked around in hardware - * 5AP. symmetric IO mode (normal Linux operation) not affected. - * 'noapic' mode has vector 0xf filled out properly. - * 6AP. 'noapic' mode might be affected - fixed in later steppings - * 7AP. We do not assume writes to the LVT deassering IRQs - * 8AP. We do not enable low power mode (deep sleep) during MP bootup - * 9AP. We do not use mixed mode - * - * Pentium - * There is a marginal case where REP MOVS on 100MHz SMP - * machines with B stepping processors can fail. XXX should provide - * an L1cache=Writethrough or L1cache=off option. - * - * B stepping CPUs may hang. There are hardware work arounds - * for this. We warn about it in case your board doesn't have the work - * arounds. Basically thats so I can tell anyone with a B stepping - * CPU and SMP problems "tough". - * - * Specific items [From Pentium Processor Specification Update] - * - * 1AP. Linux doesn't use remote read - * 2AP. Linux doesn't trust APIC errors - * 3AP. We work around this - * 4AP. Linux never generated 3 interrupts of the same priority - * to cause a lost local interrupt. - * 5AP. Remote read is never used - * 6AP. not affected - worked around in hardware - * 7AP. not affected - worked around in hardware - * 8AP. worked around in hardware - we get explicit CS errors if not - * 9AP. only 'noapic' mode affected. Might generate spurious - * interrupts, we log only the first one and count the - * rest silently. - * 10AP. not affected - worked around in hardware - * 11AP. Linux reads the APIC between writes to avoid this, as per - * the documentation. Make sure you preserve this as it affects - * the C stepping chips too. - * 12AP. not affected - worked around in hardware - * 13AP. not affected - worked around in hardware - * 14AP. we always deassert INIT during bootup - * 15AP. not affected - worked around in hardware - * 16AP. not affected - worked around in hardware - * 17AP. not affected - worked around in hardware - * 18AP. not affected - worked around in hardware - * 19AP. not affected - worked around in BIOS - * - * If this sounds worrying believe me these bugs are either ___RARE___, - * or are signal timing bugs worked around in hardware and there's - * about nothing of note with C stepping upwards. - */ - -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; - -/* - * the following functions deal with sending IPIs between CPUs. - * - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. - */ - -static inline int __prepare_ICR (unsigned int shortcut, int vector) -{ - return APIC_DM_FIXED | shortcut | vector | APIC_DEST_LOGICAL; -} - -static inline int __prepare_ICR2 (unsigned int mask) -{ - return SET_APIC_DEST_FIELD(mask); -} - -DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]); - -static inline void __send_IPI_one(unsigned int cpu, int vector) -{ - unsigned int evtchn; - - evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; - // printk("send_IPI_mask_bitmask cpu %d vector %d evtchn %d\n", cpu, vector, evtchn); - if (evtchn) { -#if 0 - shared_info_t *s = HYPERVISOR_shared_info; - while (synch_test_bit(evtchn, &s->evtchn_pending[0]) || - synch_test_bit(evtchn, &s->evtchn_mask[0])) - ; -#endif - notify_via_evtchn(evtchn); - } else - printk("send_IPI to unbound port %d/%d", - cpu, vector); -} - -void __send_IPI_shortcut(unsigned int shortcut, int vector) -{ - int cpu; - - switch (shortcut) { - case APIC_DEST_SELF: - __send_IPI_one(smp_processor_id(), vector); - break; - case APIC_DEST_ALLBUT: - for (cpu = 0; cpu < NR_CPUS; ++cpu) { - if (cpu == smp_processor_id()) - continue; - if (cpu_isset(cpu, cpu_online_map)) { - __send_IPI_one(cpu, vector); - } - } - break; - default: - printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, - vector); - break; - } -} - -void fastcall send_IPI_self(int vector) -{ - __send_IPI_shortcut(APIC_DEST_SELF, vector); -} - -/* - * This is only used on smaller machines. - */ -void send_IPI_mask_bitmask(cpumask_t mask, int vector) -{ - unsigned long flags; - unsigned int cpu; - - local_irq_save(flags); - WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]); - - for (cpu = 0; cpu < NR_CPUS; ++cpu) { - if (cpu_isset(cpu, mask)) { - __send_IPI_one(cpu, vector); - } - } - - local_irq_restore(flags); -} - -inline void send_IPI_mask_sequence(cpumask_t mask, int vector) -{ - - send_IPI_mask_bitmask(mask, vector); -} - -#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */ - -#if 0 /* XEN */ -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul <manfred@xxxxxxxxxxxxxxxx> - */ - -static cpumask_t flush_cpumask; -static struct mm_struct * flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - * - * We need to reload %cr3 since the page tables may be going - * away from under us.. - */ -static inline void leave_mm (unsigned long cpu) -{ - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - BUG(); - cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); - load_cr3(swapper_pg_dir); -} - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superflous - * tlb flush. - * 1a2) set cpu_tlbstate to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu_tlbstate[].active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu_tlbstate[].active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ - -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ - unsigned long cpu; - - cpu = get_cpu(); - - if (!cpu_isset(cpu, flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(flush_va); - } else - leave_mm(cpu); - } - smp_mb__before_clear_bit(); - cpu_clear(cpu, flush_cpumask); - smp_mb__after_clear_bit(); -out: - put_cpu_no_resched(); - - return IRQ_HANDLED; -} - -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) -{ - /* - * A couple of (to be removed) sanity checks: - * - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - BUG_ON(!mm); - - /* If a CPU which we ran on has gone down, OK. */ - cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) - return; - - /* - * i'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. - */ - spin_lock(&tlbstate_lock); - - flush_mm = mm; - flush_va = va; -#if NR_CPUS <= BITS_PER_LONG - atomic_set_mask(cpumask, &flush_cpumask); -#else - { - int k; - unsigned long *flush_mask = (unsigned long *)&flush_cpumask; - unsigned long *cpu_mask = (unsigned long *)&cpumask; - for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) - atomic_set_mask(cpu_mask[k], &flush_mask[k]); - } -#endif - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - - while (!cpus_empty(flush_cpumask)) - /* nothing. lockup detection does not belong here */ - mb(); - - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); -} - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm (struct mm_struct * mm) -{ - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if(current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - - preempt_enable(); -} - -static void do_flush_tlb_all(void* info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); -} - -#else - -irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ return 0; } -void flush_tlb_current_task(void) -{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } -void flush_tlb_mm(struct mm_struct * mm) -{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } -void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) -{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } -void flush_tlb_all(void) -{ xen_tlb_flush_all(); } - -#endif /* XEN */ - -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ -void smp_send_reschedule(int cpu) -{ - WARN_ON(cpu_is_offline(cpu)); - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); -} - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -static struct call_data_struct * call_data; - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ - -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -/* - * [SUMMARY] Run a function on all other CPUs. - * <func> The function to run. This must be fast and non-blocking. - * <info> An arbitrary pointer to pass to the function. - * <nonatomic> currently unused. - * <wait> If true, wait (atomically) until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <<func>> or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -{ - struct call_data_struct data; - int cpus; - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - cpus = num_online_cpus()-1; - - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); - - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - barrier(); - - if (wait) - while (atomic_read(&data.finished) != cpus) - barrier(); - spin_unlock(&call_lock); - - return 0; -} - -static void stop_this_cpu (void * dummy) -{ - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); -#if 1 - xxprint("stop_this_cpu disable_local_APIC\n"); -#else - disable_local_APIC(); -#endif - if (cpu_data[smp_processor_id()].hlt_works_ok) - for(;;) __asm__("hlt"); - for (;;); -} - -/* - * this function calls the 'stop' function on all other CPUs in the system. - */ - -void smp_send_stop(void) -{ - smp_call_function(stop_this_cpu, NULL, 1, 0); - - local_irq_disable(); -#if 1 - xxprint("smp_send_stop disable_local_APIC\n"); -#else - disable_local_APIC(); -#endif - local_irq_enable(); -} - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ - - return IRQ_HANDLED; -} - -#include <linux/kallsyms.h> -irqreturn_t smp_call_function_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - irq_enter(); - (*func)(info); - irq_exit(); - - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } - - return IRQ_HANDLED; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/time.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/time.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,801 +0,0 @@ -/* - * linux/arch/i386/kernel/time.c - * - * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * - * This file contains the PC-specific time handling details: - * reading the RTC at bootup, etc.. - * 1994-07-02 Alan Modra - * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime - * 1995-03-26 Markus Kuhn - * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887 - * precision CMOS clock update - * 1996-05-03 Ingo Molnar - * fixed time warps in do_[slow|fast]_gettimeoffset() - * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 - * "A Kernel Model for Precision Timekeeping" by Dave Mills - * 1998-09-05 (Various) - * More robust do_fast_gettimeoffset() algorithm implemented - * (works with APM, Cyrix 6x86MX and Centaur C6), - * monotonic gettimeofday() with fast_get_timeoffset(), - * drift-proof precision TSC calibration on boot - * (C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, Andrew D. - * Balsa <andrebalsa@xxxxxxxxxx>, Philip Gladstone <philip@xxxxxxxxxx>; - * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@xxxxxxxxxxxxx>). - * 1998-12-16 Andrea Arcangeli - * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy - * because was not accounting lost_ticks. - * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli - * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to - * serialize accesses to xtime/lost_ticks). - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/time.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/efi.h> -#include <linux/mca.h> -#include <linux/sysctl.h> -#include <linux/percpu.h> - -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/irq.h> -#include <asm/msr.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/timer.h> - -#include "mach_time.h" - -#include <linux/timex.h> -#include <linux/config.h> - -#include <asm/hpet.h> - -#include <asm/arch_hooks.h> - -#include "io_ports.h" - -extern spinlock_t i8259A_lock; -int pit_latch_buggy; /* extern */ - -u64 jiffies_64 = INITIAL_JIFFIES; - -EXPORT_SYMBOL(jiffies_64); - -#if defined(__x86_64__) -unsigned long vxtime_hz = PIT_TICK_RATE; -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; -#endif - -#if defined(__x86_64__) -unsigned int cpu_khz; /* Detected as we calibrate the TSC */ -#else -unsigned long cpu_khz; /* Detected as we calibrate the TSC */ -#endif - -extern unsigned long wall_jiffies; - -DEFINE_SPINLOCK(rtc_lock); - -DEFINE_SPINLOCK(i8253_lock); -EXPORT_SYMBOL(i8253_lock); - -extern struct init_timer_opts timer_tsc_init; -extern struct timer_opts timer_tsc; -struct timer_opts *cur_timer = &timer_tsc; - -/* These are peridically updated in shared_info, and then copied here. */ -u32 shadow_tsc_stamp; -u64 shadow_system_time; -static u32 shadow_time_version; -static struct timeval shadow_tv; - -/* - * We use this to ensure that gettimeofday() is monotonically increasing. We - * only break this guarantee if the wall clock jumps backwards "a long way". - */ -static struct timeval last_seen_tv = {0,0}; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -/* Periodically propagate synchronised time base to the RTC and to Xen. */ -static long last_rtc_update, last_update_to_xen; -#endif - -/* Periodically take synchronised time base from Xen, if we need it. */ -static long last_update_from_xen; /* UTC seconds when last read Xen clock. */ - -/* Keep track of last time we did processing/updating of jiffies and xtime. */ -static u64 processed_system_time; /* System time (ns) at last processing. */ -static DEFINE_PER_CPU(u64, processed_system_time); - -#define NS_PER_TICK (1000000000ULL/HZ) - -#define HANDLE_USEC_UNDERFLOW(_tv) do { \ - while ((_tv).tv_usec < 0) { \ - (_tv).tv_usec += USEC_PER_SEC; \ - (_tv).tv_sec--; \ - } \ -} while (0) -#define HANDLE_USEC_OVERFLOW(_tv) do { \ - while ((_tv).tv_usec >= USEC_PER_SEC) { \ - (_tv).tv_usec -= USEC_PER_SEC; \ - (_tv).tv_sec++; \ - } \ -} while (0) -static inline void __normalize_time(time_t *sec, s64 *nsec) -{ - while (*nsec >= NSEC_PER_SEC) { - (*nsec) -= NSEC_PER_SEC; - (*sec)++; - } - while (*nsec < 0) { - (*nsec) += NSEC_PER_SEC; - (*sec)--; - } -} - -/* Does this guest OS track Xen time, or set its wall clock independently? */ -static int independent_wallclock = 0; -static int __init __independent_wallclock(char *str) -{ - independent_wallclock = 1; - return 1; -} -__setup("independent_wallclock", __independent_wallclock); -#define INDEPENDENT_WALLCLOCK() \ - (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN)) - -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. Must be called with the xtime_lock held for writing. - */ -static void __get_time_values_from_xen(void) -{ - shared_info_t *s = HYPERVISOR_shared_info; - - do { - shadow_time_version = s->time_version2; - rmb(); - shadow_tv.tv_sec = s->wc_sec; - shadow_tv.tv_usec = s->wc_usec; - shadow_tsc_stamp = (u32)s->tsc_timestamp; - shadow_system_time = s->system_time; - rmb(); - } - while (shadow_time_version != s->time_version1); - - cur_timer->mark_offset(); -} - -#define TIME_VALUES_UP_TO_DATE \ - ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); }) - -/* - * This version of gettimeofday has microsecond resolution - * and better than microsecond precision on fast x86 machines with TSC. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq; - unsigned long usec, sec; - unsigned long max_ntp_tick; - unsigned long flags; - s64 nsec; - - do { - unsigned long lost; - - seq = read_seqbegin(&xtime_lock); - - usec = cur_timer->get_offset(); - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } - else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / NSEC_PER_USEC); - - nsec = shadow_system_time - processed_system_time; - __normalize_time(&sec, &nsec); - usec += (long)nsec / NSEC_PER_USEC; - - if (unlikely(!TIME_VALUES_UP_TO_DATE)) { - /* - * We may have blocked for a long time, - * rendering our calculations invalid - * (e.g. the time delta may have - * overflowed). Detect that and recalculate - * with fresh values. - */ - write_seqlock_irqsave(&xtime_lock, flags); - __get_time_values_from_xen(); - write_sequnlock_irqrestore(&xtime_lock, flags); - continue; - } - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= USEC_PER_SEC) { - usec -= USEC_PER_SEC; - sec++; - } - - /* Ensure that time-of-day is monotonically increasing. */ - if ((sec < last_seen_tv.tv_sec) || - ((sec == last_seen_tv.tv_sec) && (usec < last_seen_tv.tv_usec))) { - sec = last_seen_tv.tv_sec; - usec = last_seen_tv.tv_usec; - } else { - last_seen_tv.tv_sec = sec; - last_seen_tv.tv_usec = usec; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec; - s64 nsec; - struct timespec xentime; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - if (!INDEPENDENT_WALLCLOCK()) - return 0; /* Silent failure? */ - - write_seqlock_irq(&xtime_lock); - - /* - * Ensure we don't get blocked for a long time so that our time delta - * overflows. If that were to happen then our shadow time values would - * be stale, so we can retry with fresh ones. - */ - again: - nsec = (s64)tv->tv_nsec - - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC); - if (unlikely(!TIME_VALUES_UP_TO_DATE)) { - __get_time_values_from_xen(); - goto again; - } - - __normalize_time(&sec, &nsec); - set_normalized_timespec(&xentime, sec, nsec); - - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - nsec -= (shadow_system_time - processed_system_time); - - __normalize_time(&sec, &nsec); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - - /* Reset all our running time counts. They make no sense now. */ - last_seen_tv.tv_sec = 0; - last_update_from_xen = 0; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (xen_start_info.flags & SIF_INITDOMAIN) { - dom0_op_t op; - last_rtc_update = last_update_to_xen = 0; - op.cmd = DOM0_SETTIME; - op.u.settime.secs = xentime.tv_sec; - op.u.settime.usecs = xentime.tv_nsec / NSEC_PER_USEC; - op.u.settime.system_time = shadow_system_time; - write_sequnlock_irq(&xtime_lock); - HYPERVISOR_dom0_op(&op); - } else -#endif - write_sequnlock_irq(&xtime_lock); - - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval; - - /* gets recalled with irq locally disabled */ - spin_lock(&rtc_lock); - if (efi_enabled) - retval = efi_set_rtc_mmss(nowtime); - else - retval = mach_set_rtc_mmss(nowtime); - spin_unlock(&rtc_lock); - - return retval; -} -#endif - -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - return cur_timer->monotonic_clock(); -} -EXPORT_SYMBOL(monotonic_clock); - -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -unsigned long profile_pc(struct pt_regs *regs) -{ - unsigned long pc = instruction_pointer(regs); - - if (in_lock_functions(pc)) - return *(unsigned long *)(regs->ebp + 4); - - return pc; -} -EXPORT_SYMBOL(profile_pc); -#endif - -/* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick - */ -static inline void do_timer_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ - time_t wtm_sec, sec; - s64 delta, delta_cpu, nsec; - long sec_diff, wtm_nsec; - int cpu = smp_processor_id(); - - do { - __get_time_values_from_xen(); - - delta = delta_cpu = (s64)shadow_system_time + - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC); - delta -= processed_system_time; - delta_cpu -= per_cpu(processed_system_time, cpu); - } - while (!TIME_VALUES_UP_TO_DATE); - - if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { - printk("Timer ISR/%d: Time went backwards: " - "delta=%lld cpu_delta=%lld shadow=%lld " - "off=%lld processed=%lld cpu_processed=%lld\n", - cpu, delta, delta_cpu, shadow_system_time, - ((s64)cur_timer->get_offset() * (s64)NSEC_PER_USEC), - processed_system_time, - per_cpu(processed_system_time, cpu)); - for (cpu = 0; cpu < num_online_cpus(); cpu++) - printk(" %d: %lld\n", cpu, - per_cpu(processed_system_time, cpu)); - return; - } - - /* System-wide jiffy work. */ - while (delta >= NS_PER_TICK) { - delta -= NS_PER_TICK; - processed_system_time += NS_PER_TICK; - do_timer(regs); - } - - /* Local CPU jiffy work. */ - while (delta_cpu >= NS_PER_TICK) { - delta_cpu -= NS_PER_TICK; - per_cpu(processed_system_time, cpu) += NS_PER_TICK; - update_process_times(user_mode(regs)); - profile_tick(CPU_PROFILING, regs); - } - - if (cpu != 0) - return; - - /* - * Take synchronised time from Xen once a minute if we're not - * synchronised ourselves, and we haven't chosen to keep an independent - * time base. - */ - if (!INDEPENDENT_WALLCLOCK() && - ((time_status & STA_UNSYNC) != 0) && - (xtime.tv_sec > (last_update_from_xen + 60))) { - /* Adjust shadow for jiffies that haven't updated xtime yet. */ - shadow_tv.tv_usec -= - (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); - HANDLE_USEC_UNDERFLOW(shadow_tv); - - /* - * Reset our running time counts if they are invalidated by - * a warp backwards of more than 500ms. - */ - sec_diff = xtime.tv_sec - shadow_tv.tv_sec; - if (unlikely(abs(sec_diff) > 1) || - unlikely(((sec_diff * USEC_PER_SEC) + - (xtime.tv_nsec / NSEC_PER_USEC) - - shadow_tv.tv_usec) > 500000)) { -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - last_rtc_update = last_update_to_xen = 0; -#endif - last_seen_tv.tv_sec = 0; - } - - /* Update our unsynchronised xtime appropriately. */ - sec = shadow_tv.tv_sec; - nsec = shadow_tv.tv_usec * NSEC_PER_USEC; - - __normalize_time(&sec, &nsec); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - last_update_from_xen = sec; - } - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (!(xen_start_info.flags & SIF_INITDOMAIN)) - return; - - /* Send synchronised time to Xen approximately every minute. */ - if (((time_status & STA_UNSYNC) == 0) && - (xtime.tv_sec > (last_update_to_xen + 60))) { - dom0_op_t op; - struct timeval tv; - - tv.tv_sec = xtime.tv_sec; - tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; - tv.tv_usec += (jiffies - wall_jiffies) * (USEC_PER_SEC/HZ); - HANDLE_USEC_OVERFLOW(tv); - - op.cmd = DOM0_SETTIME; - op.u.settime.secs = tv.tv_sec; - op.u.settime.usecs = tv.tv_usec; - op.u.settime.system_time = shadow_system_time; - HYPERVISOR_dom0_op(&op); - - last_update_to_xen = xtime.tv_sec; - } - - /* - * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if ((time_status & STA_UNSYNC) == 0 && - xtime.tv_sec > last_rtc_update + 660 && - (xtime.tv_nsec / 1000) - >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 && - (xtime.tv_nsec / 1000) - <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) { - /* horrible...FIXME */ - if (efi_enabled) { - if (efi_set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; - } else if (set_rtc_mmss(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - last_rtc_update = xtime.tv_sec - 600; /* do it again in 60 s */ - } -#endif -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - do_timer_interrupt(irq, NULL, regs); - write_sequnlock(&xtime_lock); - return IRQ_HANDLED; -} - -/* not static: needed by APM */ -unsigned long get_cmos_time(void) -{ - unsigned long retval; - - spin_lock(&rtc_lock); - - if (efi_enabled) - retval = efi_get_time(); - else - retval = mach_get_cmos_time(); - - spin_unlock(&rtc_lock); - - return retval; -} - -static long clock_cmos_diff, sleep_start; - -static int timer_suspend(struct sys_device *dev, u32 state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - sleep_start = get_cmos_time(); - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long sleep_length; - -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - sec = get_cmos_time() + clock_cmos_diff; - sleep_length = (get_cmos_time() - sleep_start) * HZ; - write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); - jiffies += sleep_length; - wall_jiffies += sleep_length; - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - -#ifdef CONFIG_HPET_TIMER -extern void (*late_time_init)(void); -/* Duplicate of time_init() below, with hpet_enable part added */ -void __init hpet_time_init(void) -{ - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - if (hpet_enable() >= 0) { - printk("Using HPET for base-timer\n"); - } - - cur_timer = select_timer(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - - time_init_hook(); -} -#endif - -/* Dynamically-mapped IRQ. */ -static DEFINE_PER_CPU(int, timer_irq); - -static struct irqaction irq_timer = { - timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0", - NULL, NULL -}; - -void __init time_init(void) -{ -#ifdef CONFIG_HPET_TIMER - if (is_hpet_capable()) { - /* - * HPET initialization needs to do memory-mapped io. So, let - * us do a late initialization after mem_init(). - */ - late_time_init = hpet_time_init; - return; - } -#endif - __get_time_values_from_xen(); - xtime.tv_sec = shadow_tv.tv_sec; - xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - processed_system_time = shadow_system_time; - per_cpu(processed_system_time, 0) = processed_system_time; - - if (timer_tsc_init.init(NULL) != 0) - BUG(); - printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name); - -#if defined(__x86_64__) - vxtime.mode = VXTIME_TSC; - vxtime.quot = (1000000L << 32) / vxtime_hz; - vxtime.tsc_quot = (1000L << 32) / cpu_khz; - vxtime.hz = vxtime_hz; - sync_core(); - rdtscll(vxtime.last_tsc); -#endif - - per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER); - (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer); -} - -/* Convert jiffies to system time. */ -static inline u64 jiffies_to_st(unsigned long j) -{ - unsigned long seq; - long delta; - u64 st; - - do { - seq = read_seqbegin(&xtime_lock); - delta = j - jiffies; - /* NB. The next check can trigger in some wrap-around cases, - * but that's ok: we'll just end up with a shorter timeout. */ - if (delta < 1) - delta = 1; - st = processed_system_time + (delta * NS_PER_TICK); - } while (read_seqretry(&xtime_lock, seq)); - - return st; -} - -/* - * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu - * These functions are based on implementations from arch/s390/kernel/time.c - */ -void stop_hz_timer(void) -{ - unsigned int cpu = smp_processor_id(); - unsigned long j; - - /* s390 does this /before/ checking rcu_pending(). We copy them. */ - cpu_set(cpu, nohz_cpu_mask); - - /* Leave ourselves in 'tick mode' if rcu or softirq pending. */ - if (rcu_pending(cpu) || local_softirq_pending()) { - cpu_clear(cpu, nohz_cpu_mask); - j = jiffies + 1; - } else { - j = next_timer_interrupt(); - } - - BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0); -} - -void start_hz_timer(void) -{ - cpu_clear(smp_processor_id(), nohz_cpu_mask); -} - -void time_suspend(void) -{ - /* nothing */ -} - -/* No locking required. We are only CPU running, and interrupts are off. */ -void time_resume(void) -{ - if (timer_tsc_init.init(NULL) != 0) - BUG(); - - /* Get timebases for new environment. */ - __get_time_values_from_xen(); - - /* Reset our own concept of passage of system time. */ - processed_system_time = shadow_system_time; - per_cpu(processed_system_time, 0) = processed_system_time; - - /* Accept a warp in UTC (wall-clock) time. */ - last_seen_tv.tv_sec = 0; - - /* Make sure we resync UTC time with Xen on next timer interrupt. */ - last_update_from_xen = 0; -} - -#ifdef CONFIG_SMP -static char timer_name[NR_CPUS][15]; -void local_setup_timer(void) -{ - int seq, cpu = smp_processor_id(); - - do { - seq = read_seqbegin(&xtime_lock); - per_cpu(processed_system_time, cpu) = shadow_system_time; - } while (read_seqretry(&xtime_lock, seq)); - - per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); - sprintf(timer_name[cpu], "timer%d", cpu); - BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt, - SA_INTERRUPT, timer_name[cpu], NULL)); -} -#endif - -/* - * /proc/sys/xen: This really belongs in another file. It can stay here for - * now however. - */ -static ctl_table xen_subtable[] = { - {1, "independent_wallclock", &independent_wallclock, - sizeof(independent_wallclock), 0644, NULL, proc_dointvec}, - {0} -}; -static ctl_table xen_table[] = { - {123, "xen", NULL, 0, 0555, xen_subtable}, - {0} -}; -static int __init xen_sysctl_init(void) -{ - (void)register_sysctl_table(xen_table, 0); - return 0; -} -__initcall(xen_sysctl_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,17 +0,0 @@ -# -# Makefile for x86 timers -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -obj-y := timer_tsc.o -c-obj-y := - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/kernel/timers/$(notdir $@) $@ - -obj-y += $(c-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,379 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/cpufreq.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -/* processor.h for distable_tsc flag */ -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" - -#include <asm/hpet.h> - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __initdata = 0; - -extern spinlock_t i8253_lock; - -static int use_tsc; - -static unsigned long long monotonic_base; -static u32 monotonic_offset; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_mhz * 10^6)) - * ns = cycles * (10^3 / cpu_mhz) - * - * Then we use scaling math (suggested by george@xxxxxxxxxx) to get: - * ns = cycles * (10^3 * SC / cpu_mhz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * -johnstul@xxxxxxxxxx "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -extern u32 shadow_tsc_stamp; -extern u64 shadow_system_time; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= shadow_tsc_stamp; - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = monotonic_offset; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - - -static void mark_offset_tsc(void) -{ - - /* update the monotonic base value */ - write_seqlock(&monotonic_lock); - monotonic_base = shadow_system_time; - monotonic_offset = shadow_tsc_stamp; - write_sequnlock(&monotonic_lock); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned long cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz/1000); - } - } -#endif - } - - if (val != CPUFREQ_RESUMECHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - - -static int init_tsc(char* override) -{ - u64 __cpu_khz; - - __cpu_khz = HYPERVISOR_shared_info->cpu_freq; - do_div(__cpu_khz, 1000); - cpu_khz = (u32)__cpu_khz; - printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - - /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = - (2^32 * 1 / (clocks/us)) */ - { - unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (fast_gettimeoffset_quotient), "=d" (edx) - :"r" (cpu_khz), - "0" (eax), "1" (edx)); - } - - set_cyc2ns_scale(cpu_khz/1000); - - use_tsc = 1; - - return 0; -} - -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: cannot disable TSC in Xen/Linux.\n"); - return 1; -} -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, -}; - -struct init_timer_opts timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,982 +0,0 @@ -/* - * linux/arch/i386/traps.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. - */ -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/highmem.h> -#include <linux/kallsyms.h> -#include <linux/ptrace.h> -#include <linux/utsname.h> -#include <linux/kprobes.h> - -#ifdef CONFIG_EISA -#include <linux/ioport.h> -#include <linux/eisa.h> -#endif - -#ifdef CONFIG_MCA -#include <linux/mca.h> -#endif - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/debugreg.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/nmi.h> - -#include <asm/smp.h> -#include <asm/arch_hooks.h> -#include <asm/kdebug.h> - -#include <linux/irq.h> -#include <linux/module.h> - -#include "mach_traps.h" - -asmlinkage int system_call(void); - -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq = 0; - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void fixup_4gb_segment(void); -asmlinkage void machine_check(void); - -static int kstack_depth_to_print = 24; -struct notifier_block *i386die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); - -int register_die_notifier(struct notifier_block *nb) -{ - int err = 0; - unsigned long flags; - spin_lock_irqsave(&die_notifier_lock, flags); - err = notifier_chain_register(&i386die_chain, nb); - spin_unlock_irqrestore(&die_notifier_lock, flags); - return err; -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) -{ - return p > (void *)tinfo && - p < (void *)tinfo + THREAD_SIZE - 3; -} - -static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long ebp) -{ - unsigned long addr; - -#ifdef CONFIG_FRAME_POINTER - while (valid_stack_ptr(tinfo, (void *)ebp)) { - addr = *(unsigned long *)(ebp + 4); - printk(" [<%08lx>] ", addr); - print_symbol("%s", addr); - printk("\n"); - ebp = *(unsigned long *)ebp; - } -#else - while (valid_stack_ptr(tinfo, stack)) { - addr = *stack++; - if (__kernel_text_address(addr)) { - printk(" [<%08lx>]", addr); - print_symbol(" %s", addr); - printk("\n"); - } - } -#endif - return ebp; -} - -void show_trace(struct task_struct *task, unsigned long * stack) -{ - unsigned long ebp; - - if (!task) - task = current; - - if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); - } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; - } - - while (1) { - struct thread_info *context; - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp); - stack = (unsigned long*)context->previous_esp; - if (!stack) - break; - printk(" =======================\n"); - } -} - -void show_stack(struct task_struct *task, unsigned long *esp) -{ - unsigned long *stack; - int i; - - if (esp == NULL) { - if (task) - esp = (unsigned long*)task->thread.esp; - else - esp = (unsigned long *)&esp; - } - - stack = esp; - for(i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (i && ((i % 8) == 0)) - printk("\n "); - printk("%08lx ", *stack++); - } - printk("\nCall Trace:\n"); - show_trace(task, esp); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long stack; - - show_trace(current, &stack); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - int in_kernel = 1; - unsigned long esp; - unsigned short ss; - - esp = (unsigned long) (®s->esp); - ss = __KERNEL_DS; - if (regs->xcs & 2) { - in_kernel = 0; - esp = regs->esp; - ss = regs->xss & 0xffff; - } - print_modules(); - printk("CPU: %d\nEIP: %04x:[<%08lx>] %s VLI\nEFLAGS: %08lx" - " (%s) \n", - smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, system_utsname.release); - print_symbol("EIP is at %s\n", regs->eip); - printk("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); - printk("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); - printk("ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); - printk("Process %s (pid: %d, threadinfo=%p task=%p)", - current->comm, current->pid, current_thread_info(), current); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (in_kernel) { - u8 *eip; - - printk("\nStack: "); - show_stack(NULL, (unsigned long*)esp); - - printk("Code: "); - - eip = (u8 *)regs->eip - 43; - for (i = 0; i < 64; i++, eip++) { - unsigned char c; - - if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { - printk(" Bad EIP value."); - break; - } - if (eip == (u8 *)regs->eip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -static void handle_BUG(struct pt_regs *regs) -{ - unsigned short ud2; - unsigned short line; - char *file; - char c; - unsigned long eip; - - if (regs->xcs & 2) - goto no_bug; /* Not in kernel */ - - eip = regs->eip; - - if (eip < PAGE_OFFSET) - goto no_bug; - if (__get_user(ud2, (unsigned short *)eip)) - goto no_bug; - if (ud2 != 0x0b0f) - goto no_bug; - if (__get_user(line, (unsigned short *)(eip + 2))) - goto bug; - if (__get_user(file, (char **)(eip + 4)) || - (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) - file = "<bad filename>"; - - printk("------------[ cut here ]------------\n"); - printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line); - -no_bug: - return; - - /* Here we know it was a BUG but file-n-line is unavailable */ -bug: - printk("Kernel BUG\n"); -} - -void die(const char * str, struct pt_regs * regs, long err) -{ - static struct { - spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 - }; - static int die_counter; - - if (die.lock_owner != _smp_processor_id()) { - console_verbose(); - spin_lock_irq(&die.lock); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - } - - if (++die.lock_owner_depth < 3) { - int nl = 0; - handle_BUG(regs); - printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); - nl = 1; -#endif -#ifdef CONFIG_SMP - printk("SMP "); - nl = 1; -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); - nl = 1; -#endif - if (nl) - printk("\n"); - notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); - show_registers(regs); - } else - printk(KERN_ERR "Recursive die() failure, output suppressed\n"); - - bust_spinlocks(0); - die.lock_owner = -1; - spin_unlock_irq(&die.lock); - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) { - printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(5 * HZ); - panic("Fatal exception"); - } - do_exit(SIGSEGV); -} - -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) -{ - if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs)) - die(str, regs, err); -} - -static void do_trap(int trapnr, int signr, char *str, int vm86, - struct pt_regs * regs, long error_code, siginfo_t *info) -{ - if (regs->eflags & VM_MASK) { - if (vm86) - goto vm86_trap; - goto trap_signal; - } - - if (!(regs->xcs & 2)) - goto kernel_trap; - - trap_signal: { - struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - } - - kernel_trap: { - if (!fixup_exception(regs)) - die(str, regs, error_code); - return; - } - - vm86_trap: { - int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr); - if (ret) goto trap_signal; - return; - } -} - -#define DO_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ -} - -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ -} - -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) -#ifndef CONFIG_KPROBES -DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) -#endif -DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->eip) -DO_VM86_ERROR( 7, SIGSEGV, "device not available", device_not_available) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -#ifdef CONFIG_X86_MCE -DO_ERROR(18, SIGBUS, "machine check", machine_check) -#endif - -fastcall void do_general_protection(struct pt_regs * regs, long error_code) -{ - /* - * If we trapped on an LDT access then ensure that the default_ldt is - * loaded, if nothing else. We load default_ldt lazily because LDT - * switching costs time and many applications don't need it. - */ - if (unlikely((error_code & 6) == 4)) { - unsigned long ldt; - __asm__ __volatile__ ("sldt %0" : "=r" (ldt)); - if (ldt == 0) { - xen_set_ldt((unsigned long)&default_ldt[0], 5); - return; - } - } - - if (regs->eflags & VM_MASK) - goto gp_in_vm86; - - if (!(regs->xcs & 2)) - goto gp_in_kernel; - - current->thread.error_code = error_code; - current->thread.trap_no = 13; - force_sig(SIGSEGV, current); - return; - -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; - -gp_in_kernel: - if (!fixup_exception(regs)) { - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } -} - -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) -{ - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); - - /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); -} - -static void io_check_error(unsigned char reason, struct pt_regs * regs) -{ - unsigned long i; - - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - i = 2000; - while (--i) udelay(1000); - reason &= ~8; - outb(reason, 0x61); -} - -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ -#ifdef CONFIG_MCA - /* Might actually be able to figure out what the guilty party - * is. */ - if( MCA_bus ) { - mca_handle_nmi(); - return; - } -#endif - printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); -} - -static DEFINE_SPINLOCK(nmi_print_lock); - -void die_nmi (struct pt_regs *regs, const char *msg) -{ - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - bust_spinlocks(1); - printk(msg); - printk(" on CPU%d, eip %08lx, registers:\n", - smp_processor_id(), regs->eip); - show_registers(regs); - printk("console shuts up ...\n"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - do_exit(SIGSEGV); -} - -static void default_do_nmi(struct pt_regs * regs) -{ - unsigned char reason = 0; - - /* Only the BSP gets external NMIs from the system. */ - if (!smp_processor_id()) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog) { - nmi_watchdog_tick(regs); - return; - } -#endif - unknown_nmi_error(reason, regs); - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) - return; - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered. - */ - reassert_nmi(); -} - -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) -{ - return 0; -} - -static nmi_callback_t nmi_callback = dummy_nmi_callback; - -fastcall void do_nmi(struct pt_regs * regs, long error_code) -{ - int cpu; - - nmi_enter(); - - cpu = smp_processor_id(); - -#ifdef CONFIG_HOTPLUG_CPU - if (!cpu_online(cpu)) { - nmi_exit(); - return; - } -#endif - - ++nmi_count(cpu); - - if (!nmi_callback(regs, cpu)) - default_do_nmi(regs); - - nmi_exit(); -} - -void set_nmi_callback(nmi_callback_t callback) -{ - nmi_callback = callback; -} - -void unset_nmi_callback(void) -{ - nmi_callback = dummy_nmi_callback; -} - -#ifdef CONFIG_KPROBES -fastcall int do_int3(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return 1; - /* This is an interrupt gate, because kprobes wants interrupts - disabled. Normal trap handlers don't. */ - restore_interrupts(regs); - do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); - return 0; -} -#endif - -/* - * Our handling of the processor debug registers is non-trivial. - * We do not clear them on entry and exit from the kernel. Therefore - * it is possible to get a watchpoint trap here from inside the kernel. - * However, the code in ./ptrace.c has ensured that the user can - * only set watchpoints on userspace addresses. Therefore the in-kernel - * watchpoint trap can only occur in code which is reading/writing - * from user space. Such code must not hold kernel locks (since it - * can equally take a page fault), therefore it is safe to call - * force_sig_info even though that claims and releases locks. - * - * Code in ./signal.c ensures that the debug control register - * is restored before we deliver any signal, and therefore that - * user code runs with the correct debug control register even though - * we clear it here. - * - * Being careful here means that we don't have to be as careful in a - * lot of more complicated places (task switching can be a bit lazy - * about restoring all the debug state, and ptrace doesn't have to - * find every occurrence of the TF bit that could be saved away even - * by user code) - */ -fastcall void do_debug(struct pt_regs * regs, long error_code) -{ - unsigned int condition; - struct task_struct *tsk = current; - - condition = HYPERVISOR_get_debugreg(6); - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; -#if 0 - /* It's safe to allow irq's after DR6 has been saved */ - if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); -#endif - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg[7]) - goto clear_dr7; - } - - if (regs->eflags & VM_MASK) - goto debug_vm86; - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg[6] = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - * And if the event was due to a debugger (PT_DTRACE), clear the - * TF flag so that register information is correct. - */ - if (condition & DR_STEP) { - /* - * We already checked v86 mode above, so we can - * check for kernel mode by just checking the CPL - * of CS. - */ - if ((regs->xcs & 2) == 0) - goto clear_TF_reenable; - - if (likely(tsk->ptrace & PT_DTRACE)) { - tsk->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } - } - - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); - - /* Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - HYPERVISOR_set_debugreg(7, 0); - return; - -debug_vm86: - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - return; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; - return; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void __user *eip) -{ - struct task_struct * task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = eip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't syncronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ - info.si_code = FPE_FLTINV; - /* Should we clear the SF or let user space do it ???? */ - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) -{ - ignore_fpu_irq = 1; - math_error((void __user *)regs->eip); -} - -void simd_math_error(void __user *eip) -{ - struct task_struct * task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = eip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -fastcall void do_simd_coprocessor_error(struct pt_regs * regs, - long error_code) -{ - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->eip); - } else { - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->eflags & VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, - error_code); - return; - } - die_if_kernel("cache flush denied", regs, error_code); - current->thread.trap_no = 19; - current->thread.error_code = error_code; - force_sig(SIGSEGV, current); - } -} - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). - */ -asmlinkage void math_state_restore(struct pt_regs regs) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - /* NB. 'clts' is done for us by Xen during virtual trap. */ - if (!tsk_used_math(tsk)) - init_fpu(tsk); - restore_fpu(tsk); - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ -} - -#ifndef CONFIG_MATH_EMULATION - -asmlinkage void math_emulate(long arg) -{ - printk("math-emulation not enabled and no coprocessor found.\n"); - printk("killing %s.\n",current->comm); - force_sig(SIGFPE,current); - schedule(); -} - -#endif /* CONFIG_MATH_EMULATION */ - -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); -} -#endif - - -/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */ -static trap_info_t trap_table[] = { - { 0, 0, __KERNEL_CS, (unsigned long)divide_error }, - { 1, 0, __KERNEL_CS, (unsigned long)debug }, - { 3, 3, __KERNEL_CS, (unsigned long)int3 }, - { 4, 3, __KERNEL_CS, (unsigned long)overflow }, - { 5, 3, __KERNEL_CS, (unsigned long)bounds }, - { 6, 0, __KERNEL_CS, (unsigned long)invalid_op }, - { 7, 0, __KERNEL_CS, (unsigned long)device_not_available }, - { 9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun }, - { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS }, - { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present }, - { 12, 0, __KERNEL_CS, (unsigned long)stack_segment }, - { 13, 0, __KERNEL_CS, (unsigned long)general_protection }, - { 14, 0, __KERNEL_CS, (unsigned long)page_fault }, - { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment }, - { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error }, - { 17, 0, __KERNEL_CS, (unsigned long)alignment_check }, -#ifdef CONFIG_X86_MCE - { 18, 0, __KERNEL_CS, (unsigned long)machine_check }, -#endif - { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error }, - { SYSCALL_VECTOR, 3, __KERNEL_CS, (unsigned long)system_call }, - { 0, 0, 0, 0 } -}; - -void __init trap_init(void) -{ - HYPERVISOR_set_trap_table(trap_table); - - /* - * default LDT is a single-entry callgate to lcall7 for iBCS - * and a callgate to lcall27 for Solaris/x86 binaries - */ - make_lowmem_page_readonly(&default_ldt[0]); - - /* - * Should be a barrier for any external CPU state. - */ - cpu_init(); -} - -void smp_trap_init(trap_info_t *trap_ctxt) -{ - trap_info_t *t = trap_table; - - for (t = trap_table; t->address; t++) { - trap_ctxt[t->vector].flags = t->flags; - trap_ctxt[t->vector].cs = t->cs; - trap_ctxt[t->vector].address = t->address; - } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/kernel/vsyscall.S --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/vsyscall.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,15 +0,0 @@ -#include <linux/init.h> - -__INITDATA - - .globl vsyscall_int80_start, vsyscall_int80_end -vsyscall_int80_start: - .incbin "arch/xen/i386/kernel/vsyscall-int80.so" -vsyscall_int80_end: - - .globl vsyscall_sysenter_start, vsyscall_sysenter_end -vsyscall_sysenter_start: - .incbin "arch/xen/i386/kernel/vsyscall-sysenter.so" -vsyscall_sysenter_end: - -__FINIT diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mach-default/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,12 +0,0 @@ -# -# Makefile for the linux kernel. -# - -c-obj-y := topology.o - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y)): - @ln -fsn $(srctree)/arch/i386/mach-default/$(notdir $@) $@ - -obj-y += $(c-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,24 +0,0 @@ -# -# Makefile for the linux i386-specific parts of the memory manager. -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CFLAGS += -Iarch/$(XENARCH)/mm - -obj-y := init.o pgtable.o fault.o ioremap.o hypervisor.o -c-obj-y := extable.o mmap.o pageattr.o - -c-obj-$(CONFIG_DISCONTIGMEM) += discontig.o -c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_HIGHMEM) += highmem.o -c-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/mm/$(notdir $@) $@ - -obj-y += $(c-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,561 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/percpu.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/desc.h> -#include <asm/kdebug.h> - -extern void die(const char *,struct pt_regs *,long); - -DEFINE_PER_CPU(pgd_t *, cur_pgd); - -/* - * Unlock any spinlocks which will prevent us from getting the - * message out - */ -void bust_spinlocks(int yes) -{ - int loglevel_save = console_loglevel; - - if (yes) { - oops_in_progress = 1; - return; - } -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; -} - -/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. - * - * The segment is checked, because it might have been changed by another - * task between the original faulting instruction and here. - * - * If CS is no longer a valid code segment, or if EIP is beyond the - * limit, or if it is a kernel address when CS is not a kernel segment, - * then the returned value will be greater than *eip_limit. - * - * This is slow, but is very rarely executed. - */ -static inline unsigned long get_segment_eip(struct pt_regs *regs, - unsigned long *eip_limit) -{ - unsigned long eip = regs->eip; - unsigned seg = regs->xcs & 0xffff; - u32 seg_ar, seg_limit, base, *desc; - - /* The standard kernel/user address space limit. */ - *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg; - - /* Unlikely, but must come before segment checks. */ - if (unlikely((regs->eflags & VM_MASK) != 0)) - return eip + (seg << 4); - - /* By far the most common cases. */ - if (likely(seg == __USER_CS || seg == __KERNEL_CS)) - return eip; - - /* Check the segment exists, is within the current LDT/GDT size, - that kernel/user (ring 0..3) has the appropriate privilege, - that it's a code segment, and get the limit. */ - __asm__ ("larl %3,%0; lsll %3,%1" - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); - if ((~seg_ar & 0x9800) || eip > seg_limit) { - *eip_limit = 0; - return 1; /* So that returned eip > *eip_limit. */ - } - - /* Get the GDT/LDT descriptor base. - When you look for races in this code remember that - LDT and other horrors are only used in user space. */ - if (seg & (1<<2)) { - /* Must lock the LDT while reading it. */ - down(¤t->mm->context.sem); - desc = current->mm->context.ldt; - desc = (void *)desc + (seg & ~7); - } else { - /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); - desc = (void *)desc + (seg & ~7); - } - - /* Decode the code segment base from the descriptor */ - base = get_desc_base((unsigned long *)desc); - - if (seg & (1<<2)) { - up(¤t->mm->context.sem); - } else - put_cpu(); - - /* Adjust EIP and segment limit, and clamp at the kernel limit. - It's legitimate for segments to wrap at 0xffffffff. */ - seg_limit += base; - if (seg_limit < *eip_limit && seg_limit >= base) - *eip_limit = seg_limit; - return eip + base; -} - -/* - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - */ -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) -{ - unsigned long limit; - unsigned long instr = get_segment_eip (regs, &limit); - int scan_more = 1; - int prefetch = 0; - int i; - - for (i = 0; scan_more && i < 15; i++) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (instr > limit) - break; - if (__get_user(opcode, (unsigned char *) instr)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (instr > limit) - break; - if (__get_user(opcode, (unsigned char *) instr)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 6)) { - /* Catch an obscure case of prefetch inside an NX page. */ - if (nx_enabled && (error_code & 16)) - return 0; - return __is_prefetch(regs, addr); - } - return 0; -} - -fastcall void do_invalid_op(struct pt_regs *, unsigned long); - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - */ -fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long page; - int write; - siginfo_t info; - - /* Set the "privileged fault" bit to something sane. */ - error_code &= 3; - error_code |= (regs->xcs & 2) << 1; - if (regs->eflags & X86_EFLAGS_VM) - error_code |= 4; - - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; -#if 0 - /* It's safe to allow irq's after cr2 has been saved */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); -#endif - - tsk = current; - - info.si_code = SEGV_MAPERR; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 1) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 5)) - goto vmalloc_fault; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - mm = tsk->mm; - - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault.. - */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occuring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->eip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * accessing the stack below %esp is always a bug. - * The "+ 32" is there due to some instructions (like - * pusha) doing post-decrement on the stack and that - * doesn't show up until later.. - */ - if (address + 32 < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ -#ifdef TEST_VERIFY_AREA - if (regs->cs == KERNEL_CS) - printk("WP fault at %08lx\n", regs->eip); -#endif - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - switch (handle_mm_fault(mm, vma, address, write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - case VM_FAULT_OOM: - goto out_of_memory; - default: - BUG(); - } - - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -#ifdef CONFIG_X86_F00F_BUG - /* - * Pentium F0 0F C7 C8 bug workaround. - */ - if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return; - } - } -#endif - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - */ - if (is_prefetch(regs, address, error_code)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n", current->uid); - } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT " printing eip:\n"); - printk("%08lx\n", regs->eip); - page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id())) - [address >> 22]; - printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); - /* - * We must not directly access the pte in the highpte - * case, the page table might be allocated in highmem. - * And lets rather not kmap-atomic the pte, just in case - * it's allocated already. - */ -#ifndef CONFIG_HIGHPTE - if (page & 1) { - page &= PAGE_MASK; - address &= 0x003ff000; - page = machine_to_phys(page); - page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; - printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, - machine_to_phys(page)); - } -#endif - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (tsk->pid == 1) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; - -vmalloc_fault: - { - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "tsk" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - int index = pgd_index(address); - pgd_t *pgd, *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - pte_t *pte_k; - - pgd = index + per_cpu(cur_pgd, smp_processor_id()); - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - goto no_context; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - goto no_context; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - goto no_context; - set_pmd(pmd, *pmd_k); - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - goto no_context; - return; - } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/highmem.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/highmem.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,100 +0,0 @@ -#include <linux/highmem.h> - -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ -static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) -{ - enum fixed_addresses idx; - unsigned long vaddr; - - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ - inc_preempt_count(); - if (!PageHighMem(page)) - return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; -} - -void *kmap_atomic(struct page *page, enum km_type type) -{ - return __kmap_atomic(page, type, kmap_prot); -} - -/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ -void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - return __kmap_atomic(page, type, PAGE_KERNEL_RO); -} - -void kunmap_atomic(void *kvaddr, enum km_type type) -{ -#ifdef CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); - preempt_check_resched(); - return; - } - - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(kmap_pte-idx); - __flush_tlb_one(vaddr); -#endif - - dec_preempt_count(); - preempt_check_resched(); -} - -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,346 +0,0 @@ -/****************************************************************************** - * mm/hypervisor.c - * - * Update page tables via the hypervisor. - * - * Copyright (c) 2002-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/balloon.h> -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#include <linux/percpu.h> -#include <asm/tlbflush.h> -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#define pte_offset_kernel pte_offset -#define pud_t pgd_t -#define pud_offset(d, va) d -#elif defined(CONFIG_X86_64) -#define pmd_val_ma(v) (v).pmd -#else -#define pmd_val_ma(v) (v).pud.pgd.pgd -#endif - -#ifndef CONFIG_XEN_SHADOW_MODE -void xen_l1_entry_update(pte_t *ptr, unsigned long val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = val; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - -void xen_l2_entry_update(pmd_t *ptr, pmd_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = pmd_val_ma(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - -#ifdef CONFIG_X86_64 -void xen_l3_entry_update(pud_t *ptr, pud_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = val.pud; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - -void xen_l4_entry_update(pgd_t *ptr, pgd_t val) -{ - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = val.pgd; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_XEN_SHADOW_MODE */ - -void xen_machphys_update(unsigned long mfn, unsigned long pfn) -{ - mmu_update_t u; - u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - u.val = pfn; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pt_switch(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_NEW_BASEPTR; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_new_user_pt(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_NEW_USER_BASEPTR; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_tlb_flush(void) -{ - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_LOCAL; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_invlpg(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_LOCAL; - op.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -#ifdef CONFIG_SMP - -void xen_tlb_flush_all(void) -{ - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_ALL; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_tlb_flush_mask(cpumask_t *mask) -{ - struct mmuext_op op; - if ( cpus_empty(*mask) ) - return; - op.cmd = MMUEXT_TLB_FLUSH_MULTI; - op.vcpumask = mask->bits; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_invlpg_all(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_ALL; - op.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) -{ - struct mmuext_op op; - if ( cpus_empty(*mask) ) - return; - op.cmd = MMUEXT_INVLPG_MULTI; - op.vcpumask = mask->bits; - op.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -#endif /* CONFIG_SMP */ - -#ifndef CONFIG_XEN_SHADOW_MODE -void xen_pgd_pin(unsigned long ptr) -{ - struct mmuext_op op; -#ifdef CONFIG_X86_64 - op.cmd = MMUEXT_PIN_L4_TABLE; -#else - op.cmd = MMUEXT_PIN_L2_TABLE; -#endif - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pgd_unpin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pte_pin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L1_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pte_unpin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -#ifdef CONFIG_X86_64 -void xen_pud_pin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L3_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pud_unpin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pmd_pin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L2_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_pmd_unpin(unsigned long ptr) -{ - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} -#endif /* CONFIG_X86_64 */ -#endif /* CONFIG_XEN_SHADOW_MODE */ - -void xen_set_ldt(unsigned long ptr, unsigned long len) -{ - struct mmuext_op op; - op.cmd = MMUEXT_SET_LDT; - op.linear_addr = ptr; - op.nr_ents = len; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_contig_memory(unsigned long vstart, unsigned int order) -{ - /* - * Ensure multi-page extents are contiguous in machine memory. This code - * could be cleaned up some, and the number of hypercalls reduced. - */ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long mfn, i, flags; - - scrub_pages(vstart, 1 << order); - - balloon_lock(flags); - - /* 1. Zap current PTEs, giving away the underlying pages. */ - for (i = 0; i < (1<<order); i++) { - pgd = pgd_offset_k(vstart + (i*PAGE_SIZE)); - pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); - pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); - pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); - mfn = pte_mfn(*pte); - HYPERVISOR_update_va_mapping( - vstart + (i*PAGE_SIZE), __pte_ma(0), 0); - phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = - INVALID_P2M_ENTRY; - BUG_ON(HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, &mfn, 1, 0) != 1); - } - - /* 2. Get a new contiguous memory extent. */ - BUG_ON(HYPERVISOR_dom_mem_op( - MEMOP_increase_reservation, &mfn, 1, order) != 1); - - /* 3. Map the new extent in place of old pages. */ - for (i = 0; i < (1<<order); i++) { - HYPERVISOR_update_va_mapping( - vstart + (i*PAGE_SIZE), - __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0); - xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i); - phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i; - } - - flush_tlb_all(); - - balloon_unlock(flags); -} - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - -unsigned long allocate_empty_lowmem_region(unsigned long pages) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long *pfn_array; - unsigned long vstart; - unsigned long i; - unsigned int order = get_order(pages*PAGE_SIZE); - - vstart = __get_free_pages(GFP_KERNEL, order); - if ( vstart == 0 ) - return 0UL; - - scrub_pages(vstart, 1 << order); - - pfn_array = vmalloc((1<<order) * sizeof(*pfn_array)); - if ( pfn_array == NULL ) - BUG(); - - for ( i = 0; i < (1<<order); i++ ) - { - pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE))); - pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); - pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); - pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); - pfn_array[i] = pte_mfn(*pte); -#ifdef CONFIG_X86_64 - xen_l1_entry_update(pte, 0); -#else - HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0); -#endif - phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = - INVALID_P2M_ENTRY; - } - - flush_tlb_all(); - - balloon_put_pages(pfn_array, 1 << order); - - vfree(pfn_array); - - return vstart; -} - -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/init.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,795 +0,0 @@ -/* - * linux/arch/i386/mm/init.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/slab.h> -#include <linux/proc_fs.h> -#include <linux/efi.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/dma.h> -#include <asm/fixmap.h> -#include <asm/e820.h> -#include <asm/apic.h> -#include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/sections.h> -#include <asm-xen/hypervisor.h> - -unsigned int __VMALLOC_RESERVE = 128 << 20; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -unsigned long highstart_pfn, highend_pfn; - -static int noinline do_test_wp_bit(void); - -/* - * Creates a middle page table and puts a pointer to it in the - * given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t * __init one_md_table_init(pgd_t *pgd) -{ - pud_t *pud; - pmd_t *pmd_table; - -#ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); -#else - pud = pud_offset(pgd, 0); - pmd_table = pmd_offset(pud, 0); -#endif - - return pmd_table; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static pte_t * __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - make_page_readonly(page_table); - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This function initializes a certain range of kernel virtual memory - * with new bootmem page tables, everywhere page tables are missing in - * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without - * checking the pgd every time. - */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - int pgd_idx, pmd_idx; - unsigned long vaddr; - - vaddr = start; - pgd_idx = pgd_index(vaddr); - pmd_idx = pmd_index(vaddr); - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD_NO_HV) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); - - vaddr += PMD_SIZE; - } - pmd_idx = 0; - } -} - -static inline int is_kernel_text(unsigned long addr) -{ - if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) - return 1; - return 0; -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. - */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; - - unsigned long max_ram_pfn = xen_start_info.nr_pages; - if (max_ram_pfn > max_low_pfn) - max_ram_pfn = max_low_pfn; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - pmd_idx = pmd_index(PAGE_OFFSET); - pte_ofs = pte_index(PAGE_OFFSET); - - for (; pgd_idx < PTRS_PER_PGD_NO_HV; pgd++, pgd_idx++) { - pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) - continue; - pmd += pmd_idx; - for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - pte += pte_ofs; - for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { - /* XEN: Only map initial RAM allocation. */ - if ((pfn >= max_ram_pfn) || pte_present(*pte)) - continue; - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - pte_ofs = 0; - } - } - pmd_idx = 0; - } -} - -static inline int page_kills_ppro(unsigned long pagenr) -{ - if (pagenr >= 0x70000 && pagenr <= 0x7003F) - return 1; - return 0; -} - -extern int is_available_memory(efi_memory_desc_t *); - -static inline int page_is_ram(unsigned long pagenr) -{ - int i; - unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; - - for (i = 0; i < memmap.nr_map; i++) { - md = &memmap.map[i]; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; - } - - for (i = 0; i < e820.nr_map; i++) { - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -pgprot_t kmap_prot; - -EXPORT_SYMBOL(kmap_prot); -EXPORT_SYMBOL(kmap_pte); - -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -void __init permanent_kmaps_init(pgd_t *pgd_base) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; -} - -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) -{ - if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { - ClearPageReserved(page); - set_bit(PG_highmem, &page->flags); - set_page_count(page, 1); - if (pfn < xen_start_info.nr_pages) - __free_page(page); - totalhigh_pages++; - } else - SetPageReserved(page); -} - -#ifndef CONFIG_DISCONTIGMEM -void __init set_highmem_pages_init(int bad_ppro) -{ - int pfn; - for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); - totalram_pages += totalhigh_pages; -} -#else -extern void set_highmem_pages_init(int); -#endif /* !CONFIG_DISCONTIGMEM */ - -#else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) -#endif /* CONFIG_HIGHMEM */ - -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; - -#ifndef CONFIG_DISCONTIGMEM -#define remap_numa_kva() do {} while (0) -#else -extern void __init remap_numa_kva(void); -#endif - -static void __init pagetable_init (void) -{ - unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; - pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base; - -#ifdef CONFIG_X86_PAE - int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); -#endif - - /* Enable PSE if available */ - if (cpu_has_pse) { - set_in_cr4(X86_CR4_PSE); - } - - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; - __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; - } - - /* - * Switch to proper mm_init page directory. Initialise from the current - * page directory, write-protect the new page directory, then switch to - * it. We clean up by write-enabling and then freeing the old page dir. - */ - memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); - make_page_readonly(pgd_base); - xen_pgd_pin(__pa(pgd_base)); - load_cr3(pgd_base); - xen_pgd_unpin(__pa(old_pgd)); - make_page_writable(old_pgd); - __flush_tlb_all(); - free_bootmem(__pa(old_pgd), PAGE_SIZE); - init_mm.context.pinned = 1; - - kernel_physical_mapping_init(pgd_base); - remap_numa_kva(); - - /* - * Fixed mappings, only the page table structure has to be - * created - mappings will be set by set_fixmap(): - */ - vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); - - permanent_kmaps_init(pgd_base); - -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; -#endif -} - -#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) -/* - * Swap suspend & friends need this for resume because things like the intel-agp - * driver might have split up a kernel 4MB mapping. - */ -char __nosavedata swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); - -static inline void save_pg_dir(void) -{ - memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); -} -#else -static inline void save_pg_dir(void) -{ -} -#endif - -void zap_low_mappings (void) -{ - int i; - - save_pg_dir(); - - /* - * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. - */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); -} - -#ifndef CONFIG_DISCONTIGMEM -void __init zone_sizes_init(void) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - unsigned int /*max_dma,*/ high, low; - - /* - * XEN: Our notion of "DMA memory" is fake when running over Xen. - * We simply put all RAM in the DMA zone so that those drivers which - * needlessly specify GFP_DMA do not get starved of RAM unnecessarily. - * Those drivers that *do* require lowmem are screwed anyway when - * running over Xen! - */ - /*max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;*/ - low = max_low_pfn; - high = highend_pfn; - - /*if (low < max_dma)*/ - zones_size[ZONE_DMA] = low; - /*else*/ { - /*zones_size[ZONE_DMA] = max_dma;*/ - /*zones_size[ZONE_NORMAL] = low - max_dma;*/ -#ifdef CONFIG_HIGHMEM - zones_size[ZONE_HIGHMEM] = high - low; -#endif - } - free_area_init(zones_size); -} -#else -extern void zone_sizes_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ - -static int disable_nx __initdata = 0; -u64 __supported_pte_mask = ~_PAGE_NX; - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -void __init noexec_setup(const char *str) -{ - if (!strncmp(str, "on",2) && cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str,"off",3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } -} - -int nx_enabled = 0; -#ifdef CONFIG_X86_PAE - -static void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} - -/* - * Enables/disables executability of a given kernel page and - * returns the previous setting. - */ -int __init set_kernel_exec(unsigned long vaddr, int enable) -{ - pte_t *pte; - int ret = 1; - - if (!nx_enabled) - goto out; - - pte = lookup_address(vaddr); - BUG_ON(!pte); - - if (!pte_exec_kernel(*pte)) - ret = 0; - - if (enable) - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - else - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); - __flush_tlb_all(); -out: - return ret; -} - -#endif - -/* - * paging_init() sets up the page tables - note that the first 8MB are - * already mapped by head.S. - * - * This routines also unmaps the page at virtual kernel address 0, so - * that we can trap those pesky NULL-reference errors in the kernel. - */ -void __init paging_init(void) -{ -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - int i; -#endif - -#ifdef CONFIG_X86_PAE - set_nx(); - if (nx_enabled) - printk("NX (Execute Disable) protection: active\n"); -#endif - - pagetable_init(); - -#ifdef CONFIG_X86_PAE - /* - * We will bail out later - printk doesn't work right now so - * the user would just see a hanging kernel. - */ - if (cpu_has_pae) - set_in_cr4(X86_CR4_PAE); -#endif - __flush_tlb_all(); - - kmap_init(); - zone_sizes_init(); - - /* Switch to the real shared_info page, and clear the dummy page. */ - set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - memset(empty_zero_page, 0, sizeof(empty_zero_page)); - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - /* Setup mapping of lower 1st MB */ - for (i = 0; i < NR_FIX_ISAMAPS; i++) - if (xen_start_info.flags & SIF_PRIVILEGED) - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); - else - __set_fixmap(FIX_ISAMAP_BEGIN - i, - virt_to_machine(empty_zero_page), - PAGE_KERNEL_RO); -#endif -} - -/* - * Test if the WP bit works in supervisor mode. It isn't supported on 386's - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This - * used to involve black magic jumps to work around some nasty CPU bugs, - * but fortunately the switch to using exceptions got rid of all that. - */ - -void __init test_wp_bit(void) -{ - printk("Checking if this processor honours the WP bit even in supervisor mode... "); - - /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); - boot_cpu_data.wp_works_ok = do_test_wp_bit(); - clear_fixmap(FIX_WP_TEST); - - if (!boot_cpu_data.wp_works_ok) { - printk("No.\n"); -#ifdef CONFIG_X86_WP_WORKS_OK - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); -#endif - } else { - printk("Ok.\n"); - } -} - -#ifndef CONFIG_DISCONTIGMEM -static void __init set_max_mapnr_init(void) -{ -#ifdef CONFIG_HIGHMEM - max_mapnr = num_physpages = highend_pfn; -#else - max_mapnr = num_physpages = max_low_pfn; -#endif -} -#define __free_all_bootmem() free_all_bootmem() -#else -#define __free_all_bootmem() free_all_bootmem_node(NODE_DATA(0)) -extern void set_max_mapnr_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ - -static struct kcore_list kcore_mem, kcore_vmalloc; - -void __init mem_init(void) -{ - extern int ppro_with_ram_bug(void); - int codesize, reservedpages, datasize, initsize; - int tmp; - int bad_ppro; - unsigned long pfn; - -#ifndef CONFIG_DISCONTIGMEM - if (!mem_map) - BUG(); -#endif - - bad_ppro = ppro_with_ram_bug(); - -#ifdef CONFIG_HIGHMEM - /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); - printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); - BUG(); - } -#endif - - set_max_mapnr_init(); - -#ifdef CONFIG_HIGHMEM - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE); -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); -#endif - printk("vmalloc area: %lx-%lx, maxmem %lx\n", - VMALLOC_START,VMALLOC_END,MAXMEM); - BUG_ON(VMALLOC_START > VMALLOC_END); - - /* this will put all low memory onto the freelists */ - totalram_pages += __free_all_bootmem(); - /* XEN: init and count low-mem pages outside initial allocation. */ - for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) { - ClearPageReserved(&mem_map[pfn]); - set_page_count(&mem_map[pfn], 1); - totalram_pages++; - } - - reservedpages = 0; - for (tmp = 0; tmp < max_low_pfn; tmp++) - /* - * Only count reserved RAM pages - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; - - set_highmem_pages_init(bad_ppro); - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - num_physpages << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10, - (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) - ); - -#ifdef CONFIG_X86_PAE - if (!cpu_has_pae) - panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!"); -#endif - if (boot_cpu_data.wp_works_ok < 0) - test_wp_bit(); - - /* - * Subtle. SMP is doing it's boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. - */ -#ifndef CONFIG_SMP - zap_low_mappings(); -#endif -} - -kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; - -void __init pgtable_cache_init(void) -{ - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - 0, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } - pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), - 0, - pgd_ctor, - pgd_dtor); - if (!pgd_cache) - panic("pgtable_cache_init(): Cannot create pgd cache"); -} - -/* - * This function cannot be __init, since exceptions don't work in that - * section. Put this after the callers, so that it cannot be inlined. - */ -static int noinline do_test_wp_bit(void) -{ - char tmp_reg; - int flag; - - __asm__ __volatile__( - " movb %0,%1 \n" - "1: movb %1,%0 \n" - " xorl %2,%2 \n" - "2: \n" - ".section __ex_table,\"a\"\n" - " .align 4 \n" - " .long 1b,2b \n" - ".previous \n" - :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), - "=q" (tmp_reg), - "=r" (flag) - :"2" (1) - :"memory"); - - return flag; -} - -void free_initmem(void) -{ - unsigned long addr; - - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); - memset((void *)addr, 0xcc, PAGE_SIZE); - free_page(addr); - totalram_pages++; - } - printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", (__init_end - __init_begin) >> 10); -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < end) - printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); - free_page(start); - totalram_pages++; - } -} -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/ioremap.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/ioremap.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,442 +0,0 @@ -/* - * arch/i386/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> - -#ifndef CONFIG_XEN_PHYSDEV_ACCESS - -void * __ioremap(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - return NULL; -} - -void *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return NULL; -} - -void iounmap(volatile void __iomem *addr) -{ -} - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - return NULL; -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ -} - -#else - -/* - * Does @address reside within a non-highmem page that is local to this virtual - * machine (i.e., not an I/O page, nor a memory page belonging to another VM). - * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand - * why this works. - */ -static inline int is_local_lowmem(unsigned long address) -{ - extern unsigned long max_low_pfn; - unsigned long mfn = address >> PAGE_SHIFT; - unsigned long pfn = mfn_to_pfn(mfn); - return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn)); -} - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - domid_t domid = DOMID_IO; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0x0 && last_addr < 0x100000) - return isa_bus_to_virt(phys_addr); -#endif - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (is_local_lowmem(phys_addr)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = bus_to_virt(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - - domid = DOMID_SELF; - } - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr, - size, __pgprot(_PAGE_PRESENT | _PAGE_RW | - _PAGE_DIRTY | _PAGE_ACCESSED - | flags), domid)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} - - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (is_local_lowmem(last_addr)) { - struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} - -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p; - if ((void __force *) addr <= high_memory) - return; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; -#endif - p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); - if (!p) { - printk("__iounmap: bad address %p\n", addr); - return; - } - - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { - /* p->size includes the guard page, but cpa doesn't like that */ - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), - (p->size - PAGE_SIZE) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - kfree(p); -} - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0x0 && last_addr < 0x100000) - return isa_bus_to_virt(phys_addr); -#endif - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; -#endif - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} - -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ - -/* These hacky macros avoid phys->machine translations. */ -#define __direct_pte(x) ((pte_t) { (x) } ) -#define __direct_mk_pte(page_nr,pgprot) \ - __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) -#define direct_mk_pte_phys(physpage, pgprot) \ - __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) - -static inline void direct_remap_area_pte(pte_t *pte, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - if (address >= end) - BUG(); - - do { - (*v)->ptr = virt_to_machine(pte); - (*v)++; - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int direct_remap_area_pmd(struct mm_struct *mm, - pmd_t *pmd, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - pte_t *pte = (mm == &init_mm) ? - pte_alloc_kernel(mm, pmd, address) : - pte_alloc_map(mm, pmd, address); - if (!pte) - return -ENOMEM; - direct_remap_area_pte(pte, address, end - address, v); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v) -{ - pgd_t * dir; - unsigned long end = address + size; - int error; - - dir = pgd_offset(mm, address); - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - do { - pud_t *pud; - pmd_t *pmd; - - error = -ENOMEM; - pud = pud_alloc(mm, dir, address); - if (!pud) - break; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - break; - error = 0; - direct_remap_area_pmd(mm, pmd, address, end - address, &v); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - - } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); - return error; -} - - -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - int i; - unsigned long start_address; -#define MAX_DIRECTMAP_MMU_QUEUE 130 - mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; - - start_address = address; - - flush_cache_all(); - - for (i = 0; i < size; i += PAGE_SIZE) { - if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) { - /* Fill in the PTE pointers. */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) - return -EFAULT; - v = u; - start_address = address; - } - - /* - * Fill in the machine address: PTE ptr is done later by - * __direct_remap_area_pages(). - */ - v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); - - machine_addr += PAGE_SIZE; - address += PAGE_SIZE; - v++; - } - - if (v != u) { - /* get the ptep's filled in */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) - return -EFAULT; - } - - flush_tlb_all(); - - return 0; -} - -EXPORT_SYMBOL(direct_remap_area_pages); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,525 +0,0 @@ -/* - * linux/arch/i386/mm/pgtable.c - */ - -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/highmem.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/spinlock.h> - -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/fixmap.h> -#include <asm/e820.h> -#include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/io.h> -#include <asm/mmu_context.h> - -#include <asm-xen/foreign_page.h> - -void show_mem(void) -{ - int total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - - printk("Mem-info:\n"); - show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - page = pgdat->node_mem_map + i; - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk("%d pages of RAM\n", total); - printk("%d pages of HIGHMEM\n",highmem); - printk("%d reserved pages\n",reserved); - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte(pfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn, - pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* <pfn,flags> stored as-is, to permit clearing entries */ - set_pte(pte, pfn_pte_ma(pfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -/* - * Associate a large virtual page frame with a given physical page frame - * and protection flags for that frame. pfn is for the base of the page, - * vaddr is what the page gets mapped to - both must be properly aligned. - * The pmd must already be instantiated. Assumes PAE mode. - */ -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */ - printk ("set_pmd_pfn: vaddr misaligned\n"); - return; /* BUG(); */ - } - if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */ - printk ("set_pmd_pfn: pfn misaligned\n"); - return; /* BUG(); */ - } - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - printk ("set_pmd_pfn: pgd_none\n"); - return; /* BUG(); */ - } - pud = pud_offset(pgd, vaddr); - pmd = pmd_offset(pud, vaddr); - set_pmd(pmd, pfn_pmd(pfn, flags)); - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - BUG(); - return; - } - switch (idx) { - case FIX_WP_TEST: - case FIX_VSYSCALL: -#ifdef CONFIG_X86_F00F_BUG - case FIX_F00F_IDT: -#endif - set_pte_pfn(address, phys >> PAGE_SHIFT, flags); - break; - default: - set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags); - break; - } -} - -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); - if (pte) - make_page_readonly(pte); - return pte; -} - -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); - if (pte) { - SetPageForeign(pte, pte_free); - set_page_count(pte, 1); - } -#endif - - return pte; -} - -void pte_free(struct page *pte) -{ - unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); - - if (!pte_write(*virt_to_ptep(va))) - HYPERVISOR_update_va_mapping( - va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0); - - ClearPageForeign(pte); - set_page_count(pte, 1); - - __free_page(pte); -} - -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * The locking scheme was chosen on the basis of manfred's - * recommendations and having no core impact whatsoever. - * -- wli - */ -DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; - -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - page->index = (unsigned long)pgd_list; - if (pgd_list) - pgd_list->private = (unsigned long)&page->index; - pgd_list = page; - page->private = (unsigned long)&pgd_list; -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *next, **pprev, *page = virt_to_page(pgd); - next = (struct page *)page->index; - pprev = (struct page **)page->private; - *pprev = next; - if (next) - next->private = (unsigned long)pprev; -} - -void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) -{ - unsigned long flags; - - if (PTRS_PER_PMD == 1) - spin_lock_irqsave(&pgd_lock, flags); - - memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - - if (PTRS_PER_PMD > 1) - return; - - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); -} - -/* never called when PTRS_PER_PMD > 1 */ -void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused) -{ - unsigned long flags; /* can be called from interrupt context */ - - if (PTRS_PER_PMD > 1) - return; - - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); - - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; - - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - if (!pmd) - goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); - } - return pgd; - -out_oom: - for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - kmem_cache_free(pgd_cache, pgd); - return NULL; -} - -void pgd_free(pgd_t *pgd) -{ - int i; - pte_t *ptep = virt_to_ptep(pgd); - - if (!pte_write(*ptep)) { - xen_pgd_unpin(__pa(pgd)); - HYPERVISOR_update_va_mapping( - (unsigned long)pgd, - pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), - 0); - } - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_range() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); -} - -#ifndef CONFIG_XEN_SHADOW_MODE -void make_lowmem_page_readonly(void *va) -{ - pte_t *pte = virt_to_ptep(va); - set_pte(pte, pte_wrprotect(*pte)); -} - -void make_lowmem_page_writable(void *va) -{ - pte_t *pte = virt_to_ptep(va); - set_pte(pte, pte_mkwrite(*pte)); -} - -void make_page_readonly(void *va) -{ - pte_t *pte = virt_to_ptep(va); - set_pte(pte, pte_wrprotect(*pte)); - if ( (unsigned long)va >= (unsigned long)high_memory ) - { - unsigned long phys; - phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK); -#ifdef CONFIG_HIGHMEM - if ( (phys >> PAGE_SHIFT) < highstart_pfn ) -#endif - make_lowmem_page_readonly(phys_to_virt(phys)); - } -} - -void make_page_writable(void *va) -{ - pte_t *pte = virt_to_ptep(va); - set_pte(pte, pte_mkwrite(*pte)); - if ( (unsigned long)va >= (unsigned long)high_memory ) - { - unsigned long phys; - phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK); -#ifdef CONFIG_HIGHMEM - if ( (phys >> PAGE_SHIFT) < highstart_pfn ) -#endif - make_lowmem_page_writable(phys_to_virt(phys)); - } -} - -void make_pages_readonly(void *va, unsigned int nr) -{ - while ( nr-- != 0 ) - { - make_page_readonly(va); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} - -void make_pages_writable(void *va, unsigned int nr) -{ - while ( nr-- != 0 ) - { - make_page_writable(va); - va = (void *)((unsigned long)va + PAGE_SIZE); - } -} -#endif /* CONFIG_XEN_SHADOW_MODE */ - -LIST_HEAD(mm_unpinned); -DEFINE_SPINLOCK(mm_unpinned_lock); - -static inline void mm_walk_set_prot(void *pt, pgprot_t flags) -{ - struct page *page = virt_to_page(pt); - unsigned long pfn = page_to_pfn(page); - - if (PageHighMem(page)) - return; - HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, flags), 0); -} - -static void mm_walk(struct mm_struct *mm, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int g,u,m; - - pgd = mm->pgd; - for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, 0); - if (PTRS_PER_PUD > 1) /* not folded */ - mm_walk_set_prot(pud,flags); - for (u = 0; u < PTRS_PER_PUD; u++, pud++) { - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, 0); - if (PTRS_PER_PMD > 1) /* not folded */ - mm_walk_set_prot(pmd,flags); - for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { - if (pmd_none(*pmd)) - continue; - pte = pte_offset_kernel(pmd,0); - mm_walk_set_prot(pte,flags); - } - } - } -} - -void mm_pin(struct mm_struct *mm) -{ - spin_lock(&mm->page_table_lock); - - mm_walk(mm, PAGE_KERNEL_RO); - HYPERVISOR_update_va_mapping( - (unsigned long)mm->pgd, - pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), - UVMF_TLB_FLUSH); - xen_pgd_pin(__pa(mm->pgd)); - mm->context.pinned = 1; - spin_lock(&mm_unpinned_lock); - list_del(&mm->context.unpinned); - spin_unlock(&mm_unpinned_lock); - - spin_unlock(&mm->page_table_lock); -} - -void mm_unpin(struct mm_struct *mm) -{ - spin_lock(&mm->page_table_lock); - - xen_pgd_unpin(__pa(mm->pgd)); - HYPERVISOR_update_va_mapping( - (unsigned long)mm->pgd, - pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0); - mm_walk(mm, PAGE_KERNEL); - xen_tlb_flush(); - mm->context.pinned = 0; - spin_lock(&mm_unpinned_lock); - list_add(&mm->context.unpinned, &mm_unpinned); - spin_unlock(&mm_unpinned_lock); - - spin_unlock(&mm->page_table_lock); -} - -void mm_pin_all(void) -{ - while (!list_empty(&mm_unpinned)) - mm_pin(list_entry(mm_unpinned.next, struct mm_struct, - context.unpinned)); -} - -void _arch_exit_mmap(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - - /* - * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() - * *much* faster this way, as no tlb flushes means bigger wrpt batches. - */ - if ( tsk->active_mm == mm ) - { - tsk->active_mm = &init_mm; - atomic_inc(&init_mm.mm_count); - - switch_mm(mm, &init_mm, tsk); - - atomic_dec(&mm->mm_count); - BUG_ON(atomic_read(&mm->mm_count) == 0); - } - - task_unlock(tsk); - - if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) - mm_unpin(mm); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/pci/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/i386/pci/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,32 +0,0 @@ -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CFLAGS += -Iarch/$(XENARCH)/pci - -c-obj-y := i386.o - -c-obj-$(CONFIG_PCI_BIOS) += pcbios.o -c-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o -c-obj-$(CONFIG_PCI_DIRECT) += direct.o - -c-pci-y := fixup.o -c-pci-$(CONFIG_ACPI_PCI) += acpi.o -c-pci-y += legacy.o -# Make sure irq.o gets linked in after legacy.o -l-pci-y += irq.o - -c-pci-$(CONFIG_X86_VISWS) := visws.o fixup.o -pci-$(CONFIG_X86_VISWS) := -c-pci-$(CONFIG_X86_NUMAQ) := numa.o -pci-$(CONFIG_X86_NUMAQ) := irq.o - -obj-y += $(pci-y) -c-obj-y += $(c-pci-y) common.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@ - -obj-y += $(c-obj-y) $(l-pci-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/i386/pci/irq.c --- a/linux-2.6.11-xen-sparse/arch/xen/i386/pci/irq.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1125 +0,0 @@ -/* - * Low-Level PCI Support for PC -- Routing of Interrupts - * - * (c) 1999--2000 Martin Mares <mj@xxxxxx> - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/pci.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/dmi.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/io_apic.h> -#include <asm/hw_irq.h> -#include <linux/acpi.h> - -#include "pci.h" - -#define DBG printk - -#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) -#define PIRQ_VERSION 0x0100 - -static int broken_hp_bios_irq9; -static int acer_tm360_irqrouting; - -static struct irq_routing_table *pirq_table; - -static int pirq_enable_irq(struct pci_dev *dev); - -/* - * Never use: 0, 1, 2 (timer, keyboard, and cascade) - * Avoid using: 13, 14 and 15 (FP error and IDE). - * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and mouse) - */ -unsigned int pcibios_irq_mask = 0xfff8; - -static int pirq_penalty[16] = { - 1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000, - 0, 0, 0, 0, 1000, 100000, 100000, 100000 -}; - -struct irq_router { - char *name; - u16 vendor, device; - int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq); - int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new); -}; - -struct irq_router_handler { - u16 vendor; - int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); -}; - -int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; - -/* - * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. - */ - -static struct irq_routing_table * __init pirq_find_routing_table(void) -{ - u8 *addr; - struct irq_routing_table *rt; - int i; - u8 sum; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) isa_bus_to_virt(0x100000); addr += 16) { - rt = (struct irq_routing_table *) addr; - if (rt->signature != PIRQ_SIGNATURE || - rt->version != PIRQ_VERSION || - rt->size % 16 || - rt->size < sizeof(struct irq_routing_table)) - continue; - sum = 0; - for(i=0; i<rt->size; i++) - sum += addr[i]; - if (!sum) { - DBG("PCI: Interrupt Routing Table found at 0x%p\n", rt); - return rt; - } - } -#endif - - return NULL; -} - -/* - * If we have a IRQ routing table, use it to search for peer host - * bridges. It's a gross hack, but since there are no other known - * ways how to get a list of buses, we have to go this way. - */ - -static void __init pirq_peer_trick(void) -{ - struct irq_routing_table *rt = pirq_table; - u8 busmap[256]; - int i; - struct irq_info *e; - - memset(busmap, 0, sizeof(busmap)); - for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) { - e = &rt->slots[i]; -#ifdef DEBUG - { - int j; - DBG("%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot); - for(j=0; j<4; j++) - DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap); - DBG("\n"); - } -#endif - busmap[e->bus] = 1; - } - for(i = 1; i < 256; i++) { - if (!busmap[i] || pci_find_bus(0, i)) - continue; - if (pci_scan_bus(i, &pci_root_ops, NULL)) - printk(KERN_INFO "PCI: Discovered primary peer bus %02x [IRQ]\n", i); - } - pcibios_last_bus = -1; -} - -/* - * Code for querying and setting of IRQ routes on various interrupt routers. - */ - -void eisa_set_level_irq(unsigned int irq) -{ - unsigned char mask = 1 << (irq & 7); - unsigned int port = 0x4d0 + (irq >> 3); - unsigned char val; - static u16 eisa_irq_mask; - - if (irq >= 16 || (1 << irq) & eisa_irq_mask) - return; - - eisa_irq_mask |= (1 << irq); - printk("PCI: setting IRQ %u as level-triggered\n", irq); - val = inb(port); - if (!(val & mask)) { - DBG(" -> edge"); - outb(val | mask, port); - } -} - -/* - * Common IRQ routing practice: nybbles in config space, - * offset by some magic constant. - */ -static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr) -{ - u8 x; - unsigned reg = offset + (nr >> 1); - - pci_read_config_byte(router, reg, &x); - return (nr & 1) ? (x >> 4) : (x & 0xf); -} - -static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val) -{ - u8 x; - unsigned reg = offset + (nr >> 1); - - pci_read_config_byte(router, reg, &x); - x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val); - pci_write_config_byte(router, reg, x); -} - -/* - * ALI pirq entries are damn ugly, and completely undocumented. - * This has been figured out from pirq tables, and it's not a pretty - * picture. - */ -static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; - - return irqmap[read_config_nybble(router, 0x48, pirq-1)]; -} - -static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; - unsigned int val = irqmap[irq]; - - if (val) { - write_config_nybble(router, 0x48, pirq-1, val); - return 1; - } - return 0; -} - -/* - * The Intel PIIX4 pirq rules are fairly simple: "pirq" is - * just a pointer to the config space. - */ -static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - u8 x; - - pci_read_config_byte(router, pirq, &x); - return (x < 16) ? x : 0; -} - -static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - pci_write_config_byte(router, pirq, irq); - return 1; -} - -/* - * The VIA pirq rules are nibble-based, like ALI, - * but without the ugly irq number munging. - * However, PIRQD is in the upper instead of lower 4 bits. - */ -static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq); -} - -static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq); - return 1; -} - -/* - * ITE 8330G pirq rules are nibble-based - * FIXME: pirqmap may be { 1, 0, 3, 2 }, - * 2+3 are both mapped to irq 9 on my system - */ -static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - static unsigned char pirqmap[4] = { 1, 0, 2, 3 }; - return read_config_nybble(router,0x43, pirqmap[pirq-1]); -} - -static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - static unsigned char pirqmap[4] = { 1, 0, 2, 3 }; - write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); - return 1; -} - -/* - * OPTI: high four bits are nibble pointer.. - * I wonder what the low bits do? - */ -static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - return read_config_nybble(router, 0xb8, pirq >> 4); -} - -static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - write_config_nybble(router, 0xb8, pirq >> 4, irq); - return 1; -} - -/* - * Cyrix: nibble offset 0x5C - * 0x5C bits 7:4 is INTB bits 3:0 is INTA - * 0x5D bits 7:4 is INTD bits 3:0 is INTC - */ -static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - return read_config_nybble(router, 0x5C, (pirq-1)^1); -} - -static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - write_config_nybble(router, 0x5C, (pirq-1)^1, irq); - return 1; -} - -/* - * PIRQ routing for SiS 85C503 router used in several SiS chipsets. - * We have to deal with the following issues here: - * - vendors have different ideas about the meaning of link values - * - some onboard devices (integrated in the chipset) have special - * links and are thus routed differently (i.e. not via PCI INTA-INTD) - * - different revision of the router have a different layout for - * the routing registers, particularly for the onchip devices - * - * For all routing registers the common thing is we have one byte - * per routeable link which is defined as: - * bit 7 IRQ mapping enabled (0) or disabled (1) - * bits [6:4] reserved (sometimes used for onchip devices) - * bits [3:0] IRQ to map to - * allowed: 3-7, 9-12, 14-15 - * reserved: 0, 1, 2, 8, 13 - * - * The config-space registers located at 0x41/0x42/0x43/0x44 are - * always used to route the normal PCI INT A/B/C/D respectively. - * Apparently there are systems implementing PCI routing table using - * link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D. - * We try our best to handle both link mappings. - * - * Currently (2003-05-21) it appears most SiS chipsets follow the - * definition of routing registers from the SiS-5595 southbridge. - * According to the SiS 5595 datasheets the revision id's of the - * router (ISA-bridge) should be 0x01 or 0xb0. - * - * Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1. - * Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets. - * They seem to work with the current routing code. However there is - * some concern because of the two USB-OHCI HCs (original SiS 5595 - * had only one). YMMV. - * - * Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1: - * - * 0x61: IDEIRQ: - * bits [6:5] must be written 01 - * bit 4 channel-select primary (0), secondary (1) - * - * 0x62: USBIRQ: - * bit 6 OHCI function disabled (0), enabled (1) - * - * 0x6a: ACPI/SCI IRQ: bits 4-6 reserved - * - * 0x7e: Data Acq. Module IRQ - bits 4-6 reserved - * - * We support USBIRQ (in addition to INTA-INTD) and keep the - * IDE, ACPI and DAQ routing untouched as set by the BIOS. - * - * Currently the only reported exception is the new SiS 65x chipset - * which includes the SiS 69x southbridge. Here we have the 85C503 - * router revision 0x04 and there are changes in the register layout - * mostly related to the different USB HCs with USB 2.0 support. - * - * Onchip routing for router rev-id 0x04 (try-and-error observation) - * - * 0x60/0x61/0x62/0x63: 1xEHCI and 3xOHCI (companion) USB-HCs - * bit 6-4 are probably unused, not like 5595 - */ - -#define PIRQ_SIS_IRQ_MASK 0x0f -#define PIRQ_SIS_IRQ_DISABLE 0x80 -#define PIRQ_SIS_USB_ENABLE 0x40 - -static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - u8 x; - int reg; - - reg = pirq; - if (reg >= 0x01 && reg <= 0x04) - reg += 0x40; - pci_read_config_byte(router, reg, &x); - return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK); -} - -static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - u8 x; - int reg; - - reg = pirq; - if (reg >= 0x01 && reg <= 0x04) - reg += 0x40; - pci_read_config_byte(router, reg, &x); - x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE); - x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE; - pci_write_config_byte(router, reg, x); - return 1; -} - - -/* - * VLSI: nibble offset 0x74 - educated guess due to routing table and - * config space of VLSI 82C534 PCI-bridge/router (1004:0102) - * Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard - * devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6 - * for the busbridge to the docking station. - */ - -static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - if (pirq > 8) { - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); - return 0; - } - return read_config_nybble(router, 0x74, pirq-1); -} - -static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - if (pirq > 8) { - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); - return 0; - } - write_config_nybble(router, 0x74, pirq-1, irq); - return 1; -} - -/* - * ServerWorks: PCI interrupts mapped to system IRQ lines through Index - * and Redirect I/O registers (0x0c00 and 0x0c01). The Index register - * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a. The Redirect - * register is a straight binary coding of desired PIC IRQ (low nibble). - * - * The 'link' value in the PIRQ table is already in the correct format - * for the Index register. There are some special index values: - * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1, - * and 0x03 for SMBus. - */ -static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - outb_p(pirq, 0xc00); - return inb(0xc01) & 0xf; -} - -static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - outb_p(pirq, 0xc00); - outb_p(irq, 0xc01); - return 1; -} - -/* Support for AMD756 PCI IRQ Routing - * Jhon H. Caicedo <jhcaiced@xxxxxxxxxxx> - * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced) - * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced) - * The AMD756 pirq rules are nibble-based - * offset 0x56 0-3 PIRQA 4-7 PIRQB - * offset 0x57 0-3 PIRQC 4-7 PIRQD - */ -static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq) -{ - u8 irq; - irq = 0; - if (pirq <= 4) - { - irq = read_config_nybble(router, 0x56, pirq - 1); - } - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", - dev->vendor, dev->device, pirq, irq); - return irq; -} - -static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", - dev->vendor, dev->device, pirq, irq); - if (pirq <= 4) - { - write_config_nybble(router, 0x56, pirq - 1, irq); - } - return 1; -} - -#ifdef CONFIG_PCI_BIOS - -static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) -{ - struct pci_dev *bridge; - int pin = pci_get_interrupt_pin(dev, &bridge); - return pcibios_set_irq_routing(bridge, pin, irq); -} - -#endif - -static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - static struct pci_device_id pirq_440gx[] = { - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) }, - { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) }, - { }, - }; - - /* 440GX has a proprietary PIRQ router -- don't use it */ - if (pci_dev_present(pirq_440gx)) - return 0; - - switch(device) - { - case PCI_DEVICE_ID_INTEL_82371FB_0: - case PCI_DEVICE_ID_INTEL_82371SB_0: - case PCI_DEVICE_ID_INTEL_82371AB_0: - case PCI_DEVICE_ID_INTEL_82371MX: - case PCI_DEVICE_ID_INTEL_82443MX_0: - case PCI_DEVICE_ID_INTEL_82801AA_0: - case PCI_DEVICE_ID_INTEL_82801AB_0: - case PCI_DEVICE_ID_INTEL_82801BA_0: - case PCI_DEVICE_ID_INTEL_82801BA_10: - case PCI_DEVICE_ID_INTEL_82801CA_0: - case PCI_DEVICE_ID_INTEL_82801CA_12: - case PCI_DEVICE_ID_INTEL_82801DB_0: - case PCI_DEVICE_ID_INTEL_82801E_0: - case PCI_DEVICE_ID_INTEL_82801EB_0: - case PCI_DEVICE_ID_INTEL_ESB_1: - case PCI_DEVICE_ID_INTEL_ICH6_0: - case PCI_DEVICE_ID_INTEL_ICH6_1: - case PCI_DEVICE_ID_INTEL_ICH7_0: - case PCI_DEVICE_ID_INTEL_ICH7_1: - r->name = "PIIX/ICH"; - r->get = pirq_piix_get; - r->set = pirq_piix_set; - return 1; - } - return 0; -} - -static __init int via_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - /* FIXME: We should move some of the quirk fixup stuff here */ - switch(device) - { - case PCI_DEVICE_ID_VIA_82C586_0: - case PCI_DEVICE_ID_VIA_82C596: - case PCI_DEVICE_ID_VIA_82C686: - case PCI_DEVICE_ID_VIA_8231: - /* FIXME: add new ones for 8233/5 */ - r->name = "VIA"; - r->get = pirq_via_get; - r->set = pirq_via_set; - return 1; - } - return 0; -} - -static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_VLSI_82C534: - r->name = "VLSI 82C534"; - r->get = pirq_vlsi_get; - r->set = pirq_vlsi_set; - return 1; - } - return 0; -} - - -static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_SERVERWORKS_OSB4: - case PCI_DEVICE_ID_SERVERWORKS_CSB5: - r->name = "ServerWorks"; - r->get = pirq_serverworks_get; - r->set = pirq_serverworks_set; - return 1; - } - return 0; -} - -static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - if (device != PCI_DEVICE_ID_SI_503) - return 0; - - r->name = "SIS"; - r->get = pirq_sis_get; - r->set = pirq_sis_set; - return 1; -} - -static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_CYRIX_5520: - r->name = "NatSemi"; - r->get = pirq_cyrix_get; - r->set = pirq_cyrix_set; - return 1; - } - return 0; -} - -static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_OPTI_82C700: - r->name = "OPTI"; - r->get = pirq_opti_get; - r->set = pirq_opti_set; - return 1; - } - return 0; -} - -static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_ITE_IT8330G_0: - r->name = "ITE"; - r->get = pirq_ite_get; - r->set = pirq_ite_set; - return 1; - } - return 0; -} - -static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_AL_M1533: - case PCI_DEVICE_ID_AL_M1563: - printk("PCI: Using ALI IRQ Router\n"); - r->name = "ALI"; - r->get = pirq_ali_get; - r->set = pirq_ali_set; - return 1; - } - return 0; -} - -static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device) -{ - switch(device) - { - case PCI_DEVICE_ID_AMD_VIPER_740B: - r->name = "AMD756"; - break; - case PCI_DEVICE_ID_AMD_VIPER_7413: - r->name = "AMD766"; - break; - case PCI_DEVICE_ID_AMD_VIPER_7443: - r->name = "AMD768"; - break; - default: - return 0; - } - r->get = pirq_amd756_get; - r->set = pirq_amd756_set; - return 1; -} - -static __initdata struct irq_router_handler pirq_routers[] = { - { PCI_VENDOR_ID_INTEL, intel_router_probe }, - { PCI_VENDOR_ID_AL, ali_router_probe }, - { PCI_VENDOR_ID_ITE, ite_router_probe }, - { PCI_VENDOR_ID_VIA, via_router_probe }, - { PCI_VENDOR_ID_OPTI, opti_router_probe }, - { PCI_VENDOR_ID_SI, sis_router_probe }, - { PCI_VENDOR_ID_CYRIX, cyrix_router_probe }, - { PCI_VENDOR_ID_VLSI, vlsi_router_probe }, - { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe }, - { PCI_VENDOR_ID_AMD, amd_router_probe }, - /* Someone with docs needs to add the ATI Radeon IGP */ - { 0, NULL } -}; -static struct irq_router pirq_router; -static struct pci_dev *pirq_router_dev; - - -/* - * FIXME: should we have an option to say "generic for - * chipset" ? - */ - -static void __init pirq_find_router(struct irq_router *r) -{ - struct irq_routing_table *rt = pirq_table; - struct irq_router_handler *h; - -#ifdef CONFIG_PCI_BIOS - if (!rt->signature) { - printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n"); - r->set = pirq_bios_set; - r->name = "BIOS"; - return; - } -#endif - - /* Default unless a driver reloads it */ - r->name = "default"; - r->get = NULL; - r->set = NULL; - - DBG("PCI: Attempting to find IRQ router for %04x:%04x\n", - rt->rtr_vendor, rt->rtr_device); - - pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn); - if (!pirq_router_dev) { - DBG("PCI: Interrupt router not found at %02x:%02x\n", rt->rtr_bus, rt->rtr_devfn); - return; - } - - for( h = pirq_routers; h->vendor; h++) { - /* First look for a router match */ - if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device)) - break; - /* Fall back to a device match */ - if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device)) - break; - } - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", - pirq_router.name, - pirq_router_dev->vendor, - pirq_router_dev->device, - pci_name(pirq_router_dev)); -} - -static struct irq_info *pirq_get_info(struct pci_dev *dev) -{ - struct irq_routing_table *rt = pirq_table; - int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); - struct irq_info *info; - - for (info = rt->slots; entries--; info++) - if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn)) - return info; - return NULL; -} - -static int pcibios_lookup_irq(struct pci_dev *dev, int assign) -{ - u8 pin; - struct irq_info *info; - int i, pirq, newirq; - int irq = 0; - u32 mask; - struct irq_router *r = &pirq_router; - struct pci_dev *dev2 = NULL; - char *msg = NULL; - - /* Find IRQ pin */ - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (!pin) { - DBG(" -> no interrupt pin\n"); - return 0; - } - pin = pin - 1; - - /* Find IRQ routing entry */ - - if (!pirq_table) - return 0; - - DBG("IRQ for %s[%c]", pci_name(dev), 'A' + pin); - info = pirq_get_info(dev); - if (!info) { - DBG(" -> not found in routing table\n"); - return 0; - } - pirq = info->irq[pin].link; - mask = info->irq[pin].bitmap; - if (!pirq) { - DBG(" -> not routed\n"); - return 0; - } - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs); - mask &= pcibios_irq_mask; - - /* Work around broken HP Pavilion Notebooks which assign USB to - IRQ 9 even though it is actually wired to IRQ 11 */ - - if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) { - dev->irq = 11; - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11); - r->set(pirq_router_dev, dev, pirq, 11); - } - - /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */ - if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) { - pirq = 0x68; - mask = 0x400; - dev->irq = r->get(pirq_router_dev, dev, pirq); - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq); - } - - /* - * Find the best IRQ to assign: use the one - * reported by the device if possible. - */ - newirq = dev->irq; - if (!((1 << newirq) & mask)) { - if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; - else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev)); - } - if (!newirq && assign) { - for (i = 0; i < 16; i++) { - if (!(mask & (1 << i))) - continue; - if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, SA_SHIRQ)) - newirq = i; - } - } - DBG(" -> newirq=%d", newirq); - - /* Check if it is hardcoded */ - if ((pirq & 0xf0) == 0xf0) { - irq = pirq & 0xf; - DBG(" -> hardcoded IRQ %d\n", irq); - msg = "Hardcoded"; - } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ - ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) { - DBG(" -> got IRQ %d\n", irq); - msg = "Found"; - } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { - DBG(" -> assigning IRQ %d", newirq); - if (r->set(pirq_router_dev, dev, pirq, newirq)) { - eisa_set_level_irq(newirq); - DBG(" ... OK\n"); - msg = "Assigned"; - irq = newirq; - } - } - - if (!irq) { - DBG(" ... failed\n"); - if (newirq && mask == (1 << newirq)) { - msg = "Guessed"; - irq = newirq; - } else - return 0; - } - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev)); - - /* Update IRQ for all devices with the same pirq value */ - while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { - pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin); - if (!pin) - continue; - pin--; - info = pirq_get_info(dev2); - if (!info) - continue; - if (info->irq[pin].link == pirq) { - /* We refuse to override the dev->irq information. Give a warning! */ - if ( dev2->irq && dev2->irq != irq && \ - (!(pci_probe & PCI_USE_PIRQ_MASK) || \ - ((1 << dev2->irq) & mask)) ) { -#ifndef CONFIG_PCI_MSI - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", - pci_name(dev2), dev2->irq, irq); -#endif - continue; - } - dev2->irq = irq; - pirq_penalty[irq]++; - if (dev != dev2) - printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2)); - } - } - return 1; -} - -static void __init pcibios_fixup_irqs(void) -{ - struct pci_dev *dev = NULL; - u8 pin; - - DBG("PCI: IRQ fixup\n"); - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - /* - * If the BIOS has set an out of range IRQ number, just ignore it. - * Also keep track of which IRQ's are already in use. - */ - if (dev->irq >= 16) { - DBG("%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq); - dev->irq = 0; - } - /* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */ - if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000) - pirq_penalty[dev->irq] = 0; - pirq_penalty[dev->irq]++; - } - - dev = NULL; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); -#ifdef CONFIG_X86_IO_APIC - /* - * Recalculate IRQ numbers if we use the I/O APIC. - */ - if (io_apic_assign_pci_irqs) - { - int irq; - - if (pin) { - pin--; /* interrupt pins are numbered starting from 1 */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); - /* - * Busses behind bridges are typically not listed in the MP-table. - * In this case we have to look up the IRQ based on the parent bus, - * parent slot, and pin number. The SMP code detects such bridged - * busses itself so we should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev * bridge = dev->bus->self; - - pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); - if (irq >= 0) - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), 'A' + pin, irq); - } - if (irq >= 0) { - if (use_pci_vector() && - !platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); - - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); - dev->irq = irq; - } - } - } -#endif - /* - * Still no IRQ? Try to lookup one... - */ - if (pin && !dev->irq) - pcibios_lookup_irq(dev, 0); - } -} - -/* - * Work around broken HP Pavilion Notebooks which assign USB to - * IRQ 9 even though it is actually wired to IRQ 11 - */ -static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d) -{ - if (!broken_hp_bios_irq9) { - broken_hp_bios_irq9 = 1; - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); - } - return 0; -} - -/* - * Work around broken Acer TravelMate 360 Notebooks which assign - * Cardbus to IRQ 11 even though it is actually wired to IRQ 10 - */ -static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d) -{ - if (!acer_tm360_irqrouting) { - acer_tm360_irqrouting = 1; - printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident); - } - return 0; -} - -static struct dmi_system_id __initdata pciirq_dmi_table[] = { - { - .callback = fix_broken_hp_bios_irq9, - .ident = "HP Pavilion N5400 Series Laptop", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), - DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"), - DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"), - DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"), - }, - }, - { - .callback = fix_acer_tm360_irqrouting, - .ident = "Acer TravelMate 36x Laptop", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Acer"), - DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), - }, - }, - { } -}; - -static int __init pcibios_irq_init(void) -{ - DBG("PCI: IRQ init\n"); - - if (pcibios_enable_irq || raw_pci_ops == NULL) - return 0; - - dmi_check_system(pciirq_dmi_table); - - pirq_table = pirq_find_routing_table(); - -#ifdef CONFIG_PCI_BIOS - if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN)) - pirq_table = pcibios_get_irq_routing_table(); -#endif - if (pirq_table) { - pirq_peer_trick(); - pirq_find_router(&pirq_router); - if (pirq_table->exclusive_irqs) { - int i; - for (i=0; i<16; i++) - if (!(pirq_table->exclusive_irqs & (1 << i))) - pirq_penalty[i] += 100; - } - /* If we're using the I/O APIC, avoid using the PCI IRQ routing table */ - if (io_apic_assign_pci_irqs) - pirq_table = NULL; - } - - pcibios_enable_irq = pirq_enable_irq; - - pcibios_fixup_irqs(); - return 0; -} - -subsys_initcall(pcibios_irq_init); - - -static void pirq_penalize_isa_irq(int irq) -{ - /* - * If any ISAPnP device reports an IRQ in its list of possible - * IRQ's, we try to avoid assigning it to PCI devices. - */ - if (irq < 16) - pirq_penalty[irq] += 100; -} - -void pcibios_penalize_isa_irq(int irq) -{ -#ifdef CONFIG_ACPI_PCI - if (!acpi_noirq) - acpi_penalize_isa_irq(irq); - else -#endif - pirq_penalize_isa_irq(irq); -} - -static int pirq_enable_irq(struct pci_dev *dev) -{ - u8 pin; - extern int via_interrupt_line_quirk; - struct pci_dev *temp_dev; - - pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); - if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) { - char *msg; - msg = ""; - if (io_apic_assign_pci_irqs) { - int irq; - - if (pin) { - pin--; /* interrupt pins are numbered starting from 1 */ - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin); - /* - * Busses behind bridges are typically not listed in the MP-table. - * In this case we have to look up the IRQ based on the parent bus, - * parent slot, and pin number. The SMP code detects such bridged - * busses itself so we should get into this branch reliably. - */ - temp_dev = dev; - while (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev * bridge = dev->bus->self; - - pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); - if (irq >= 0) - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), 'A' + pin, irq); - dev = bridge; - } - dev = temp_dev; - if (irq >= 0) { -#ifdef CONFIG_PCI_MSI - if (!platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); -#endif - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); - dev->irq = irq; - return 0; - } else - msg = " Probably buggy MP table."; - } - } else if (pci_probe & PCI_BIOS_IRQ_SCAN) - msg = ""; - else - msg = " Please try using pci=biosirq."; - - /* With IDE legacy devices the IRQ lookup failure is not a problem.. */ - if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5)) - return 0; - - printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", - 'A' + pin - 1, pci_name(dev), msg); - } - /* VIA bridges use interrupt line for apic/pci steering across - the V-Link */ - else if (via_interrupt_line_quirk) - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq & 15); - return 0; -} - -int pci_vector_resources(int last, int nr_released) -{ - int count = nr_released; - - int next = last; - int offset = (last % 8); - - while (next < FIRST_SYSTEM_VECTOR) { - next += 8; -#ifdef CONFIG_X86_64 - if (next == IA32_SYSCALL_VECTOR) - continue; -#else - if (next == SYSCALL_VECTOR) - continue; -#endif - count++; - if (next >= FIRST_SYSTEM_VECTOR) { - if (offset%8) { - next = FIRST_DEVICE_VECTOR + offset; - offset++; - continue; - } - count--; - } - } - - return count; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,18 +0,0 @@ -# -# Makefile for the linux kernel. -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CPPFLAGS_vmlinux.lds += -U$(XENARCH) - -$(obj)/vmlinux.lds.S: - @ln -fsn $(srctree)/arch/$(XENARCH)/kernel/vmlinux.lds.S $@ - -extra-y += vmlinux.lds - -obj-y := ctrl_if.o evtchn.o fixup.o reboot.o gnttab.o devmem.o - -obj-$(CONFIG_PROC_FS) += xen_proc.o -obj-$(CONFIG_NET) += skbuff.o -obj-$(CONFIG_SMP) += smp.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/devmem.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/devmem.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,158 +0,0 @@ -/* - * Originally from linux/drivers/char/mem.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Added devfs support. - * Jan-11-1998, C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx> - * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@xxxxxxx> - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/mman.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/raw.h> -#include <linux/tty.h> -#include <linux/capability.h> -#include <linux/smp_lock.h> -#include <linux/devfs_fs_kernel.h> -#include <linux/ptrace.h> -#include <linux/device.h> -#include <asm/pgalloc.h> -#include <asm/uaccess.h> -#include <asm/io.h> - -static inline int uncached_access(struct file *file, unsigned long addr) -{ - if (file->f_flags & O_SYNC) - return 1; - /* Xen sets correct MTRR type on non-RAM for us. */ - return 0; -} - -/* - * This funcion reads the *physical* memory. The f_pos points directly to the - * memory location. - */ -static ssize_t read_mem(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long i, p = *ppos; - ssize_t read = -EFAULT; - void *v; - - if ((v = ioremap(p, count)) == NULL) { - /* - * Some programs (e.g., dmidecode) groove off into weird RAM - * areas where no table scan possibly exist (because Xen will - * have stomped on them!). These programs get rather upset if - * we let them know that Xen failed their access, so we fake - * out a read of all zeroes. :-) - */ - for (i = 0; i < count; i++) - if (put_user(0, buf+i)) - return -EFAULT; - return count; - } - if (copy_to_user(buf, v, count)) - goto out; - - read = count; - *ppos += read; -out: - iounmap(v); - return read; -} - -static ssize_t write_mem(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t written = -EFAULT; - void *v; - - if ((v = ioremap(p, count)) == NULL) - return -EFAULT; - if (copy_to_user(v, buf, count)) - goto out; - - written = count; - *ppos += written; -out: - iounmap(v); - return written; -} - -static int mmap_mem(struct file * file, struct vm_area_struct * vma) -{ - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - int uncached; - - uncached = uncached_access(file, offset); - if (uncached) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - /* Don't try to swap out physical pages.. */ - vma->vm_flags |= VM_RESERVED; - - /* - * Don't dump addresses that are not real memory to a core file. - */ - if (uncached) - vma->vm_flags |= VM_IO; - - if (io_remap_page_range(vma, vma->vm_start, offset, - vma->vm_end-vma->vm_start, vma->vm_page_prot)) - return -EAGAIN; - - return 0; -} - -/* - * The memory devices use the full 32/64 bits of the offset, and so we cannot - * check against negative addresses: they are ok. The return value is weird, - * though, in that case (0). - * - * also note that seeking relative to the "end of file" isn't supported: - * it has no meaning, so it returns -EINVAL. - */ -static loff_t memory_lseek(struct file * file, loff_t offset, int orig) -{ - loff_t ret; - - down(&file->f_dentry->d_inode->i_sem); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - force_successful_syscall_return(); - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - force_successful_syscall_return(); - break; - default: - ret = -EINVAL; - } - up(&file->f_dentry->d_inode->i_sem); - return ret; -} - -static int open_mem(struct inode * inode, struct file * filp) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -struct file_operations mem_fops = { - .llseek = memory_lseek, - .read = read_mem, - .write = write_mem, - .mmap = mmap_mem, - .open = open_mem, -}; diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/fixup.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/fixup.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,87 +0,0 @@ -/****************************************************************************** - * fixup.c - * - * Binary-rewriting of certain IA32 instructions, on notification by Xen. - * Used to avoid repeated slow emulation of common instructions used by the - * user-space TLS (Thread-Local Storage) libraries. - * - * **** NOTE **** - * Issues with the binary rewriting have caused it to be removed. Instead - * we rely on Xen's emulator to boot the kernel, and then print a banner - * message recommending that the user disables /lib/tls. - * - * Copyright (c) 2004, K A Fraser - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include <linux/config.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/kernel.h> -#include <linux/delay.h> -#include <linux/version.h> - -#define DP(_f, _args...) printk(KERN_ALERT " " _f "\n" , ## _args ) - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define __LINKAGE fastcall -#else -#define __LINKAGE asmlinkage -#endif - -__LINKAGE void do_fixup_4gb_segment(struct pt_regs *regs, long error_code) -{ - static unsigned long printed = 0; - char info[100]; - int i; - - if ( !test_and_set_bit(0, &printed) ) - { - HYPERVISOR_vm_assist(VMASST_CMD_disable, - VMASST_TYPE_4gb_segments_notify); - - sprintf(info, "%s (pid=%d)", current->comm, current->tgid); - - DP(""); - DP("***************************************************************"); - DP("***************************************************************"); - DP("** WARNING: Currently emulating unsupported memory accesses **"); - DP("** in /lib/tls libraries. The emulation is very **"); - DP("** slow. To ensure full performance you should **"); - DP("** execute the following as root: **"); - DP("** mv /lib/tls /lib/tls.disabled **"); - DP("** Offending process: %-38.38s **", info); - DP("***************************************************************"); - DP("***************************************************************"); - DP(""); - - for ( i = 5; i > 0; i-- ) - { - printk("Pausing... %d", i); - mdelay(1000); - printk("\b\b\b\b\b\b\b\b\b\b\b\b"); - } - printk("Continuing...\n\n"); - } -} - -static int __init fixup_init(void) -{ - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify); - return 0; -} -__initcall(fixup_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/gnttab.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/gnttab.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,390 +0,0 @@ -/****************************************************************************** - * gnttab.c - * - * Two sets of functionality: - * 1. Granting foreign access to our memory reservation. - * 2. Accessing others' memory reservations via grant references. - * (i.e., mechanisms for both sender and recipient of grant references) - * - * Copyright (c) 2005, Christopher Clark - * Copyright (c) 2004, K A Fraser - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <asm/pgtable.h> -#include <asm/fixmap.h> -#include <asm/uaccess.h> -#include <asm-xen/xen_proc.h> -#include <asm-xen/linux-public/privcmd.h> -#include <asm-xen/gnttab.h> -#include <asm-xen/synch_bitops.h> - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printk(KERN_ALERT"Assertion '%s': line %d, file %s\n", \ - #_p , __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -#define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_grant: " fmt, ##args) - - -EXPORT_SYMBOL(gnttab_grant_foreign_access); -EXPORT_SYMBOL(gnttab_end_foreign_access); -EXPORT_SYMBOL(gnttab_query_foreign_access); -EXPORT_SYMBOL(gnttab_grant_foreign_transfer); -EXPORT_SYMBOL(gnttab_end_foreign_transfer); -EXPORT_SYMBOL(gnttab_alloc_grant_references); -EXPORT_SYMBOL(gnttab_free_grant_references); -EXPORT_SYMBOL(gnttab_claim_grant_reference); -EXPORT_SYMBOL(gnttab_release_grant_reference); -EXPORT_SYMBOL(gnttab_grant_foreign_access_ref); -EXPORT_SYMBOL(gnttab_grant_foreign_transfer_ref); - -static grant_ref_t gnttab_free_list[NR_GRANT_ENTRIES]; -static grant_ref_t gnttab_free_head; - -static grant_entry_t *shared; - -/* - * Lock-free grant-entry allocator - */ - -static inline int -get_free_entry( - void) -{ - grant_ref_t fh, nfh = gnttab_free_head; - do { if ( unlikely((fh = nfh) == NR_GRANT_ENTRIES) ) return -1; } - while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, - gnttab_free_list[fh])) != fh) ); - return fh; -} - -static inline void -put_free_entry( - grant_ref_t ref) -{ - grant_ref_t fh, nfh = gnttab_free_head; - do { gnttab_free_list[ref] = fh = nfh; wmb(); } - while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, ref)) != fh) ); -} - -/* - * Public grant-issuing interface functions - */ - -int -gnttab_grant_foreign_access( - domid_t domid, unsigned long frame, int readonly) -{ - int ref; - - if ( unlikely((ref = get_free_entry()) == -1) ) - return -ENOSPC; - - shared[ref].frame = frame; - shared[ref].domid = domid; - wmb(); - shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0); - - return ref; -} - -void -gnttab_grant_foreign_access_ref( - grant_ref_t ref, domid_t domid, unsigned long frame, int readonly) -{ - shared[ref].frame = frame; - shared[ref].domid = domid; - wmb(); - shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0); -} - - -int -gnttab_query_foreign_access( grant_ref_t ref ) -{ - u16 nflags; - - nflags = shared[ref].flags; - - return ( nflags & (GTF_reading|GTF_writing) ); -} - -void -gnttab_end_foreign_access( grant_ref_t ref, int readonly ) -{ - u16 flags, nflags; - - nflags = shared[ref].flags; - do { - if ( (flags = nflags) & (GTF_reading|GTF_writing) ) - printk(KERN_ALERT "WARNING: g.e. still in use!\n"); - } - while ( (nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) != flags ); - - put_free_entry(ref); -} - -int -gnttab_grant_foreign_transfer( - domid_t domid, unsigned long pfn ) -{ - int ref; - - if ( unlikely((ref = get_free_entry()) == -1) ) - return -ENOSPC; - - shared[ref].frame = pfn; - shared[ref].domid = domid; - wmb(); - shared[ref].flags = GTF_accept_transfer; - - return ref; -} - -void -gnttab_grant_foreign_transfer_ref( - grant_ref_t ref, domid_t domid, unsigned long pfn ) -{ - shared[ref].frame = pfn; - shared[ref].domid = domid; - wmb(); - shared[ref].flags = GTF_accept_transfer; -} - -unsigned long -gnttab_end_foreign_transfer( - grant_ref_t ref) -{ - unsigned long frame = 0; - u16 flags; - - flags = shared[ref].flags; - ASSERT(flags == (GTF_accept_transfer | GTF_transfer_committed)); - - /* - * If a transfer is committed then wait for the frame address to appear. - * Otherwise invalidate the grant entry against future use. - */ - if ( likely(flags != GTF_accept_transfer) || - (synch_cmpxchg(&shared[ref].flags, flags, 0) != GTF_accept_transfer) ) - while ( unlikely((frame = shared[ref].frame) == 0) ) - cpu_relax(); - - put_free_entry(ref); - - return frame; -} - -void -gnttab_free_grant_references( u16 count, grant_ref_t head ) -{ - /* TODO: O(N)...? */ - grant_ref_t to_die = 0, next = head; - int i; - - for ( i = 0; i < count; i++ ) - { - to_die = next; - next = gnttab_free_list[next]; - put_free_entry( to_die ); - } -} - -int -gnttab_alloc_grant_references( u16 count, - grant_ref_t *head, - grant_ref_t *terminal ) -{ - int i; - grant_ref_t h = gnttab_free_head; - - for ( i = 0; i < count; i++ ) - if ( unlikely(get_free_entry() == -1) ) - goto not_enough_refs; - - *head = h; - *terminal = gnttab_free_head; - - return 0; - -not_enough_refs: - gnttab_free_head = h; - return -ENOSPC; -} - -int -gnttab_claim_grant_reference( grant_ref_t *private_head, - grant_ref_t terminal ) -{ - grant_ref_t g; - if ( unlikely((g = *private_head) == terminal) ) - return -ENOSPC; - *private_head = gnttab_free_list[g]; - return g; -} - -void -gnttab_release_grant_reference( grant_ref_t *private_head, - grant_ref_t release ) -{ - gnttab_free_list[release] = *private_head; - *private_head = release; -} - -/* - * ProcFS operations - */ - -#ifdef CONFIG_PROC_FS - -static struct proc_dir_entry *grant_pde; - -static int grant_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long data) -{ - int ret; - privcmd_hypercall_t hypercall; - - /* XXX Need safety checks here if using for anything other - * than debugging */ - return -ENOSYS; - - if ( cmd != IOCTL_PRIVCMD_HYPERCALL ) - return -ENOSYS; - - if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) ) - return -EFAULT; - - if ( hypercall.op != __HYPERVISOR_grant_table_op ) - return -ENOSYS; - - /* hypercall-invoking asm taken from privcmd.c */ - __asm__ __volatile__ ( - "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; " - "movl 4(%%eax),%%ebx ;" - "movl 8(%%eax),%%ecx ;" - "movl 12(%%eax),%%edx ;" - "movl 16(%%eax),%%esi ;" - "movl 20(%%eax),%%edi ;" - "movl (%%eax),%%eax ;" - TRAP_INSTR "; " - "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" - : "=a" (ret) : "0" (&hypercall) : "memory" ); - - return ret; -} - -static struct file_operations grant_file_ops = { - ioctl: grant_ioctl, -}; - -static int grant_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - unsigned int i; - grant_entry_t *gt; - - gt = (grant_entry_t *)shared; - len = 0; - - for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) - /* TODO: safety catch here until this can handle >PAGE_SIZE output */ - if (len > (PAGE_SIZE - 200)) - { - len += sprintf( page + len, "Truncated.\n"); - break; - } - - if ( gt[i].flags ) - len += sprintf( page + len, - "Grant: ref (0x%x) flags (0x%hx) dom (0x%hx) frame (0x%x)\n", - i, - gt[i].flags, - gt[i].domid, - gt[i].frame ); - - *eof = 1; - return len; -} - -static int grant_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) -{ - /* TODO: implement this */ - return -ENOSYS; -} - -#endif /* CONFIG_PROC_FS */ - -int gnttab_resume(void) -{ - gnttab_setup_table_t setup; - unsigned long frames[NR_GRANT_FRAMES]; - int i; - - setup.dom = DOMID_SELF; - setup.nr_frames = NR_GRANT_FRAMES; - setup.frame_list = frames; - - BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1) != 0); - BUG_ON(setup.status != 0); - - for ( i = 0; i < NR_GRANT_FRAMES; i++ ) - set_fixmap(FIX_GNTTAB_END - i, frames[i] << PAGE_SHIFT); - - return 0; -} - -int gnttab_suspend(void) -{ - int i; - - for ( i = 0; i < NR_GRANT_FRAMES; i++ ) - clear_fixmap(FIX_GNTTAB_END - i); - - return 0; -} - -static int __init gnttab_init(void) -{ - int i; - - BUG_ON(gnttab_resume()); - - shared = (grant_entry_t *)fix_to_virt(FIX_GNTTAB_END); - - for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) - gnttab_free_list[i] = i + 1; - -#ifdef CONFIG_PROC_FS - /* - * /proc/xen/grant : used by libxc to access grant tables - */ - if ( (grant_pde = create_xen_proc_entry("grant", 0600)) == NULL ) - { - WPRINTK("Unable to create grant xen proc entry\n"); - return -1; - } - - grant_file_ops.read = grant_pde->proc_fops->read; - grant_file_ops.write = grant_pde->proc_fops->write; - - grant_pde->proc_fops = &grant_file_ops; - - grant_pde->read_proc = &grant_read; - grant_pde->write_proc = &grant_write; -#endif - - printk("Grant table initialized\n"); - return 0; -} - -__initcall(gnttab_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,269 +0,0 @@ - -#define __KERNEL_SYSCALLS__ -static int errno; -#include <linux/errno.h> -#include <linux/version.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/unistd.h> -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/sysrq.h> -#include <asm/irq.h> -#include <asm/mmu_context.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/xen-public/dom0_ops.h> -#include <asm-xen/linux-public/suspend.h> -#include <asm-xen/queues.h> - -void machine_restart(char * __unused) -{ - /* We really want to get pending console data out before we die. */ - extern void xencons_force_flush(void); - xencons_force_flush(); - HYPERVISOR_reboot(); -} - -void machine_halt(void) -{ - machine_power_off(); -} - -void machine_power_off(void) -{ - /* We really want to get pending console data out before we die. */ - extern void xencons_force_flush(void); - xencons_force_flush(); - HYPERVISOR_shutdown(); -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -int reboot_thru_bios = 0; /* for dmi_scan.c */ -EXPORT_SYMBOL(machine_restart); -EXPORT_SYMBOL(machine_halt); -EXPORT_SYMBOL(machine_power_off); -#endif - - -/****************************************************************************** - * Stop/pickle callback handling. - */ - -/* Ignore multiple shutdown requests. */ -static int shutting_down = -1; - -static void __do_suspend(void) -{ - int i, j; - suspend_record_t *suspend_record; - - /* Hmmm... a cleaner interface to suspend/resume blkdevs would be nice. */ - /* XXX SMH: yes it would :-( */ -#ifdef CONFIG_XEN_BLKDEV_FRONTEND - extern void blkdev_suspend(void); - extern void blkdev_resume(void); -#else -#define blkdev_suspend() do{}while(0) -#define blkdev_resume() do{}while(0) -#endif - -#ifdef CONFIG_XEN_NETDEV_FRONTEND - extern void netif_suspend(void); - extern void netif_resume(void); -#else -#define netif_suspend() do{}while(0) -#define netif_resume() do{}while(0) -#endif - -#ifdef CONFIG_XEN_USB_FRONTEND - extern void usbif_resume(); -#else -#define usbif_resume() do{}while(0) -#endif - -#ifdef CONFIG_XEN_BLKDEV_GRANT - extern int gnttab_suspend(void); - extern int gnttab_resume(void); -#else -#define gnttab_suspend() do{}while(0) -#define gnttab_resume() do{}while(0) -#endif - - extern void time_suspend(void); - extern void time_resume(void); - extern unsigned long max_pfn; - extern unsigned int *pfn_to_mfn_frame_list; - - suspend_record = (suspend_record_t *)__get_free_page(GFP_KERNEL); - if ( suspend_record == NULL ) - goto out; - - suspend_record->nr_pfns = max_pfn; /* final number of pfns */ - - __cli(); - -#ifdef __i386__ - mm_pin_all(); - kmem_cache_shrink(pgd_cache); -#endif - - netif_suspend(); - - blkdev_suspend(); - - time_suspend(); - - ctrl_if_suspend(); - - irq_suspend(); - - gnttab_suspend(); - - HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; - clear_fixmap(FIX_SHARED_INFO); - - memcpy(&suspend_record->resume_info, &xen_start_info, - sizeof(xen_start_info)); - - HYPERVISOR_suspend(virt_to_machine(suspend_record) >> PAGE_SHIFT); - - shutting_down = -1; - - memcpy(&xen_start_info, &suspend_record->resume_info, - sizeof(xen_start_info)); - - set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); - - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - - memset(empty_zero_page, 0, PAGE_SIZE); - - for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) - { - pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; - } - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = - virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; - - gnttab_resume(); - - irq_resume(); - - ctrl_if_resume(); - - time_resume(); - - blkdev_resume(); - - netif_resume(); - - usbif_resume(); - - __sti(); - - out: - if ( suspend_record != NULL ) - free_page((unsigned long)suspend_record); -} - -static int shutdown_process(void *__unused) -{ - static char *envp[] = { "HOME=/", "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; - static char *restart_argv[] = { "/sbin/shutdown", "-r", "now", NULL }; - static char *poweroff_argv[] = { "/sbin/halt", "-p", NULL }; - - extern asmlinkage long sys_reboot(int magic1, int magic2, - unsigned int cmd, void *arg); - - daemonize( -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - "shutdown" -#endif - ); - - switch ( shutting_down ) - { - case CMSG_SHUTDOWN_POWEROFF: - if ( execve("/sbin/halt", poweroff_argv, envp) < 0 ) - { - sys_reboot(LINUX_REBOOT_MAGIC1, - LINUX_REBOOT_MAGIC2, - LINUX_REBOOT_CMD_POWER_OFF, - NULL); - } - break; - - case CMSG_SHUTDOWN_REBOOT: - if ( execve("/sbin/shutdown", restart_argv, envp) < 0 ) - { - sys_reboot(LINUX_REBOOT_MAGIC1, - LINUX_REBOOT_MAGIC2, - LINUX_REBOOT_CMD_RESTART, - NULL); - } - break; - } - - shutting_down = -1; /* could try again */ - - return 0; -} - -static void __shutdown_handler(void *unused) -{ - int err; - - if ( shutting_down != CMSG_SHUTDOWN_SUSPEND ) - { - err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES); - if ( err < 0 ) - printk(KERN_ALERT "Error creating shutdown process!\n"); - } - else - { - __do_suspend(); - } -} - -static void shutdown_handler(ctrl_msg_t *msg, unsigned long id) -{ - static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); - - if ( msg->subtype == CMSG_SHUTDOWN_SYSRQ ) - { - int sysrq = ((shutdown_sysrq_t *)&msg->msg[0])->key; - -#ifdef CONFIG_MAGIC_SYSRQ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - handle_sysrq(sysrq, NULL, NULL); -#else - handle_sysrq(sysrq, NULL, NULL, NULL); -#endif -#endif - } - else if ( (shutting_down == -1) && - ((msg->subtype == CMSG_SHUTDOWN_POWEROFF) || - (msg->subtype == CMSG_SHUTDOWN_REBOOT) || - (msg->subtype == CMSG_SHUTDOWN_SUSPEND)) ) - { - shutting_down = msg->subtype; - schedule_work(&shutdown_work); - } - else - { - printk("Ignore spurious shutdown request\n"); - } - - ctrl_if_send_response(msg); -} - -static int __init setup_shutdown_event(void) -{ - ctrl_if_register_receiver(CMSG_SHUTDOWN, shutdown_handler, 0); - return 0; -} - -__initcall(setup_shutdown_event); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/skbuff.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/skbuff.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,47 +0,0 @@ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/version.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/etherdevice.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <asm/io.h> -#include <asm/page.h> - -EXPORT_SYMBOL(__dev_alloc_skb); - -/* Referenced in netback.c. */ -/*static*/ kmem_cache_t *skbuff_cachep; - -/* Size must be cacheline-aligned (alloc_skb uses SKB_DATA_ALIGN). */ -#define XEN_SKB_SIZE \ - ((PAGE_SIZE - sizeof(struct skb_shared_info)) & ~(SMP_CACHE_BYTES - 1)) - -struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask) -{ - struct sk_buff *skb; - skb = alloc_skb_from_cache(skbuff_cachep, length + 16, gfp_mask); - if ( likely(skb != NULL) ) - skb_reserve(skb, 16); - return skb; -} - -static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused) -{ - scrub_pages(buf, 1); -} - -static int __init skbuff_init(void) -{ - skbuff_cachep = kmem_cache_create( - "xen-skb", PAGE_SIZE, PAGE_SIZE, 0, skbuff_ctor, NULL); - return 0; -} -__initcall(skbuff_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/smp.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/smp.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,16 +0,0 @@ -/* Copyright (C) 2004, Christian Limpach */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/threads.h> - -/* - * the frequency of the profiling timer can be changed - * by writing a multiplier value into /proc/profile. - */ -int setup_profiling_timer(unsigned int multiplier) -{ - printk("setup_profiling_timer\n"); - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/kernel/xen_proc.c --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/xen_proc.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,18 +0,0 @@ - -#include <linux/config.h> -#include <linux/proc_fs.h> - -static struct proc_dir_entry *xen_base; - -struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode) -{ - if ( xen_base == NULL ) - if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL ) - panic("Couldn't create /proc/xen"); - return create_proc_entry(name, mode, xen_base); -} - -void remove_xen_proc_entry(const char *name) -{ - remove_proc_entry(name, xen_base); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/Kconfig --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/Kconfig Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,455 +0,0 @@ -# -# For a description of the syntax of this configuration file, -# see Documentation/kbuild/kconfig-language.txt. -# -# Note: ISA is disabled and will hopefully never be enabled. -# If you managed to buy an ISA x86-64 box you'll have to fix all the -# ISA drivers you need yourself. -# - -menu "X86_64 processor configuration" - -config XENARCH - string - default x86_64 - -config X86_64 - bool - default y - help - Port to the x86-64 architecture. x86-64 is a 64-bit extension to the - classical 32-bit x86 architecture. For details see - <http://www.x86-64.org/>. - -config X86 - bool - default y - -config 64BIT - def_bool y - -config MMU - bool - default y - -config ISA - bool - -config SBUS - bool - -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - -config GENERIC_CALIBRATE_DELAY - bool - default y - -config X86_CMPXCHG - bool - default y - -config EARLY_PRINTK - bool "Early Printk" - default n - help - Write kernel log output directly into the VGA buffer or to a serial - port. - - This is useful for kernel debugging when your machine crashes very - early before the console code is initialized. For normal operation - it is not recommended because it looks ugly and doesn't cooperate - with klogd/syslogd or the X server. You should normally N here, - unless you want to debug such a crash. - -config HPET_TIMER - bool - default n - help - Use the IA-PC HPET (High Precision Event Timer) to manage - time in preference to the PIT and RTC, if a HPET is - present. The HPET provides a stable time base on SMP - systems, unlike the RTC, but it is more expensive to access, - as it is off-chip. You can find the HPET spec at - <http://www.intel.com/labs/platcomp/hpet/hpetspec.htm>. - - If unsure, say Y. - -config HPET_EMULATE_RTC - bool "Provide RTC interrupt" - depends on HPET_TIMER && RTC=y - -config GENERIC_ISA_DMA - bool - default y - -config GENERIC_IOMAP - bool - default y - -#source "init/Kconfig" - - -menu "Processor type and features" - -choice - prompt "Processor family" - default MK8 - -#config MK8 -# bool "AMD-Opteron/Athlon64" -# help -# Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. - -config MPSC - bool "Intel x86-64" - help - Optimize for Intel IA32 with 64bit extension CPUs - (Prescott/Nocona/Potomac) - -config GENERIC_CPU - bool "Generic-x86-64" - help - Generic x86-64 CPU. - -endchoice - -# -# Define implied options from the CPU selection here -# -config X86_L1_CACHE_BYTES - int - default "128" if GENERIC_CPU || MPSC - default "64" if MK8 - -config X86_L1_CACHE_SHIFT - int - default "7" if GENERIC_CPU || MPSC - default "6" if MK8 - -config X86_TSC - bool - default n - -config X86_GOOD_APIC - bool - default y - -config X86_IO_APIC - bool - default XEN_PRIVILEGED_GUEST - -config X86_LOCAL_APIC - bool - default XEN_PRIVILEGED_GUEST - -config MICROCODE - tristate "/dev/cpu/microcode - Intel CPU microcode support" - ---help--- - If you say Y here the 'File systems' section, you will be - able to update the microcode on Intel processors. You will - obviously need the actual microcode binary data itself which is - not shipped with the Linux kernel. - - For latest news and information on obtaining all the required - ingredients for this driver, check: - <http://www.urbanmyth.org/microcode/>. - - To compile this driver as a module, choose M here: the - module will be called microcode. - If you use modprobe or kmod you may also want to add the line - 'alias char-major-10-184 microcode' to your /etc/modules.conf file. - -config X86_MSR - tristate "/dev/cpu/*/msr - Model-specific register support" - help - This device gives privileged processes access to the x86 - Model-Specific Registers (MSRs). It is a character device with - major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr. - MSR accesses are directed to a specific CPU on multi-processor - systems. - -config X86_CPUID - tristate "/dev/cpu/*/cpuid - CPU information support" - help - This device gives processes access to the x86 CPUID instruction to - be executed on a specific processor. It is a character device - with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to - /dev/cpu/31/cpuid. - -# disable it for opteron optimized builds because it pulls in ACPI_BOOT -config X86_HT - bool - depends on SMP && !MK8 - default y - -config MATH_EMULATION - bool - -config MCA - bool - -config EISA - bool - -config MTRR - bool "MTRR (Memory Type Range Register) support" - ---help--- - On Intel P6 family processors (Pentium Pro, Pentium II and later) - the Memory Type Range Registers (MTRRs) may be used to control - processor access to memory ranges. This is most useful if you have - a video (VGA) card on a PCI or AGP bus. Enabling write-combining - allows bus write transfers to be combined into a larger transfer - before bursting over the PCI/AGP bus. This can increase performance - of image write operations 2.5 times or more. Saying Y here creates a - /proc/mtrr file which may be used to manipulate your processor's - MTRRs. Typically the X server should use this. - - This code has a reasonably generic interface so that similar - control registers on other processors can be easily supported - as well. - - Saying Y here also fixes a problem with buggy SMP BIOSes which only - set the MTRRs for the boot CPU and not for the secondary CPUs. This - can lead to all sorts of problems, so it's good to say Y here. - - Just say Y here, all x86-64 machines support MTRRs. - - See <file:Documentation/mtrr.txt> for more information. - -config SMP - bool "Symmetric multi-processing support" - ---help--- - This enables support for systems with more than one CPU. If you have - a system with only one CPU, like most personal computers, say N. If - you have a system with more than one CPU, say Y. - - If you say N here, the kernel will run on single and multiprocessor - machines, but will use only one CPU of a multiprocessor machine. If - you say Y here, the kernel will run on many, but not all, - singleprocessor machines. On a singleprocessor machine, the kernel - will run faster if you say N here. - - If you don't know what to do here, say N. - -config PREEMPT - bool "Preemptible Kernel" - ---help--- - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. On contrary it may also break your drivers and add - priority inheritance problems to your system. Don't select it if - you rely on a stable system or have slightly obscure hardware. - It's also not very well tested on x86-64 currently. - You have been warned. - - Say Y here if you are feeling brave and building a kernel for a - desktop, embedded or real-time system. Say N if you are unsure. - -config SCHED_SMT - bool "SMT (Hyperthreading) scheduler support" - depends on SMP - default off - help - SMT scheduler support improves the CPU scheduler's decision making - when dealing with Intel Pentium 4 chips with HyperThreading at a - cost of slightly increased overhead in some places. If unsure say - N here. - -config K8_NUMA - bool "K8 NUMA support" - select NUMA - depends on SMP - help - Enable NUMA (Non Unified Memory Architecture) support for - AMD Opteron Multiprocessor systems. The kernel will try to allocate - memory used by a CPU on the local memory controller of the CPU - and add some more NUMA awareness to the kernel. - This code is recommended on all multiprocessor Opteron systems - and normally doesn't hurt on others. - -config NUMA_EMU - bool "NUMA emulation support" - select NUMA - depends on SMP - help - Enable NUMA emulation. A flat machine will be split - into virtual nodes when booted with "numa=fake=N", where N is the - number of nodes. This is only useful for debugging. - -config DISCONTIGMEM - bool - depends on NUMA - default y - -config NUMA - bool - default n - -config HAVE_DEC_LOCK - bool - depends on SMP - default y - -# actually 64 maximum, but you need to fix the APIC code first -# to use clustered mode or whatever your big iron needs -config NR_CPUS - int "Maximum number of CPUs (2-8)" - range 2 8 - depends on SMP - default "8" - help - This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 32 and the - minimum value which makes sense is 2. - - This is purely to save memory - each supported CPU requires - memory in the static kernel configuration. - -config GART_IOMMU - bool "IOMMU support" - depends on PCI - help - Support the K8 IOMMU. Needed to run systems with more than 4GB of memory - properly with 32-bit PCI devices that do not support DAC (Double Address - Cycle). The IOMMU can be turned off at runtime with the iommu=off parameter. - Normally the kernel will take the right choice by itself. - If unsure, say Y. - -# need this always enabled with GART_IOMMU for the VIA workaround -config SWIOTLB - bool - depends on GART_IOMMU - default y - -config DUMMY_IOMMU - bool - depends on !GART_IOMMU && !SWIOTLB - default y - help - Don't use IOMMU code. This will cause problems when you have more than 4GB - of memory and any 32-bit devices. Don't turn on unless you know what you - are doing. - -config X86_MCE - bool "Machine check support" if EMBEDDED - default n - help - Include a machine check error handler to report hardware errors. - This version will require the mcelog utility to decode some - machine check error logs. See - ftp://ftp.x86-64.org/pub/linux/tools/mcelog - -endmenu - -# -# Use the generic interrupt handling code in kernel/irq/: -# -config GENERIC_HARDIRQS - bool - default y - -config GENERIC_IRQ_PROBE - bool - default y - -menu "Power management options" - -source kernel/power/Kconfig - -source "arch/x86_64/kernel/cpufreq/Kconfig" - -endmenu - -menu "Bus options (PCI etc.)" - -config PCI - bool "PCI support" - -# x86-64 doesn't support PCI BIOS access from long mode so always go direct. -config PCI_DIRECT - bool - depends on PCI - default y - -config PCI_MMCONFIG - bool "Support mmconfig PCI config space access" - depends on PCI - select ACPI_BOOT - -config UNORDERED_IO - bool "Unordered IO mapping access" - depends on EXPERIMENTAL - help - Use unordered stores to access IO memory mappings in device drivers. - Still very experimental. When a driver works on IA64/ppc64/pa-risc it should - work with this option, but it makes the drivers behave differently - from i386. Requires that the driver writer used memory barriers - properly. - -#source "drivers/pci/Kconfig" - -#source "drivers/pcmcia/Kconfig" - -#source "drivers/pci/hotplug/Kconfig" - -endmenu - - -menu "Executable file formats / Emulations" - -# source "fs/Kconfig.binfmt" - -config IA32_EMULATION - bool "IA32 Emulation" - help - Include code to run 32-bit programs under a 64-bit kernel. You should likely - turn this on, unless you're 100% sure that you don't have any 32-bit programs - left. - -config IA32_AOUT - bool "IA32 a.out support" - depends on IA32_EMULATION - help - Support old a.out binaries in the 32bit emulation. - -config COMPAT - bool - depends on IA32_EMULATION - default y - -config SYSVIPC_COMPAT - bool - depends on COMPAT && SYSVIPC - default y - -config UID16 - bool - depends on IA32_EMULATION - default y - -endmenu - -# source drivers/Kconfig - -# source "drivers/firmware/Kconfig" - -# source fs/Kconfig - -#source "arch/x86_64/oprofile/Kconfig" - -#source "arch/x86_64/Kconfig.debug" - -# source "security/Kconfig" - -# source "crypto/Kconfig" - -# source "lib/Kconfig" - -endmenu diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,92 +0,0 @@ -# -# x86_64/Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" and "archdep" for cleaning up and making dependencies for -# this architecture -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@xxxxxxxxxxxxx> -# Added '-march' and '-mpreferred-stack-boundary' support -# 20000913 Pavel Machek <pavel@xxxxxxx> -# Converted for x86_64 architecture -# 20010105 Andi Kleen, add IA32 compiler. -# ....and later removed it again.... -# 20050205 Jun Nakajima <jun.nakajima@xxxxxxxxx> -# Modified for Xen -# -# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ - -# -# early bootup linking needs 32bit. You can either use real 32bit tools -# here or 64bit tools in 32bit mode. -# -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -IA32_CC := $(CC) $(CPPFLAGS) -m32 -O2 -fomit-frame-pointer -IA32_LD := $(LD) -m elf_i386 -IA32_AS := $(CC) $(AFLAGS) -m32 -Wa,--32 -traditional -c -IA32_OBJCOPY := $(CROSS_COMPILE)objcopy -IA32_CPP := $(CROSS_COMPILE)gcc -m32 -E -export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP - - -LDFLAGS := -m elf_x86_64 -#LDFLAGS_vmlinux := -e stext - -CHECKFLAGS += -D__x86_64__ -m64 - -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) -cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) -CFLAGS += $(cflags-y) - -CFLAGS += -mno-red-zone -CFLAGS += -mcmodel=kernel -CFLAGS += -pipe -# this makes reading assembly source easier, but produces worse code -# actually it makes the kernel smaller too. -CFLAGS += -fno-reorder-blocks -CFLAGS += -Wno-sign-compare -ifneq ($(CONFIG_DEBUG_INFO),y) -CFLAGS += -fno-asynchronous-unwind-tables -# -fweb shrinks the kernel a bit, but the difference is very small -# it also messes up debugging, so don't use it for now. -#CFLAGS += $(call cc-option,-fweb) -endif -# -funit-at-a-time shrinks the kernel .text considerably -# unfortunately it makes reading oopses harder. -CFLAGS += $(call cc-option,-funit-at-a-time,) - -head-y := arch/xen/x86_64/kernel/head.o arch/xen/x86_64/kernel/head64.o arch/xen/x86_64/kernel/init_task.o - -libs-y += arch/x86_64/lib/ -core-y += arch/xen/x86_64/kernel/ arch/xen/x86_64/mm/ -core-$(CONFIG_IA32_EMULATION) += arch/xen/x86_64/ia32/ -drivers-$(CONFIG_PCI) += arch/xen/x86_64/pci/ -drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ - -# for clean -obj- += kernel/ mm/ pci/ - -xenflags-y += -Iinclude/asm-xen/asm-x86_64/mach-xen - -CFLAGS += $(xenflags-y) -AFLAGS += $(xenflags-y) - -prepare: include/asm-$(XENARCH)/asm_offset.h -CLEAN_FILES += include/asm-$(XENARCH)/asm_offset.h - -arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/.asm-ignore \ - include/linux/version.h include/config/MARKER - - -include/asm-$(XENARCH)/asm_offset.h: arch/xen/x86_64/kernel/asm-offsets.s - $(call filechk,gen-asm-offsets) - ln -fsn asm_offset.h include/asm-$(XENARCH)/offset.h - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,58 +0,0 @@ -# -# Makefile for the ia32 kernel emulation subsystem. -# -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CFLAGS += -Iarch/$(XENARCH)/kernel - -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o syscall32.o - -c-obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_ioctl.o \ - ia32_signal.o tls32.o \ - ia32_binfmt.o fpu32.o ptrace32.o - -s-obj-y := - -sysv-$(CONFIG_SYSVIPC) := ipc32.o -c-obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) - -c-obj-$(CONFIG_IA32_AOUT) += ia32_aout.o - -$(obj)/syscall32.o: $(src)/syscall32.c \ - $(foreach F,int80 sysenter syscall,$(obj)/vsyscall-$F.so) - -# Teach kbuild about targets -targets := $(foreach F,int80 sysenter syscall,vsyscall-$F.o vsyscall-$F.so) - -# The DSO images are built using a special linker script -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m32 -nostdlib -shared -s \ - -Wl,-soname=linux-gate.so.1 -o $@ \ - -Wl,-T,$(filter-out FORCE,$^) - - -$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so $(obj)/vsyscall-syscall.so: \ -$(obj)/vsyscall-%.so: $(obj)/vsyscall.lds $(obj)/vsyscall-%.o FORCE - $(call if_changed,syscall) - -AFLAGS_vsyscall-int80.o = -m32 -I$(obj) -AFLAGS_vsyscall-sysenter.o = -m32 -AFLAGS_vsyscall-syscall.o = -m32 -CFLAGS_ia32_ioctl.o += -Ifs/ - -s-link := vsyscall-syscall.o vsyscall-sysenter.o vsyscall-sigreturn.o - -$(obj)/vsyscall.lds: - @ln -fsn $(srctree)/arch/x86_64/ia32/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): - @ln -fsn $(srctree)/arch/x86_64/ia32/$(notdir $@) $@ - -$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S -$(obj)/vsyscall-sysenter.S: $(obj)/vsyscall-sigreturn.S -$(obj)/vsyscall-syscall.S: $(obj)/vsyscall-sigreturn.S - -obj-y += $(c-obj-y) $(s-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) -clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/ia32entry.S --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/ia32entry.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,629 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - -#include <asm/dwarf2.h> -#include <asm/calling.h> -#include <asm/offset.h> -#include <asm/current.h> -#include <asm/errno.h> -#include <asm/ia32_unistd.h> -#include <asm/thread_info.h> -#include <asm/segment.h> -#include <asm/vsyscall32.h> -#include <linux/linkage.h> - -#define __XEN_X86_64 1 - - .macro IA32_ARG_FIXUP noebp=0 - movl %edi,%r8d - .if \noebp - .else - movl %ebp,%r9d - .endif - xchg %ecx,%esi - movl %ebx,%edi - movl %edx,%edx /* zero extension */ - .endm - - /* clobbers %eax */ - .macro CLEAR_RREGS - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %rax,R9(%rsp) - movq %rax,R8(%rsp) - .endm - -#if defined (__XEN_X86_64) -#include "../kernel/xen_entry.S" - -#define __swapgs -#define __cli -#define __sti -#else -/* - * Use the native instructions - */ -#define __swapgs swapgs -#define __cli cli -#define __sti sti -#endif - -/* - * 32bit SYSENTER instruction entry. - * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp user stack - * 0(%ebp) Arg6 - * - * Interrupts off. - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - CFI_STARTPROC - __swapgs - movq %gs:pda_kernelstack, %rsp - addq $(PDA_STACKOFFSET),%rsp - XEN_UNBLOCK_EVENTS(%r11) - __sti - movl %ebp,%ebp /* zero extension */ - pushq $__USER32_DS - pushq %rbp - pushfq - movl $VSYSCALL32_SYSEXIT, %r10d - pushq $__USER32_CS - movl %eax, %eax - pushq %r10 - pushq %rax - cld - SAVE_ARGS 0,0,1 - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ -1: movl (%rbp),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) - jnz sysenter_tracesys -sysenter_do_call: - cmpl $(IA32_NR_syscalls),%eax - jae ia32_badsys - IA32_ARG_FIXUP 1 - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - XEN_BLOCK_EVENTS(%r11) - __cli - testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) - jnz int_ret_from_sys_call - /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS-R11(%rsp) - RESTORE_ARGS 1,24,1,1,1,1 - popfq - popq %rcx /* User %esp */ - movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ - __swapgs - XEN_UNBLOCK_EVENTS(%r11) - __sti /* sti only takes effect after the next instruction */ - /* sysexit */ - .byte 0xf, 0x35 /* TBD */ - -sysenter_tracesys: - SAVE_REST - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* really needed? */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - movl %ebp, %ebp - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ -1: movl (%rbp),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - jmp sysenter_do_call - CFI_ENDPROC - -/* - * 32bit SYSCALL instruction entry. - * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx return EIP - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg2 [note: not saved in the stack frame, should not be touched] - * %esp user stack - * 0(%esp) Arg6 - * - * Interrupts off. - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - CFI_STARTPROC - __swapgs - movl %esp,%r8d - movq %gs:pda_kernelstack,%rsp - XEN_UNBLOCK_EVENTS(%r11) - __sti - SAVE_ARGS 8,1,1 - movl %eax,%eax /* zero extension */ - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ - movl %ebp,%ecx - movq $__USER32_CS,CS-ARGOFFSET(%rsp) - movq $__USER32_DS,SS-ARGOFFSET(%rsp) - movq %r11,EFLAGS-ARGOFFSET(%rsp) - movq %r8,RSP-ARGOFFSET(%rsp) - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ - /* hardware stack frame is complete now */ -1: movl (%r8),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) - jnz cstar_tracesys -cstar_do_call: - cmpl $IA32_NR_syscalls,%eax - jae ia32_badsys - IA32_ARG_FIXUP 1 - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) - XEN_BLOCK_EVENTS(%r11) - __cli - testl $_TIF_ALLWORK_MASK,threadinfo_flags(%r10) - jnz int_ret_from_sys_call - RESTORE_ARGS 1,-ARG_SKIP,1,1,1 - movl RIP-ARGOFFSET(%rsp),%ecx - movl EFLAGS-ARGOFFSET(%rsp),%r11d - movl RSP-ARGOFFSET(%rsp),%esp - __swapgs - sysretl /* TBD */ - -cstar_tracesys: - SAVE_REST - CLEAR_RREGS - movq $-ENOSYS,RAX(%rsp) /* really needed? */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - movl RSP-ARGOFFSET(%rsp), %r8d - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ -1: movl (%r8),%r9d - .section __ex_table,"a" - .quad 1b,ia32_badarg - .previous - jmp cstar_do_call - -ia32_badarg: - movq $-EFAULT,%rax - jmp ia32_sysret - CFI_ENDPROC - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg6 [note: not saved in the stack frame, should not be touched] - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except %eax must be saved (but ptrace may violate that) - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - CFI_STARTPROC - __swapgs - XEN_UNBLOCK_EVENTS(%r11) - __sti - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* skip rcx and r11 */ - movl %eax,%eax - pushq %rax - cld -/* 1: jmp 1b */ - /* note the registers are not zero extended to the sf. - this could be a problem. */ - SAVE_ARGS 0,0,1 - GET_THREAD_INFO(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10) - jnz ia32_tracesys -ia32_do_syscall: - cmpl $(IA32_NR_syscalls),%eax - jae ia32_badsys - IA32_ARG_FIXUP - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative -ia32_sysret: - movq %rax,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_REST - movq $-ENOSYS,RAX(%rsp) /* really needed? */ - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - jmp ia32_do_syscall - -ia32_badsys: - movq $0,ORIG_RAX-ARGOFFSET(%rsp) - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp int_ret_from_sys_call - -ni_syscall: - movq %rax,%rdi - jmp sys32_ni_syscall - -quiet_ni_syscall: - movq $-ENOSYS,%rax - ret - CFI_ENDPROC - - .macro PTREGSCALL label, func, arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn, %rdi - PTREGSCALL stub32_sigreturn, sys32_sigreturn, %rdi - PTREGSCALL stub32_sigaltstack, sys32_sigaltstack, %rdx - PTREGSCALL stub32_sigsuspend, sys32_sigsuspend, %rcx - PTREGSCALL stub32_execve, sys32_execve, %rcx - PTREGSCALL stub32_fork, sys_fork, %rdi - PTREGSCALL stub32_clone, sys32_clone, %rdx - PTREGSCALL stub32_vfork, sys_vfork, %rdi - PTREGSCALL stub32_iopl, sys_iopl, %rsi - PTREGSCALL stub32_rt_sigsuspend, sys_rt_sigsuspend, %rdx - -ENTRY(ia32_ptregs_common) - CFI_STARTPROC - popq %r11 - SAVE_REST - call *%rax - RESTORE_REST - jmp ia32_sysret /* misbalances the return cache */ - CFI_ENDPROC - - .data - .align 8 - .globl ia32_sys_call_table -ia32_sys_call_table: - .quad sys_restart_syscall - .quad sys_exit - .quad stub32_fork - .quad sys_read - .quad sys_write - .quad sys32_open /* 5 */ - .quad sys_close - .quad sys32_waitpid - .quad sys_creat - .quad sys_link - .quad sys_unlink /* 10 */ - .quad stub32_execve - .quad sys_chdir - .quad compat_sys_time - .quad sys_mknod - .quad sys_chmod /* 15 */ - .quad sys_lchown16 - .quad quiet_ni_syscall /* old break syscall holder */ - .quad sys_stat - .quad sys32_lseek - .quad sys_getpid /* 20 */ - .quad compat_sys_mount /* mount */ - .quad sys_oldumount /* old_umount */ - .quad sys_setuid16 - .quad sys_getuid16 - .quad compat_sys_stime /* stime */ /* 25 */ - .quad sys32_ptrace /* ptrace */ - .quad sys_alarm - .quad sys_fstat /* (old)fstat */ - .quad sys_pause - .quad compat_sys_utime /* 30 */ - .quad quiet_ni_syscall /* old stty syscall holder */ - .quad quiet_ni_syscall /* old gtty syscall holder */ - .quad sys_access - .quad sys_nice - .quad quiet_ni_syscall /* 35 */ /* old ftime syscall holder */ - .quad sys_sync - .quad sys32_kill - .quad sys_rename - .quad sys_mkdir - .quad sys_rmdir /* 40 */ - .quad sys_dup - .quad sys32_pipe - .quad compat_sys_times - .quad quiet_ni_syscall /* old prof syscall holder */ - .quad sys_brk /* 45 */ - .quad sys_setgid16 - .quad sys_getgid16 - .quad sys_signal - .quad sys_geteuid16 - .quad sys_getegid16 /* 50 */ - .quad sys_acct - .quad sys_umount /* new_umount */ - .quad quiet_ni_syscall /* old lock syscall holder */ - .quad compat_sys_ioctl - .quad compat_sys_fcntl64 /* 55 */ - .quad quiet_ni_syscall /* old mpx syscall holder */ - .quad sys_setpgid - .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys32_olduname - .quad sys_umask /* 60 */ - .quad sys_chroot - .quad sys32_ustat - .quad sys_dup2 - .quad sys_getppid - .quad sys_getpgrp /* 65 */ - .quad sys_setsid - .quad sys32_sigaction - .quad sys_sgetmask - .quad sys_ssetmask - .quad sys_setreuid16 /* 70 */ - .quad sys_setregid16 - .quad stub32_sigsuspend - .quad compat_sys_sigpending - .quad sys_sethostname - .quad compat_sys_setrlimit /* 75 */ - .quad compat_sys_old_getrlimit /* old_getrlimit */ - .quad compat_sys_getrusage - .quad sys32_gettimeofday - .quad sys32_settimeofday - .quad sys_getgroups16 /* 80 */ - .quad sys_setgroups16 - .quad sys32_old_select - .quad sys_symlink - .quad sys_lstat - .quad sys_readlink /* 85 */ -#ifdef CONFIG_IA32_AOUT - .quad sys_uselib -#else - .quad quiet_ni_syscall -#endif - .quad sys_swapon - .quad sys_reboot - .quad compat_sys_old_readdir - .quad sys32_mmap /* 90 */ - .quad sys_munmap - .quad sys_truncate - .quad sys_ftruncate - .quad sys_fchmod - .quad sys_fchown16 /* 95 */ - .quad sys_getpriority - .quad sys_setpriority - .quad quiet_ni_syscall /* old profil syscall holder */ - .quad compat_sys_statfs - .quad compat_sys_fstatfs /* 100 */ - .quad sys_ioperm - .quad compat_sys_socketcall - .quad sys_syslog - .quad compat_sys_setitimer - .quad compat_sys_getitimer /* 105 */ - .quad compat_sys_newstat - .quad compat_sys_newlstat - .quad compat_sys_newfstat - .quad sys32_uname - .quad stub32_iopl /* 110 */ - .quad sys_vhangup - .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ - .quad compat_sys_wait4 - .quad sys_swapoff /* 115 */ - .quad sys32_sysinfo - .quad sys32_ipc - .quad sys_fsync - .quad stub32_sigreturn - .quad stub32_clone /* 120 */ - .quad sys_setdomainname - .quad sys_uname - .quad sys_modify_ldt - .quad sys32_adjtimex - .quad sys32_mprotect /* 125 */ - .quad compat_sys_sigprocmask - .quad quiet_ni_syscall /* create_module */ - .quad sys_init_module - .quad sys_delete_module - .quad quiet_ni_syscall /* 130 get_kernel_syms */ - .quad sys_quotactl - .quad sys_getpgid - .quad sys_fchdir - .quad quiet_ni_syscall /* bdflush */ - .quad sys_sysfs /* 135 */ - .quad sys_personality - .quad quiet_ni_syscall /* for afs_syscall */ - .quad sys_setfsuid16 - .quad sys_setfsgid16 - .quad sys_llseek /* 140 */ - .quad compat_sys_getdents - .quad compat_sys_select - .quad sys_flock - .quad sys_msync - .quad compat_sys_readv /* 145 */ - .quad compat_sys_writev - .quad sys_getsid - .quad sys_fdatasync - .quad sys32_sysctl /* sysctl */ - .quad sys_mlock /* 150 */ - .quad sys_munlock - .quad sys_mlockall - .quad sys_munlockall - .quad sys_sched_setparam - .quad sys_sched_getparam /* 155 */ - .quad sys_sched_setscheduler - .quad sys_sched_getscheduler - .quad sys_sched_yield - .quad sys_sched_get_priority_max - .quad sys_sched_get_priority_min /* 160 */ - .quad sys_sched_rr_get_interval - .quad compat_sys_nanosleep - .quad sys_mremap - .quad sys_setresuid16 - .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ - .quad quiet_ni_syscall /* query_module */ - .quad sys_poll - .quad compat_sys_nfsservctl - .quad sys_setresgid16 /* 170 */ - .quad sys_getresgid16 - .quad sys_prctl - .quad stub32_rt_sigreturn - .quad sys32_rt_sigaction - .quad sys32_rt_sigprocmask /* 175 */ - .quad sys32_rt_sigpending - .quad compat_sys_rt_sigtimedwait - .quad sys32_rt_sigqueueinfo - .quad stub32_rt_sigsuspend - .quad sys32_pread /* 180 */ - .quad sys32_pwrite - .quad sys_chown16 - .quad sys_getcwd - .quad sys_capget - .quad sys_capset - .quad stub32_sigaltstack - .quad sys32_sendfile - .quad quiet_ni_syscall /* streams1 */ - .quad quiet_ni_syscall /* streams2 */ - .quad stub32_vfork /* 190 */ - .quad compat_sys_getrlimit - .quad sys32_mmap2 - .quad sys32_truncate64 - .quad sys32_ftruncate64 - .quad sys32_stat64 /* 195 */ - .quad sys32_lstat64 - .quad sys32_fstat64 - .quad sys_lchown - .quad sys_getuid - .quad sys_getgid /* 200 */ - .quad sys_geteuid - .quad sys_getegid - .quad sys_setreuid - .quad sys_setregid - .quad sys_getgroups /* 205 */ - .quad sys_setgroups - .quad sys_fchown - .quad sys_setresuid - .quad sys_getresuid - .quad sys_setresgid /* 210 */ - .quad sys_getresgid - .quad sys_chown - .quad sys_setuid - .quad sys_setgid - .quad sys_setfsuid /* 215 */ - .quad sys_setfsgid - .quad sys_pivot_root - .quad sys_mincore - .quad sys_madvise - .quad compat_sys_getdents64 /* 220 getdents64 */ - .quad compat_sys_fcntl64 - .quad quiet_ni_syscall /* tux */ - .quad quiet_ni_syscall /* security */ - .quad sys_gettid - .quad sys_readahead /* 225 */ - .quad sys_setxattr - .quad sys_lsetxattr - .quad sys_fsetxattr - .quad sys_getxattr - .quad sys_lgetxattr /* 230 */ - .quad sys_fgetxattr - .quad sys_listxattr - .quad sys_llistxattr - .quad sys_flistxattr - .quad sys_removexattr /* 235 */ - .quad sys_lremovexattr - .quad sys_fremovexattr - .quad sys_tkill - .quad sys_sendfile64 - .quad compat_sys_futex /* 240 */ - .quad compat_sys_sched_setaffinity - .quad compat_sys_sched_getaffinity - .quad sys32_set_thread_area - .quad sys32_get_thread_area - .quad compat_sys_io_setup /* 245 */ - .quad sys_io_destroy - .quad compat_sys_io_getevents - .quad compat_sys_io_submit - .quad sys_io_cancel - .quad sys_fadvise64 /* 250 */ - .quad quiet_ni_syscall /* free_huge_pages */ - .quad sys_exit_group - .quad sys32_lookup_dcookie - .quad sys_epoll_create - .quad sys_epoll_ctl /* 255 */ - .quad sys_epoll_wait - .quad sys_remap_file_pages - .quad sys_set_tid_address - .quad sys32_timer_create - .quad compat_sys_timer_settime /* 260 */ - .quad compat_sys_timer_gettime - .quad sys_timer_getoverrun - .quad sys_timer_delete - .quad compat_sys_clock_settime - .quad compat_sys_clock_gettime /* 265 */ - .quad compat_sys_clock_getres - .quad compat_sys_clock_nanosleep - .quad compat_sys_statfs64 - .quad compat_sys_fstatfs64 - .quad sys_tgkill /* 270 */ - .quad compat_sys_utimes - .quad sys32_fadvise64_64 - .quad quiet_ni_syscall /* sys_vserver */ - .quad sys_mbind - .quad compat_sys_get_mempolicy /* 275 */ - .quad sys_set_mempolicy - .quad compat_sys_mq_open - .quad sys_mq_unlink - .quad compat_sys_mq_timedsend - .quad compat_sys_mq_timedreceive /* 280 */ - .quad compat_sys_mq_notify - .quad compat_sys_mq_getsetattr - .quad quiet_ni_syscall /* reserved for kexec */ - .quad sys32_waitid - .quad quiet_ni_syscall /* sys_altroot */ - .quad sys_add_key - .quad sys_request_key - .quad sys_keyctl - /* don't forget to change IA32_NR_syscalls */ -ia32_syscall_end: - .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 - .quad ni_syscall - .endr diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/syscall32.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/syscall32.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,143 +0,0 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs */ - -/* vsyscall handling for 32bit processes. Map a stub page into it - on demand because 32bit cannot reach the kernel's fixmaps */ - -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/stringify.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> -#include <asm/ia32_unistd.h> - -#define USE_INT80 - -#ifdef USE_INT80 -/* 32bit VDSOs mapped into user space. */ -asm(".section \".init.data\",\"aw\"\n" - "syscall32_int80:\n" - ".incbin \"arch/xen/x86_64/ia32/vsyscall-int80.so\"\n" - "syscall32_int80_end:\n" - "syscall32_syscall:\n" - ".incbin \"arch/xen/x86_64/ia32/vsyscall-syscall.so\"\n" - "syscall32_syscall_end:\n" - "syscall32_sysenter:\n" - ".incbin \"arch/xen/x86_64/ia32/vsyscall-sysenter.so\"\n" - "syscall32_sysenter_end:\n" - ".previous"); - -extern unsigned char syscall32_int80[], syscall32_int80_end[]; -#else -/* 32bit VDSOs mapped into user space. */ -asm(".section \".init.data\",\"aw\"\n" - "syscall32_syscall:\n" - ".incbin \"arch/xen/x86_64/ia32/vsyscall-syscall.so\"\n" - "syscall32_syscall_end:\n" - "syscall32_sysenter:\n" - ".incbin \"arch/xen/x86_64/ia32/vsyscall-sysenter.so\"\n" - "syscall32_sysenter_end:\n" - ".previous"); - -static int use_sysenter = -1; -#endif - -extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; -extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; -extern int sysctl_vsyscall32; - -char *syscall32_page; - -/* - * Map the 32bit vsyscall page on demand. - * - * RED-PEN: This knows too much about high level VM. - * - * Alternative would be to generate a vma with appropriate backing options - * and let it be handled by generic VM. - */ -int __map_syscall32(struct mm_struct *mm, unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pte_t *pte; - pmd_t *pmd; - int err = -ENOMEM; - - spin_lock(&mm->page_table_lock); - pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (pud) { - pmd = pmd_alloc(mm, pud, address); - if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { - if (pte_none(*pte)) { - set_pte(pte, - mk_pte(virt_to_page(syscall32_page), - PAGE_KERNEL_VSYSCALL32)); - } - /* Flush only the local CPU. Other CPUs taking a fault - will just end up here again - This probably not needed and just paranoia. */ - __flush_tlb_one(address); - err = 0; - } - } - spin_unlock(&mm->page_table_lock); - return err; -} - -int map_syscall32(struct mm_struct *mm, unsigned long address) -{ - int err; - down_read(&mm->mmap_sem); - err = __map_syscall32(mm, address); - up_read(&mm->mmap_sem); - return err; -} - -static int __init init_syscall32(void) -{ - syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!syscall32_page) - panic("Cannot allocate syscall32 page"); - SetPageReserved(virt_to_page(syscall32_page)); - -#ifdef USE_INT80 - /* - * At this point we use int 0x80. - */ - memcpy(syscall32_page, syscall32_int80, - syscall32_int80_end - syscall32_int80); -#else - - if (use_sysenter > 0) { - memcpy(syscall32_page, syscall32_sysenter, - syscall32_sysenter_end - syscall32_sysenter); - } else { - memcpy(syscall32_page, syscall32_syscall, - syscall32_syscall_end - syscall32_syscall); - } -#endif - return 0; -} - -__initcall(init_syscall32); - -/* May not be __init: called during resume */ -void syscall32_cpu_init(void) -{ -#ifndef USE_INT80 - if (use_sysenter < 0) - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); - - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)(__KERNEL_CS | 3)); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -#endif -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/vsyscall-int80.S --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/ia32/vsyscall-int80.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,57 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the old int $0x80 method. - * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. - */ -#include <asm/ia32_unistd.h> -#include <asm/offset.h> - - .text - .section .text.vsyscall,"ax" - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - .align 4 -.LENDFDE1: - -/* - * Get the common code for the sigreturn entry points. - */ -#define SYSCALL_ENTER_KERNEL int $0x80 -#include "vsyscall-sigreturn.S" diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,68 +0,0 @@ -# -# Makefile for the linux kernel. -# -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CFLAGS += -Iarch/$(XENARCH)/kernel - -extra-y := head.o head64.o init_task.o - -obj-y := process.o signal.o entry.o traps.o \ - ioport.o ldt.o setup.o \ - x8664_ksyms.o vsyscall.o \ - setup64.o e820.o irq.o early_printk.o -c-obj-y := semaphore.o i387.o sys_x86_64.o \ - ptrace.o quirks.o syscall.o bootflag.o - -i386-obj-y := time.o -obj-y += ../../i386/kernel/timers/ - -s-obj-y := - -#obj-$(CONFIG_X86_MCE) += mce.o -#obj-$(CONFIG_MTRR) += ../../i386/kernel/cpu/mtrr/ -obj-$(CONFIG_ACPI_BOOT) += acpi/ -c-obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -#obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o -c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o -c-obj-$(CONFIG_X86_IO_APIC) += genapic.o genapic_cluster.o genapic_flat.o -#obj-$(CONFIG_PM) += suspend.o -#obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o -#obj-$(CONFIG_CPU_FREQ) += cpufreq/ -#obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -#obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o -c-obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o -#obj-$(CONFIG_SWIOTLB) += swiotlb.o -obj-$(CONFIG_KPROBES) += kprobes.o - -c-obj-$(CONFIG_MODULES) += module.o - -#obj-y += topology.o -c-obj-y += intel_cacheinfo.o - -bootflag-y += ../../../i386/kernel/bootflag.o -cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../../i386/kernel/cpuid.o -topology-y += ../../../i386/mach-default/topology.o -swiotlb-$(CONFIG_SWIOTLB) += ../../../ia64/lib/swiotlb.o -microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../../i386/kernel/microcode.o -intel_cacheinfo-y += ../../../i386/kernel/cpu/intel_cacheinfo.o -quirks-y += ../../../i386/kernel/quirks.o - -c-link := init_task.o -s-link := vsyscall.o - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): - @ln -fsn $(srctree)/arch/x86_64/kernel/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(i386-obj-y)): - @ln -fsn $(srctree)/arch/xen/i386/kernel/$(notdir $@) $@ - -obj-y += $(c-obj-y) $(s-obj-y) $(i386-obj-y) -obj-m += $(c-obj-m) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link) $(i386-obj-y)) -clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/acpi/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/acpi/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,20 +0,0 @@ -i386-obj-$(CONFIG_ACPI_BOOT) := boot.o -c-obj-$(CONFIG_X86_IO_APIC) := earlyquirk.o -c-obj-$(CONFIG_ACPI_SLEEP) += sleep.o -s-obj-$(CONFIG_ACPI_SLEEP) += wakeup.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): - @ln -fsn $(srctree)/arch/x86_64/kernel/acpi/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(i386-obj-y)): - @ln -fsn $(srctree)/arch/xen/i386/kernel/acpi/$(notdir $@) $@ - -obj-y += $(c-obj-y) $(s-obj-y) $(i386-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) -clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/apic.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/apic.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,200 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include <linux/config.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/kernel_stat.h> -#include <linux/sysdev.h> - -#include <asm/atomic.h> -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/desc.h> -#include <asm/arch_hooks.h> -#include <asm/hpet.h> - -#include "io_ports.h" - -/* - * Debug level - */ -int apic_verbosity; -int disable_apic; - -void smp_local_timer_interrupt(struct pt_regs *regs) -{ - int cpu = smp_processor_id(); - - profile_tick(CPU_PROFILING, regs); -#if 0 - if (--per_cpu(prof_counter, cpu) <= 0) { - /* - * The multiplier may have changed since the last time we got - * to this point as a result of the user writing to - * /proc/profile. In this case we need to adjust the APIC - * timer accordingly. - * - * Interrupts are already masked off at this point. - */ - per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); - if (per_cpu(prof_counter, cpu) != - per_cpu(prof_old_multiplier, cpu)) { - __setup_APIC_LVTT(calibration_result/ - per_cpu(prof_counter, cpu)); - per_cpu(prof_old_multiplier, cpu) = - per_cpu(prof_counter, cpu); - } - -#ifdef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - } -#endif - - /* - * We take the 'long' return path, and there every subsystem - * grabs the appropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - smp_local_timer_interrupt(regs); - irq_exit(); -} - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ - -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -int get_physical_broadcast(void) -{ - return 0xff; -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) -{ -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,70 +0,0 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ - -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <asm/pda.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> -#include <asm/ia32.h> - -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) - -int main(void) -{ -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(thread); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) - ENTRY(kernelstack); - ENTRY(oldrsp); - ENTRY(pcurrent); - ENTRY(irqrsp); - ENTRY(irqcount); - ENTRY(cpunumber); - ENTRY(irqstackptr); - ENTRY(kernel_mode); - BLANK(); -#undef ENTRY -#ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); - BLANK(); -#undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); - BLANK(); -#endif - DEFINE(SIZEOF_PBE, sizeof(struct pbe)); - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/e820.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/e820.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,533 +0,0 @@ -/* - * Handle the memory map. - * The functions here do the job until bootmem takes over. - * $Id: e820.c,v 1.4 2002/09/19 19:25:32 ak Exp $ - */ -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/ioport.h> -#include <linux/string.h> -#include <asm/page.h> -#include <asm/e820.h> -#include <asm/proto.h> -#include <asm/bootsetup.h> - -/* - * PFN of last memory page. - */ -unsigned long end_pfn; -unsigned long end_user_pfn = MAXMEM>>PAGE_SHIFT; -unsigned long end_pfn_map; - -/* - * Add a memory region to the kernel e820 map. - */ -void __init add_memory_region(unsigned long start, unsigned long size, int type) -{ - int x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } - - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; -} - -#if 0 -extern char _end[]; - -/* - * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. - * The direct mapping extends to end_pfn_map, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ - -/* - * Last pfn which the user wants to use. - */ - -extern struct resource code_resource, data_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; - - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = 0x8000; - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = table_end << PAGE_SHIFT; - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (LOADER_TYPE && INITRD_START && last >= INITRD_START && - addr < INITRD_START+INITRD_SIZE) { - *addrp = INITRD_START + INITRD_SIZE; - return 1; - } -#endif - /* kernel code + 640k memory hole (later should not be needed, but - be paranoid for now) */ - if (last >= 640*1024 && addr < __pa_symbol(&_end)) { - *addrp = __pa_symbol(&_end); - return 1; - } - /* XXX ramdisk image here? */ - return 0; -} - -int __init e820_mapped(unsigned long start, unsigned long end, unsigned type) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) - continue; - if (ei->addr >= end || ei->addr + ei->size < start) - continue; - return 1; - } - return 0; -} - -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) - addr = start; - if (addr > ei->addr + ei->size) - continue; - while (bad_addr(&addr, size) && addr+size < ei->addr + ei->size) - ; - last = addr + size; - if (last > ei->addr + ei->size) - continue; - if (last > end) - continue; - return addr; - } - return -1UL; -} - -/* - * Free bootmem based on the e820 table for a node. - */ -void __init e820_bootmem_free(pg_data_t *pgdat, unsigned long start,unsigned long end) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr > end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr && last-addr >= PAGE_SIZE) - free_bootmem_node(pgdat, addr, last-addr); - } -} - -/* - * Find the highest page frame number we have available - */ -unsigned long __init e820_end_of_ram(void) -{ - int i; - unsigned long end_pfn = 0; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long start, end; - - start = round_up(ei->addr, PAGE_SIZE); - end = round_down(ei->addr + ei->size, PAGE_SIZE); - if (start >= end) - continue; - if (ei->type == E820_RAM) { - if (end > end_pfn<<PAGE_SHIFT) - end_pfn = end>>PAGE_SHIFT; - } else { - if (end > end_pfn_map<<PAGE_SHIFT) - end_pfn_map = end>>PAGE_SHIFT; - } - } - - if (end_pfn > end_pfn_map) - end_pfn_map = end_pfn; - if (end_pfn_map > MAXMEM>>PAGE_SHIFT) - end_pfn_map = MAXMEM>>PAGE_SHIFT; - if (end_pfn > end_user_pfn) - end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; - - return end_pfn; -} - -/* - * Mark e820 reserved areas as busy for the resource manager. - */ -void __init e820_reserve_resources(void) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct resource *res; - if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) - continue; - res = alloc_bootmem_low(sizeof(struct resource)); - switch (e820.map[i].type) { - case E820_RAM: res->name = "System RAM"; break; - case E820_ACPI: res->name = "ACPI Tables"; break; - case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; - default: res->name = "reserved"; - } - res->start = e820.map[i].addr; - res->end = res->start + e820.map[i].size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - request_resource(&iomem_resource, res); - if (e820.map[i].type == E820_RAM) { - /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. - */ - request_resource(res, &code_resource); - request_resource(res, &data_resource); - } - } -} - - -void __init e820_print_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); - switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; - case E820_RESERVED: - printk("(reserved)\n"); - break; - case E820_ACPI: - printk("(ACPI data)\n"); - break; - case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; - } - } -} - -/* - * Sanitize the BIOS e820 map. - * - * Some e820 responses include overlapping entries. The following - * replaces the original e820 map with a new one, removing overlaps. - * - */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) -{ - struct change_member { - struct e820entry *pbios; /* pointer to original bios entry */ - unsigned long long addr; /* address for this change point */ - }; - static struct change_member change_point_list[2*E820MAX] __initdata; - static struct change_member *change_point[2*E820MAX] __initdata; - static struct e820entry *overlap_list[E820MAX] __initdata; - static struct e820entry new_bios[E820MAX] __initdata; - struct change_member *change_tmp; - unsigned long current_type, last_type; - unsigned long long last_addr; - int chgidx, still_changing; - int overlap_entries; - int new_bios_entry; - int old_nr, new_nr; - int i; - - /* - Visually we're performing the following (1,2,3,4 = memory types)... - - Sample memory map (w/overlaps): - ____22__________________ - ______________________4_ - ____1111________________ - _44_____________________ - 11111111________________ - ____________________33__ - ___________44___________ - __________33333_________ - ______________22________ - ___________________2222_ - _________111111111______ - _____________________11_ - _________________4______ - - Sanitized equivalent (no overlap): - 1_______________________ - _44_____________________ - ___1____________________ - ____22__________________ - ______11________________ - _________1______________ - __________3_____________ - ___________44___________ - _____________33_________ - _______________2________ - ________________1_______ - _________________4______ - ___________________2____ - ____________________33__ - ______________________4_ - */ - - /* if there's only one memory region, don't bother */ - if (*pnr_map < 2) - return -1; - - old_nr = *pnr_map; - - /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) - if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) - return -1; - - /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) - change_point[i] = &change_point_list[i]; - - /* record all known change-points (starting and ending addresses) */ - chgidx = 0; - for (i=0; i < old_nr; i++) { - change_point[chgidx]->addr = biosmap[i].addr; - change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; - change_point[chgidx++]->pbios = &biosmap[i]; - } - - /* sort change-point list by memory addresses (low -> high) */ - still_changing = 1; - while (still_changing) { - still_changing = 0; - for (i=1; i < 2*old_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { - change_tmp = change_point[i]; - change_point[i] = change_point[i-1]; - change_point[i-1] = change_tmp; - still_changing=1; - } - } - } - - /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ - last_type = 0; /* start with undefined memory type */ - last_addr = 0; /* start with 0 as last starting address */ - /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < 2*old_nr; chgidx++) - { - /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; - } - overlap_entries--; - } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ - current_type = 0; - for (i=0; i<overlap_entries; i++) - if (overlap_list[i]->type > current_type) - current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ - if (current_type != last_type) { - if (last_type != 0) { - new_bios[new_bios_entry].size = - change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ - if (new_bios[new_bios_entry].size != 0) - if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ - } - if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; - new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; - } - last_type = current_type; - } - } - new_nr = new_bios_entry; /* retain count for new bios entries */ - - /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); - *pnr_map = new_nr; - - return 0; -} - -/* - * Copy the BIOS e820 map into a safe place. - * - * Sanity-check it while we're at it.. - * - * If we're lucky and live on a modern system, the setup code - * will have given us a memory map that we can use to properly - * set up memory. If we aren't, we'll fake a memory map. - * - * We check to see that the memory map contains at least 2 elements - * before we'll use it, because the detection code in setup.S may - * not be perfect and most every PC known to man has two memory - * regions: one from 0 to 640k, and one from 1mb up. (The IBM - * thinkpad 560x, for example, does not cooperate with the memory - * detection code.) - */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) -{ - /* Only one memory region (or negative)? Ignore it */ - if (nr_map < 2) - return -1; - - do { - unsigned long start = biosmap->addr; - unsigned long size = biosmap->size; - unsigned long end = start + size; - unsigned long type = biosmap->type; - - /* Overflow in 64 bits? Ignore the memory map. */ - if (start > end) - return -1; - - /* - * Some BIOSes claim RAM in the 640k - 1M region. - * Not right. Fix it up. - * - * This should be removed on Hammer which is supposed to not - * have non e820 covered ISA mappings there, but I had some strange - * problems so it stays for now. -AK - */ - if (type == E820_RAM) { - if (start < 0x100000ULL && end > 0xA0000ULL) { - if (start < 0xA0000ULL) - add_memory_region(start, 0xA0000ULL-start, type); - if (end <= 0x100000ULL) - continue; - start = 0x100000ULL; - size = end - start; - } - } - - add_memory_region(start, size, type); - } while (biosmap++,--nr_map); - return 0; -} - -void __init setup_memory_region(void) -{ - char *who = "BIOS-e820"; - - /* - * Try to copy the BIOS-supplied E820-map. - * - * Otherwise fake a memory map; one section from 0k->640k, - * the next section from 1mb->appropriate_mem_k - */ - sanitize_e820_map(E820_MAP, &E820_MAP_NR); - if (copy_e820_map(E820_MAP, E820_MAP_NR) < 0) { - unsigned long mem_size; - - /* compare results from other methods and take the greater */ - if (ALT_MEM_K < EXT_MEM_K) { - mem_size = EXT_MEM_K; - who = "BIOS-88"; - } else { - mem_size = ALT_MEM_K; - who = "BIOS-e801"; - } - - e820.nr_map = 0; - add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map(who); -} -#endif - -extern unsigned long xen_override_max_pfn; - -void __init parse_memopt(char *p, char **from) -{ - /* - * mem=XXX[kKmM] limits kernel memory to XXX+1MB - * - * It would be more logical to count from 0 instead of from - * HIGH_MEMORY, but we keep that for now for i386 compatibility. - * - * No support for custom mapping like i386. The reason is - * that we need to read the e820 map anyways to handle the - * ACPI mappings in the direct map. Also on x86-64 there - * should be always a good e820 map. This is only an upper - * limit, you cannot force usage of memory not in e820. - * - * -AK - */ - end_user_pfn = memparse(p, from) + HIGH_MEMORY; - end_user_pfn >>= PAGE_SHIFT; - xen_override_max_pfn = (unsigned long) end_user_pfn; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/early_printk.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/early_printk.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,242 +0,0 @@ -#include <linux/config.h> -#include <linux/console.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/string.h> -#include <asm/io.h> -#include <asm/processor.h> - -/* Simple VGA output */ - -#ifdef __i386__ -#define VGABASE (__ISA_IO_base + 0xb8000) -#else -#define VGABASE ((void __iomem *)0xffffffff800b8000UL) -#endif - -#define MAX_YPOS 25 -#define MAX_XPOS 80 - -static int current_ypos = 1, current_xpos = 0; - -static void early_vga_write(struct console *con, const char *str, unsigned n) -{ - char c; - int i, k, j; - - while ((c = *str++) != '\0' && n-- > 0) { - if (current_ypos >= MAX_YPOS) { - /* scroll 1 line up */ - for (k = 1, j = 0; k < MAX_YPOS; k++, j++) { - for (i = 0; i < MAX_XPOS; i++) { - writew(readw(VGABASE + 2*(MAX_XPOS*k + i)), - VGABASE + 2*(MAX_XPOS*j + i)); - } - } - for (i = 0; i < MAX_XPOS; i++) - writew(0x720, VGABASE + 2*(MAX_XPOS*j + i)); - current_ypos = MAX_YPOS-1; - } - if (c == '\n') { - current_xpos = 0; - current_ypos++; - } else if (c != '\r') { - writew(((0x7 << 8) | (unsigned short) c), - VGABASE + 2*(MAX_XPOS*current_ypos + - current_xpos++)); - if (current_xpos >= MAX_XPOS) { - current_xpos = 0; - current_ypos++; - } - } - } -} - -static struct console early_vga_console = { - .name = "earlyvga", - .write = early_vga_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -#ifndef CONFIG_XEN -/* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ - -int early_serial_base = 0x3f8; /* ttyS0 */ - -#define XMTRDY 0x20 - -#define DLAB 0x80 - -#define TXR 0 /* Transmit register (WRITE) */ -#define RXR 0 /* Receive register (READ) */ -#define IER 1 /* Interrupt Enable */ -#define IIR 2 /* Interrupt ID */ -#define FCR 2 /* FIFO control */ -#define LCR 3 /* Line control */ -#define MCR 4 /* Modem control */ -#define LSR 5 /* Line Status */ -#define MSR 6 /* Modem Status */ -#define DLL 0 /* Divisor Latch Low */ -#define DLH 1 /* Divisor latch High */ - -static int early_serial_putc(unsigned char ch) -{ - unsigned timeout = 0xffff; - while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) - cpu_relax(); - outb(ch, early_serial_base + TXR); - return timeout ? 0 : -1; -} - -static void early_serial_write(struct console *con, const char *s, unsigned n) -{ - while (*s && n-- > 0) { - early_serial_putc(*s); - if (*s == '\n') - early_serial_putc('\r'); - s++; - } -} - -#define DEFAULT_BAUD 9600 - -static __init void early_serial_init(char *s) -{ - unsigned char c; - unsigned divisor; - unsigned baud = DEFAULT_BAUD; - char *e; - - if (*s == ',') - ++s; - - if (*s) { - unsigned port; - if (!strncmp(s,"0x",2)) { - early_serial_base = simple_strtoul(s, &e, 16); - } else { - static int bases[] = { 0x3f8, 0x2f8 }; - - if (!strncmp(s,"ttyS",4)) - s += 4; - port = simple_strtoul(s, &e, 10); - if (port > 1 || s == e) - port = 0; - early_serial_base = bases[port]; - } - s += strcspn(s, ","); - if (*s == ',') - s++; - } - - outb(0x3, early_serial_base + LCR); /* 8n1 */ - outb(0, early_serial_base + IER); /* no interrupt */ - outb(0, early_serial_base + FCR); /* no fifo */ - outb(0x3, early_serial_base + MCR); /* DTR + RTS */ - - if (*s) { - baud = simple_strtoul(s, &e, 0); - if (baud == 0 || s == e) - baud = DEFAULT_BAUD; - } - - divisor = 115200 / baud; - c = inb(early_serial_base + LCR); - outb(c | DLAB, early_serial_base + LCR); - outb(divisor & 0xff, early_serial_base + DLL); - outb((divisor >> 8) & 0xff, early_serial_base + DLH); - outb(c & ~DLAB, early_serial_base + LCR); -} -#else - -static void -early_serial_write(struct console *con, const char *s, unsigned count) -{ - int n; - - while (count > 0) { - n = HYPERVISOR_console_io(CONSOLEIO_write, count, (char *)s); - if (n <= 0) - break; - count -= n; - s += n; - } -} - -static __init void early_serial_init(char *s) -{ -} -#endif - -static struct console early_serial_console = { - .name = "earlyser", - .write = early_serial_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; - -/* Direct interface for emergencies */ -struct console *early_console = &early_vga_console; -static int early_console_initialized = 0; - -void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - - va_start(ap,fmt); - n = vscnprintf(buf,512,fmt,ap); - early_console->write(early_console,buf,n); - va_end(ap); -} - -static int keep_early; - -int __init setup_early_printk(char *opt) -{ - char *space; - char buf[256]; - - if (early_console_initialized) - return -1; - - opt = strchr(opt, '=') + 1; - - strlcpy(buf,opt,sizeof(buf)); - space = strchr(buf, ' '); - if (space) - *space = 0; - - if (strstr(buf,"keep")) - keep_early = 1; - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3)) { - early_console = &early_vga_console; - } - early_console_initialized = 1; - register_console(early_console); - return 0; -} - -void __init disable_early_printk(void) -{ - if (!early_console_initialized || !early_console) - return; - if (!keep_early) { - printk("disabling early console\n"); - unregister_console(early_console); - early_console_initialized = 0; - } else { - printk("keeping early console\n"); - } -} - -__setup("earlyprintk=", setup_early_printk); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/entry.S --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/entry.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1047 +0,0 @@ -/* - * linux/arch/x86_64/entry.S - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * Copyright (C) 2000 Pavel Machek <pavel@xxxxxxx> - * - * $Id$ - * - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Asit Mallick <asit.k.mallick@xxxxxxxxx> - * Modified for Xen - */ - -/* - * entry.S contains the system-call and fault low-level handling routines. - * - * NOTE: This code handles signal-recognition, which happens every time - * after an interrupt and after each system call. - * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * - * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP - * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. - * - full stack frame: Like partial stack frame, but all register saved. - * - * TODO: - * - schedule it carefully for the final hardware. - */ - -#define ASSEMBLY 1 -#include <linux/config.h> -#include <linux/linkage.h> -#include <asm/segment.h> -#include <asm/smp.h> -#include <asm/cache.h> -#include <asm/errno.h> -#include <asm/dwarf2.h> -#include <asm/calling.h> -#include <asm/asm_offset.h> -#include <asm/msr.h> -#include <asm/unistd.h> -#include <asm/thread_info.h> -#include <asm/hw_irq.h> -#include <asm/errno.h> -#include <asm-xen/xen-public/arch-x86_64.h> - -#include "irq_vectors.h" - -#include "xen_entry.S" - - .code64 - -#ifdef CONFIG_PREEMPT -#define preempt_stop XEN_BLOCK_EVENTS(%rsi) -#else -#define preempt_stop -#define retint_kernel retint_restore_args -#endif - -/* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. - * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs - * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp - movq $__USER_CS,CS(%rsp) - movq $-1,RCX(%rsp) - .endm - - .macro RESTORE_TOP_OF_STACK tmp,offset=0 - .endm - - .macro FAKE_STACK_FRAME child_rip - /* push in order ss, rsp, eflags, cs, rip */ - xorq %rax, %rax - pushq %rax /* ss */ - CFI_ADJUST_CFA_OFFSET 8 - pushq %rax /* rsp */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 - pushq $(1<<9) /* eflags - interrupts on */ - CFI_ADJUST_CFA_OFFSET 8 - pushq $__KERNEL_CS /* cs */ - CFI_ADJUST_CFA_OFFSET 8 - pushq \child_rip /* rip */ - CFI_ADJUST_CFA_OFFSET 8 - CFI_OFFSET rip,0 - pushq %rax /* orig rax */ - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro UNFAKE_STACK_FRAME - addq $8*6, %rsp - CFI_ADJUST_CFA_OFFSET -(6*8) - .endm - - .macro CFI_DEFAULT_STACK - CFI_ADJUST_CFA_OFFSET (SS) - CFI_OFFSET r15,R15-SS - CFI_OFFSET r14,R14-SS - CFI_OFFSET r13,R13-SS - CFI_OFFSET r12,R12-SS - CFI_OFFSET rbp,RBP-SS - CFI_OFFSET rbx,RBX-SS - CFI_OFFSET r11,R11-SS - CFI_OFFSET r10,R10-SS - CFI_OFFSET r9,R9-SS - CFI_OFFSET r8,R8-SS - CFI_OFFSET rax,RAX-SS - CFI_OFFSET rcx,RCX-SS - CFI_OFFSET rdx,RDX-SS - CFI_OFFSET rsi,RSI-SS - CFI_OFFSET rdi,RDI-SS - CFI_OFFSET rsp,RSP-SS - CFI_OFFSET rip,RIP-SS - .endm - - /* - * Must be consistent with the definition in arch_x86_64.h: - * struct switch_to_user { - * u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss; - * }; - * #define VGCF_IN_SYSCALL (1<<8) - */ - .macro SWITCH_TO_USER flag - movl $0,%gs:pda_kernel_mode # change to user mode - subq $8*4,%rsp # reuse rip, cs, rflags, rsp, ss in the stack - movq %rax,(%rsp) - movq %r11,1*8(%rsp) - movq %rcx,2*8(%rsp) # we saved %rcx upon exceptions - movq $\flag,3*8(%rsp) - movq $__HYPERVISOR_switch_to_user,%rax - syscall - .endm - - .macro SWITCH_TO_KERNEL ssoff,adjust=0 - btsq $0,%gs:pda_kernel_mode - jc 1f - orb $1,\ssoff-\adjust+4(%rsp) -1: - .endm - -/* - * A newly forked process directly context switches into this. - */ -/* rdi: prev */ -ENTRY(ret_from_fork) - CFI_STARTPROC - CFI_DEFAULT_STACK - call schedule_tail - GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) - jnz rff_trace -rff_action: - RESTORE_REST - testl $3,CS-ARGOFFSET(%rsp) # from kernel_thread? - je int_ret_from_sys_call - testl $_TIF_IA32,threadinfo_flags(%rcx) - jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi,ARGOFFSET - jmp ret_from_sys_call -rff_trace: - movq %rsp,%rdi - call syscall_trace_leave - GET_THREAD_INFO(%rcx) - jmp rff_action - CFI_ENDPROC - -/* - * System call entry. Upto 6 arguments in registers are supported. - * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. - */ - -/* - * Register setup: - * rax system call number - * rdi arg0 - * rcx return address for syscall/sysret, C arg3 - * rsi arg1 - * rdx arg2 - * r10 arg3 (--> moved to rcx for C) - * r8 arg4 - * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. - * - * Interrupts are off on entry. - * Only called from user space. - * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - */ - -ENTRY(system_call) - CFI_STARTPROC - SAVE_ARGS -8,0 - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - XEN_UNBLOCK_EVENTS(%r11) - GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx) - jnz tracesys - cmpq $__NR_syscall_max,%rax - ja badsys - movq %r10,%rcx - call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) -/* - * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. - */ - .globl ret_from_sys_call -ret_from_sys_call: - movl $_TIF_WORK_MASK,%edi - /* edi: flagmask */ -sysret_check: - GET_THREAD_INFO(%rcx) - XEN_BLOCK_EVENTS(%rsi) - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz sysret_careful - XEN_UNBLOCK_EVENTS(%rsi) - RESTORE_ARGS 0,8,0 - SWITCH_TO_USER VGCF_IN_SYSCALL - - /* Handle reschedules */ - /* edx: work, edi: workmask */ -sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal - XEN_BLOCK_EVENTS(%rsi) - pushq %rdi - call schedule - popq %rdi - jmp sysret_check - - /* Handle a signal */ -sysret_signal: -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jz 1f - - /* Really a signal */ - /* edx: work flags (arg3) */ - leaq do_notify_resume(%rip),%rax - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi - jmp sysret_check - - /* Do syscall tracing */ -tracesys: - SAVE_REST - movq $-ENOSYS,RAX(%rsp) - FIXUP_TOP_OF_STACK %rdi - movq %rsp,%rdi - call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST - cmpq $__NR_syscall_max,%rax - ja 1f - movq %r10,%rcx /* fixup for C */ - call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) -1: SAVE_REST - movq %rsp,%rdi - call syscall_trace_leave - RESTORE_TOP_OF_STACK %rbx - RESTORE_REST - jmp ret_from_sys_call - -badsys: - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - jmp ret_from_sys_call - -/* - * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. - */ -ENTRY(int_ret_from_sys_call) - XEN_BLOCK_EVENTS(%rsi) - testb $3,CS-ARGOFFSET(%rsp) - jnz 1f - /* Need to set the proper %ss (not NULL) for ring 3 iretq */ - movl $__KERNEL_DS,SS-ARGOFFSET(%rsp) - jmp retint_restore_args # retrun from ring3 kernel -1: - movl $_TIF_ALLWORK_MASK,%edi - /* edi: mask to check */ -int_with_check: - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz int_careful - jmp retint_restore_args - - /* Either reschedule or signal or syscall exit tracking needed. */ - /* First do a reschedule test. */ - /* edx: work, edi: workmask */ -int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - pushq %rdi - call schedule - popq %rdi - jmp int_with_check - - /* handle signals and tracing -- both require a full stack frame */ -int_very_careful: -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - SAVE_REST - /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx - jz int_signal - pushq %rdi - leaq 8(%rsp),%rdi # &ptregs -> arg1 - call syscall_trace_leave - popq %rdi - btr $TIF_SYSCALL_TRACE,%edi - btr $TIF_SYSCALL_AUDIT,%edi - btr $TIF_SINGLESTEP,%edi - jmp int_restore_rest - -int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx - jz 1f - movq %rsp,%rdi # &ptregs -> arg1 - xorl %esi,%esi # oldset -> arg2 - call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi -int_restore_rest: - RESTORE_REST - jmp int_with_check - CFI_ENDPROC - -/* - * Certain special system calls that need to save a complete full stack frame. - */ - - .macro PTREGSCALL label,func,arg - .globl \label -\label: - leaq \func(%rip),%rax - leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ - jmp ptregscall_common - .endm - - PTREGSCALL stub_clone, sys_clone, %r8 - PTREGSCALL stub_fork, sys_fork, %rdi - PTREGSCALL stub_vfork, sys_vfork, %rdi - PTREGSCALL stub_rt_sigsuspend, sys_rt_sigsuspend, %rdx - PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx - PTREGSCALL stub_iopl, sys_iopl, %rsi - -ENTRY(ptregscall_common) - CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - SAVE_REST - movq %r11, %r15 - FIXUP_TOP_OF_STACK %r11 - call *%rax - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - RESTORE_REST - pushq %r11 - CFI_ADJUST_CFA_OFFSET 8 - ret - CFI_ENDPROC - -ENTRY(stub_execve) - CFI_STARTPROC - popq %r11 - CFI_ADJUST_CFA_OFFSET -8 - SAVE_REST - movq %r11, %r15 - FIXUP_TOP_OF_STACK %r11 - call sys_execve - GET_THREAD_INFO(%rcx) - bt $TIF_IA32,threadinfo_flags(%rcx) - jc exec_32bit - RESTORE_TOP_OF_STACK %r11 - movq %r15, %r11 - RESTORE_REST - push %r11 - ret - -exec_32bit: - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC - -/* - * sigreturn is special because it needs to restore all registers on return. - * This cannot be done with SYSRET, so use the IRET return path instead. - */ -ENTRY(stub_rt_sigreturn) - CFI_STARTPROC - addq $8, %rsp - SAVE_REST - movq %rsp,%rdi - FIXUP_TOP_OF_STACK %r11 - call sys_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC - - -/* - * Interrupt entry/exit. - * - * Interrupt entry points save only callee clobbered registers in fast path. - * - * Entry runs with interrupts off. - */ - -/* 0(%rsp): interrupt number */ - .macro interrupt func - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-ORIG_RAX) - CFI_REL_OFFSET rip,(RIP-ORIG_RAX) - cld -#ifdef CONFIG_DEBUG_INFO - SAVE_ALL - movq %rsp,%rdi - /* - * Setup a stack frame pointer. This allows gdb to trace - * back to the original stack. - */ - movq %rsp,%rbp - CFI_DEF_CFA_REGISTER rbp -#else - SAVE_ARGS - leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler -#endif -#if 0 /* For Xen we don't need to do this */ - testl $3,CS(%rdi) - je 1f - swapgs -#endif -1: addl $1,%gs:pda_irqcount # RED-PEN should check preempt count - movq %gs:pda_irqstackptr,%rax - cmoveq %rax,%rsp - pushq %rdi # save old stack - call \func - .endm - -retint_check: - movl threadinfo_flags(%rcx),%edx - andl %edi,%edx - jnz retint_careful -retint_restore_args: - movb EVENT_MASK-REST_SKIP(%rsp), %al - notb %al # %al == ~saved_mask - XEN_GET_VCPU_INFO(%rsi) - andb evtchn_upcall_mask(%rsi),%al - andb $1,%al # %al == mask & ~saved_mask - jnz restore_all_enable_events # != 0 => reenable event delivery - XEN_PUT_VCPU_INFO(%rsi) - - RESTORE_ARGS 0,8,0 - testb $3,8(%rsp) # check CS - jnz user_mode -kernel_mode: - orb $3,1*8(%rsp) - iretq -user_mode: - SWITCH_TO_USER 0 - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal - XEN_UNBLOCK_EVENTS(%rsi) -/* sti */ - pushq %rdi - call schedule - popq %rdi - XEN_BLOCK_EVENTS(%rsi) - GET_THREAD_INFO(%rcx) -/* cli */ - jmp retint_check - -retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jz retint_restore_args - XEN_UNBLOCK_EVENTS(%rsi) - SAVE_REST - movq $-1,ORIG_RAX(%rsp) - xorq %rsi,%rsi # oldset - movq %rsp,%rdi # &pt_regs - call do_notify_resume - RESTORE_REST - XEN_BLOCK_EVENTS(%rsi) - movl $_TIF_NEED_RESCHED,%edi - GET_THREAD_INFO(%rcx) - jmp retint_check - -#ifdef CONFIG_PREEMPT - /* Returning to kernel space. Check if we need preemption */ - /* rcx: threadinfo. interrupts off. */ - .p2align -retint_kernel: - cmpl $0,threadinfo_preempt_count(%rcx) - jnz retint_restore_args - bt $TIF_NEED_RESCHED,threadinfo_flags(%rcx) - jnc retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ - jc retint_restore_args - movl $PREEMPT_ACTIVE,threadinfo_preempt_count(%rcx) -/* sti */ - XEN_UNBLOCK_EVENTS(%rsi) - call schedule - XEN_BLOCK_EVENTS(%rsi) -/* cli */ - GET_THREAD_INFO(%rcx) - movl $0,threadinfo_preempt_count(%rcx) - jmp retint_kernel /* check again */ -#endif - CFI_ENDPROC - -/* - * APIC interrupts. - */ - .macro apicinterrupt num,func - pushq $\num-256 - interrupt \func - jmp error_entry - CFI_ENDPROC - .endm - -#ifdef CONFIG_SMP -ENTRY(reschedule_interrupt) - apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt - -ENTRY(invalidate_interrupt) - apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt - -ENTRY(call_function_interrupt) - apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -ENTRY(apic_timer_interrupt) - apicinterrupt LOCAL_TIMER_VECTOR,smp_apic_timer_interrupt - -ENTRY(error_interrupt) - apicinterrupt ERROR_APIC_VECTOR,smp_error_interrupt - -ENTRY(spurious_interrupt) - apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt -#endif - -/* - * Exception entry points. - */ - .macro zeroentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* skip rcx and r11 */ - pushq $0 /* push error code/oldrax */ - pushq %rax /* push real oldrax to the rdi slot */ - leaq \sym(%rip),%rax - jmp error_entry - .endm - - .macro errorentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* rsp points to the error code */ - pushq %rax - leaq \sym(%rip),%rax - jmp error_entry - .endm - - /* error code is on the stack already */ - /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp /* skip rcx and r11 */ - SAVE_ALL - cld - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f -/* swapgs */ - xorl %ebx,%ebx -1: movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi - movq $-1,ORIG_RAX(%rsp) - call \sym - .endm - -/* - * Exception entry point. This expects an error code/orig_rax on the stack - * and the exception handler in %rax. - */ -ENTRY(error_entry) - CFI_STARTPROC simple - CFI_DEF_CFA rsp,(SS-RDI) - CFI_REL_OFFSET rsp,(RSP-RDI) - CFI_REL_OFFSET rip,(RIP-RDI) - /* rdi slot contains rax, oldrax contains error code */ - cld - subq $14*8,%rsp - CFI_ADJUST_CFA_OFFSET (14*8) - movq %rsi,13*8(%rsp) - CFI_REL_OFFSET rsi,RSI - movq 14*8(%rsp),%rsi /* load rax from rdi slot */ - movq %rdx,12*8(%rsp) - CFI_REL_OFFSET rdx,RDX - movq %rcx,11*8(%rsp) - CFI_REL_OFFSET rcx,RCX - movq %rsi,10*8(%rsp) /* store rax */ - CFI_REL_OFFSET rax,RAX - movq %r8, 9*8(%rsp) - CFI_REL_OFFSET r8,R8 - movq %r9, 8*8(%rsp) - CFI_REL_OFFSET r9,R9 - movq %r10,7*8(%rsp) - CFI_REL_OFFSET r10,R10 - movq %r11,6*8(%rsp) - CFI_REL_OFFSET r11,R11 - movq %rbx,5*8(%rsp) - CFI_REL_OFFSET rbx,RBX - movq %rbp,4*8(%rsp) - CFI_REL_OFFSET rbp,RBP - movq %r12,3*8(%rsp) - CFI_REL_OFFSET r12,R12 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13,R13 - movq %r14,1*8(%rsp) - CFI_REL_OFFSET r14,R14 - movq %r15,(%rsp) - CFI_REL_OFFSET r15,R15 -#if 0 - cmpl $__KERNEL_CS,CS(%rsp) - je error_kernelspace -#endif -error_call_handler: - movq %rdi, RDI(%rsp) - movq %rsp,%rdi - movq ORIG_RAX(%rsp),%rsi # get error code - movq $-1,ORIG_RAX(%rsp) - call *%rax -error_exit: - RESTORE_REST -/* cli */ - XEN_BLOCK_EVENTS(%rsi) - GET_THREAD_INFO(%rcx) - testb $3,CS-ARGOFFSET(%rsp) - jz retint_kernel - movl threadinfo_flags(%rcx),%edx - movl $_TIF_WORK_MASK,%edi - andl %edi,%edx - jnz retint_careful - jmp retint_restore_args - -error_kernelspace: - /* - * We need to re-write the logic here because we don't do iretq to - * to return to user mode. It's still possible that we get trap/fault - * in the kernel (when accessing buffers pointed to by system calls, - * for example). - * - */ -#if 0 - incl %ebx - /* There are two places in the kernel that can potentially fault with - usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. - B stepping K8s sometimes report an truncated RIP for IRET - exceptions returning to compat mode. Check for these here too. */ - leaq iret_label(%rip),%rbp - cmpq %rbp,RIP(%rsp) - je error_swapgs - movl %ebp,%ebp /* zero extend */ - cmpq %rbp,RIP(%rsp) - je error_swapgs - cmpq $gs_change,RIP(%rsp) - je error_swapgs - jmp error_sti -#endif - -ENTRY(hypervisor_callback) - zeroentry do_hypervisor_callback - -/* - * Copied from arch/xen/i386/kernel/entry.S - */ -# A note on the "critical region" in our callback handler. -# We want to avoid stacking callback handlers due to events occurring -# during handling of the last event. To do this, we keep events disabled -# until we've done all processing. HOWEVER, we must enable events before -# popping the stack frame (can't be done atomically) and so it would still -# be possible to get enough handler activations to overflow the stack. -# Although unlikely, bugs of that kind are hard to track down, so we'd -# like to avoid the possibility. -# So, on entry to the handler we detect whether we interrupted an -# existing activation in its critical region -- if so, we pop the current -# activation and restart the handler using the previous one. -ENTRY(do_hypervisor_callback) # do_hyperviosr_callback(struct *pt_regs) -# Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will -# see the correct pointer to the pt_regs - addq $8, %rsp # we don't return, adjust the stack frame -11: movb $0, EVENT_MASK(%rsp) - call evtchn_do_upcall - jmp error_exit - - ALIGN -restore_all_enable_events: - XEN_UNBLOCK_EVENTS(%rsi) # %rsi is already set up... - -scrit: /**** START OF CRITICAL REGION ****/ - XEN_TEST_PENDING(%rsi) - jnz 14f # process more events if necessary... - XEN_PUT_VCPU_INFO(%rsi) - RESTORE_ARGS 0,8,0 - testb $3,8(%rsp) # check CS - jnz crit_user_mode - orb $3,1*8(%rsp) - iretq -crit_user_mode: - SWITCH_TO_USER 0 - -14: XEN_LOCKED_BLOCK_EVENTS(%rsi) - XEN_PUT_VCPU_INFO(%rsi) - SAVE_REST - movq %rsp,%rdi # set the argument again - jmp 11b -ecrit: /**** END OF CRITICAL REGION ****/ -# At this point, unlike on x86-32, we don't do the fixup to simplify the -# code and the stack frame is more complex on x86-64. -# When the kernel is interrupted in the critical section, the kernel -# will do IRET in that case, and everything will be restored at that point, -# i.e. it just resumes from the next instruction interrupted with the same context. - -# Hypervisor uses this for application faults while it executes. -ENTRY(failsafe_callback) - addq $0x10,%rsp /* skip rcx and r11 */ -1: movl (%rsp),%ds -2: movl 8(%rsp),%es -3: movl 16(%rsp),%fs -4: movl 24(%rsp),%gs - addq $0x20,%rsp /* skip the above selectors */ - SAVE_ALL - jmp error_exit -.section .fixup,"ax"; \ -6: movq $0,(%rsp); \ - jmp 1b; \ -7: movq $0,8(%rsp); \ - jmp 2b; \ -8: movq $0,16(%rsp); \ - jmp 3b; \ -9: movq $0,24(%rsp); \ - jmp 4b; \ -.previous; \ -.section __ex_table,"a";\ - .align 16; \ - .quad 1b,6b; \ - .quad 2b,7b; \ - .quad 3b,8b; \ - .quad 4b,9b; \ -.previous - -#if 0 - .section __ex_table,"a" - .align 8 - .quad gs_change,bad_gs - .previous - .section .fixup,"ax" - /* running with kernelgs */ -bad_gs: -/* swapgs */ /* switch back to user gs */ - xorl %eax,%eax - movl %eax,%gs - jmp 2b - .previous -#endif - -/* - * Create a kernel thread. - * - * C extern interface: - * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) - * - * asm input arguments: - * rdi: fn, rsi: arg, rdx: flags - */ -ENTRY(kernel_thread) - CFI_STARTPROC - FAKE_STACK_FRAME $child_rip - SAVE_ALL - - # rdi: flags, rsi: usp, rdx: will be &pt_regs - movq %rdx,%rdi - orq kernel_thread_flags(%rip),%rdi - movq $-1, %rsi - movq %rsp, %rdx - - xorl %r8d,%r8d - xorl %r9d,%r9d - - # clone now - call do_fork - movq %rax,RAX(%rsp) - xorl %edi,%edi - - /* - * It isn't worth to check for reschedule here, - * so internally to the x86_64 port you can rely on kernel_thread() - * not to reschedule the child before returning, this avoids the need - * of hacks for example to fork off the per-CPU idle tasks. - * [Hopefully no generic code relies on the reschedule -AK] - */ - RESTORE_ALL - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC - - -child_rip: - /* - * Here we are in the child and the registers are set as they were - * at kernel_thread() invocation in the parent. - */ - movq %rdi, %rax - movq %rsi, %rdi - call *%rax - # exit - xorq %rdi, %rdi - call do_exit - -/* - * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. - * - * C extern interface: - * extern long execve(char *name, char **argv, char **envp) - * - * asm input arguments: - * rdi: name, rsi: argv, rdx: envp - * - * We want to fallback into: - * extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) - * - * do_sys_execve asm fallback arguments: - * rdi: name, rsi: argv, rdx: envp, fake frame on the stack - */ -ENTRY(execve) - CFI_STARTPROC - FAKE_STACK_FRAME $0 - SAVE_ALL - call sys_execve - movq %rax, RAX(%rsp) - RESTORE_REST - testq %rax,%rax - jne 1f - jmp int_ret_from_sys_call -1: RESTORE_ARGS - UNFAKE_STACK_FRAME - ret - CFI_ENDPROC - - - /* - * Copy error_entry because of the different stack frame - */ -ENTRY(page_fault) - movq (%rsp),%rcx - movq 8(%rsp),%r11 - addq $0x10,%rsp # now %rsp points to %cr2 - pushq %rax - leaq do_page_fault(%rip),%rax - cld - subq $13*8,%rsp - movq %rdx,12*8(%rsp) # save %rdx - movq 13*8(%rsp),%rdx # load rax - movq %rcx,11*8(%rsp) - movq %rdx,10*8(%rsp) # store rax - movq %rsi,13*8(%rsp) # now save %rsi - movq 14*8(%rsp),%rdx # load %cr2, 3rd argument - movq %r8, 9*8(%rsp) - movq %r9, 8*8(%rsp) - movq %r10,7*8(%rsp) - movq %r11,6*8(%rsp) - movq %rbx,5*8(%rsp) - movq %rbp,4*8(%rsp) - movq %r12,3*8(%rsp) - movq %r13,2*8(%rsp) - movq %r14,1*8(%rsp) - movq %r15,(%rsp) -#if 0 - cmpl $__KERNEL_CS,CS(%rsp) - je error_kernelspace -#endif - /* - * 1st and 2nd arguments are set by error_call_handler - */ - jmp error_call_handler - -ENTRY(coprocessor_error) - zeroentry do_coprocessor_error - -ENTRY(simd_coprocessor_error) - zeroentry do_simd_coprocessor_error - -ENTRY(device_not_available) - zeroentry math_state_restore - - /* runs on exception stack */ -ENTRY(debug) - CFI_STARTPROC - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug - /* switch back to process stack to restore the state ptrace touched */ - movq %rax,%rsp - jmp paranoid_exit - CFI_ENDPROC - -#if 0 - /* runs on exception stack */ -ENTRY(nmi) - CFI_STARTPROC - pushq $-1 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_nmi - /* ebx: no swapgs flag */ -#endif -paranoid_exit: - testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore -paranoid_swapgs: -/* cli - swapgs */ -paranoid_restore: - RESTORE_ALL 8 -/* iretq */ -paranoid_userspace: -/* cli */ - GET_THREAD_INFO(%rcx) - movl threadinfo_flags(%rcx),%edx - testl $_TIF_NEED_RESCHED,%edx - jnz paranoid_resched - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx - jnz paranoid_signal - jmp paranoid_swapgs -paranoid_resched: -/* sti */ - call schedule - jmp paranoid_exit -paranoid_signal: -/* sti */ - xorl %esi,%esi /* oldset */ - movq %rsp,%rdi /* &pt_regs */ - call do_notify_resume - jmp paranoid_exit - CFI_ENDPROC - -ENTRY(int3) - zeroentry do_int3 - -ENTRY(overflow) - zeroentry do_overflow - -ENTRY(bounds) - zeroentry do_bounds - -ENTRY(invalid_op) - zeroentry do_invalid_op - -ENTRY(coprocessor_segment_overrun) - zeroentry do_coprocessor_segment_overrun - -ENTRY(reserved) - zeroentry do_reserved - - /* runs on exception stack */ -ENTRY(double_fault) - CFI_STARTPROC - paranoidentry do_double_fault - movq %rax,%rsp - jmp paranoid_exit - CFI_ENDPROC - -ENTRY(invalid_TSS) - errorentry do_invalid_TSS - -ENTRY(segment_not_present) - errorentry do_segment_not_present - - /* runs on exception stack */ -ENTRY(stack_segment) - CFI_STARTPROC - paranoidentry do_stack_segment - movq %rax,%rsp - jmp paranoid_exit - CFI_ENDPROC - -ENTRY(general_protection) - errorentry do_general_protection - -ENTRY(alignment_check) - errorentry do_alignment_check - -ENTRY(divide_error) - zeroentry do_divide_error - -ENTRY(spurious_interrupt_bug) - zeroentry do_spurious_interrupt_bug - -#ifdef CONFIG_X86_MCE - /* runs on exception stack */ -ENTRY(machine_check) - CFI_STARTPROC - pushq $0 - CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_machine_check - jmp paranoid_exit - CFI_ENDPROC -#endif - -ENTRY(call_debug) - zeroentry do_call_debug - - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,207 +0,0 @@ -/* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@xxxxxxx> SuSE - * Copyright (C) 2000 Pavel Machek <pavel@xxxxxxx> - * Copyright (C) 2000 Karsten Keil <kkeil@xxxxxxx> - * Copyright (C) 2001,2002 Andi Kleen <ak@xxxxxxx> - * - * $Id: head.S,v 1.49 2002/03/19 17:39:25 ak Exp $ - * - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Modified for Xen - */ - - -#include <linux/linkage.h> - -.section __xen_guest - .ascii "GUEST_OS=linux,GUEST_VER=2.6,XEN_VER=3.0,VIRT_BASE=0xffffffff80000000" - .ascii ",LOADER=generic" -/* .ascii ",PT_MODE_WRITABLE" */ - .byte 0 - - -#include <linux/threads.h> -#include <asm/desc.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/msr.h> -#include <asm/cache.h> -/* #include <asm/thread_info.h> */ - - -/* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages on setup so define __START_KERNEL to - * 0x100000 for this stage - * - */ - - .text - .code64 -ENTRY(_start) - cld - movq init_rsp(%rip),%rsp - /* Copy the necessary stuff from xen_start_info structure. */ - movq $xen_start_info_union,%rdi - movq $64,%rcx /* sizeof (union xen_start_info_union) / sizeof (long) */ - rep movsq - -#ifdef CONFIG_SMP - ENTRY(startup_64_smp) - cld -#endif /* CONFIG_SMP */ - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - movq initial_code(%rip),%rax - jmp *%rax - - /* SMP bootup changes these two */ - .globl initial_code -initial_code: - .quad x86_64_start_kernel - .globl init_rsp -init_rsp: - .quad init_thread_union+THREAD_SIZE-8 - -ENTRY(early_idt_handler) - xorl %eax,%eax - movq 8(%rsp),%rsi # get rip - movq (%rsp),%rdx - leaq early_idt_msg(%rip),%rdi -1: hlt # generate #GP - jmp 1b - -early_idt_msg: - .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" - -#if 0 -ENTRY(lgdt_finish) - movl $(__USER_DS),%eax # DS/ES contains default USER segment - movw %ax,%ds - movw %ax,%es - movl $(__KERNEL_DS),%eax - movw %ax,%ss # after changing gdt. - popq %rax # get the retrun address - pushq $(__KERNEL_CS) - pushq %rax - lretq -#endif - -ENTRY(stext) -ENTRY(_stext) - - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ -.org 0x1000 -ENTRY(init_level4_pgt) - .fill 512,8,0 - - /* - * We update two pgd entries to make kernel and user pgd consistent - * at pgd_populate(). It can be used for kernel modules. So we place - * this page here for those cases to avoid memory corruption. - * We also use this page to establish the initiali mapping for - * vsyscall area. - */ -.org 0x2000 -ENTRY(init_level4_user_pgt) - .fill 512,8,0 - - /* - * This is used for vsyscall area mapping as we have a different - * level4 page table for user. - */ -.org 0x3000 -ENTRY(level3_user_pgt) - .fill 512,8,0 - -.org 0x4000 -ENTRY(cpu_gdt_table) -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ - .quad 0x0000000000000000 /* NULL descriptor */ - .quad 0x008ffa000000ffff /* __KERNEL_COMPAT32_CS */ - .quad 0x00affa000000ffff /* __KERNEL_CS */ - .quad 0x00cff2000000ffff /* __KERNEL_DS */ - - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cffa000000ffff /* __KERNEL32_CS */ - .quad 0,0 /* TSS */ - .quad 0 /* LDT */ - .quad 0,0,0 /* three TLS descriptors */ - .quad 0 /* unused now */ - -gdt_end: - /* asm/segment.h:GDT_ENTRIES must match this */ - /* This should be a multiple of the cache line size */ - /* GDTs of other CPUs: */ - .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) - -.org 0x5000 -ENTRY(empty_zero_page) - -.org 0x6000 -ENTRY(empty_bad_page) - -.org 0x7000 -ENTRY(empty_bad_pte_table) - -.org 0x8000 -ENTRY(empty_bad_pmd_table) - - .org 0x9000 -#ifdef CONFIG_ACPI_SLEEP -ENTRY(wakeup_level4_pgt) - .quad 0x0000000000102007 /* -> level3_ident_pgt */ - .fill 255,8,0 - .quad 0x000000000010a007 - .fill 254,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad 0x0000000000103007 /* -> level3_kernel_pgt */ -#endif - - .data - - .align 16 - .globl cpu_gdt_descr -cpu_gdt_descr: - .word gdt_end-cpu_gdt_table -gdt: - .quad cpu_gdt_table -#ifdef CONFIG_SMP - .rept NR_CPUS-1 - .word 0 - .quad 0 - .endr -#endif - -ENTRY(gdt_table32) - .quad 0x0000000000000000 /* This one is magic */ - .quad 0x0000000000000000 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ -gdt32_end: - -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ - -#if 0 -.align L1_CACHE_BYTES -#endif - .align L1_CACHE_BYTES -ENTRY(idt_table) - .rept 256 - .quad 0 - .quad 0 - .endr - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head64.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/head64.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,132 +0,0 @@ -/* - * linux/arch/x86_64/kernel/head64.c -- prepare to run common code - * - * Copyright (C) 2000 Andrea Arcangeli <andrea@xxxxxxx> SuSE - * - * $Id: head64.c,v 1.22 2001/07/06 14:28:20 ak Exp $ - * - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Modified for Xen. - */ - -#include <linux/init.h> -#include <linux/linkage.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/percpu.h> - -#include <asm/processor.h> -#include <asm/proto.h> -#include <asm/smp.h> -#include <asm/bootsetup.h> -#include <asm/setup.h> -#include <asm/desc.h> - -unsigned long start_pfn; - -/* Don't add a printk in there. printk relies on the PDA which is not initialized - yet. */ -#if 0 -static void __init clear_bss(void) -{ - extern char __bss_start[], __bss_end[]; - memset(__bss_start, 0, - (unsigned long) __bss_end - (unsigned long) __bss_start); -} -#endif - -extern char x86_boot_params[2048]; - -#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x90020 -#define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 - -extern char saved_command_line[]; - -#if 0 -static void __init copy_bootdata(char *real_mode_data) -{ - int new_data; - char * command_line; - - memcpy(x86_boot_params, real_mode_data, 2048); - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); - if (!new_data) { - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { - printk("so old bootloader that it does not support commandline?!\n"); - return; - } - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; - printk("old bootloader convention, maybe loadlin?\n"); - } - command_line = (char *) ((u64)(new_data)); - memcpy(saved_command_line, command_line, COMMAND_LINE_SIZE); - printk("Bootdata ok (command line is %s)\n", saved_command_line); -} -#endif - -static void __init setup_boot_cpu_data(void) -{ - int dummy, eax; - - /* get vendor info */ - cpuid(0, &boot_cpu_data.cpuid_level, - (int *)&boot_cpu_data.x86_vendor_id[0], - (int *)&boot_cpu_data.x86_vendor_id[8], - (int *)&boot_cpu_data.x86_vendor_id[4]); - - /* get cpu type */ - cpuid(1, &eax, &dummy, &dummy, - (unsigned int *) &boot_cpu_data.x86_capability); - boot_cpu_data.x86 = (eax >> 8) & 0xf; - boot_cpu_data.x86_model = (eax >> 4) & 0xf; - boot_cpu_data.x86_mask = eax & 0xf; -} - -extern char _end[]; - -void __init x86_64_start_kernel(char * real_mode_data) -{ - int i; - - phys_to_machine_mapping = (u32 *)xen_start_info.mfn_list; - start_pfn = (__pa(xen_start_info.pt_base) >> PAGE_SHIFT) + xen_start_info.nr_pt_frames; - - for (i = 0; i < 256; i++) - set_intr_gate(i, early_idt_handler); -#if 0 - asm volatile("lidt %0" :: "m" (idt_descr)); -#endif - pda_init(0); - /* copy_bootdata(real_mode_data); */ -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif - /* default console: */ - if (!strstr(saved_command_line, "console=")) - strcat(saved_command_line, " console=tty0"); -#if 0 - s = strstr(saved_command_line, "earlyprintk="); - if (s != NULL) - setup_early_printk(s); -#endif - -#ifdef CONFIG_DISCONTIGMEM - s = strstr(saved_command_line, "numa="); - if (s != NULL) - numa_setup(s+5); -#endif -#ifdef CONFIG_X86_IO_APIC - if (strstr(saved_command_line, "disableapic")) - disable_apic = 1; -#endif - /* You need early console to see that */ - if (__pa_symbol(&_end) >= KERNEL_TEXT_SIZE) - panic("Kernel too big for kernel mapping\n"); - - setup_boot_cpu_data(); - start_kernel(); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/init_task.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/init_task.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,49 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - -/* - * Initial task structure. - * - * We need to make sure that this is 8192-byte aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_task); -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; - -#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/io_apic.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/io_apic.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2051 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku <yaku@xxxxxxxxxxxxxxxxxxxx> and - * Hidemi Kishimoto <kisimoto@xxxxxxxxxxxxxxxxxxxx>, - * further tested and cleaned up by Zach Brown <zab@xxxxxxxxxx> - * and Ingo Molnar <mingo@xxxxxxxxxx> - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/sched.h> -#include <linux/config.h> -#include <linux/smp_lock.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> -#include <linux/sysdev.h> - -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/mach_apic.h> - -#define __apicdebuginit __init - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static DEFINE_SPINLOCK(ioapic_lock); - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; - -int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - -#ifdef CONFIG_XEN - -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/xen-public/physdev.h> - -/* Fake i8259 */ -#define make_8259A_irq(_irq) (io_apic_irqs &= ~(1UL<<(_irq))) -#define disable_8259A_irq(_irq) ((void)0) -#define i8259A_irq_pending(_irq) (0) - -unsigned long io_apic_irqs; - -static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int reg) -{ - physdev_op_t op; - int ret; - - op.cmd = PHYSDEVOP_APIC_READ; - op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; - op.u.apic_op.offset = reg; - ret = HYPERVISOR_physdev_op(&op); - if (ret) - return ret; - return op.u.apic_op.value; -} - -static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - physdev_op_t op; - - op.cmd = PHYSDEVOP_APIC_WRITE; - op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid; - op.u.apic_op.offset = reg; - op.u.apic_op.value = value; - HYPERVISOR_physdev_op(&op); -} - -#define io_apic_read(a,r) xen_io_apic_read(a,r) -#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v) - -#define clear_IO_APIC() ((void)0) - -#endif /* !CONFIG_XEN */ - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - static int first_free_entry = NR_IRQS; - struct irq_pin_list *entry = irq_2_pin + irq; - - while (entry->next) - entry = irq_2_pin + entry->next; - - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) - panic("io_apic.c: whoops"); - } - entry->apic = apic; - entry->pin = pin; -} - -#ifndef CONFIG_XEN -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ - \ - for (;;) { \ - unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ - break; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - if (!entry->next) \ - break; \ - entry = irq_2_pin + entry->next; \ - } \ - FINAL; \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - spin_lock_irqsave(&ioapic_lock, flags); - *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin); - *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0)); - io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#endif /* !CONFIG_XEN */ - -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -int pirq_entries [MAX_PIRQS]; -int pirqs_enabled; -int skip_ioapic_setup; -int ioapic_force; - -/* dummy parsing: see setup.c */ - -static int __init disable_ioapic_setup(char *str) -{ - skip_ioapic_setup = 1; - return 1; -} - -static int __init enable_ioapic_setup(char *str) -{ - ioapic_force = 1; - skip_ioapic_setup = 0; - return 1; -} - - -__setup("noapic", disable_ioapic_setup); -__setup("apic", enable_ioapic_setup); - - -#include <asm/pci-direct.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> - -/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC - off. Check for an Nvidia or VIA PCI bridge and turn it off. - Use pci direct infrastructure because this runs before the PCI subsystem. - - Can be overwritten with "apic" - - And another hack to disable the IOMMU on VIA chipsets. - - Kludge-O-Rama. */ -void __init check_ioapic(void) -{ - int num,slot,func; - if (ioapic_force) - return; - - /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - switch (vendor) { - case PCI_VENDOR_ID_VIA: -#ifdef CONFIG_GART_IOMMU - if ((end_pfn >= (0xffffffff>>PAGE_SHIFT) || - force_iommu) && - !iommu_aperture_allowed) { - printk(KERN_INFO - "Looks like a VIA chipset. Disabling IOMMU. Overwrite with \"iommu=allowed\"\n"); - iommu_aperture_disabled = 1; - } -#endif - return; - case PCI_VENDOR_ID_NVIDIA: -#ifdef CONFIG_ACPI - /* All timer overrides on Nvidia - seem to be wrong. Skip them. */ - acpi_skip_timer_override = 1; - printk(KERN_INFO - "Nvidia board detected. Ignoring ACPI timer override.\n"); -#endif - /* RED-PEN skip them on mptables too? */ - return; - } - - /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } -} - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_irqtype == type && - (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && - mp_irqs[i].mpc_dstirq == pin) - return i; - - return -1; -} - -#ifndef CONFIG_XEN -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || - mp_bus_id_to_type[lbus] == MP_BUS_EISA || - mp_bus_id_to_type[lbus] == MP_BUS_MCA) && - (mp_irqs[i].mpc_irqtype == type) && - (mp_irqs[i].mpc_srcbusirq == irq)) - - return mp_irqs[i].mpc_dstirq; - } - return -1; -} -#endif - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (mp_bus_id_to_pci_bus[bus] == -1) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mpc_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || - mp_irqs[i].mpc_dstapic == MP_APIC_ALL) - break; - - if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && - !mp_irqs[i].mpc_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) -#define default_EISA_polarity(idx) (0) - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) (0) - -static int __init MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mpc_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - polarity = default_ISA_polarity(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - polarity = default_EISA_polarity(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - polarity = default_PCI_polarity(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - polarity = default_MCA_polarity(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mpc_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - { - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - { - trigger = default_ISA_trigger(idx); - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - trigger = default_PCI_trigger(idx); - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mpc_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mpc_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - switch (mp_bus_id_to_type[bus]) - { - case MP_BUS_ISA: /* ISA pin */ - case MP_BUS_EISA: - case MP_BUS_MCA: - { - irq = mp_irqs[idx].mpc_srcbusirq; - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - break; - } - default: - { - printk(KERN_ERR "unknown bus type %d.\n",bus); - irq = 0; - break; - } - } - - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } - return irq; -} - -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic,pin,mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS]; - -int assign_irq_vector(int irq) -{ - static int current_vector = FIRST_DEVICE_VECTOR; - physdev_op_t op; - - BUG_ON(irq >= NR_IRQ_VECTORS); - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) - return IO_APIC_VECTOR(irq); - - op.cmd = PHYSDEVOP_ASSIGN_VECTOR; - op.u.irq_op.irq = irq; - if (HYPERVISOR_physdev_op(&op)) - return -ENOSPC; - current_vector = op.u.irq_op.vector; - - vector_irq[current_vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = current_vector; - - return current_vector; -} - -extern void (*interrupt[NR_IRQS])(void); -#ifndef CONFIG_XEN -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -static inline void ioapic_register_intr(int irq, int vector, unsigned long trigger) -{ - if (use_pci_vector() && !platform_legacy_irq(irq)) { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[vector].handler = &ioapic_level_type; - else - irq_desc[vector].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[vector]); - } else { - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[irq].handler = &ioapic_level_type; - else - irq_desc[irq].handler = &ioapic_edge_type; - set_intr_gate(vector, interrupt[irq]); - } -} -#else -#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0) -#endif /* !CONFIG_XEN */ - -void __init setup_IO_APIC_irqs(void) -{ - struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; - unsigned long flags; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mpc_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mpc_apicid, pin); - continue; - } - - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - } - - irq = pin_2_irq(idx, apic, pin); - add_pin_to_irq(irq, apic, pin); - - if (/* !apic && */ !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE," not connected.\n"); -} - -#ifndef CONFIG_XEN -/* - * Set up the 8259A-master output pin as broadcast to all - * CPUs. - */ -void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - memset(&entry,0,sizeof(entry)); - - disable_8259A_irq(0); - - /* mask LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* unmask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we have a 8259A-master in AEOI mode ... - */ - irq_desc[0].handler = &ioapic_edge_type; - - /* - * Add it to the IO-APIC irq-routing table: - */ - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - enable_8259A_irq(0); -} - -void __init UNEXPECTED_IO_APIC(void) -{ -} - -void __apicdebuginit print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) && - (reg_01.bits.entries != 0x03) - ) - UNEXPECTED_IO_APIC(); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - - if (reg_01.bits.version >= 0x10) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) - continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = irq_2_pin + entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -static __apicdebuginit void print_APIC_bitfield (int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1<<j)) - printk("1"); - else - printk("0"); - } - printk("\n"); - } -} - -void __apicdebuginit print_local_APIC(void * dummy) -{ - unsigned int v, ver, maxlvt; - - if (apic_verbosity == APIC_QUIET) - return; - - printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", - smp_processor_id(), hard_smp_processor_id()); - v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v)); - v = apic_read(APIC_LVR); - printk(KERN_INFO "... APIC VERSION: %08x\n", v); - ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); - - v = apic_read(APIC_TASKPRI); - printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); - } - - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); - v = apic_read(APIC_LDR); - printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); - v = apic_read(APIC_SPIV); - printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); - - printk(KERN_DEBUG "... APIC ISR field:\n"); - print_APIC_bitfield(APIC_ISR); - printk(KERN_DEBUG "... APIC TMR field:\n"); - print_APIC_bitfield(APIC_TMR); - printk(KERN_DEBUG "... APIC IRR field:\n"); - print_APIC_bitfield(APIC_IRR); - - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -void print_all_local_APICs (void) -{ - on_each_cpu(print_local_APIC, NULL, 1, 1); -} - -void __apicdebuginit print_PIC(void) -{ - extern spinlock_t i8259A_lock; - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} -#else -void __init print_IO_APIC(void) { } -#endif /* !CONFIG_XEN */ - -static void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i; - unsigned long flags; - - for (i = 0; i < PIN_MAP_SIZE; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (i = 0; i < nr_ioapics; i++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(i, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[i] = reg_01.bits.entries+1; - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); -#ifndef CONFIG_XEN - disconnect_bsp_APIC(); -#endif -} - -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch <Matt_Domsch@xxxxxxxx> Tue Dec 21 12:25:05 CST 1999 - */ -#ifndef CONFIG_XEN -static void __init setup_ioapic_ids_from_mpc (void) -{ - union IO_APIC_reg_00 reg_00; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mpc_apicid; - - - printk(KERN_INFO "Using IO-APIC %d\n", mp_ioapics[apic].mpc_apicid); - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mpc_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mpc_dstapic == old_id) - mp_irqs[i].mpc_dstapic - = mp_ioapics[apic].mpc_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE,KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mpc_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE," ok.\n"); - } -} -#else -static void __init setup_ioapic_ids_from_mpc(void) { } -#endif - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -#ifndef CONFIG_XEN -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (jiffies - t1 > 4) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_edge_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) -{ - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ -} - -static void end_level_ioapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - unsigned long flags; - unsigned int dest; - - dest = cpu_mask_to_apicid(mask); - - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = dest, ) - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -static struct hw_interrupt_type ioapic_edge_type = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, - .set_affinity = set_ioapic_affinity, -}; - -static struct hw_interrupt_type ioapic_level_type = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, - .set_affinity = set_ioapic_affinity, -}; -#endif /* !CONFIG_XEN */ - -static inline void init_IO_APIC_traps(void) -{ - int irq; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for (irq = 0; irq < NR_IRQS ; irq++) { - int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } - if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); -#ifndef CONFIG_XEN - else - /* Strange. Oh, well.. */ - irq_desc[irq].handler = &no_irq_type; -#endif - } - } -} - -#ifndef CONFIG_XEN -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void disable_lapic_irq (unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static void end_lapic_irq (unsigned int i) { /* nothing */ } - -static struct hw_interrupt_type lapic_irq_type = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq, -}; - -static void setup_nmi (void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(NULL); - - printk(" done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void unlock_ExtINT_logic(void) -{ - int pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - unsigned long flags; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) - return; - - spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin); - *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - clear_IO_APIC_pin(0, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(0, pin); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1)); - io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0)); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - */ -static inline void check_timer(void) -{ - int pin1, pin2; - int vector; - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); - - /* - * Subtle, code in do_timer_interrupt() expects an AEOI - * mode for the 8259A whenever interrupts are routed - * through I/O APICs. Also IRQ0 has to be enabled in - * the 8259A which implies the virtual wire has to be - * disabled in the local APIC. - */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); - enable_8259A_irq(0); - - pin1 = find_isa_irq_pin(0, mp_INT); - pin2 = find_isa_irq_pin(0, mp_ExtINT); - - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, pin1, pin2); - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - check_nmi_watchdog(); - } - return; - } - clear_IO_APIC_pin(0, pin1); - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n"); - } - - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); - if (pin2 != -1) { - apic_printk(APIC_VERBOSE,"\n..... (found pin %d) ...", pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - setup_ExtINT_IRQ0_pin(pin2, vector); - if (timer_irq_works()) { - printk("works.\n"); - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - check_nmi_watchdog(); - } - return; - } - /* - * Cleanup, just in case ... - */ - clear_IO_APIC_pin(0, pin2); - } - printk(" failed.\n"); - - if (nmi_watchdog) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = 0; - } - - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); - - disable_8259A_irq(0); - irq_desc[0].handler = &lapic_irq_type; - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, " works.\n"); - return; - } - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - apic_printk(APIC_VERBOSE," failed.\n"); - - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); - - init_8259A(0); - make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); - return; - } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); -} -#else -#define check_timer() ((void)0) -#endif /* !CONFIG_XEN */ - -/* - * - * IRQ's that are handled by the PIC in the MPS IOAPIC case. - * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. - * Linux doesn't really care, as it's not actually used - * for any interrupt handling anyway. - */ -#define PIC_IRQS (1<<2) - -void __init setup_IO_APIC(void) -{ - enable_IO_APIC(); - - if (acpi_ioapic) - io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ - else - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - - /* - * Set up the IO-APIC IRQ routing table. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#ifndef CONFIG_XEN - sync_Arb_IDs(); -#endif /* !CONFIG_XEN */ - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); -} - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, u32 state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i); - *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) { - io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1)); - io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0)); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - set_kset_name("ioapic"), - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error = 0; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - memset(mp_ioapic_data[i], 0, size); - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_BOOT - -#define IO_APIC_MAX_ID 0xFE - -int __init io_apic_get_unique_id (int ioapic, int apic_id) -{ -#ifndef CONFIG_XEN - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = phys_cpu_present_map; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= IO_APIC_MAX_ID) { - apic_printk(APIC_QUIET, KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (physid_isset(apic_id, apic_id_map)) { - - for (i = 0; i < IO_APIC_MAX_ID; i++) { - if (!physid_isset(i, apic_id_map)) - break; - } - - if (i == IO_APIC_MAX_ID) - panic("Max apic_id exceeded!\n"); - - apic_printk(APIC_VERBOSE, KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - physid_set(apic_id, apic_id_map); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) - panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic); - } - - apic_printk(APIC_VERBOSE,KERN_INFO "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); -#endif /* !CONFIG_XEN */ - - return apic_id; -} - - -int __init io_apic_get_version (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} - - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) -{ - struct IO_APIC_route_entry entry; - unsigned long flags; - - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; /* Disabled (masked) */ - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); - io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return 0; -} - -#endif /*CONFIG_ACPI_BOOT*/ - - -#ifndef CONFIG_XEN -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif /* !CONFIG_XEN */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/ioport.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/ioport.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,63 +0,0 @@ -/* - * linux/arch/x86_64/kernel/ioport.c - * - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <asm-xen/xen-public/physdev.h> - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - */ - -// asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -asmlinkage long sys_iopl(unsigned int new_io_pl) -{ - unsigned int old_io_pl = current->thread.io_pl; - physdev_op_t op; - - - if (new_io_pl > 3) - return -EINVAL; - - /* Need "raw I/O" privileges for direct port access. */ - if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* Maintain OS privileges even if user attempts to relinquish them. */ - if (new_io_pl == 0) - new_io_pl = 1; - - /* Change our version of the privilege levels. */ - current->thread.io_pl = new_io_pl; - - /* Force the change at ring 0. */ - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = new_io_pl; - HYPERVISOR_physdev_op(&op); - - return 0; -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - return turn_on ? sys_iopl(3) : 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/irq.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,105 +0,0 @@ -/* - * linux/arch/x86_64/kernel/irq.c - * - * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar - * - * This file contains the lowest level x86_64-specific interrupt - * entry and irq statistics code. All the remaining irq logic is - * done by the generic kernel/irq/ code and in the - * x86_64-specific irq controller code. (e.g. i8259.c and - * io_apic.c.) - */ -#include <asm/uaccess.h> -#include <linux/module.h> -#include <linux/seq_file.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> - -/* - * Interrupt statistics: - */ - -atomic_t irq_err_count; - - -/* - * Generic, controller-independent functions: - */ - -int show_interrupts(struct seq_file *p, void *v) -{ - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - - if (i == 0) { - seq_printf(p, " "); - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "CPU%d ",j); - seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; - if (!action) - goto skip; - seq_printf(p, "%3d: ",i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", - kstat_cpu(j).irqs[i]); -#endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); - - seq_printf(p, " %s", action->name); - for (action=action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", cpu_pda[j].__nmi_count); - seq_putc(p, '\n'); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs); - seq_putc(p, '\n'); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#ifdef CONFIG_X86_IO_APIC -#ifdef APIC_MISMATCH_DEBUG - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif -#endif - } - return 0; -} - -/* - * do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - */ -asmlinkage unsigned int do_IRQ(struct pt_regs *regs) -{ - /* high bits used in ret_from_ code */ - int irq = regs->orig_rax & __IRQ_MASK(HARDIRQ_BITS); - - irq_enter(); - - __do_IRQ(irq, regs); - irq_exit(); - - return 1; -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/ldt.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,263 +0,0 @@ -/* - * linux/arch/x86_64/kernel/ldt.c - * - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/pgalloc.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - preempt_disable(); -#endif - make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) / - PAGE_SIZE); - load_LDT(pc); -#ifdef CONFIG_SMP - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#endif - } - if (oldsize) { - make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) / - PAGE_SIZE); - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) / - PAGE_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - init_MUTEX(&mm->context.sem); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - down(&old_mm->context.sem); - retval = copy_ldt(&mm->context, &old_mm->context); - up(&old_mm->context.sem); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - make_pages_writable(mm->context.ldt, - (mm->context.size * LDT_ENTRY_SIZE) / - PAGE_SIZE); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - down(&mm->context.sem); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - up(&mm->context.sem); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - unsigned long mach_lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - down(&mm->context.sem); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - mach_lp = arbitrary_virt_to_machine(lp); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - error = HYPERVISOR_update_descriptor(mach_lp, (unsigned long)((entry_1 | (unsigned long) entry_2 << 32))); - -out_unlock: - up(&mm->context.sem); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/mpparse.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/mpparse.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,954 +0,0 @@ -/* - * Intel Multiprocessor Specification 1.1 and 1.4 - * compliant MP-table parsing routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * - * Fixes - * Erich Boleyn : MP v1.4 and additional changes. - * Alan Cox : Added EBDA scanning - * Ingo Molnar : various cleanups and rewrites - * Maciej W. Rozycki: Bits for default MP configurations - * Paul Diefenbaugh: Added full ACPI support - */ - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/config.h> -#include <linux/bootmem.h> -#include <linux/smp_lock.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/acpi.h> - -#include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> - -/* Have we found an MP table */ -int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; - -int acpi_found_madt; - -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -int apic_version [MAX_APICS]; -unsigned char mp_bus_id_to_type [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; -cpumask_t pci_bus_to_cpumask [256] = { [0 ... 255] = CPU_MASK_ALL }; - -int mp_current_pci_id = 0; -/* I/O APIC entries */ -struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; - -/* # of MP IRQ source entries */ -struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* MP IRQ source entries */ -int mp_irq_entries; - -int nr_ioapics; -int pic_mode; -unsigned long mp_lapic_addr = 0; - - - -/* Processor that is doing the boot up */ -unsigned int boot_cpu_id = -1U; -/* Internal processor count */ -static unsigned int num_processors = 0; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; - -/* ACPI MADT entry parsing functions */ -#ifdef CONFIG_ACPI_BOOT -extern struct acpi_boot_flags acpi_boot; -#ifdef CONFIG_X86_LOCAL_APIC -extern int acpi_parse_lapic (acpi_table_entry_header *header); -extern int acpi_parse_lapic_addr_ovr (acpi_table_entry_header *header); -extern int acpi_parse_lapic_nmi (acpi_table_entry_header *header); -#endif /*CONFIG_X86_LOCAL_APIC*/ -#ifdef CONFIG_X86_IO_APIC -extern int acpi_parse_ioapic (acpi_table_entry_header *header); -#endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI_BOOT*/ - -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; - - -/* - * Intel MP BIOS table parsing routines: - */ - -/* - * Checksum an MP configuration block. - */ - -static int __init mpf_checksum(unsigned char *mp, int len) -{ - int sum = 0; - - while (len--) - sum += *mp++; - - return sum & 0xFF; -} - -#ifndef CONFIG_XEN -static void __init MP_processor_info (struct mpc_config_processor *m) -{ - int ver; - - if (!(m->mpc_cpuflag & CPU_ENABLED)) - return; - - printk(KERN_INFO "Processor #%d %d:%d APIC version %d\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK)>>8, - (m->mpc_cpufeature & CPU_MODEL_MASK)>>4, - m->mpc_apicver); - - if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { - Dprintk(" Bootup CPU\n"); - boot_cpu_id = m->mpc_apicid; - } - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - - num_processors++; - - if (m->mpc_apicid > MAX_APICS) { - printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->mpc_apicid, MAX_APICS); - return; - } - ver = m->mpc_apicver; - - physid_set(m->mpc_apicid, phys_cpu_present_map); - /* - * Validate version - */ - if (ver == 0x0) { - printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid); - ver = 0x10; - } - apic_version[m->mpc_apicid] = ver; - bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; -} -#else -void __init MP_processor_info (struct mpc_config_processor *m) -{ - num_processors++; -} -#endif /* CONFIG_XEN */ - -static void __init MP_bus_info (struct mpc_config_bus *m) -{ - char str[7]; - - memcpy(str, m->mpc_bustype, 6); - str[6] = 0; - Dprintk("Bus #%d is %s\n", m->mpc_busid, str); - - if (strncmp(str, "ISA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; - } else if (strncmp(str, "EISA", 4) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; - } else if (strncmp(str, "PCI", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; - mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; - mp_current_pci_id++; - } else if (strncmp(str, "MCA", 3) == 0) { - mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; - } else { - printk(KERN_ERR "Unknown bustype %s\n", str); - } -} - -static void __init MP_ioapic_info (struct mpc_config_ioapic *m) -{ - if (!(m->mpc_flags & MPC_APIC_USABLE)) - return; - - printk("I/O APIC #%d Version %d at 0x%X.\n", - m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "Max # of I/O APICs (%d) exceeded (found %d).\n", - MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); - } - if (!m->mpc_apicaddr) { - printk(KERN_ERR "WARNING: bogus zero I/O APIC address" - " found in MP table, skipping!\n"); - return; - } - mp_ioapics[nr_ioapics] = *m; - nr_ioapics++; -} - -static void __init MP_intsrc_info (struct mpc_config_intsrc *m) -{ - mp_irqs [mp_irq_entries] = *m; - Dprintk("Int: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC INT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, - m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!!\n"); -} - -static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) -{ - Dprintk("Lint: type %d, pol %d, trig %d, bus %d," - " IRQ %02x, APIC ID %x, APIC LINT %02x\n", - m->mpc_irqtype, m->mpc_irqflag & 3, - (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, - m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); - /* - * Well it seems all SMP boards in existence - * use ExtINT/LVT1 == LINT0 and - * NMI/LVT2 == LINT1 - the following check - * will show us if this assumptions is false. - * Until then we do not have to add baggage. - */ - if ((m->mpc_irqtype == mp_ExtINT) && - (m->mpc_destapiclint != 0)) - BUG(); - if ((m->mpc_irqtype == mp_NMI) && - (m->mpc_destapiclint != 1)) - BUG(); -} - -/* - * Read/parse the MPC - */ - -static int __init smp_read_mpc(struct mp_config_table *mpc) -{ - char str[16]; - int count=sizeof(*mpc); - unsigned char *mpt=((unsigned char *)mpc)+count; - - if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { - printk("SMP mptable: bad signature [%c%c%c%c]!\n", - mpc->mpc_signature[0], - mpc->mpc_signature[1], - mpc->mpc_signature[2], - mpc->mpc_signature[3]); - return 0; - } - if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { - printk("SMP mptable: checksum error!\n"); - return 0; - } - if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { - printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", - mpc->mpc_spec); - return 0; - } - if (!mpc->mpc_lapic) { - printk(KERN_ERR "SMP mptable: null local APIC address!\n"); - return 0; - } - memcpy(str,mpc->mpc_oem,8); - str[8]=0; - printk(KERN_INFO "OEM ID: %s ",str); - - memcpy(str,mpc->mpc_productid,12); - str[12]=0; - printk(KERN_INFO "Product ID: %s ",str); - - printk(KERN_INFO "APIC at: 0x%X\n",mpc->mpc_lapic); - - /* save the local APIC address, it might be non-default */ - if (!acpi_lapic) - mp_lapic_addr = mpc->mpc_lapic; - - /* - * Now process the configuration blocks. - */ - while (count < mpc->mpc_length) { - switch(*mpt) { - case MP_PROCESSOR: - { - struct mpc_config_processor *m= - (struct mpc_config_processor *)mpt; - if (!acpi_lapic) - MP_processor_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_BUS: - { - struct mpc_config_bus *m= - (struct mpc_config_bus *)mpt; - MP_bus_info(m); - mpt += sizeof(*m); - count += sizeof(*m); - break; - } - case MP_IOAPIC: - { - struct mpc_config_ioapic *m= - (struct mpc_config_ioapic *)mpt; - MP_ioapic_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_INTSRC: - { - struct mpc_config_intsrc *m= - (struct mpc_config_intsrc *)mpt; - - MP_intsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - case MP_LINTSRC: - { - struct mpc_config_lintsrc *m= - (struct mpc_config_lintsrc *)mpt; - MP_lintsrc_info(m); - mpt+=sizeof(*m); - count+=sizeof(*m); - break; - } - } - } - clustered_apic_check(); - if (!num_processors) - printk(KERN_ERR "SMP mptable: no processors registered!\n"); - return num_processors; -} - -static int __init ELCR_trigger(unsigned int irq) -{ - unsigned int port; - - port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; -} - -static void __init construct_default_ioirq_mptable(int mpc_default_type) -{ - struct mpc_config_intsrc intsrc; - int i; - int ELCR_fallback = 0; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* conforming */ - intsrc.mpc_srcbus = 0; - intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; - - intsrc.mpc_irqtype = mp_INT; - - /* - * If true, we have an ISA/PCI system with no IRQ entries - * in the MP table. To prevent the PCI interrupts from being set up - * incorrectly, we try to use the ELCR. The sanity check to see if - * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can - * never be level sensitive, so we simply see if the ELCR agrees. - * If it does, we assume it's valid. - */ - if (mpc_default_type == 5) { - printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); - - if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) - printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n"); - else { - printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); - ELCR_fallback = 1; - } - } - - for (i = 0; i < 16; i++) { - switch (mpc_default_type) { - case 2: - if (i == 0 || i == 13) - continue; /* IRQ0 & IRQ13 not connected */ - /* fall through */ - default: - if (i == 2) - continue; /* IRQ2 is never connected */ - } - - if (ELCR_fallback) { - /* - * If the ELCR indicates a level-sensitive interrupt, we - * copy that information over to the MP table in the - * irqflag field (level sensitive, active high polarity). - */ - if (ELCR_trigger(i)) - intsrc.mpc_irqflag = 13; - else - intsrc.mpc_irqflag = 0; - } - - intsrc.mpc_srcbusirq = i; - intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ - MP_intsrc_info(&intsrc); - } - - intsrc.mpc_irqtype = mp_ExtINT; - intsrc.mpc_srcbusirq = 0; - intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ - MP_intsrc_info(&intsrc); -} - -static inline void __init construct_default_ISA_mptable(int mpc_default_type) -{ - struct mpc_config_processor processor; - struct mpc_config_bus bus; - struct mpc_config_ioapic ioapic; - struct mpc_config_lintsrc lintsrc; - int linttypes[2] = { mp_ExtINT, mp_NMI }; - int i; - - /* - * local APIC has default address - */ - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* - * 2 CPUs, numbered 0 & 1. - */ - processor.mpc_type = MP_PROCESSOR; - /* Either an integrated APIC or a discrete 82489DX. */ - processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - processor.mpc_cpuflag = CPU_ENABLED; - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | - boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; - for (i = 0; i < 2; i++) { - processor.mpc_apicid = i; - MP_processor_info(&processor); - } - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - switch (mpc_default_type) { - default: - printk(KERN_ERR "???\nUnknown standard configuration %d\n", - mpc_default_type); - /* fall through */ - case 1: - case 5: - memcpy(bus.mpc_bustype, "ISA ", 6); - break; - case 2: - case 6: - case 3: - memcpy(bus.mpc_bustype, "EISA ", 6); - break; - case 4: - case 7: - memcpy(bus.mpc_bustype, "MCA ", 6); - } - MP_bus_info(&bus); - if (mpc_default_type > 4) { - bus.mpc_busid = 1; - memcpy(bus.mpc_bustype, "PCI ", 6); - MP_bus_info(&bus); - } - - ioapic.mpc_type = MP_IOAPIC; - ioapic.mpc_apicid = 2; - ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; - ioapic.mpc_flags = MPC_APIC_USABLE; - ioapic.mpc_apicaddr = 0xFEC00000; - MP_ioapic_info(&ioapic); - - /* - * We set up most of the low 16 IO-APIC pins according to MPS rules. - */ - construct_default_ioirq_mptable(mpc_default_type); - - lintsrc.mpc_type = MP_LINTSRC; - lintsrc.mpc_irqflag = 0; /* conforming */ - lintsrc.mpc_srcbusid = 0; - lintsrc.mpc_srcbusirq = 0; - lintsrc.mpc_destapic = MP_APIC_ALL; - for (i = 0; i < 2; i++) { - lintsrc.mpc_irqtype = linttypes[i]; - lintsrc.mpc_destapiclint = i; - MP_lintsrc_info(&lintsrc); - } -} - -static struct intel_mp_floating *mpf_found; - -/* - * Scan the memory blocks for an SMP configuration block. - */ -void __init get_smp_config (void) -{ - struct intel_mp_floating *mpf = mpf_found; - - /* - * ACPI may be used to obtain the entire SMP configuration or just to - * enumerate/configure processors (CONFIG_ACPI_BOOT). Note that - * ACPI supports both logical (e.g. Hyper-Threading) and physical - * processors, where MPS only supports physical. - */ - if (acpi_lapic && acpi_ioapic) { - printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); - return; - } - else if (acpi_lapic) - printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); - - printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); - if (mpf->mpf_feature2 & (1<<7)) { - printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); - pic_mode = 1; - } else { - printk(KERN_INFO " Virtual Wire compatibility mode.\n"); - pic_mode = 0; - } - - /* - * Now see if we need to read further. - */ - if (mpf->mpf_feature1 != 0) { - - printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); - construct_default_ISA_mptable(mpf->mpf_feature1); - - } else if (mpf->mpf_physptr) { - - /* - * Read the physical hardware table. Anything here will - * override the defaults. - */ - if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) { - smp_found_config = 0; - printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); - printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); - return; - } - /* - * If there are no explicit MP IRQ entries, then we are - * broken. We set up most of the low 16 IO-APIC pins to - * ISA defaults and hope it will work. - */ - if (!mp_irq_entries) { - struct mpc_config_bus bus; - - printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); - - bus.mpc_type = MP_BUS; - bus.mpc_busid = 0; - memcpy(bus.mpc_bustype, "ISA ", 6); - MP_bus_info(&bus); - - construct_default_ioirq_mptable(0); - } - - } else - BUG(); - - printk(KERN_INFO "Processors: %d\n", num_processors); - /* - * Only use the first configuration found. - */ -} - -static int __init smp_scan_config (unsigned long base, unsigned long length) -{ - extern void __bad_mpf_size(void); - unsigned int *bp = isa_bus_to_virt(base); - struct intel_mp_floating *mpf; - - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); - if (sizeof(*mpf) != 16) - __bad_mpf_size(); - - while (length > 0) { - mpf = (struct intel_mp_floating *)bp; - if ((*bp == SMP_MAGIC_IDENT) && - (mpf->mpf_length == 1) && - !mpf_checksum((unsigned char *)bp, 16) && - ((mpf->mpf_specification == 1) - || (mpf->mpf_specification == 4)) ) { - - smp_found_config = 1; - mpf_found = mpf; - return 1; - } - bp += 4; - length -= 16; - } - return 0; -} - -void __init find_intel_smp (void) -{ - unsigned int address; - - /* - * FIXME: Linux assumes you have 640K of base ram.. - * this continues the error... - * - * 1) Scan the bottom 1K for a signature - * 2) Scan the top 1K of base RAM - * 3) Scan the 64K of bios - */ - if (smp_scan_config(0x0,0x400) || - smp_scan_config(639*0x400,0x400) || - smp_scan_config(0xF0000,0x10000)) - return; - /* - * If it is an SMP machine we should know now, unless the - * configuration is in an EISA/MCA bus machine with an - * extended bios data area. - * - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E, calculate and scan it here. - * - * NOTE! There are Linux loaders that will corrupt the EBDA - * area, and as such this kind of SMP config may be less - * trustworthy, simply because the SMP table may have been - * stomped on during early boot. These loaders are buggy and - * should be fixed. - */ - - address = *(unsigned short *)phys_to_virt(0x40E); - address <<= 4; - if (smp_scan_config(address, 0x1000)) - return; - - /* If we have come this far, we did not find an MP table */ - printk(KERN_INFO "No mptable found.\n"); -} - -/* - * - Intel MP Configuration Table - */ -void __init find_smp_config (void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - find_intel_smp(); -#endif -} - - -/* -------------------------------------------------------------------------- - ACPI-based MP Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI_BOOT - -void __init mp_register_lapic_address ( - u64 address) -{ -#ifndef CONFIG_XEN - mp_lapic_addr = (unsigned long) address; - - if (boot_cpu_id == -1U) - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); -#endif -} - - -void __init mp_register_lapic ( - u8 id, - u8 enabled) -{ - struct mpc_config_processor processor; - int boot_cpu = 0; - - if (id >= MAX_APICS) { - printk(KERN_WARNING "Processor #%d invalid (max %d)\n", - id, MAX_APICS); - return; - } - - if (id == boot_cpu_physical_apicid) - boot_cpu = 1; - -#ifndef CONFIG_XEN - processor.mpc_type = MP_PROCESSOR; - processor.mpc_apicid = id; - processor.mpc_apicver = 0x10; /* TBD: lapic version */ - processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); - processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); - processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | - (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; - processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; - processor.mpc_reserved[0] = 0; - processor.mpc_reserved[1] = 0; -#endif - - MP_processor_info(&processor); -} - -#ifdef CONFIG_X86_IO_APIC - -#define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 - -struct mp_ioapic_routing { - int apic_id; - int gsi_start; - int gsi_end; - u32 pin_programmed[4]; -} mp_ioapic_routing[MAX_IO_APICS]; - - -static int mp_find_ioapic ( - int gsi) -{ - int i = 0; - - /* Find the IOAPIC that manages this GSI. */ - for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_ioapic_routing[i].gsi_start) - && (gsi <= mp_ioapic_routing[i].gsi_end)) - return i; - } - - printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); - - return -1; -} - - -void __init mp_register_ioapic ( - u8 id, - u32 address, - u32 gsi_base) -{ - int idx = 0; - - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in MADT table, skipping!\n"); - return; - } - - idx = nr_ioapics++; - - mp_ioapics[idx].mpc_type = MP_IOAPIC; - mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; - mp_ioapics[idx].mpc_apicaddr = address; - - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); - mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); - - /* - * Build basic IRQ lookup table to facilitate gsi->io_apic lookups - * and to prevent reprogramming of IOAPIC pins (PCI IRQs). - */ - mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; - mp_ioapic_routing[idx].gsi_start = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); - - printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_start, - mp_ioapic_routing[idx].gsi_end); - - return; -} - - -void __init mp_override_legacy_irq ( - u8 bus_irq, - u8 polarity, - u8 trigger, - u32 gsi) -{ - struct mpc_config_intsrc intsrc; - int ioapic = -1; - int pin = -1; - - /* - * Convert 'gsi' to 'ioapic.pin'. - */ - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) - return; - pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * TBD: This check is for faulty timer entries, where the override - * erroneously sets the trigger to level, resulting in a HUGE - * increase of timer interrupts! - */ - if ((bus_irq == 0) && (trigger == 3)) - trigger = 1; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_irqflag = (trigger << 2) | polarity; - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ - intsrc.mpc_dstirq = pin; /* INTIN# */ - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - - return; -} - - -void __init mp_config_acpi_legacy_irqs (void) -{ - struct mpc_config_intsrc intsrc; - int i = 0; - int ioapic = -1; - - /* - * Fabricate the legacy ISA bus (bus #31). - */ - mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); - - /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - - intsrc.mpc_type = MP_INTSRC; - intsrc.mpc_irqflag = 0; /* Conforming */ - intsrc.mpc_srcbus = MP_ISA_BUS; - intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; - - /* - * Use the default configuration for the IRQs 0-15. Unless - * overridden by (MADT) interrupt source override entries. - */ - for (i = 0; i < 16; i++) { - int idx; - - for (idx = 0; idx < mp_irq_entries; idx++) { - struct mpc_config_intsrc *irq = mp_irqs + idx; - - /* Do we already have a mapping for this ISA IRQ? */ - if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) - break; - - /* Do we already have a mapping for this IOAPIC pin */ - if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && - (irq->mpc_dstirq == i)) - break; - } - - if (idx != mp_irq_entries) { - printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); - continue; /* IRQ already used */ - } - - intsrc.mpc_irqtype = mp_INT; - intsrc.mpc_srcbusirq = i; /* Identity mapped */ - intsrc.mpc_dstirq = i; - - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); - } - - return; -} - -int mp_register_gsi(u32 gsi, int edge_level, int active_high_low) -{ - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; - - if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) - return gsi; - -#ifdef CONFIG_ACPI_BUS - /* Don't set up the ACPI SCI because it's already set up */ - if (acpi_fadt.sci_int == gsi) - return gsi; -#endif - - ioapic = mp_find_ioapic(gsi); - if (ioapic < 0) { - printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); - return gsi; - } - - ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start; - - /* - * Avoid pin reprogramming. PRTs typically include entries - * with redundant pin->gsi mappings (but unique PCI devices); - * we only program the IOAPIC on the first. - */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { - printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, - ioapic_pin); - return gsi; - } - if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; - } - - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, - edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, - active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); - return gsi; -} - -#endif /*CONFIG_X86_IO_APIC*/ -#endif /*CONFIG_ACPI_BOOT*/ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/pci-dma.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/pci-dma.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,205 +0,0 @@ -/* - * Dynamic DMA mapping support. - */ - -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm-xen/balloon.h> - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -int dma_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction) -{ - int i; - - BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); - s->dma_length = s->length; - } - return nents; -} - -EXPORT_SYMBOL(dma_map_sg); - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ - int i; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - BUG_ON(s->page == NULL); - BUG_ON(s->dma_address == 0); - dma_unmap_single(dev, s->dma_address, s->dma_length, dir); - } -} - -EXPORT_SYMBOL(dma_unmap_sg); - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) -{ - void *ret; - unsigned int order = get_order(size); - unsigned long vstart; - - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - ret = mem->virt_base + (page << PAGE_SHIFT); - memset(ret, 0, size); - return ret; - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - return NULL; - } - - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) - gfp |= GFP_DMA; - - vstart = __get_free_pages(gfp, order); - ret = (void *)vstart; - if (ret == NULL) - return ret; - - xen_contig_memory(vstart, order); - - memset(ret, 0, size); - *dma_handle = virt_to_bus(ret); - - return ret; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - } else - free_pages((unsigned long)vaddr, order); -} -EXPORT_SYMBOL(dma_free_coherent); - -#if 0 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base; - int pages = size >> PAGE_SHIFT; - int bitmap_size = (pages + 31)/32; - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem->bitmap); - out: - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if(!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; - int pos, err; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,96 +0,0 @@ -/* Fallback functions when the main IOMMU code is not compiled in. This - code is roughly equivalent to i386. */ -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/pci.h> -#include <linux/string.h> -#include <asm/proto.h> -#include <asm/processor.h> - -int iommu_merge = 0; -EXPORT_SYMBOL(iommu_merge); - -dma_addr_t bad_dma_address; -EXPORT_SYMBOL(bad_dma_address); - -int iommu_bio_merge = 0; -EXPORT_SYMBOL(iommu_bio_merge); - -int iommu_sac_force = 0; -EXPORT_SYMBOL(iommu_sac_force); - -#if 0 -/* - * Dummy IO MMU functions - */ - -void *dma_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) -{ - void *ret; - u64 mask; - int order = get_order(size); - - if (hwdev) - mask = hwdev->coherent_dma_mask & *hwdev->dma_mask; - else - mask = 0xffffffff; - for (;;) { - ret = (void *)__get_free_pages(gfp, order); - if (ret == NULL) - return NULL; - *dma_handle = virt_to_bus(ret); - if ((*dma_handle & ~mask) == 0) - break; - free_pages((unsigned long)ret, order); - if (gfp & GFP_DMA) - return NULL; - gfp |= GFP_DMA; - } - - memset(ret, 0, size); - return ret; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -void dma_free_coherent(struct device *hwdev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - free_pages((unsigned long)vaddr, get_order(size)); -} -EXPORT_SYMBOL(dma_free_coherent); -#endif - -int dma_supported(struct device *hwdev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - * RED-PEN this won't work for pci_map_single. Caller has to - * use GFP_DMA in the first place. - */ - if (mask < 0x00ffffff) - return 0; - - return 1; -} -EXPORT_SYMBOL(dma_supported); - -int dma_get_cache_alignment(void) -{ - return boot_cpu_data.x86_clflush_size; -} -EXPORT_SYMBOL(dma_get_cache_alignment); - -static int __init check_ram(void) -{ - if (end_pfn >= 0xffffffff>>PAGE_SHIFT) { - printk( - KERN_ERR "WARNING more than 4GB of memory but IOMMU not compiled in.\n" - KERN_ERR "WARNING 32bit PCI may malfunction.\n"); - } - return 0; -} -__initcall(check_ram); - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/process.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/process.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,725 +0,0 @@ -/* - * linux/arch/x86-64/kernel/process.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - * - * X86-64 port - * Andi Kleen. - * - * $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $ - * - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Modified for Xen - */ - -/* - * This file handles the architecture-dependent parts of process handling.. - */ - -#include <stdarg.h> - -#include <linux/cpu.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/elfcore.h> -#include <linux/smp.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/module.h> -#include <linux/a.out.h> -#include <linux/interrupt.h> -#include <linux/delay.h> -#include <linux/irq.h> -#include <linux/ptrace.h> -#include <linux/utsname.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/mmu_context.h> -#include <asm/pda.h> -#include <asm/prctl.h> -#include <asm/kdebug.h> -#include <asm-xen/xen-public/dom0_ops.h> -#include <asm-xen/xen-public/physdev.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/hardirq.h> -#include <asm/ia32.h> - -asmlinkage extern void ret_from_fork(void); - -unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; - -atomic_t hlt_counter = ATOMIC_INIT(0); - -unsigned long boot_option_idle_override = 0; -EXPORT_SYMBOL(boot_option_idle_override); - -/* - * Powermanagement idle function, if any.. - */ -void (*pm_idle)(void); -static cpumask_t cpu_idle_map; - -void disable_hlt(void) -{ - atomic_inc(&hlt_counter); -} - -EXPORT_SYMBOL(disable_hlt); - -void enable_hlt(void) -{ - atomic_dec(&hlt_counter); -} - -EXPORT_SYMBOL(enable_hlt); - -/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */ -extern void stop_hz_timer(void); -extern void start_hz_timer(void); -void xen_idle(void) -{ - local_irq_disable(); - - if (need_resched()) { - local_irq_enable(); - } else { - stop_hz_timer(); - HYPERVISOR_block(); /* implicit local_irq_enable() */ - start_hz_timer(); - } -} - -#ifdef CONFIG_HOTPLUG_CPU -#include <asm/nmi.h> -/* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) -{ - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* We shouldn't have to disable interrupts while dead, but - * some interrupts just don't seem to go away, and this makes - * it "work" for testing purposes. */ - /* Death loop */ - while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) - HYPERVISOR_yield(); - - local_irq_disable(); - __flush_tlb_all(); - cpu_set(smp_processor_id(), cpu_online_map); - local_irq_enable(); -} -#else -static inline void play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a - * low exit latency (ie sit in a loop waiting for - * somebody to say that they'd like to reschedule) - */ -void cpu_idle (void) -{ - int cpu = smp_processor_id(); - - /* endless idle loop with no priority at all */ - while (1) { - while (!need_resched()) { - if (cpu_isset(cpu, cpu_idle_map)) - cpu_clear(cpu, cpu_idle_map); - rmb(); - - if (cpu_is_offline(cpu)) - play_dead(); - - __IRQ_STAT(cpu,idle_timestamp) = jiffies; - xen_idle(); - } - schedule(); - } -} - -void cpu_idle_wait(void) -{ - int cpu; - cpumask_t map; - - for_each_online_cpu(cpu) - cpu_set(cpu, cpu_idle_map); - - wmb(); - do { - ssleep(1); - cpus_and(map, cpu_idle_map, cpu_online_map); - } while (!cpus_empty(map)); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */ -/* Always use xen_idle() instead. */ -void __init select_idle_routine(const struct cpuinfo_x86 *c) {} - -/* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) -{ - unsigned long fs, gs, shadowgs; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; - - printk("\n"); - print_modules(); - printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), system_utsname.release); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); - - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); - asm("movl %%fs,%0" : "=r" (fsindex)); - asm("movl %%gs,%0" : "=r" (gsindex)); - - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x\n", cs, ds, es); - -} - -void show_regs(struct pt_regs *regs) -{ - __show_regs(regs); - show_trace(®s->rsp); -} - -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - struct task_struct *me = current; - struct thread_struct *t = &me->thread; - if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); - - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - } -} - -void load_gs_index(unsigned gs) -{ - HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, gs); -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - struct thread_info *t = current_thread_info(); - - if (t->flags & _TIF_ABI_PENDING) - t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - clear_fpu(tsk); - clear_used_math(); -} - -void release_thread(struct task_struct *dead_task) -{ - if (dead_task->mm) { - if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); - BUG(); - } - } -} - -static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) -{ - struct user_desc ud = { - .base_addr = addr, - .limit = 0xfffff, - .contents = (3 << 3), /* user */ - .seg_32bit = 1, - .limit_in_pages = 1, - .useable = 1, - }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); -} - -static inline u32 read_32bit_tls(struct task_struct *t, int tls) -{ - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); -} - -/* - * This gets called before we allocate a new thread and copy - * the current task into it. - */ -void prepare_to_copy(struct task_struct *tsk) -{ - unlazy_fpu(tsk); -} - -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, - unsigned long unused, - struct task_struct * p, struct pt_regs * regs) -{ - int err; - struct pt_regs * childregs; - struct task_struct *me = current; - - childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p->thread_info)) - 1; - - *childregs = *regs; - - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) { - childregs->rsp = (unsigned long)childregs; - } - - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; - - set_ti_thread_flag(p->thread_info, TIF_FORK); - - p->thread.fs = me->thread.fs; - p->thread.gs = me->thread.gs; - - asm("movl %%gs,%0" : "=m" (p->thread.gsindex)); - asm("movl %%fs,%0" : "=m" (p->thread.fsindex)); - asm("movl %%es,%0" : "=m" (p->thread.es)); - asm("movl %%ds,%0" : "=m" (p->thread.ds)); - - if (unlikely(me->thread.io_bitmap_ptr != NULL)) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!p->thread.io_bitmap_ptr) { - p->thread.io_bitmap_max = 0; - return -ENOMEM; - } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); - } - - /* - * Set a new TLS for the child thread? - */ - if (clone_flags & CLONE_SETTLS) { -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) - goto out; - } - p->thread.io_pl = current->thread.io_pl; - - err = 0; -out: - if (err && p->thread.io_bitmap_ptr) { - kfree(p->thread.io_bitmap_ptr); - p->thread.io_bitmap_max = 0; - } - return err; -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread,register) \ - HYPERVISOR_set_debugreg((register), \ - (thread->debugreg ## register)) - - -static inline void __save_init_fpu( struct task_struct *tsk ) -{ - asm volatile( "rex64 ; fxsave %0 ; fnclex" - : "=m" (tsk->thread.i387.fxsave)); - tsk->thread_info->status &= ~TS_USEDFPU; -} - -/* - * switch_to(x,y) should switch tasks from x to y. - * - * This could still be optimized: - * - fold all the options into a flag word and test it with a single test. - * - could test fs/gs bitsliced - */ -struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) -{ - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; - int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - physdev_op_t iopl_op, iobmp_op; - multicall_entry_t _mcl[8], *mcl = _mcl; - - /* - * This is basically '__unlazy_fpu', except that we queue a - * multicall to indicate FPU task switch, rather than - * synchronously trapping to Xen. - */ - if (prev_p->thread_info->status & TS_USEDFPU) { - __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ - mcl->op = __HYPERVISOR_fpu_taskswitch; - mcl->args[0] = 1; - mcl++; - } - - /* - * Reload esp0, LDT and the page table pointer: - */ - tss->rsp0 = next->rsp0; - mcl->op = __HYPERVISOR_stack_switch; - mcl->args[0] = __KERNEL_DS; - mcl->args[1] = tss->rsp0; - mcl++; - - /* - * Load the per-thread Thread-Local Storage descriptor. - * This is load_TLS(next, cpu) with multicalls. - */ -#define C(i) do { \ - if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ - mcl->op = __HYPERVISOR_update_descriptor; \ - mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \ - [GDT_ENTRY_TLS_MIN + i]); \ - mcl->args[1] = next->tls_array[i]; \ - mcl++; \ - } \ -} while (0) - C(0); C(1); C(2); -#undef C - - if (unlikely(prev->io_pl != next->io_pl)) { - iopl_op.cmd = PHYSDEVOP_SET_IOPL; - iopl_op.u.set_iopl.iopl = next->io_pl; - mcl->op = __HYPERVISOR_physdev_op; - mcl->args[0] = (unsigned long)&iopl_op; - mcl++; - } - - if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { - iobmp_op.cmd = - PHYSDEVOP_SET_IOBITMAP; - iobmp_op.u.set_iobitmap.bitmap = - (unsigned long)next->io_bitmap_ptr; - iobmp_op.u.set_iobitmap.nr_ports = - next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; - mcl->op = __HYPERVISOR_physdev_op; - mcl->args[0] = (unsigned long)&iobmp_op; - mcl++; - } - - (void)HYPERVISOR_multicall(_mcl, mcl - _mcl); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - if (unlikely(next->es)) - loadsegment(es, next->es); - - if (unlikely(next->ds)) - loadsegment(ds, next->ds); - - /* - * Switch FS and GS. - */ - if (unlikely(next->fsindex)) - loadsegment(fs, next->fsindex); - - if (next->fs) - HYPERVISOR_set_segment_base(SEGBASE_FS, next->fs); - - if (unlikely(next->gsindex)) - load_gs_index(next->gsindex); - - if (next->gs) - HYPERVISOR_set_segment_base(SEGBASE_GS_USER, next->gs); - - /* - * Switch the PDA context. - */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); - write_pda(pcurrent, next_p); - write_pda(kernelstack, (unsigned long)next_p->thread_info + THREAD_SIZE - PDA_STACKOFFSET); - - /* - * Now maybe reload the debug registers - */ - if (unlikely(next->debugreg7)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - return prev_p; -} - -/* - * sys_execve() executes a new program. - */ -asmlinkage -long sys_execve(char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs regs) -{ - long error; - char * filename; - - filename = getname(name); - error = PTR_ERR(filename); - if (IS_ERR(filename)) - return error; - error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } - putname(filename); - return error; -} - -void set_personality_64bit(void) -{ - /* inherit personality from parent */ - - /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); - - /* TBD: overwrites user setup. Should have two bits. - But 64bit processes have always behaved this way, - so it's not too bad. The main problem is just that - 32bit childs are affected again. */ - current->personality &= ~READ_IMPLIES_EXEC; -} - -asmlinkage long sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); -} - -asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) -{ - if (!newsp) - newsp = regs->rsp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); -} - -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage long sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, - NULL, NULL); -} - -unsigned long get_wchan(struct task_struct *p) -{ - unsigned long stack; - u64 fp,rip; - int count = 0; - - if (!p || p == current || p->state==TASK_RUNNING) - return 0; - stack = (unsigned long)p->thread_info; - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) - return 0; - fp = *(u64 *)(p->thread.rsp); - do { - if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) - return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; - fp = *(u64 *)fp; - } while (count++ < 16); - return 0; -} - -long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; - int doit = task == current; - int cpu; - - switch (code) { - case ARCH_SET_GS: - if (addr >= TASK_SIZE) - return -EPERM; - cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); - } - task->thread.gsindex = GS_TLS_SEL; - task->thread.gs = 0; - } else { - task->thread.gsindex = 0; - task->thread.gs = addr; - if (doit) { - load_gs_index(0); - ret = HYPERVISOR_set_segment_base(SEGBASE_GS_USER, addr); - } - } - put_cpu(); - break; - case ARCH_SET_FS: - /* Not strictly needed for fs, but do it for symmetry - with gs */ - if (addr >= TASK_SIZE) - return -EPERM; - cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to - switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); - asm volatile("movl %0,%%fs" :: "r" (FS_TLS_SEL)); - } - task->thread.fsindex = FS_TLS_SEL; - task->thread.fs = 0; - } else { - task->thread.fsindex = 0; - task->thread.fs = addr; - if (doit) { - /* set the selector to 0 to not confuse - __switch_to */ - asm volatile("movl %0,%%fs" :: "r" (0)); - ret = HYPERVISOR_set_segment_base(SEGBASE_FS, addr); - - } - } - put_cpu(); - break; - case ARCH_GET_FS: { - unsigned long base; - if (task->thread.fsindex == FS_TLS_SEL) - base = read_32bit_tls(task, FS_TLS); - else if (doit) { - rdmsrl(MSR_FS_BASE, base); - } else - base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; - } - case ARCH_GET_GS: { - unsigned long base; - if (task->thread.gsindex == GS_TLS_SEL) - base = read_32bit_tls(task, GS_TLS); - else if (doit) { - rdmsrl(MSR_KERNEL_GS_BASE, base); - } else - base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); - break; - } - - default: - ret = -EINVAL; - break; - } - - return ret; -} - -long sys_arch_prctl(int code, unsigned long addr) -{ - return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = (struct pt_regs *)(tsk->thread.rsp0); - --pp; - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - boot_option_idle_override = 1; - return 1; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1375 +0,0 @@ -/* - * linux/arch/x86-64/kernel/setup.c - * - * Copyright (C) 1995 Linus Torvalds - * - * Nov 2001 Dave Jones <davej@xxxxxxx> - * Forked from i386 setup code. - * - * $Id$ - */ - -/* - * This file handles the architecture-dependent parts of initialization - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/stddef.h> -#include <linux/unistd.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/tty.h> -#include <linux/ioport.h> -#include <linux/delay.h> -#include <linux/config.h> -#include <linux/init.h> -#include <linux/initrd.h> -#include <linux/highmem.h> -#include <linux/bootmem.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <linux/console.h> -#include <linux/seq_file.h> -#include <linux/root_dev.h> -#include <linux/pci.h> -#include <linux/acpi.h> -#include <linux/kallsyms.h> -#include <linux/edd.h> -#include <linux/percpu.h> -#include <asm/mtrr.h> -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/msr.h> -#include <asm/desc.h> -#include <video/edid.h> -#include <asm/e820.h> -#include <asm/dma.h> -#include <asm/mpspec.h> -#include <asm/mmu_context.h> -#include <asm/bootsetup.h> -#include <asm/proto.h> -#include <asm/setup.h> -#include <asm/mach_apic.h> -#include <asm/numa.h> -#include <asm-xen/xen-public/physdev.h> -#include "setup_arch_pre.h" -#include <asm-xen/hypervisor.h> - -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) - -#include <asm/mach-xen/setup_arch_post.h> - -extern unsigned long start_pfn; - -#if 0 -struct edid_info { - unsigned char dummy[128]; -}; -#endif - -extern struct edid_info edid_info; - -/* Allows setting of maximum possible memory size */ -unsigned long xen_override_max_pfn; -/* - * Machine setup.. - */ - -struct cpuinfo_x86 boot_cpu_data; - -unsigned long mmu_cr4_features; -EXPORT_SYMBOL_GPL(mmu_cr4_features); - -int acpi_disabled; -EXPORT_SYMBOL(acpi_disabled); -#ifdef CONFIG_ACPI_BOOT -extern int __initdata acpi_ht; -extern acpi_interrupt_flags acpi_sci_flags; -int __initdata acpi_force = 0; -#endif - -int acpi_numa __initdata; - -/* For PCI or other memory-mapped resources */ -unsigned long pci_mem_start = 0x10000000; - -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; - -unsigned long saved_video_mode; - -#ifdef CONFIG_SWIOTLB -int swiotlb; -EXPORT_SYMBOL(swiotlb); -#endif - -/* - * Setup options - */ -struct drive_info_struct { char dummy[32]; } drive_info; -struct screen_info screen_info; -struct sys_desc_table_struct { - unsigned short length; - unsigned char table[0]; -}; - -struct edid_info edid_info; -struct e820map e820; - -unsigned char aux_device_present; - -extern int root_mountflags; -extern char _text, _etext, _edata, _end; - -char command_line[COMMAND_LINE_SIZE]; - -struct resource standard_io_resources[] = { - { .name = "dma1", .start = 0x00, .end = 0x1f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic1", .start = 0x20, .end = 0x21, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer0", .start = 0x40, .end = 0x43, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "timer1", .start = 0x50, .end = 0x53, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "keyboard", .start = 0x60, .end = 0x6f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma page reg", .start = 0x80, .end = 0x8f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "pic2", .start = 0xa0, .end = 0xa1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "dma2", .start = 0xc0, .end = 0xdf, - .flags = IORESOURCE_BUSY | IORESOURCE_IO }, - { .name = "fpu", .start = 0xf0, .end = 0xff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO } -}; - -#define STANDARD_IO_RESOURCES \ - (sizeof standard_io_resources / sizeof standard_io_resources[0]) - -#define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) - -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_RAM, -}; - -#define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_ROM, -}; - -static struct resource adapter_rom_resources[] = { - { .name = "Adapter ROM", .start = 0xc8000, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM }, - { .name = "Adapter ROM", .start = 0, .end = 0, - .flags = IORESOURCE_ROM } -}; -#endif - -#define ADAPTER_ROM_RESOURCES \ - (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_ROM, -}; - -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_RAM, -}; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -#define romsignature(x) (*(unsigned short *)(x) == 0xaa55) - -static int __init romchecksum(unsigned char *rom, unsigned long length) -{ - unsigned char *p, sum = 0; - - for (p = rom; p < rom + length; p++) - sum += *p; - return sum == 0; -} - -static void __init probe_roms(void) -{ - unsigned long start, length, upper; - unsigned char *rom; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = rom[2] * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} -#endif - -/* - * Point at the empty zero page to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; -EXPORT_SYMBOL(HYPERVISOR_shared_info); - -u32 *phys_to_machine_mapping, *pfn_to_mfn_frame_list; - -EXPORT_SYMBOL(phys_to_machine_mapping); - -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]); -DEFINE_PER_CPU(int, nr_multicall_ents); - -/* Raw start-of-day parameters from the hypervisor. */ -union xen_start_info_union xen_start_info_union; - -static __init void parse_cmdline_early (char ** cmdline_p) -{ - char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0, max_cmdline; - - if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) - max_cmdline = COMMAND_LINE_SIZE; - memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline); - /* Save unparsed command line copy for /proc/cmdline */ - saved_command_line[max_cmdline-1] = '\0'; - - for (;;) { - if (c != ' ') - goto next_char; - -#ifdef CONFIG_SMP - /* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ - else if (!memcmp(from, "maxcpus=", 8)) { - extern unsigned int maxcpus; - - maxcpus = simple_strtoul(from + 8, NULL, 0); - } -#endif -#ifdef CONFIG_ACPI_BOOT - /* "acpi=off" disables both ACPI table parsing and interpreter init */ - if (!memcmp(from, "acpi=off", 8)) - disable_acpi(); - - if (!memcmp(from, "acpi=force", 10)) { - /* add later when we do DMI horrors: */ - acpi_force = 1; - acpi_disabled = 0; - } - - /* acpi=ht just means: do ACPI MADT parsing - at bootup, but don't enable the full ACPI interpreter */ - if (!memcmp(from, "acpi=ht", 7)) { - if (!acpi_force) - disable_acpi(); - acpi_ht = 1; - } - else if (!memcmp(from, "pci=noacpi", 10)) - acpi_disable_pci(); - else if (!memcmp(from, "acpi=noirq", 10)) - acpi_noirq_set(); - - else if (!memcmp(from, "acpi_sci=edge", 13)) - acpi_sci_flags.trigger = 1; - else if (!memcmp(from, "acpi_sci=level", 14)) - acpi_sci_flags.trigger = 3; - else if (!memcmp(from, "acpi_sci=high", 13)) - acpi_sci_flags.polarity = 1; - else if (!memcmp(from, "acpi_sci=low", 12)) - acpi_sci_flags.polarity = 3; - - /* acpi=strict disables out-of-spec workarounds */ - else if (!memcmp(from, "acpi=strict", 11)) { - acpi_strict = 1; - } -#endif - -#if 0 - if (!memcmp(from, "nolapic", 7) || - !memcmp(from, "disableapic", 11)) - disable_apic = 1; - - if (!memcmp(from, "noapic", 6)) - skip_ioapic_setup = 1; - - if (!memcmp(from, "apic", 4)) { - skip_ioapic_setup = 0; - ioapic_force = 1; - } -#endif - - if (!memcmp(from, "mem=", 4)) - parse_memopt(from+4, &from); - -#ifdef CONFIG_DISCONTIGMEM - if (!memcmp(from, "numa=", 5)) - numa_setup(from+5); -#endif - -#ifdef CONFIG_GART_IOMMU - if (!memcmp(from,"iommu=",6)) { - iommu_setup(from+6); - } -#endif - - if (!memcmp(from,"oops=panic", 10)) - panic_on_oops = 1; - - if (!memcmp(from, "noexec=", 7)) - nonx_setup(from + 7); - - next_char: - c = *(from++); - if (!c) - break; - if (COMMAND_LINE_SIZE <= ++len) - break; - *(to++) = c; - } - *to = '\0'; - *cmdline_p = command_line; -} - -#ifndef CONFIG_DISCONTIGMEM -static void __init contig_initmem_init(void) -{ - unsigned long bootmap_size, bootmap; - - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = start_pfn; - bootmap_size = init_bootmem(bootmap, end_pfn); - reserve_bootmem(bootmap, bootmap_size); - - free_bootmem(start_pfn << PAGE_SHIFT, (end_pfn - start_pfn) << PAGE_SHIFT); - printk("Registering memory for bootmem: from %lx, size = %lx\n", - start_pfn << PAGE_SHIFT, (end_pfn - start_pfn) << PAGE_SHIFT); - /* - * This should cover kernel_end - */ -#if 0 - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); -#endif - reserve_bootmem(0, (PFN_PHYS(start_pfn) + - bootmap_size + PAGE_SIZE-1)); - -} -#endif - -/* Use inline assembly to define this because the nops are defined - as inline assembly strings in the include files and we cannot - get them easily into strings. */ -asm("\t.data\nk8nops: " - K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 - K8_NOP7 K8_NOP8); - -extern unsigned char k8nops[]; -static unsigned char *k8_nops[ASM_NOP_MAX+1] = { - NULL, - k8nops, - k8nops + 1, - k8nops + 1 + 2, - k8nops + 1 + 2 + 3, - k8nops + 1 + 2 + 3 + 4, - k8nops + 1 + 2 + 3 + 4 + 5, - k8nops + 1 + 2 + 3 + 4 + 5 + 6, - k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, -}; - -/* Replace instructions with better alternatives for this CPU type. - - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where - APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ -void apply_alternatives(void *start, void *end) -{ - struct alt_instr *a; - int diff, i, k; - for (a = start; (void *)a < end; a++) { - if (!boot_cpu_has(a->cpuid)) - continue; - - BUG_ON(a->replacementlen > a->instrlen); - __inline_memcpy(a->instr, a->replacement, a->replacementlen); - diff = a->instrlen - a->replacementlen; - - /* Pad the rest with nops */ - for (i = a->replacementlen; diff > 0; diff -= k, i += k) { - k = diff; - if (k > ASM_NOP_MAX) - k = ASM_NOP_MAX; - __inline_memcpy(a->instr + i, k8_nops[k], k); - } - } -} - -static int no_replacement __initdata = 0; - -void __init alternative_instructions(void) -{ - extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - if (no_replacement) - return; - apply_alternatives(__alt_instructions, __alt_instructions_end); -} - -static int __init noreplacement_setup(char *s) -{ - no_replacement = 1; - return 0; -} - -__setup("noreplacement", noreplacement_setup); - -#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) -struct edd edd; -#ifdef CONFIG_EDD_MODULE -EXPORT_SYMBOL(edd); -#endif -/** - * copy_edd() - Copy the BIOS EDD information - * from boot_params into a safe place. - * - */ -static inline void copy_edd(void) -{ - memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature)); - memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info)); - edd.mbr_signature_nr = EDD_MBR_SIG_NR; - edd.edd_info_nr = EDD_NR; -} -#else -static inline void copy_edd(void) -{ -} -#endif - -#if 0 -#define EBDA_ADDR_POINTER 0x40E -static void __init reserve_ebda_region(void) -{ - unsigned int addr; - /** - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - addr = *(unsigned short *)phys_to_virt(EBDA_ADDR_POINTER); - addr <<= 4; - if (addr) - reserve_bootmem_generic(addr, PAGE_SIZE); -} -#endif - -/* - * Guest physical starts from 0. - */ - -unsigned long __init xen_end_of_ram(void) -{ - unsigned long max_end_pfn = xen_start_info.nr_pages; - - if ( xen_override_max_pfn < max_end_pfn) - xen_override_max_pfn = max_end_pfn; - - return xen_override_max_pfn; -} - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - early_printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: early_printk("(usable)\n"); - break; - case E820_RESERVED: - early_printk("(reserved)\n"); - break; - case E820_ACPI: - early_printk("(ACPI data)\n"); - break; - case E820_NVS: - early_printk("(ACPI NVS)\n"); - break; - default: early_printk("type %u\n", e820.map[i].type); - break; - } - } -} - -void __init setup_arch(char **cmdline_p) -{ - unsigned long low_mem_size; - int i, j; - physdev_op_t op; - -#if 0 - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); -#else - ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); -#endif - drive_info = DRIVE_INFO; - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - screen_info = SCREEN_INFO; -#endif - edid_info = EDID_INFO; - aux_device_present = AUX_DEVICE_INFO; - saved_video_mode = SAVED_VIDEO_MODE; - bootloader_type = LOADER_TYPE; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); -#endif -/* register_console(&xen_console); */ - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - /* This is drawn from a dump from vgacon:startup in standard Linux. */ - screen_info.orig_video_mode = 3; - screen_info.orig_video_isVGA = 1; - screen_info.orig_video_lines = 25; - screen_info.orig_video_cols = 80; - screen_info.orig_video_ega_bx = 3; - screen_info.orig_video_points = 16; -#endif - ARCH_SETUP - print_memory_map(machine_specific_memory_setup()); - - /* copy_edd(); */ - - if (!MOUNT_ROOT_RDONLY) - root_mountflags &= ~MS_RDONLY; - init_mm.start_code = (unsigned long) &_text; - init_mm.end_code = (unsigned long) &_etext; - init_mm.end_data = (unsigned long) &_edata; -/* init_mm.brk = (unsigned long) &_end; */ - init_mm.brk = start_pfn << PAGE_SHIFT; - - -#if 0 /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */ - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; -#endif - parse_cmdline_early(cmdline_p); - - early_identify_cpu(&boot_cpu_data); - - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ -#if 0 - end_pfn = e820_end_of_ram(); -#else - end_pfn = xen_end_of_ram(); -#endif - - check_efer(); - - init_memory_mapping(0, (end_pfn << PAGE_SHIFT)); - -#ifdef CONFIG_ACPI_NUMA - /* - * Parse SRAT to discover nodes. - */ - acpi_numa_init(); -#endif - -#ifdef CONFIG_DISCONTIGMEM - numa_initmem_init(0, end_pfn); -#else - contig_initmem_init(); -#endif - - /* Reserve direct mapping and shared info etc. */ -// reserve_bootmem_generic(table_start << PAGE_SHIFT, (table_end + 1 - table_start) << PAGE_SHIFT); - -// reserve_bootmem_generic(0, (table_end + 1) << PAGE_SHIFT); - - /* reserve kernel */ -// kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); - -#if 0 - /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. - */ - reserve_bootmem_generic(0, PAGE_SIZE); -#endif - - /* reserve ebda region */ -/* reserve_ebda_region(); */ - -#ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); - - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); -#endif - -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif -#ifdef CONFIG_BLK_DEV_INITRD - if (xen_start_info.mod_start) { - if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { - /*reserve_bootmem_generic(INITRD_START, INITRD_SIZE);*/ - initrd_start = INITRD_START + PAGE_OFFSET; - initrd_end = initrd_start+INITRD_SIZE; - initrd_below_start_ok = 1; - } else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - (unsigned long)(INITRD_START + INITRD_SIZE), - (unsigned long)(end_pfn << PAGE_SHIFT)); - initrd_start = 0; - } - } -#endif - paging_init(); -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Find and reserve possible boot-time SMP configuration: - */ - find_smp_config(); -#endif - /* Make sure we have a large enough P->M table. */ - if (end_pfn > xen_start_info.nr_pages) { - phys_to_machine_mapping = alloc_bootmem( - max_pfn * sizeof(unsigned long)); - memset(phys_to_machine_mapping, ~0, - max_pfn * sizeof(unsigned long)); - memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - xen_start_info.nr_pages * sizeof(unsigned long)); - free_bootmem( - __pa(xen_start_info.mfn_list), - PFN_PHYS(PFN_UP(xen_start_info.nr_pages * - sizeof(unsigned long)))); - } - - pfn_to_mfn_frame_list = alloc_bootmem(PAGE_SIZE); - - for ( i=0, j=0; i < end_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) - { - pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; - } - -#if 0 - check_ioapic(); -#endif - -#ifdef CONFIG_ACPI_BOOT - /* - * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). - * Call this early for SRAT node setup. - */ - acpi_boot_table_init(); - - /* - * Read APIC and some other early information from ACPI tables. - */ - acpi_boot_init(); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - get_smp_config(); -#ifndef CONFIG_XEN - init_apic_mappings(); -#endif -#endif - - /* XXX Disable irqdebug until we have a way to avoid interrupt - * conflicts. */ -/* noirqdebug_setup(""); */ - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Request address space for all standard RAM and ROM resources - * and also for regions reported as reserved by the e820. - */ - probe_roms(); -#endif -/* e820_reserve_resources(); */ - - request_resource(&iomem_resource, &video_ram_resource); - - { - unsigned i; - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < STANDARD_IO_RESOURCES; i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - } - - /* Will likely break when you have unassigned resources with more - than 4GB memory and bridges that don't support more than 4GB. - Doing it properly would require to use pci_alloc_consistent - in this case. */ - low_mem_size = ((end_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff; - if (low_mem_size > pci_mem_start) - pci_mem_start = low_mem_size; - -#ifdef CONFIG_GART_IOMMU - iommu_hole_init(); -#endif - - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = current->thread.io_pl = 1; - HYPERVISOR_physdev_op(&op); - - if (xen_start_info.flags & SIF_INITDOMAIN) { - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - panic("Xen granted us console access " - "but not privileged status"); - -#ifdef CONFIG_VT -#if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; -#elif defined(CONFIG_DUMMY_CONSOLE) - conswitchp = &dummy_con; -#endif -#endif - } else { -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - extern const struct consw xennull_con; - extern int console_use_vt; -#if defined(CONFIG_VGA_CONSOLE) - /* disable VGA driver */ - ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB; -#endif - conswitchp = &xennull_con; - console_use_vt = 0; -#endif - } -} - -static int __init get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - - if (c->x86_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - return 1; -} - - -static void __init display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, eax, ebx, ecx, edx; - - n = c->x86_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; - } - - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); - if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } -} - - -static int __init init_amd(struct cpuinfo_x86 *c) -{ - int r; - int level; -#ifdef CONFIG_NUMA - int cpu; -#endif - - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - - /* C-stepping K8? */ - level = cpuid_eax(1); - if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_bit(X86_FEATURE_K8_C, &c->x86_capability); - - r = get_model_name(c); - if (!r) { - switch (c->x86) { - case 15: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - display_cacheinfo(c); - - if (c->x86_cpuid_level >= 0x80000008) { - c->x86_num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - if (c->x86_num_cores & (c->x86_num_cores - 1)) - c->x86_num_cores = 1; - -#ifdef CONFIG_NUMA - /* On a dual core setup the lower bits of apic id - distingush the cores. Fix up the CPU<->node mappings - here based on that. - Assumes number of cores is a power of two. - When using SRAT use mapping from SRAT. */ - cpu = c->x86_apicid; - if (acpi_numa <= 0 && c->x86_num_cores > 1) { - cpu_to_node[cpu] = cpu >> hweight32(c->x86_num_cores - 1); - if (!node_online(cpu_to_node[cpu])) - cpu_to_node[cpu] = first_node(node_online_map); - } - printk(KERN_INFO "CPU %d(%d) -> Node %d\n", - cpu, c->x86_num_cores, cpu_to_node[cpu]); -#endif - } - - return r; -} - -static void __init detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_lsb, index_msb, tmp; - int cpu = smp_processor_id(); - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - - cpuid(1, &eax, &ebx, &ecx, &edx); - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - index_lsb = 0; - index_msb = 31; - /* - * At this point we only support two siblings per - * processor package. - */ - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - tmp = smp_num_siblings; - while ((tmp & 1) == 0) { - tmp >>=1 ; - index_lsb++; - } - tmp = smp_num_siblings; - while ((tmp & 0x80000000 ) == 0) { - tmp <<=1 ; - index_msb--; - } - if (index_lsb != index_msb ) - index_msb++; - phys_proc_id[cpu] = phys_pkg_id(index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - phys_proc_id[cpu]); - } -#endif -} - -static void __init sched_cmp_hack(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - /* AMD dual core looks like HT but isn't really. Hide it from the - scheduler. This works around problems with the domain scheduler. - Also probably gives slightly better scheduling and disables - SMT nice which is harmful on dual core. - TBD tune the domain scheduler for dual core. */ - if (c->x86_vendor == X86_VENDOR_AMD && cpu_has(c, X86_FEATURE_CMP_LEGACY)) - smp_num_siblings = 1; -#endif -} - -static void __init init_intel(struct cpuinfo_x86 *c) -{ - /* Cache sizes */ - unsigned n; - - init_intel_cacheinfo(c); - n = c->x86_cpuid_level; - if (n >= 0x80000008) { - unsigned eax = cpuid_eax(0x80000008); - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } - - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; -} - -void __init get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - - if (!strcmp(v, "AuthenticAMD")) - c->x86_vendor = X86_VENDOR_AMD; - else if (!strcmp(v, "GenuineIntel")) - c->x86_vendor = X86_VENDOR_INTEL; - else - c->x86_vendor = X86_VENDOR_UNKNOWN; -} - -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -void __init early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_num_cores = 1; - c->x86_apicid = c == &boot_cpu_data ? 0 : c - cpu_data; - c->x86_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) { - c->x86 += (tfms >> 20) & 0xff; - c->x86_model += ((tfms >> 16) & 0xF) << 4; - } - if (c->x86_capability[0] & (1<<19)) - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - c->x86_apicid = misc >> 24; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __init identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->x86_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[5] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - init_amd(c); - break; - - case X86_VENDOR_INTEL: - init_intel(c); - break; - - case X86_VENDOR_UNKNOWN: - default: - display_cacheinfo(c); - break; - } - - select_idle_routine(c); - detect_ht(c); - sched_cmp_hack(c); - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - -#ifdef CONFIG_X86_MCE - mcheck_init(c); -#endif -#ifdef CONFIG_NUMA - if (c != &boot_cpu_data) - numa_add_cpu(c - cpu_data); -#endif -} - - -void __init print_cpu_info(struct cpuinfo_x86 *c) -{ - if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); - else - printk("\n"); -} - -/* - * Get CPU information for use by the procfs. - */ - -static int show_cpuinfo(struct seq_file *m, void *v) -{ - struct cpuinfo_x86 *c = v; - - /* - * These flag bits must match the definitions in <asm/cpufeature.h>. - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu/<cpu_nr>/cpuid instead. - */ - static char *x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, - - /* AMD-defined */ - "pni", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", NULL, NULL, NULL, "lm", "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", NULL, NULL, "est", - "tm2", NULL, "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL - }; - static char *x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - }; - - -#ifdef CONFIG_SMP - if (!cpu_online(c-cpu_data)) - return 0; -#endif - - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)(c-cpu_data), - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - - if (c->x86_mask || c->cpuid_level >= 0) - seq_printf(m, "stepping\t: %d\n", c->x86_mask); - else - seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { - seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - cpu_khz / 1000, (cpu_khz % 1000)); - } - - /* Cache size */ - if (c->x86_cache_size >= 0) - seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - -#ifdef CONFIG_SMP - seq_printf(m, "physical id\t: %d\n", phys_proc_id[c - cpu_data]); - seq_printf(m, "siblings\t: %d\n", c->x86_num_cores * smp_num_siblings); -#endif - - seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", - c->cpuid_level); - - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if ( test_bit(i, &c->x86_capability) && - x86_cap_flags[i] != NULL ) - seq_printf(m, " %s", x86_cap_flags[i]); - } - - seq_printf(m, "\nbogomips\t: %lu.%02lu\n", - c->loops_per_jiffy/(500000/HZ), - (c->loops_per_jiffy/(5000/HZ)) % 100); - - if (c->x86_tlbsize > 0) - seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); - seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); - seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", - c->x86_phys_bits, c->x86_virt_bits); - - seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags)) - seq_printf(m, " %s", x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } - } - seq_printf(m, "\n"); - - if (c->x86_num_cores > 1) - seq_printf(m, "cpu cores\t: %d\n", c->x86_num_cores); - - seq_printf(m, "\n\n"); - - return 0; -} - -static void *c_start(struct seq_file *m, loff_t *pos) -{ - return *pos < NR_CPUS ? cpu_data + *pos : NULL; -} - -static void *c_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return c_start(m, pos); -} - -static void c_stop(struct seq_file *m, void *v) -{ -} - -struct seq_operations cpuinfo_op = { - .start =c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo, -}; diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup64.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/setup64.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,344 +0,0 @@ -/* - * X86-64 specific CPU setup. - * Copyright (C) 1995 Linus Torvalds - * Copyright 2001, 2002, 2003 SuSE Labs / Andi Kleen. - * See setup.c for older changelog. - * $Id: setup64.c,v 1.12 2002/03/21 10:09:17 ak Exp $ - * - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Modified for Xen - * - */ -#include <linux/config.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/bootmem.h> -#include <linux/bitops.h> -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/mmu_context.h> -#include <asm/smp.h> -#include <asm/i387.h> -#include <asm/percpu.h> -#include <asm/mtrr.h> -#include <asm/proto.h> -#include <asm/mman.h> -#include <asm/numa.h> - -#include <asm-xen/hypervisor.h> - -char x86_boot_params[2048] __initdata = {0,}; - -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; - -struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; - -extern struct task_struct init_task; - -extern unsigned char __per_cpu_start[], __per_cpu_end[]; - -extern struct desc_ptr cpu_gdt_descr[]; -struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); - -unsigned long __supported_pte_mask = ~0UL; -static int do_not_nx __initdata = 0; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -int __init nonx_setup(char *str) -{ - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -__setup("noexec=", nonx_setup); /* parsed early actually */ - -int force_personality32 = READ_IMPLIES_EXEC; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes -off PROT_READ implies PROT_EXEC (default) -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 0; -} -__setup("noexec32=", nonx32_setup); - -/* - * Great future plan: - * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. - * Always point %gs to its beginning - */ -void __init setup_per_cpu_areas(void) -{ - int i; - unsigned long size; - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif - - for (i = 0; i < NR_CPUS; i++) { - unsigned char *ptr; - - if (!NODE_DATA(cpu_to_node(i))) { - printk("cpu with no node %d, num_online_nodes %d\n", - i, num_online_nodes()); - ptr = alloc_bootmem(size); - } else { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); - } - if (!ptr) - panic("Cannot allocate cpu data for CPU %d\n", i); - cpu_pda[i].data_offset = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } -} - -void pda_init(int cpu) -{ - pgd_t *old_level4 = (pgd_t *)xen_start_info.pt_base; - struct x8664_pda *pda = &cpu_pda[cpu]; - - /* Setup up data that may be needed in __get_free_pages early */ - asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); - HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, - (unsigned long)(cpu_pda + cpu)); - - pda->me = pda; - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = - (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - pda->kernel_mode = 1; - - if (cpu == 0) { - memcpy((void *)init_level4_pgt, - (void *) xen_start_info.pt_base, PAGE_SIZE); - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - make_page_readonly(init_level4_pgt); - make_page_readonly(init_level4_user_pgt); - make_page_readonly(level3_user_pgt); /* for vsyscall stuff */ - xen_pgd_pin(__pa_symbol(init_level4_user_pgt)); - xen_pud_pin(__pa_symbol(level3_user_pgt)); - set_pgd((pgd_t *)(init_level4_user_pgt + 511), - mk_kernel_pgd(__pa_symbol(level3_user_pgt))); - } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); - } - - xen_pt_switch(__pa(init_level4_pgt)); - xen_new_user_pt(__pa(init_level4_user_pgt)); - - if (cpu == 0) { - xen_pgd_unpin(__pa(old_level4)); -#if 0 - early_printk("__pa: %x, <machine_phys> old_level 4 %x\n", - __pa(xen_start_info.pt_base), - pfn_to_mfn(__pa(old_level4) >> PAGE_SHIFT)); -#endif -// make_page_writable(old_level4); -// free_bootmem(__pa(old_level4), PAGE_SIZE); - } - - pda->irqstackptr += IRQSTACKSIZE-64; -} - -char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] -__attribute__((section(".bss.page_aligned"))); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init (); -#endif -} - -void __init check_efer(void) -{ - unsigned long efer; - - /* rdmsrl(MSR_EFER, efer); */ - - /* - * At this point, Xen does not like the bit 63. - * So NX is not supported. Come back later. - */ - efer = 0; - - if (!(efer & EFER_NX) || do_not_nx) { - __supported_pte_mask &= ~_PAGE_NX; - } -} - -void __init cpu_gdt_init(struct desc_ptr *gdt_descr) -{ - unsigned long frames[16]; - unsigned long va; - int f; - - for (va = gdt_descr->address, f = 0; - va < gdt_descr->address + gdt_descr->size; - va += PAGE_SIZE, f++) { - frames[f] = virt_to_machine(va) >> PAGE_SHIFT; - make_page_readonly((void *)va); - } - if (HYPERVISOR_set_gdt(frames, gdt_descr->size / - sizeof (struct desc_struct))) - BUG(); -} - - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. - */ -void __init cpu_init (void) -{ -#ifdef CONFIG_SMP - int cpu = stack_smp_processor_id(); -#else - int cpu = smp_processor_id(); -#endif - struct tss_struct *t = &per_cpu(init_tss, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) { - pda_init(cpu); - } else - estacks = boot_exception_stacks; - - me = current; - - if (test_and_set_bit(cpu, &cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk("Initializing CPU#%d\n", cpu); - -#if 0 - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); -#endif - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - if (cpu) { - memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); - } - - cpu_gdt_descr[cpu].size = GDT_SIZE; - cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; -#if 0 - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); - asm volatile("lidt %0" :: "m" (idt_descr)); -#endif - cpu_gdt_init(&cpu_gdt_descr[cpu]); - -#if 0 - memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); - -#endif - memcpy(me->thread.tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], - GDT_ENTRY_TLS_ENTRIES * 8); - - /* - * Delete NT - */ - - asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); - - if (cpu == 0) - early_identify_cpu(&boot_cpu_data); - - syscall_init(); - - barrier(); - check_efer(); - - /* - * set up and load the per-CPU TSS - */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, - EXCEPTION_STACK_ORDER); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); - } - estacks += EXCEPTION_STKSZ; - t->ist[v] = (unsigned long)estacks; - } - - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_LDT(&init_mm.context); - - /* - * Clear all 6 debug registers: - */ -#define CD(register) HYPERVISOR_set_debugreg(register, 0) - - CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); - -#undef CD - fpu_init(); - -#ifdef CONFIG_NUMA - numa_add_cpu(cpu); -#endif -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/signal.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/signal.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,493 +0,0 @@ -/* - * linux/arch/x86_64/kernel/signal.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs - * - * 1997-11-28 Modified for POSIX.1b signals by Richard Henderson - * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes - * 2000-2002 x86-64 support by Andi Kleen - * - * $Id: signal.c,v 1.18 2001/10/17 22:30:37 ak Exp $ - */ - -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/errno.h> -#include <linux/wait.h> -#include <linux/ptrace.h> -#include <linux/unistd.h> -#include <linux/stddef.h> -#include <linux/personality.h> -#include <linux/compiler.h> -#include <asm/ucontext.h> -#include <asm/uaccess.h> -#include <asm/i387.h> -#include <asm/proto.h> - -/* #define DEBUG_SIG 1 */ - -#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) - -void ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs); -void ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs); - -asmlinkage long -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); -#ifdef DEBUG_SIG - printk("rt_sigsuspend savset(%lx) newset(%lx) regs(%p) rip(%lx)\n", - saveset, newset, regs, regs->rip); -#endif - regs->rax = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(regs, &saveset)) - return -EINTR; - } -} - -asmlinkage long -sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs) -{ - return do_sigaltstack(uss, uoss, regs->rsp); -} - - -/* - * Do a signal return; undo the signal stack. - */ - -struct rt_sigframe -{ - char *pretcode; - struct ucontext uc; - struct siginfo info; -}; - -static int -restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax) -{ - unsigned int err = 0; - - /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; - -#define COPY(x) err |= __get_user(regs->x, &sc->x) - - COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); - COPY(rdx); COPY(rcx); COPY(rip); - COPY(r8); - COPY(r9); - COPY(r10); - COPY(r11); - COPY(r12); - COPY(r13); - COPY(r14); - COPY(r15); - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ - } - - { - struct _fpstate __user * buf; - err |= __get_user(buf, &sc->fpstate); - - if (buf) { - if (verify_area(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } - } - - err |= __get_user(*prax, &sc->rax); - return err; - -badframe: - return 1; -} - -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -{ - struct rt_sigframe __user *frame; - sigset_t set; - unsigned long eax; - - frame = (struct rt_sigframe __user *)(regs->rsp - 8); - if (verify_area(VERIFY_READ, frame, sizeof(*frame))) { - goto badframe; - } - if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) { - goto badframe; - } - - sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) { - goto badframe; - } - -#ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); -#endif - - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) - goto badframe; - - return eax; - -badframe: - signal_fault(regs,frame,"sigreturn"); - return 0; -} - -/* - * Set up a signal frame. - */ - -static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) -{ - int err = 0; - unsigned long eflags; - - err |= __put_user(0, &sc->gs); - err |= __put_user(0, &sc->fs); - - err |= __put_user(regs->rdi, &sc->rdi); - err |= __put_user(regs->rsi, &sc->rsi); - err |= __put_user(regs->rbp, &sc->rbp); - err |= __put_user(regs->rsp, &sc->rsp); - err |= __put_user(regs->rbx, &sc->rbx); - err |= __put_user(regs->rdx, &sc->rdx); - err |= __put_user(regs->rcx, &sc->rcx); - err |= __put_user(regs->rax, &sc->rax); - err |= __put_user(regs->r8, &sc->r8); - err |= __put_user(regs->r9, &sc->r9); - err |= __put_user(regs->r10, &sc->r10); - err |= __put_user(regs->r11, &sc->r11); - err |= __put_user(regs->r12, &sc->r12); - err |= __put_user(regs->r13, &sc->r13); - err |= __put_user(regs->r14, &sc->r14); - err |= __put_user(regs->r15, &sc->r15); - err |= __put_user(me->thread.trap_no, &sc->trapno); - err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->rip, &sc->rip); - eflags = regs->eflags; - if (current->ptrace & PT_PTRACED) { - eflags &= ~TF_MASK; - } - err |= __put_user(eflags, &sc->eflags); - err |= __put_user(mask, &sc->oldmask); - err |= __put_user(me->thread.cr2, &sc->cr2); - - return err; -} - -/* - * Determine which stack to use.. - */ - -static void __user * -get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) -{ - unsigned long rsp; - - /* Default to using normal stack - redzone*/ - rsp = regs->rsp - 128; - - /* This is the X/Open sanctioned signal stack switching. */ - /* RED-PEN: redzone on that stack? */ - if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; - } - - return (void __user *)round_down(rsp - size, 16); -} - -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) -{ - struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; - int err = 0; - struct task_struct *me = current; - - if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); - frame = (void __user *)round_down((unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) { - goto give_sigsegv; - } - - if (save_i387(fp) < 0) - err |= -1; - } else { - frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; - } - - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) { - goto give_sigsegv; - } - - if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) { - goto give_sigsegv; - } - } - - /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); - err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), - &frame->uc.uc_stack.ss_flags); - err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); - err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { - __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); - } else { - err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - } - - /* Set up to return from userspace. If provided, use a stub - already in userspace. */ - /* x86-64 should always use SA_RESTORER. */ - if (ka->sa.sa_flags & SA_RESTORER) { - err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); - } else { - /* could use a vstub here */ - goto give_sigsegv; - } - - if (err) { - goto give_sigsegv; - } - -#ifdef DEBUG_SIG - printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); -#endif - - /* Set up registers for signal handler */ - { - struct exec_domain *ed = current_thread_info()->exec_domain; - if (unlikely(ed && ed->signal_invmap && sig < 32)) - sig = ed->signal_invmap[sig]; - } - regs->rdi = sig; - /* In case the signal handler was declared without prototypes */ - regs->rax = 0; - - /* This also works for non SA_SIGINFO handlers because they expect the - next argument after the signal number on the stack. */ - regs->rsi = (unsigned long)&frame->info; - regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; - - regs->rsp = (unsigned long)frame; - - set_fs(USER_DS); - if (regs->eflags & TF_MASK) { - if ((current->ptrace & (PT_PTRACED | PT_DTRACE)) == (PT_PTRACED | PT_DTRACE)) { - ptrace_notify(SIGTRAP); - } else { - regs->eflags &= ~TF_MASK; - } - } - -#ifdef DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); -#endif - - return; - -give_sigsegv: - force_sigsegv(sig, current); -} - -/* - * OK, we're invoking a handler - */ - -static void -handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, - sigset_t *oldset, struct pt_regs *regs) -{ -#ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", current->pid, sig, - regs->rip, regs->rsp, regs); -#endif - - /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { - /* If so, check system call restarting.. */ - switch (regs->rax) { - case -ERESTART_RESTARTBLOCK: - case -ERESTARTNOHAND: - regs->rax = -EINTR; - break; - - case -ERESTARTSYS: - if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->rax = -EINTR; - break; - } - /* fallthrough */ - case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; - } - } - -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif - setup_rt_frame(sig, ka, info, oldset, regs); - - if (!(ka->sa.sa_flags & SA_NODEFER)) { - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); - sigaddset(¤t->blocked,sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - } -} - -/* - * Note that 'init' is a special process: it doesn't get signals it doesn't - * want to handle. Thus you cannot kill init even with a SIGKILL even by - * mistake. - */ -int do_signal(struct pt_regs *regs, sigset_t *oldset) -{ - struct k_sigaction ka; - siginfo_t info; - int signr; - - /* - * We want the common case to go fast, which - * is why we may in certain cases get here from - * kernel mode. Just return without doing anything - * if so. - */ - if ((regs->cs & 2) != 2) { - return 1; - } - - if (try_to_freeze(0)) - goto no_signal; - - if (!oldset) - oldset = ¤t->blocked; - - signr = get_signal_to_deliver(&info, &ka, regs, NULL); - if (signr > 0) { - /* Reenable any watchpoints before delivering the - * signal to user space. The processor register will - * have been cleared if the watchpoint triggered - * inside the kernel. - */ - if (current->thread.debugreg7) { - HYPERVISOR_set_debugreg(7, - current->thread.debugreg7); - } - - /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; - } - - no_signal: - /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { - /* Restart the system call - no handlers present */ - long res = regs->rax; - if (res == -ERESTARTNOHAND || - res == -ERESTARTSYS || - res == -ERESTARTNOINTR) { - regs->rax = regs->orig_rax; - regs->rip -= 2; - } - if (regs->rax == (unsigned long)-ERESTART_RESTARTBLOCK) { - regs->rax = __NR_restart_syscall; - regs->rip -= 2; - } - } - return 0; -} - -void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, __u32 thread_info_flags) -{ -#ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", - thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); -#endif - - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; - clear_thread_flag(TIF_SINGLESTEP); - } - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs,oldset); -} - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; - if (exception_trace) - printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", - me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); - - force_sig(SIGSEGV, me); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smp.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,411 +0,0 @@ -/* - * Intel SMP support routines. - * - * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> - * (c) 1998-99, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * (c) 2002,2003 Andi Kleen, SuSE Labs. - * - * This code is released under the GNU General Public License version 2 or - * later. - */ - -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/irq.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/smp_lock.h> -#include <linux/smp.h> -#include <linux/kernel_stat.h> -#include <linux/mc146818rtc.h> -#include <linux/interrupt.h> - -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mach_apic.h> -#include <asm/proto.h> - -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul <manfred@xxxxxxxxxxxxxxxx> - */ - -static cpumask_t flush_cpumask; -static struct mm_struct * flush_mm; -static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff - -/* - * We cannot call mmdrop() because we are in interrupt context, - * instead update mm->cpu_vm_mask. - */ -static inline void leave_mm (unsigned long cpu) -{ - if (read_pda(mmu_state) == TLBSTATE_OK) - BUG(); - clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); - __flush_tlb(); -} - -/* - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) clear_bit(cpu, &old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. - * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu mmu_state to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu mmu_state is local to each cpu, no - * write/read ordering problems. - */ - -/* - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ - -asmlinkage void smp_invalidate_interrupt (void) -{ - unsigned long cpu; - - cpu = get_cpu(); - - if (!cpu_isset(cpu, flush_cpumask)) - goto out; - /* - * This was a BUG() but until someone can quote me the - * line from the intel manual that guarantees an IPI to - * multiple CPUs is retried _only_ on the erroring CPUs - * its staying as a return - * - * BUG(); - */ - - if (flush_mm == read_pda(active_mm)) { - if (read_pda(mmu_state) == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) - local_flush_tlb(); - else - __flush_tlb_one(flush_va); - } else - leave_mm(cpu); - } - ack_APIC_irq(); - cpu_clear(cpu, flush_cpumask); - -out: - put_cpu_no_resched(); -} - -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) -{ - cpumask_t tmp; - /* - * A couple of (to be removed) sanity checks: - * - * - we do not send IPIs to not-yet booted CPUs. - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(tmp, cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - if (!mm) - BUG(); - - /* - * I'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. - * Temporarily this turns IRQs off, so that lockups are - * detected by the NMI watchdog. - */ - spin_lock(&tlbstate_lock); - - flush_mm = mm; - flush_va = va; - cpus_or(flush_cpumask, cpumask, flush_cpumask); - - /* - * We have to send the IPI only to - * CPUs affected. - */ - send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); - - while (!cpus_empty(flush_cpumask)) - mb(); /* nothing. lockup detection does not belong here */; - - flush_mm = NULL; - flush_va = 0; - spin_unlock(&tlbstate_lock); -} - -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - local_flush_tlb(); - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - -void flush_tlb_mm (struct mm_struct * mm) -{ - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if (current->mm) - local_flush_tlb(); - else - leave_mm(smp_processor_id()); - } - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - - preempt_enable(); -} - -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) -{ - struct mm_struct *mm = vma->vm_mm; - cpumask_t cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask; - cpu_clear(smp_processor_id(), cpu_mask); - - if (current->active_mm == mm) { - if(current->mm) - __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, va); - - preempt_enable(); -} - -static void do_flush_tlb_all(void* info) -{ - unsigned long cpu = smp_processor_id(); - - __flush_tlb_all(); - if (read_pda(mmu_state) == TLBSTATE_LAZY) - leave_mm(cpu); -} - -void flush_tlb_all(void) -{ - on_each_cpu(do_flush_tlb_all, NULL, 1, 1); -} - -void smp_kdb_stop(void) -{ - send_IPI_allbutself(KDB_VECTOR); -} - -/* - * this function sends a 'reschedule' IPI to another CPU. - * it goes straight through and wastes no time serializing - * anything. Worst case is that we lose a reschedule ... - */ - -void smp_send_reschedule(int cpu) -{ - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); -} - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -static struct call_data_struct * call_data; - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ -static void __smp_call_function (void (*func) (void *info), void *info, - int nonatomic, int wait) -{ - struct call_data_struct data; - int cpus = num_online_cpus()-1; - - if (!cpus) - return; - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - cpu_relax(); - - if (!wait) - return; - - while (atomic_read(&data.finished) != cpus) - cpu_relax(); -} - -/* - * smp_call_function - run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. - * @wait: If true, wait (atomically) until function has completed on other - * CPUs. - * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute func or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - * Actually there are a few legal cases, like panic. - */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -{ - spin_lock(&call_lock); - __smp_call_function(func,info,nonatomic,wait); - spin_unlock(&call_lock); - return 0; -} - -void smp_stop_cpu(void) -{ - /* - * Remove this CPU: - */ - cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); - disable_local_APIC(); - local_irq_enable(); -} - -static void smp_really_stop_cpu(void *dummy) -{ - smp_stop_cpu(); - for (;;) - asm("hlt"); -} - -void smp_send_stop(void) -{ - int nolock = 0; - if (reboot_force) - return; - /* Don't deadlock on the call lock in panic */ - if (!spin_trylock(&call_lock)) { - /* ignore locking because we have paniced anyways */ - nolock = 1; - } - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); - if (!nolock) - spin_unlock(&call_lock); - smp_stop_cpu(); -} - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -asmlinkage void smp_reschedule_interrupt(void) -{ - ack_APIC_irq(); -} - -asmlinkage void smp_call_function_interrupt(void) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - ack_APIC_irq(); - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - irq_enter(); - (*func)(info); - irq_exit(); - if (wait) { - mb(); - atomic_inc(&call_data->finished); - } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/smpboot.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,958 +0,0 @@ -/* - * x86 SMP booting functions - * - * (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx> - * (c) 1998, 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx> - * Copyright 2001 Andi Kleen, SuSE Labs. - * - * Much of the core SMP work is based on previous work by Thomas Radke, to - * whom a great many thanks are extended. - * - * Thanks to Intel for making available several different Pentium, - * Pentium Pro and Pentium-II/Xeon MP machines. - * Original development of Linux SMP code supported by Caldera. - * - * This code is released under the GNU General Public License version 2 or - * later. - * - * Fixes - * Felix Koop : NR_CPUS used properly - * Jose Renau : Handle single CPU case. - * Alan Cox : By repeated request 8) - Total BogoMIP report. - * Greg Wright : Fix for kernel stacks panic. - * Erich Boleyn : MP v1.4 and additional changes. - * Matthias Sattler : Changes for 2.1 kernel map. - * Michel Lespinasse : Changes for 2.1 kernel map. - * Michael Chastain : Change trampoline.S to gnu as. - * Alan Cox : Dumb bug: 'B' step PPro's are fine - * Ingo Molnar : Added APIC timers, based on code - * from Jose Renau - * Ingo Molnar : various cleanups and rewrites - * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. - * Maciej W. Rozycki : Bits for genuine 82489DX APICs - * Andi Kleen : Changed for SMP boot into long mode. - * Rusty Russell : Hacked into shape for new "hotplug" boot process. - */ - -#include <linux/config.h> -#include <linux/init.h> - -#include <linux/mm.h> -#include <linux/kernel_stat.h> -#include <linux/smp_lock.h> -#include <linux/irq.h> -#include <linux/bootmem.h> -#include <linux/thread_info.h> -#include <linux/module.h> - -#include <linux/delay.h> -#include <linux/mc146818rtc.h> -#include <asm/mtrr.h> -#include <asm/pgalloc.h> -#include <asm/desc.h> -#include <asm/kdebug.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> - -/* Number of siblings per CPU package */ -int smp_num_siblings = 1; -/* Package ID of each logical CPU */ -u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -EXPORT_SYMBOL(phys_proc_id); - -/* Bitmask of currently online CPUs */ -cpumask_t cpu_online_map; - -cpumask_t cpu_callin_map; -cpumask_t cpu_callout_map; -static cpumask_t smp_commenced_mask; - -/* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; - -/* Set when the idlers are all forked */ -int smp_threads_ready; - -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; - -/* - * Trampoline 80x86 program as an array. - */ - -extern unsigned char trampoline_data []; -extern unsigned char trampoline_end []; - -/* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. - */ - -static unsigned long __init setup_trampoline(void) -{ - void *tramp = __va(SMP_TRAMPOLINE_BASE); - extern volatile __u32 tramp_gdt_ptr; - tramp_gdt_ptr = __pa_symbol(&cpu_gdt_table); - memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys(tramp); -} - -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ - -static void __init smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = cpu_data + id; - - *c = boot_cpu_data; - identify_cpu(c); -} - -/* - * TSC synchronization. - * - * We first check whether all CPUs have their TSC's synchronized, - * then we print a warning if not, and always resync. - */ - -static atomic_t tsc_start_flag = ATOMIC_INIT(0); -static atomic_t tsc_count_start = ATOMIC_INIT(0); -static atomic_t tsc_count_stop = ATOMIC_INIT(0); -static unsigned long long tsc_values[NR_CPUS]; - -#define NR_LOOPS 5 - -extern unsigned int fast_gettimeoffset_quotient; - -static void __init synchronize_tsc_bp (void) -{ - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - long one_usec; - int buggy = 0; - - printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",num_booting_cpus()); - - one_usec = cpu_khz; - - atomic_set(&tsc_start_flag, 1); - wmb(); - - /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. - */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) mb(); - atomic_set(&tsc_count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc_count_start); - - sync_core(); - rdtscll(tsc_values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) mb(); - atomic_set(&tsc_count_start, 0); - wmb(); - atomic_inc(&tsc_count_stop); - } - - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (cpu_isset(i, cpu_callout_map)) { - t0 = tsc_values[i]; - sum += t0; - } - } - avg = sum / num_booting_cpus(); - - sum = 0; - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - - delta = tsc_values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*one_usec) { - long realdelta; - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = delta / one_usec; - if (tsc_values[i] < avg) - realdelta = -realdelta; - - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", - i, realdelta); - } - - sum += delta; - } - if (!buggy) - printk("passed.\n"); -} - -static void __init synchronize_tsc_ap (void) -{ - int i; - - /* - * Not every cpu is online at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc_start_flag)) mb(); - - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc_count_start); - while (atomic_read(&tsc_count_start) != num_booting_cpus()) mb(); - - sync_core(); - rdtscll(tsc_values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); - } -} -#undef NR_LOOPS - -static atomic_t init_deasserted; - -void __init smp_callin(void) -{ - int cpuid, phys_id; - unsigned long timeout; - - /* - * If waken up by an INIT in an 82489DX configuration - * we may get here before an INIT-deassert IPI reaches - * our local APIC. We have to wait for the IPI or we'll - * lock up on an APIC access. - */ - while (!atomic_read(&init_deasserted)); - - /* - * (This works even if the APIC is not enabled.) - */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = smp_processor_id(); - if (cpu_isset(cpuid, cpu_callin_map)) { - panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", - phys_id, cpuid); - } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); - - /* - * STARTUP IPIs are fragile beasts as they might sometimes - * trigger some glue motherboard logic. Complete APIC bus - * silence for 1 second, this overestimates the time the - * boot CPU is spending to send the up to 2 STARTUP IPIs - * by a factor of two. This should be enough. - */ - - /* - * Waiting 2s total for startup (udelay is not yet working) - */ - timeout = jiffies + 2*HZ; - while (time_before(jiffies, timeout)) { - /* - * Has the boot CPU finished it's STARTUP sequence? - */ - if (cpu_isset(cpuid, cpu_callout_map)) - break; - rep_nop(); - } - - if (!time_before(jiffies, timeout)) { - panic("smp_callin: CPU%d started up but did not get a callout!\n", - cpuid); - } - - /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) - */ - - Dprintk("CALLIN, before setup_local_APIC().\n"); - setup_local_APIC(); - - local_irq_enable(); - - /* - * Get our bogomips. - */ - calibrate_delay(); - Dprintk("Stack at about %p\n",&cpuid); - - disable_APIC_timer(); - - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - local_irq_disable(); - - /* - * Allow the master to continue. - */ - cpu_set(cpuid, cpu_callin_map); - - /* - * Synchronize the TSC with the BP - */ - if (cpu_has_tsc) - synchronize_tsc_ap(); -} - -int cpucount; - -/* - * Activate a secondary processor. - */ -void __init start_secondary(void) -{ - /* - * Dont put anything before smp_callin(), SMP - * booting is too fragile that we want to limit the - * things done here to the most necessary things. - */ - cpu_init(); - smp_callin(); - - /* otherwise gcc will move up the smp_processor_id before the cpu_init */ - barrier(); - - Dprintk("cpu %d: waiting for commence\n", smp_processor_id()); - while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) - rep_nop(); - - Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); - setup_secondary_APIC_clock(); - - Dprintk("cpu %d: enabling apic timer\n", smp_processor_id()); - - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); - enable_8259A_irq(0); - } - - - enable_APIC_timer(); - - /* - * low-memory mappings have been cleared, flush them from - * the local TLBs too. - */ - local_flush_tlb(); - - Dprintk("cpu %d eSetting cpu_online_map\n", smp_processor_id()); - cpu_set(smp_processor_id(), cpu_online_map); - wmb(); - - cpu_idle(); -} - -extern volatile unsigned long init_rsp; -extern void (*initial_code)(void); - -#if APIC_DEBUG -static inline void inquire_remote_apic(int apicid) -{ - unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; - - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); - - for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - printk("%08x\n", status); - break; - default: - printk("failed\n"); - } - } -} -#endif - -static int __init wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) -{ - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; - - Dprintk("Asserting INIT.\n"); - - /* - * Turn INIT on target chip - */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* - * Send IPI - */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - mdelay(10); - - Dprintk("Deasserting INIT.\n"); - - /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - atomic_set(&init_deasserted, 1); - - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; - - /* - * Run STARTUP IPI loop. - */ - Dprintk("#startup loops: %d.\n", num_starts); - - maxlvt = get_maxlvt(); - - for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n",j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); - - /* - * STARTUP IPI - */ - - /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Boot on the stack */ - /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_rip >> 12)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); - - Dprintk("Startup point 1.\n"); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - } - accept_status = (apic_read(APIC_ESR) & 0xEF); - if (send_status || accept_status) - break; - } - Dprintk("After Startup.\n"); - - if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); - if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); - - return (send_status | accept_status); -} - -static void __init do_boot_cpu (int apicid) -{ - struct task_struct *idle; - unsigned long boot_error; - int timeout, cpu; - unsigned long start_rip; - - cpu = ++cpucount; - /* - * We can't use kernel_thread since we must avoid to - * reschedule the child. - */ - idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - x86_cpu_to_apicid[cpu] = apicid; - - cpu_pda[cpu].pcurrent = idle; - - start_rip = setup_trampoline(); - - init_rsp = idle->thread.rsp; - per_cpu(init_tss,cpu).rsp0 = init_rsp; - initial_code = start_secondary; - clear_ti_thread_flag(idle->thread_info, TIF_FORK); - - printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid, - start_rip, init_rsp); - - /* - * This grunge runs the startup process for - * the targeted processor. - */ - - atomic_set(&init_deasserted, 0); - - Dprintk("Setting warm reset code and vector.\n"); - - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - Dprintk("1.\n"); - *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4; - Dprintk("2.\n"); - *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf; - Dprintk("3.\n"); - - /* - * Be paranoid about clearing APIC errors. - */ - if (APIC_INTEGRATED(apic_version[apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } - - /* - * Status is now clean - */ - boot_error = 0; - - /* - * Starting actual IPI sequence... - */ - boot_error = wakeup_secondary_via_INIT(apicid, start_rip); - - if (!boot_error) { - /* - * allow APs to start initializing. - */ - Dprintk("Before Callout %d.\n", cpu); - cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); - - /* - * Wait 5s total for a response - */ - for (timeout = 0; timeout < 50000; timeout++) { - if (cpu_isset(cpu, cpu_callin_map)) - break; /* It has booted */ - udelay(100); - } - - if (cpu_isset(cpu, cpu_callin_map)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); - print_cpu_info(&cpu_data[cpu]); - Dprintk("CPU has booted.\n"); - } else { - boot_error = 1; - if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE)) - == 0xA5) - /* trampoline started but...? */ - printk("Stuck ??\n"); - else - /* trampoline code not run */ - printk("Not responding.\n"); -#if APIC_DEBUG - inquire_remote_apic(apicid); -#endif - } - } - if (boot_error) { - cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - cpucount--; - x86_cpu_to_apicid[cpu] = BAD_APICID; - x86_cpu_to_log_apicid[cpu] = BAD_APICID; - } -} - -cycles_t cacheflush_time; -unsigned long cache_decay_ticks; - -static void smp_tune_scheduling (void) -{ - int cachesize; /* kB */ - unsigned long bandwidth = 1000; /* MB/s */ - /* - * Rough estimation for SMP scheduling, this is the number of - * cycles it takes for a fully memory-limited process to flush - * the SMP-local cache. - * - * (For a P5 this pretty much means we will choose another idle - * CPU almost always at wakeup time (this is due to the small - * L1 cache), on PIIs it's around 50-100 usecs, depending on - * the cache size) - */ - - if (!cpu_khz) { - /* - * this basically disables processor-affinity - * scheduling on SMP without a TSC. - */ - cacheflush_time = 0; - return; - } else { - cachesize = boot_cpu_data.x86_cache_size; - if (cachesize == -1) { - cachesize = 16; /* Pentiums, 2x8kB cache */ - bandwidth = 100; - } - - cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; - } - - cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; - - printk(KERN_INFO "per-CPU timeslice cutoff: %ld.%02ld usecs.\n", - (long)cacheflush_time/(cpu_khz/1000), - ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); - printk(KERN_INFO "task migration cache decay timeout: %ld msecs.\n", - (cache_decay_ticks + 1) * 1000 / HZ); -} - -/* - * Cycle through the processors sending APIC IPIs to boot each. - */ - -static void __init smp_boot_cpus(unsigned int max_cpus) -{ - unsigned apicid, cpu, bit, kicked; - - nmi_watchdog_default(); - - /* - * Setup boot CPU information - */ - smp_store_cpu_info(0); /* Final full version of the data */ - printk(KERN_INFO "CPU%d: ", 0); - print_cpu_info(&cpu_data[0]); - - current_thread_info()->cpu = 0; - smp_tune_scheduling(); - - if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk("weird, boot CPU (#%d) not listed by the BIOS.\n", - hard_smp_processor_id()); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config) { - printk(KERN_NOTICE "SMP motherboard not detected.\n"); - io_apic_irqs = 0; - cpu_online_map = cpumask_of_cpu(0); - phys_cpu_present_map = physid_mask_of_physid(0); - if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); - goto smp_done; - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) { - printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_id); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (APIC_INTEGRATED(apic_version[boot_cpu_id]) && !cpu_has_apic) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_id); - printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); - io_apic_irqs = 0; - cpu_online_map = cpumask_of_cpu(0); - phys_cpu_present_map = physid_mask_of_physid(0); - disable_apic = 1; - goto smp_done; - } - - verify_local_APIC(); - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - smp_found_config = 0; - printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); - io_apic_irqs = 0; - cpu_online_map = cpumask_of_cpu(0); - phys_cpu_present_map = physid_mask_of_physid(0); - disable_apic = 1; - goto smp_done; - } - - connect_bsp_APIC(); - setup_local_APIC(); - - if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) - BUG(); - - x86_cpu_to_apicid[0] = boot_cpu_id; - - /* - * Now scan the CPU present map and fire up the other CPUs. - */ - Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); - - kicked = 1; - for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don't even attempt to start the boot CPU! - */ - if (apicid == boot_cpu_id || (apicid == BAD_APICID)) - continue; - - if (!physid_isset(apicid, phys_cpu_present_map)) - continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) - continue; - - do_boot_cpu(apicid); - ++kicked; - } - - /* - * Cleanup possible dangling ends... - */ - { - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - *((volatile int *) phys_to_virt(0x467)) = 0; - } - - /* - * Allow the user to impress friends. - */ - - Dprintk("Before bogomips.\n"); - if (!cpucount) { - printk(KERN_INFO "Only one processor found.\n"); - } else { - unsigned long bogosum = 0; - for (cpu = 0; cpu < NR_CPUS; cpu++) - if (cpu_isset(cpu, cpu_callout_map)) - bogosum += cpu_data[cpu].loops_per_jiffy; - printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); - Dprintk("Before bogocount - setting activated=1.\n"); - } - - /* - * Construct cpu_sibling_map[], so that we can tell the - * sibling CPU efficiently. - */ - for (cpu = 0; cpu < NR_CPUS; cpu++) - cpus_clear(cpu_sibling_map[cpu]); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - int siblings = 0; - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; - - if (smp_num_siblings > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING - "WARNING: %d siblings found for CPU%d, should be %d\n", - siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - } - - Dprintk("Boot done.\n"); - - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - - setup_boot_APIC_clock(); - - /* - * Synchronize the TSC with the AP - */ - if (cpu_has_tsc && cpucount) - synchronize_tsc_bp(); - - smp_done: - time_init_smp(); -} - -/* These are wrappers to interface to the new boot process. Someone - who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ -void __init smp_prepare_cpus(unsigned int max_cpus) -{ - smp_boot_cpus(max_cpus); -} - -void __devinit smp_prepare_boot_cpu(void) -{ - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); -} - -int __devinit __cpu_up(unsigned int cpu) -{ - /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); - return -ENOSYS; - } - - /* In case one didn't come up */ - if (!cpu_isset(cpu, cpu_callin_map)) { - local_irq_enable(); - return -EIO; - } - local_irq_enable(); - - /* Unleash the CPU! */ - Dprintk("waiting for cpu %d\n", cpu); - - cpu_set(cpu, smp_commenced_mask); - while (!cpu_isset(cpu, cpu_online_map)) - mb(); - return 0; -} - -void __init smp_cpus_done(unsigned int max_cpus) -{ -#ifdef CONFIG_X86_IO_APIC - setup_ioapic_dest(); -#endif - zap_low_mappings(); -} - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/traps.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,975 +0,0 @@ -/* - * linux/arch/x86-64/traps.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000 - * - * $Id: traps.c,v 1.36 2002/03/24 11:09:10 ak Exp $ - */ - -/* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. - */ -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/moduleparam.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/debugreg.h> -#include <asm/desc.h> -#include <asm/i387.h> -#include <asm/kdebug.h> -#include <asm/processor.h> - -#include <asm/smp.h> -#include <asm/pgalloc.h> -#include <asm/pda.h> -#include <asm/proto.h> -#include <asm/nmi.h> - -#include <linux/irq.h> - - -extern struct gate_struct idt_table[256]; - -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void reserved(void); -asmlinkage void alignment_check(void); -asmlinkage void machine_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void call_debug(void); - -struct notifier_block *die_chain; -static DEFINE_SPINLOCK(die_notifier_lock); - -int register_die_notifier(struct notifier_block *nb) -{ - int err = 0; - unsigned long flags; - spin_lock_irqsave(&die_notifier_lock, flags); - err = notifier_chain_register(&die_chain, nb); - spin_unlock_irqrestore(&die_notifier_lock, flags); - return err; -} - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->eflags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static int kstack_depth_to_print = 10; - -#ifdef CONFIG_KALLSYMS -#include <linux/kallsyms.h> -int printk_address(unsigned long address) -{ - unsigned long offset = 0, symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[128]; - - symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); - if (!symname) - return printk("[<%016lx>]", address); - if (!modname) - modname = delim = ""; - return printk("<%016lx>{%s%s%s%s%+ld}", - address,delim,modname,delim,symname,offset); -} -#else -int printk_address(unsigned long address) -{ - return printk("[<%016lx>]", address); -} -#endif - -unsigned long *in_exception_stack(int cpu, unsigned long stack) -{ - int k; - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - struct tss_struct *tss = &per_cpu(init_tss, cpu); - unsigned long end = tss->ist[k] + EXCEPTION_STKSZ; - - if (stack >= tss->ist[k] && stack <= end) - return (unsigned long *)end; - } - return NULL; -} - -/* - * x86-64 can have upto three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault) hardware stack - * Check and process them in order. - */ - -void show_trace(unsigned long *stack) -{ - unsigned long addr; - unsigned long *irqstack, *irqstack_end, *estack_end; - const int cpu = safe_smp_processor_id(); - int i; - - printk("\nCall Trace:"); - i = 0; - - estack_end = in_exception_stack(cpu, (unsigned long)stack); - if (estack_end) { - while (stack < estack_end) { - addr = *stack++; - if (__kernel_text_address(addr)) { - i += printk_address(addr); - i += printk(" "); - if (i > 50) { - printk("\n"); - i = 0; - } - } - } - i += printk(" <EOE> "); - i += 7; - stack = (unsigned long *) estack_end[-2]; - } - - irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); - irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE + 64); - - if (stack >= irqstack && stack < irqstack_end) { - printk("<IRQ> "); - while (stack < irqstack_end) { - addr = *stack++; - /* - * If the address is either in the text segment of the - * kernel, or in the region which contains vmalloc'ed - * memory, it *may* be the address of a calling - * routine; if so, print it so that someone tracing - * down the cause of the crash will be able to figure - * out the call path that was taken. - */ - if (__kernel_text_address(addr)) { - i += printk_address(addr); - i += printk(" "); - if (i > 50) { - printk("\n "); - i = 0; - } - } - } - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - i += 7; - } - - while (((long) stack & (THREAD_SIZE-1)) != 0) { - addr = *stack++; - if (__kernel_text_address(addr)) { - i += printk_address(addr); - i += printk(" "); - if (i > 50) { - printk("\n "); - i = 0; - } - } - } - printk("\n"); -} - -void show_stack(struct task_struct *tsk, unsigned long * rsp) -{ - unsigned long *stack; - int i; - const int cpu = safe_smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE); - - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. - - if (rsp == NULL) { - if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; - else - rsp = (unsigned long *)&rsp; - } - - stack = rsp; - for(i=0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" <EOI> "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n "); - printk("%016lx ", *stack++); - } - show_trace((unsigned long *)rsp); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long dummy; - show_trace(&dummy); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - int in_kernel = (regs->cs & 3) == 0; - unsigned long rsp; - const int cpu = safe_smp_processor_id(); - struct task_struct *cur = cpu_pda[cpu].pcurrent; - - rsp = regs->rsp; - - printk("CPU %d ", cpu); - __show_regs(regs); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, cur->thread_info, cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (in_kernel) { - - printk("Stack: "); - show_stack(NULL, (unsigned long*)rsp); - - printk("\nCode: "); - if(regs->rip < PAGE_OFFSET) - goto bad; - - for(i=0;i<20;i++) - { - unsigned char c; - if(__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: - printk(" Bad RIP value."); - break; - } - printk("%02x ", c); - } - } - printk("\n"); -} - -void handle_BUG(struct pt_regs *regs) -{ - struct bug_frame f; - char tmp; - - if (regs->cs & 3) - return; - if (__copy_from_user(&f, (struct bug_frame *) regs->rip, - sizeof(struct bug_frame))) - return; - if ((unsigned long)f.filename < __PAGE_OFFSET || - f.ud2[0] != 0x0f || f.ud2[1] != 0x0b) - return; - if (__get_user(tmp, f.filename)) - f.filename = "unmapped filename"; - printk("----------- [cut here ] --------- [please bite here ] ---------\n"); - printk(KERN_ALERT "Kernel BUG at %.50s:%d\n", f.filename, f.line); -} - -void out_of_line_bug(void) -{ - BUG(); -} - -static DEFINE_SPINLOCK(die_lock); -static int die_owner = -1; - -void oops_begin(void) -{ - int cpu = safe_smp_processor_id(); - /* racy, but better than risking deadlock. */ - local_irq_disable(); - if (!spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - spin_lock(&die_lock); - } - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); -} - -void oops_end(void) -{ - die_owner = -1; - bust_spinlocks(0); - spin_unlock(&die_lock); - local_irq_enable(); /* make sure back scroll still works */ - if (panic_on_oops) - panic("Oops"); -} - -void __die(const char * str, struct pt_regs * regs, long err) -{ - static int die_counter; - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV); - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); -} - -void die(const char * str, struct pt_regs * regs, long err) -{ - oops_begin(); - handle_BUG(regs); - __die(str, regs, err); - oops_end(); - do_exit(SIGSEGV); -} -static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) -{ - if (!(regs->eflags & VM_MASK) && (regs->cs == __KERNEL_CS)) - die(str, regs, err); -} - -#ifdef CONFIG_X86_LOCAL_APIC -void die_nmi(char *str, struct pt_regs *regs) -{ - oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(str, safe_smp_processor_id()); - show_registers(regs); - if (panic_on_timeout || panic_on_oops) - panic("nmi watchdog"); - printk("console shuts up ...\n"); - oops_end(); - do_exit(SIGSEGV); -} -#endif - -static void do_trap(int trapnr, int signr, char *str, - struct pt_regs * regs, long error_code, siginfo_t *info) -{ - conditional_sti(regs); - -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("%s: wrong gs %lx expected %p rip %lx\n", str, gs, pda, - regs->rip); - } - } -#endif - - if ((regs->cs & 3) != 0) { - struct task_struct *tsk = current; - - if (exception_trace && unhandled_signal(tsk, signr)) - printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", - tsk->comm, tsk->pid, str, - regs->rip,regs->rsp,error_code); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - } - - - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - } else - die(str, regs, error_code); - return; - } -} - -#define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) -DO_ERROR( 4, SIGSEGV, "overflow", overflow) -DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid operand", invalid_op, ILL_ILLOPN, regs->rip) -DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) -DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -DO_ERROR(18, SIGSEGV, "reserved", reserved) - -#define DO_ERROR_STACK(trapnr, signr, str, name) \ -asmlinkage void *do_##name(struct pt_regs * regs, long error_code) \ -{ \ - struct pt_regs *pr = ((struct pt_regs *)(current->thread.rsp0))-1; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return regs; \ - if (regs->cs & 3) { \ - memcpy(pr, regs, sizeof(struct pt_regs)); \ - regs = pr; \ - } \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ - return regs; \ -} - -DO_ERROR_STACK(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_STACK( 8, SIGSEGV, "double fault", double_fault) - -asmlinkage void do_general_protection(struct pt_regs * regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - oops_in_progress++; - printk("general protection handler: wrong gs %lx expected %p\n", gs, pda); - oops_in_progress--; - } - } -#endif - - if ((regs->cs & 3)!=0) { - struct task_struct *tsk = current; - - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) - printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", - tsk->comm, tsk->pid, - regs->rip,regs->rsp,error_code); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - force_sig(SIGSEGV, tsk); - return; - } - - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } -} - -static void mem_parity_error(unsigned char reason, struct pt_regs * regs) -{ - printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n"); - printk("You probably have a hardware problem with your RAM chips\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static void io_check_error(unsigned char reason, struct pt_regs * regs) -{ - printk("NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - mdelay(2000); - reason &= ~8; - outb(reason, 0x61); -} - -static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs) -{ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk("Dazed and confused, but trying to continue\n"); - printk("Do you have a strange power saving mode enabled?\n"); -} - -asmlinkage void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - - /* Only the BSP gets external NMIs from the system. */ - if (!smp_processor_id()) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog > 0) { - nmi_watchdog_tick(regs,reason); - return; - } -#endif - unknown_nmi_error(reason, regs); - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); - - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered. - */ - outb(0x8f, 0x70); - inb(0x71); /* dummy */ - outb(0x0f, 0x70); - inb(0x71); /* dummy */ -} - -asmlinkage void do_int3(struct pt_regs * regs, long error_code) -{ - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { - return; - } - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - return; -} - -/* runs on IST stack. */ -asmlinkage void *do_debug(struct pt_regs * regs, unsigned long error_code) -{ - struct pt_regs *pr; - unsigned long condition; - struct task_struct *tsk = current; - siginfo_t info; - - pr = (struct pt_regs *)(current->thread.rsp0)-1; - if (regs->cs & 3) { - memcpy(pr, regs, sizeof(struct pt_regs)); - regs = pr; - } - -#ifdef CONFIG_CHECKING - { - /* RED-PEN interaction with debugger - could destroy gs */ - unsigned long gs; - struct x8664_pda *pda = cpu_pda + safe_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("debug handler: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - - asm("movq %%db6,%0" : "=r" (condition)); - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) { - return regs; - } - conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) { - goto clear_dr7; - } - } - - tsk->thread.debugreg6 = condition; - - /* Mask out spurious TF errors due to lazy TF clearing */ - if ((condition & DR_STEP) && - (notify_die(DIE_DEBUGSTEP, "debugstep", regs, condition, - 1, SIGTRAP) != NOTIFY_STOP)) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ - if ((regs->cs & 3) == 0) - goto clear_TF_reenable; - if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) - goto clear_TF; - } - - /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_BRKPT; - if ((regs->cs & 3) == 0) - goto clear_dr7; - - info.si_addr = (void __user *)regs->rip; - force_sig_info(SIGTRAP, &info, tsk); -clear_dr7: - asm volatile("movq %0,%%db7"::"r"(0UL)); - notify_die(DIE_DEBUG, "debug", regs, condition, 1, SIGTRAP); - return regs; - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - -clear_TF: - /* RED-PEN could cause spurious errors */ - if (notify_die(DIE_DEBUG, "debug2", regs, condition, 1, SIGTRAP) - != NOTIFY_STOP) - regs->eflags &= ~TF_MASK; - return regs; -} - -static int kernel_math_error(struct pt_regs *regs, char *str) -{ - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - notify_die(DIE_GPF, str, regs, 0, 16, SIGFPE); -#if 0 - /* This should be a die, but warn only for now */ - die(str, regs, 0); -#else - printk(KERN_DEBUG "%s: %s at ", current->comm, str); - printk_address(regs->rip); - printk("\n"); -#endif - return 0; -} - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) -{ - void __user *rip = (void __user *)(regs->rip); - struct task_struct * task; - siginfo_t info; - unsigned short cwd, swd; - - conditional_sti(regs); - if ((regs->cs & 3) == 0 && - kernel_math_error(regs, "kernel x87 math error")) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = rip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - case 0x041: /* Stack Fault */ - case 0x241: /* Stack Fault | Direction */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) -{ - void __user *rip = (void __user *)(regs->rip); - struct task_struct * task; - siginfo_t info; - unsigned short mxcsr; - - conditional_sti(regs); - if ((regs->cs & 3) == 0 && - kernel_math_error(regs, "simd math error")) - return; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = rip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) -{ -} - -#if 0 -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} -#endif - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - */ -asmlinkage void math_state_restore(void) -{ - struct task_struct *me = current; - - /* clts(); */ /* 'clts' is done for us by Xen during virtual trap. */ - - if (!used_math()) - init_fpu(me); - restore_fpu_checking(&me->thread.i387.fxsave); - me->thread_info->status |= TS_USEDFPU; -} - -void do_call_debug(struct pt_regs *regs) -{ - notify_die(DIE_CALL, "debug call", regs, 0, 255, SIGINT); -} - - -static trap_info_t trap_table[] = { - { 0, 0, (__KERNEL_CS|0x3), (unsigned long)divide_error }, - { 1, 0, (__KERNEL_CS|0x3), (unsigned long)debug }, - { 3, 3, (__KERNEL_CS|0x3), (unsigned long)int3 }, - { 4, 3, (__KERNEL_CS|0x3), (unsigned long)overflow }, - { 5, 3, (__KERNEL_CS|0x3), (unsigned long)bounds }, - { 6, 0, (__KERNEL_CS|0x3), (unsigned long)invalid_op }, - { 7, 0, (__KERNEL_CS|0x3), (unsigned long)device_not_available }, - { 9, 0, (__KERNEL_CS|0x3), (unsigned long)coprocessor_segment_overrun}, - { 10, 0, (__KERNEL_CS|0x3), (unsigned long)invalid_TSS }, - { 11, 0, (__KERNEL_CS|0x3), (unsigned long)segment_not_present }, - { 12, 0, (__KERNEL_CS|0x3), (unsigned long)stack_segment }, - { 13, 0, (__KERNEL_CS|0x3), (unsigned long)general_protection }, - { 14, 0, (__KERNEL_CS|0x3), (unsigned long)page_fault }, - { 15, 0, (__KERNEL_CS|0x3), (unsigned long)spurious_interrupt_bug }, - { 16, 0, (__KERNEL_CS|0x3), (unsigned long)coprocessor_error }, - { 17, 0, (__KERNEL_CS|0x3), (unsigned long)alignment_check }, -#ifdef CONFIG_X86_MCE - { 18, 0, (__KERNEL_CS|0x3), (unsigned long)machine_check }, -#endif - { 19, 0, (__KERNEL_CS|0x3), (unsigned long)simd_coprocessor_error }, - { SYSCALL_VECTOR, 3, (__KERNEL_CS|0x3), (unsigned long)system_call }, -#ifdef CONFIG_IA32_EMULATION - { IA32_SYSCALL_VECTOR, 3, (__KERNEL_CS|0x3), (unsigned long)ia32_syscall}, -#endif - { 0, 0, 0, 0 } -}; - -void __init trap_init(void) -{ - int ret; - - ret = HYPERVISOR_set_trap_table(trap_table); - - if (ret) - printk("HYPERVISOR_set_trap_table faild: error %d\n", - ret); - -#ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - - /* - * Should be a barrier for any external CPU state. - */ - cpu_init(); -} - - -/* Actual parsing is done early in setup.c. */ -static int __init oops_dummy(char *s) -{ - panic_on_oops = 1; - return -1; -} -__setup("oops=", oops_dummy); - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s,NULL,0); - return 0; -} -__setup("kstack=", kstack_setup); - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,190 +0,0 @@ -/* - * linux/arch/x86_64/kernel/vsyscall.c - * - * Copyright (C) 2001 Andrea Arcangeli <andrea@xxxxxxx> SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Thanks to hpa@xxxxxxxxxxxxx for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - * vsyscall 1 is located at -10Mbyte, vsyscall 2 is located - * at virtual address -10Mbyte+1024bytes etc... There are at max 8192 - * vsyscalls. One vsyscall can reserve more than 1 slot to avoid - * jumping out of line if necessary. - * - * Note: the concept clashes with user mode linux. If you use UML just - * set the kernel.vsyscall sysctl to 0. - */ - -/* - * TODO 2001-03-20: - * - * 1) make page fault handler detect faults on page1-page-last of the vsyscall - * virtual space, and make it increase %rip and write -ENOSYS in %rax (so - * we'll be able to upgrade to a new glibc without upgrading kernel after - * we add more vsyscalls. - * 2) Possibly we need a fixmap table for the vsyscalls too if we want - * to avoid SIGSEGV and we want to return -EFAULT from the vsyscalls as well. - * Can we segfault inside a "syscall"? We can fix this anytime and those fixes - * won't be visible for userspace. Not fixing this is a noop for correct programs, - * broken programs will segfault and there's no security risk until we choose to - * fix it. - * - * These are not urgent things that we need to address only before shipping the first - * production binary kernels. - */ - -#include <linux/time.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/timer.h> -#include <linux/seqlock.h> -#include <linux/jiffies.h> - -#include <asm/vsyscall.h> -#include <asm/pgtable.h> -#include <asm/page.h> -#include <asm/fixmap.h> -#include <asm/errno.h> -#include <asm/io.h> - -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define force_inline __attribute__((always_inline)) inline - -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; - -#include <asm/unistd.h> - -static force_inline void timeval_normalize(struct timeval * tv) -{ - time_t __sec; - - __sec = tv->tv_usec / 1000000; - if (__sec) - { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} - -static force_inline void do_vgettimeofday(struct timeval * tv) -{ - long sequence, t; - unsigned long sec, usec; - - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); - - if (__vxtime.mode == VXTIME_TSC) { - sync_core(); - rdtscll(t); - if (t < __vxtime.last_tsc) t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); - - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; -} - -/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ -static force_inline void do_get_tz(struct timezone * tz) -{ - *tz = __sys_tz; -} - - -static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret; - asm volatile("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber ); - return ret; -} - -static force_inline long time_syscall(long *t) -{ - long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time),"D" (t) : __syscall_clobber); - return secs; -} - -static int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) -{ - if (unlikely(!__sysctl_vsyscall)) - return gettimeofday(tv,tz); - if (tv) - do_vgettimeofday(tv); - if (tz) - do_get_tz(tz); - return 0; -} - -/* This will break when the xtime seconds get inaccurate, but that is - * unlikely */ -static time_t __vsyscall(1) vtime(time_t *t) -{ - if (unlikely(!__sysctl_vsyscall)) - return time_syscall(t); - else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; -} - -static long __vsyscall(2) venosys_0(void) -{ - return -ENOSYS; -} - -static long __vsyscall(3) venosys_1(void) -{ - return -ENOSYS; - -} - -static void __init map_vsyscall(void) -{ - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); - - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); -} - -extern void __set_fixmap_user (enum fixed_addresses, unsigned long, pgprot_t); - -static void __init map_vsyscall_user(void) -{ - extern char __vsyscall_0; - unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); - - __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); -} - -static int __init vsyscall_init(void) -{ - BUG_ON(((unsigned long) &vgettimeofday != - VSYSCALL_ADDR(__NR_vgettimeofday))); - BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); - BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); - map_vsyscall(); - map_vsyscall_user(); /* establish tranlation for user address space */ - sysctl_vsyscall = 0; /* TBD */ - - return 0; -} - -__initcall(vsyscall_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/x8664_ksyms.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,225 +0,0 @@ -#include <linux/config.h> -#include <linux/module.h> -#include <linux/smp.h> -#include <linux/user.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/syscalls.h> -#include <linux/tty.h> -#include <linux/ioctl32.h> - -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/nmi.h> -#include <asm/kdebug.h> -#include <asm/unistd.h> -#include <asm/tlbflush.h> -#include <asm/kdebug.h> - -extern spinlock_t rtc_lock; - -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -#endif - -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - -extern unsigned long get_cmos_time(void); - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -//EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(enable_irq); -EXPORT_SYMBOL(disable_irq); -EXPORT_SYMBOL(disable_irq_nosync); -EXPORT_SYMBOL(probe_irq_mask); -EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -#ifdef CONFIG_ACPI_BOOT -EXPORT_SYMBOL(pm_power_off); -#endif -EXPORT_SYMBOL(get_cmos_time); - -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); -/* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_nocheck); -EXPORT_SYMBOL(ip_compute_csum); -/* Delay loops */ -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); - -EXPORT_SYMBOL(__get_user_1); -EXPORT_SYMBOL(__get_user_2); -EXPORT_SYMBOL(__get_user_4); -EXPORT_SYMBOL(__get_user_8); -EXPORT_SYMBOL(__put_user_1); -EXPORT_SYMBOL(__put_user_2); -EXPORT_SYMBOL(__put_user_4); -EXPORT_SYMBOL(__put_user_8); - -EXPORT_SYMBOL(strpbrk); -EXPORT_SYMBOL(strstr); - -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(copy_user_generic); -EXPORT_SYMBOL(copy_from_user); -EXPORT_SYMBOL(copy_to_user); -EXPORT_SYMBOL(copy_in_user); -EXPORT_SYMBOL(strnlen_user); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_alloc_consistent); -EXPORT_SYMBOL(pci_free_consistent); -#endif - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pcibios_penalize_isa_irq); -EXPORT_SYMBOL(pci_mem_start); -#endif - -EXPORT_SYMBOL(copy_page); -EXPORT_SYMBOL(clear_page); - -EXPORT_SYMBOL(cpu_pda); -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); - -EXPORT_SYMBOL(synchronize_irq); -EXPORT_SYMBOL(smp_call_function); -EXPORT_SYMBOL(cpu_callout_map); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); -#endif - -EXPORT_SYMBOL(get_wchan); - -EXPORT_SYMBOL(rtc_lock); - -/* EXPORT_SYMBOL_GPL(set_nmi_callback); - EXPORT_SYMBOL_GPL(unset_nmi_callback); */ - -/* Export string functions. We normally rely on gcc builtin for most of these, - but gcc sometimes decides not to inline them. */ -#undef memcpy -#undef memset -#undef memmove -#undef memchr -#undef strlen -#undef strcpy -#undef strncmp -#undef strncpy -#undef strchr -#undef strcmp -#undef strcpy -#undef strcat -#undef memcmp - -extern void * memset(void *,int,__kernel_size_t); -extern size_t strlen(const char *); -extern void * memmove(void * dest,const void *src,size_t count); -extern char * strcpy(char * dest,const char *src); -extern int strcmp(const char * cs,const char * ct); -extern void *memchr(const void *s, int c, size_t n); -extern void * memcpy(void *,const void *,__kernel_size_t); -extern void * __memcpy(void *,const void *,__kernel_size_t); -extern char * strcat(char *, const char *); -extern int memcmp(const void * cs,const void * ct,size_t count); - -EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(strlen); -EXPORT_SYMBOL(memmove); -EXPORT_SYMBOL(strcpy); -EXPORT_SYMBOL(strncmp); -EXPORT_SYMBOL(strncpy); -EXPORT_SYMBOL(strchr); -EXPORT_SYMBOL(strcmp); -EXPORT_SYMBOL(strcat); -EXPORT_SYMBOL(strncat); -EXPORT_SYMBOL(memchr); -EXPORT_SYMBOL(strrchr); -EXPORT_SYMBOL(strnlen); -EXPORT_SYMBOL(memscan); -EXPORT_SYMBOL(memcpy); -EXPORT_SYMBOL(__memcpy); -EXPORT_SYMBOL(memcmp); - -#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM -/* prototypes are wrong, these are assembly with custom calling functions */ -extern void rwsem_down_read_failed_thunk(void); -extern void rwsem_wake_thunk(void); -extern void rwsem_downgrade_thunk(void); -extern void rwsem_down_write_failed_thunk(void); -EXPORT_SYMBOL(rwsem_down_read_failed_thunk); -EXPORT_SYMBOL(rwsem_wake_thunk); -EXPORT_SYMBOL(rwsem_downgrade_thunk); -EXPORT_SYMBOL(rwsem_down_write_failed_thunk); -#endif - -EXPORT_SYMBOL(empty_zero_page); - -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); -#endif - -EXPORT_SYMBOL(die_chain); -EXPORT_SYMBOL(register_die_notifier); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_sibling_map); -EXPORT_SYMBOL(smp_num_siblings); -#endif - -extern void do_softirq_thunk(void); -EXPORT_SYMBOL(do_softirq_thunk); - -void out_of_line_bug(void); -EXPORT_SYMBOL(out_of_line_bug); - -EXPORT_SYMBOL(init_level4_pgt); - -extern unsigned long __supported_pte_mask; -EXPORT_SYMBOL(__supported_pte_mask); - -#ifdef CONFIG_SMP -EXPORT_SYMBOL(flush_tlb_page); -EXPORT_SYMBOL_GPL(flush_tlb_all); -#endif - -EXPORT_SYMBOL(cpu_khz); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,38 +0,0 @@ -/* - * Copied from arch/xen/i386/kernel/entry.S - */ -/* Offsets into shared_info_t. */ -#define evtchn_upcall_pending 0 -#define evtchn_upcall_mask 1 - -#define sizeof_vcpu_shift 3 - -#ifdef CONFIG_SMP -#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -#define preempt_enable(reg) decl threadinfo_preempt_count(reg) -#define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ - movq %gs:pda_cpunumber,reg ; \ - shl $sizeof_vcpu_shift,reg ; \ - addq HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ -#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff -#else -#define XEN_GET_VCPU_INFO(reg) movq HYPERVISOR_shared_info,reg -#define XEN_PUT_VCPU_INFO(reg) -#define XEN_PUT_VCPU_INFO_fixup -#endif - -#define XEN_LOCKED_BLOCK_EVENTS(reg) movb $1,evtchn_upcall_mask(reg) -#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg) -#define XEN_BLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_BLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) -#define XEN_UNBLOCK_EVENTS(reg) XEN_GET_VCPU_INFO(reg) ; \ - XEN_LOCKED_UNBLOCK_EVENTS(reg) ; \ - XEN_PUT_VCPU_INFO(reg) -#define XEN_TEST_PENDING(reg) testb $0xFF,evtchn_upcall_pending(reg) - -EVENT_MASK = (CS+4) -VGCF_IN_SYSCALL = (1<<8) - - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,31 +0,0 @@ -# -# Makefile for the linux x86_64-specific parts of the memory manager. -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -CFLAGS += -Iarch/$(XENARCH)/mm - -obj-y := init.o fault.o ioremap.o pageattr.o -c-obj-y := extable.o - -i386-obj-y := hypervisor.o - -#obj-y := init.o fault.o ioremap.o extable.o pageattr.o -#c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -c-obj-$(CONFIG_DISCONTIGMEM) += numa.o -c-obj-$(CONFIG_K8_NUMA) += k8topology.o - -hugetlbpage-y = ../../../i386/mm/hugetlbpage.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/x86_64/mm/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(i386-obj-y)): - ln -fsn $(srctree)/arch/xen/i386/mm/$(notdir $@) $@ - -obj-y += $(c-obj-y) $(i386-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link) $(i386-obj-y)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/fault.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/fault.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,592 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/config.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/compiler.h> -#include <linux/module.h> -#include <linux/percpu.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm/kdebug.h> -#include <asm-generic/sections.h> -#include <asm/kdebug.h> - -DEFINE_PER_CPU(pgd_t *, cur_pgd); - -void bust_spinlocks(int yes) -{ - int loglevel_save = console_loglevel; - if (yes) { - oops_in_progress = 1; - } else { -#ifdef CONFIG_VT - unblank_screen(); -#endif - oops_in_progress = 0; - /* - * OK, the message is on the console. Now we call printk() - * without oops_in_progress set so that printk will give klogd - * a poke. Hold onto your hats... - */ - console_loglevel = 15; /* NMI oopser may have shut the console up */ - printk(" "); - console_loglevel = loglevel_save; - } -} - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr = (unsigned char *)(regs->rip); - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr = instr + 15; - - /* If it was a exec fault ignore */ - if (error_code & (1<<4)) - return 0; - - /* Code segments in LDT could have a non zero base. Don't check - when that's possible */ - if (regs->cs & (1<<2)) - return 0; - - if ((regs->cs & 3) != 0 && regs->rip >= TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (__get_user(opcode, instr)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = ((regs->cs & 3) == 0) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (__get_user(opcode, instr)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return __get_user(dummy, (unsigned long *)p); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id()); - pgd += pgd_index(address); - - printk("PGD %lx ", pgd_val(*pgd)); - if (bad_address(pgd)) goto bad; - if (!pgd_present(*pgd)) goto ret; - - pud = __pud_offset_k((pud_t *)pgd_page(*pgd), address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -int unhandled_signal(struct task_struct *tsk, int sig) -{ - if (tsk->pid == 1) - return 1; - /* Warn for strace, but not for gdb */ - if (!test_ti_thread_flag(tsk->thread_info, TIF_SYSCALL_TRACE) && - (tsk->ptrace & PT_PTRACED)) - return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - oops_begin(); - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - __die("Bad pagetable", regs, error_code); - oops_end(); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc or module mapping area - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->mm ?: &init_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page(*pud) != pud_page(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) - BUG(); - __flush_tlb_all(); - return 0; -} - -int page_fault_trace = 0; -int exception_trace = 1; - - -#define MEM_VERBOSE 1 - -#ifdef MEM_VERBOSE -#define MEM_LOG(_f, _a...) \ - printk("fault.c:[%d]-> " _f "\n", \ - __LINE__ , ## _a ) -#else -#define MEM_LOG(_f, _a...) ((void)0) -#endif - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means fault was an instruction fetch - */ -asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code, - unsigned long address) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - const struct exception_table_entry *fixup; - int write; - siginfo_t info; - - if (!user_mode(regs)) - error_code &= ~4; /* means kernel */ - -#ifdef CONFIG_CHECKING - { - unsigned long gs; - struct x8664_pda *pda = cpu_pda + stack_smp_processor_id(); - rdmsrl(MSR_GS_BASE, gs); - if (gs != (unsigned long)pda) { - wrmsrl(MSR_GS_BASE, pda); - printk("page_fault: wrong gs %lx expected %p\n", gs, pda); - } - } -#endif - if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14, - SIGSEGV) == NOTIFY_STOP) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(page_fault_trace)) - printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n", - regs->rip,regs->rsp,regs->cs,regs->ss,address,error_code); - - tsk = current; - mm = tsk->mm; - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 1) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 5)) { - if (vmalloc_fault(address) < 0) - goto bad_area_nosemaphore; - return; - } - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (unlikely(error_code & (1 << 3))) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunatly, in the case of an - * erroneous fault occuring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibilty of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - // XXX: align red zone size with ABI - if (address + 128 < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - switch (handle_mm_fault(mm, vma, address, write)) { - case 1: - tsk->min_flt++; - break; - case 2: - tsk->maj_flt++; - break; - case 0: - goto do_sigbus; - default: - goto out_of_memory; - } - - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - -#ifdef CONFIG_IA32_EMULATION - /* 32bit vsyscall. map on demand. */ - if (test_thread_flag(TIF_IA32) && - address >= VSYSCALL32_BASE && address < VSYSCALL32_END) { - if (map_syscall32(mm, address) < 0) - goto out_of_memory2; - return; - } -#endif - - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { - printk( - "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - printk("\n"); - dump_pagetable(address); - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); -out_of_memory2: - if (current->pid == 1) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/init.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/init.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,966 +0,0 @@ -/* - * linux/arch/x86_64/mm/init.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2000 Pavel Machek <pavel@xxxxxxx> - * Copyright (C) 2002,2003 Andi Kleen <ak@xxxxxxx> - * - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Modified for Xen. - */ - -#include <linux/config.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/proc_fs.h> - -#include <asm/processor.h> -#include <asm/system.h> -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/dma.h> -#include <asm/fixmap.h> -#include <asm/e820.h> -#include <asm/apic.h> -#include <asm/tlb.h> -#include <asm/mmu_context.h> -#include <asm/proto.h> -#include <asm/smp.h> - -#ifndef Dprintk -#define Dprintk(x...) -#endif - -#ifdef CONFIG_GART_IOMMU -extern int swiotlb; -#endif - -extern char _stext[]; - -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -extern unsigned long start_pfn; - -static int init_mapping_done; - -/* - * Use this until direct mapping is established, i.e. before __va() is - * avaialble in init_memory_mapping(). - */ - -#define addr_to_page(addr, page) \ - (addr) &= PHYSICAL_PAGE_MASK; \ - (page) = ((unsigned long *) ((unsigned long)(((mfn_to_pfn((addr) >> PAGE_SHIFT)) << PAGE_SHIFT) + __START_KERNEL_map))) - -static void __make_page_readonly(unsigned long va) -{ - unsigned long addr; - pte_t *pte; - unsigned long *page = (unsigned long *) init_level4_pgt; - - addr = (unsigned long) page[pgd_index(va)]; - addr_to_page(addr, page); - - addr = page[pud_index(va)]; - addr_to_page(addr, page); - - addr = page[pmd_index(va)]; - addr_to_page(addr, page); - - pte = (pte_t *) &page[pte_index(va)]; - xen_l1_entry_update(pte, (*(unsigned long*)pte) & ~_PAGE_RW); - __flush_tlb_one(addr); -} - -static void __make_page_writable(unsigned long va) -{ - unsigned long addr; - pte_t *pte; - unsigned long *page = (unsigned long *) init_level4_pgt; - - addr = (unsigned long) page[pgd_index(va)]; - addr_to_page(addr, page); - - addr = page[pud_index(va)]; - addr_to_page(addr, page); - - addr = page[pmd_index(va)]; - addr_to_page(addr, page); - - pte = (pte_t *) &page[pte_index(va)]; - xen_l1_entry_update(pte, (*(unsigned long*)pte)| _PAGE_RW); - __flush_tlb_one(addr); -} - - -/* - * Assume the translation is already established. - */ -void make_page_readonly(void *va) -{ - pgd_t* pgd; pud_t *pud; pmd_t* pmd; pte_t *pte; - unsigned long addr = (unsigned long) va; - - if (!init_mapping_done) { - __make_page_readonly(addr); - return; - } - - pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - pte = pte_offset_kernel(pmd, addr); - xen_l1_entry_update(pte, (*(unsigned long*)pte)&~_PAGE_RW); - __flush_tlb_one(addr); -} - -void make_page_writable(void *va) -{ - pgd_t* pgd; pud_t *pud; pmd_t* pmd; pte_t *pte; - unsigned long addr = (unsigned long) va; - - if (!init_mapping_done) { - __make_page_writable(addr); - return; - } - - pgd = pgd_offset_k(addr); - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - pte = pte_offset_kernel(pmd, addr); - xen_l1_entry_update(pte, (*(unsigned long*)pte)|_PAGE_RW); - __flush_tlb_one(addr); -} - -void make_pages_readonly(void* va, unsigned nr) -{ - while ( nr-- != 0 ) { - make_page_readonly(va); - va = (void*)((unsigned long)va + PAGE_SIZE); - } -} - -void make_pages_writable(void* va, unsigned nr) -{ - while ( nr-- != 0 ) { - make_page_writable(va); - va = (void*)((unsigned long)va + PAGE_SIZE); - } -} - -/* - * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the - * physical space so we can cache the place of the first one and move - * around without checking the pgd every time. - */ - -void show_mem(void) -{ - int i, total = 0, reserved = 0; - int shared = 0, cached = 0; - pg_data_t *pgdat; - struct page *page; - - printk("Mem-info:\n"); - show_free_areas(); - printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); - - for_each_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk("%d pages of RAM\n", total); - printk("%d reserved pages\n",reserved); - printk("%d pages shared\n",shared); - printk("%d pages swap cached\n",cached); -} - -/* References to section boundaries */ - -extern char _text, _etext, _edata, __bss_start, _end[]; -extern char __init_begin, __init_end; - -int after_bootmem; - -static void *spp_getpage(void) -{ - void *ptr; - if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); - else - ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - - Dprintk("spp_getpage %p\n", ptr); - return ptr; -} - -#define pgd_offset_u(address) (pgd_t *)(init_level4_user_pgt + pgd_index(address)) - -static inline pud_t *pud_offset_u(unsigned long address) -{ - pud_t *pud = level3_user_pgt; - - return pud + pud_index(address); -} - -static void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot, int user_mode) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, new_pte; - - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); - - pgd = (user_mode ? pgd_offset_u(vaddr) : pgd_offset_k(vaddr)); - - if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; - } - - pud = (user_mode ? pud_offset_u(vaddr) : pud_offset(pgd, vaddr)); - - if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); - - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); - if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); - return; - } - } - - pmd = pmd_offset(pud, vaddr); - - if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); - make_page_readonly(pte); - - xen_pte_pin(__pa(pte)); - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); - if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); - return; - } - } - new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); - - pte = pte_offset_kernel(pmd, vaddr); - - if (!pte_none(*pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); - xen_l1_entry_update(pte, new_pte.pte); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -static void set_pte_phys_ma(unsigned long vaddr, - unsigned long phys, pgprot_t prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte, new_pte; - - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); - - pgd = pgd_offset_k(vaddr); - if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - - pmd = (pmd_t *) spp_getpage(); - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); - - if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); - return; - } - } - pmd = pmd_offset(pud, vaddr); - - if (pmd_none(*pmd)) { - pte = (pte_t *) spp_getpage(); - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - - set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); - if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); - return; - } - } - - new_pte = pfn_pte_ma(phys >> PAGE_SHIFT, prot); - pte = pte_offset_kernel(pmd, vaddr); - - if (!pte_none(*pte) && - pte_val_ma(*pte) != (pte_val_ma(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); - - /* - * Note that the pte page is already RO, thus we want to use - * xen_l1_entry_update(), not set_pte(). - */ - xen_l1_entry_update(pte, - (pfn_pte_ma(phys >> PAGE_SHIFT, prot).pte)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -#define SET_FIXMAP_KERNEL 0 -#define SET_FIXMAP_USER 1 - -/* NOTE: this is meant to be run only at boot */ -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); - return; - } - switch (idx) { - case VSYSCALL_FIRST_PAGE: - set_pte_phys(address, phys, prot, SET_FIXMAP_KERNEL); - break; - default: - set_pte_phys_ma(address, phys, prot); - break; - } -} - - -/* - * At this point it only supports vsyscall area. - */ -void __set_fixmap_user (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) -{ - unsigned long address = __fix_to_virt(idx); - - if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); - return; - } - - set_pte_phys(address, phys, prot, SET_FIXMAP_USER); -} - -unsigned long __initdata table_start, table_end, tables_space; - -unsigned long get_machine_pfn(unsigned long addr) -{ - pud_t* pud = pud_offset_k(addr); - pmd_t* pmd = pmd_offset(pud, addr); - pte_t *pte = pte_offset_kernel(pmd, addr); - - return pte_mfn(*pte); -} - -#define ALIGN_TO_4K __attribute__((section(".data.page_aligned"))) -#define MAX_LOW_PAGES 0x20 -static unsigned long __init_pgt[MAX_LOW_PAGES][512] ALIGN_TO_4K; -static int __init_pgt_index; - -/* - * We start using from start_pfn - */ -static __init void *alloc_static_page(unsigned long *phys) -{ - int i = __init_pgt_index++; - - if (__init_pgt_index >= MAX_LOW_PAGES) { - printk("Need to increase MAX_LOW_PAGES"); - BUG(); - } - - *phys = __pa(__init_pgt[i]); - - return (void *) __init_pgt[i]; -} - -/* - * Get RO page - */ -static void __init *alloc_low_page(unsigned long *phys) -{ - unsigned long pfn = table_end++; - - *phys = (pfn << PAGE_SHIFT); - memset((void *) ((pfn << PAGE_SHIFT) + __START_KERNEL_map), 0, PAGE_SIZE); - return (void *)((pfn << PAGE_SHIFT) + __START_KERNEL_map); -} - -#define PTE_SIZE PAGE_SIZE - -static inline void __set_pte(pte_t *dst, pte_t val) -{ - *dst = val; -} - -void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) -{ - long i, j, k; - unsigned long paddr; - - i = pud_index(address); - pud = pud + i; - - for (; i < PTRS_PER_PUD; pud++, i++) { - unsigned long pmd_phys; - pmd_t *pmd; - - paddr = address + i*PUD_SIZE; - if (paddr >= end) { - for (; i < PTRS_PER_PUD; i++, pud++) - set_pud(pud, __pud(0)); - break; - } - - pmd = alloc_low_page(&pmd_phys); - make_page_readonly(pmd); - xen_pmd_pin(pmd_phys); - set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); - - for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { - unsigned long pte_phys; - pte_t *pte, *pte_save; - - if (paddr >= end) { - for (; j < PTRS_PER_PMD; j++, pmd++) - set_pmd(pmd, __pmd(0)); - break; - } - pte = alloc_low_page(&pte_phys); - pte_save = pte; - for (k = 0; k < PTRS_PER_PTE; pte++, k++, paddr += PTE_SIZE) { - if (paddr < (table_start << PAGE_SHIFT) - + tables_space) - { - __set_pte(pte, - __pte(paddr | (_KERNPG_TABLE & ~_PAGE_RW))); - continue; - } - if (paddr >= end) { - for (; k < PTRS_PER_PTE; k++, pte++) - __set_pte(pte, __pte(0)); - break; - } - __set_pte(pte, __pte(paddr | _KERNPG_TABLE)); - } - pte = pte_save; - make_page_readonly(pte); - xen_pte_pin(pte_phys); - set_pmd(pmd, __pmd(pte_phys | _KERNPG_TABLE)); - } - } - __flush_tlb(); -} - -static void __init find_early_table_space(unsigned long end) -{ - unsigned long puds, pmds, ptes; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - ptes = (end + PTE_SIZE - 1) >> PAGE_SHIFT; - - tables_space = round_up(puds * 8, PAGE_SIZE) + - round_up(pmds * 8, PAGE_SIZE) + - round_up(ptes * 8, PAGE_SIZE); -} - - -/* - * Extend kernel mapping to access pages for page tables. The initial - * mapping done by Xen is minimal (e.g. 8MB) and we need to extend the - * mapping for early initialization. - */ - -#define MIN_INIT_SIZE 0x800000 -static unsigned long current_size, extended_size; - -void __init extend_init_mapping(void) -{ - unsigned long va = __START_KERNEL_map; - unsigned long addr, *pte_page; - - unsigned long phys; - pmd_t *pmd; - pte_t *pte, new_pte; - unsigned long *page = (unsigned long *) init_level4_pgt; - int i; - - addr = (unsigned long) page[pgd_index(va)]; - addr_to_page(addr, page); - - addr = page[pud_index(va)]; - addr_to_page(addr, page); - - for (;;) { - pmd = (pmd_t *) &page[pmd_index(va)]; - if (pmd_present(*pmd)) { - /* - * if pmd is valid, check pte. - */ - addr = page[pmd_index(va)]; - addr_to_page(addr, pte_page); - - for (i = 0; i < PTRS_PER_PTE; i++) { - pte = (pte_t *) &pte_page[pte_index(va)]; - - if (pte_present(*pte)) { - va += PAGE_SIZE; - current_size += PAGE_SIZE; - } else - break; - } - - } else - break; - } - - for (; va < __START_KERNEL_map + current_size + tables_space; ) { - pmd = (pmd_t *) &page[pmd_index(va)]; - - if (pmd_none(*pmd)) { - pte_page = (unsigned long *) alloc_static_page(&phys); - make_page_readonly(pte_page); - xen_pte_pin(phys); - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER)); - - for (i = 0; i < PTRS_PER_PTE; i++, va += PAGE_SIZE) { - new_pte = pfn_pte((va - __START_KERNEL_map) >> PAGE_SHIFT, - __pgprot(_KERNPG_TABLE | _PAGE_USER)); - - pte = (pte_t *) &pte_page[pte_index(va)]; - xen_l1_entry_update(pte, new_pte.pte); - extended_size += PAGE_SIZE; - } - } - } -} - - -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ -void __init init_memory_mapping(unsigned long start, unsigned long end) -{ - unsigned long next; - - Dprintk("init_memory_mapping\n"); - - find_early_table_space(end); - extend_init_mapping(); - start_pfn = current_size >> PAGE_SHIFT; - - table_start = start_pfn; - table_end = table_start; - - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); - - for (; start < end; start = next) { - unsigned long pud_phys; - pud_t *pud = alloc_low_page(&pud_phys); - make_page_readonly(pud); - xen_pud_pin(pud_phys); - next = start + PGDIR_SIZE; - if (next > end) - next = end; - phys_pud_init(pud, __pa(start), __pa(next)); - set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - } - - printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end, - table_start<<PAGE_SHIFT, - table_end<<PAGE_SHIFT); - - start_pfn = ((current_size + extended_size) >> PAGE_SHIFT); - - /* - * TBD: Need to calculate at runtime - */ - - __flush_tlb_all(); - init_mapping_done = 1; -} - -extern struct x8664_pda cpu_pda[NR_CPUS]; - -void zap_low_mappings(void) -{ - /* this is not required for Xen */ -#if 0 - swap_low_mappings(); -#endif -} - -#ifndef CONFIG_DISCONTIGMEM -void __init paging_init(void) -{ - int i; - - { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; - /* unsigned int max_dma; */ - /* max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; */ - /* if (end_pfn < max_dma) */ - zones_size[ZONE_DMA] = end_pfn; -#if 0 - else { - zones_size[ZONE_DMA] = max_dma; - zones_size[ZONE_NORMAL] = end_pfn - max_dma; - } -#endif - free_area_init(zones_size); - } - - set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); - HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); - - memset(empty_zero_page, 0, sizeof(empty_zero_page)); - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - /* Setup mapping of lower 1st MB */ - for (i = 0; i < NR_FIX_ISAMAPS; i++) - if (xen_start_info.flags & SIF_PRIVILEGED) - set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); - else - __set_fixmap(FIX_ISAMAP_BEGIN - i, - virt_to_machine(empty_zero_page), - PAGE_KERNEL_RO); -#endif - -} -#endif - -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - continue; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - -static inline int page_is_ram (unsigned long pagenr) -{ - if (pagenr < start_pfn || pagenr >= end_pfn) - return 0; - - return 1; -} - -extern int swiotlb_force; - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; - -void __init mem_init(void) -{ - int codesize, reservedpages, datasize, initsize; - int tmp; - -#ifdef CONFIG_SWIOTLB - if (swiotlb_force) - swiotlb = 1; - if (!iommu_aperture && - (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) - swiotlb = 1; - if (swiotlb) - swiotlb_init(); -#endif - - /* How many end-of-memory variables you have, grandma! */ - max_low_pfn = end_pfn; - max_pfn = end_pfn; - num_physpages = end_pfn; - high_memory = (void *) __va(end_pfn * PAGE_SIZE); - - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); - - reservedpages = 0; - - /* this will put all low memory onto the freelists */ -#ifdef CONFIG_DISCONTIGMEM - totalram_pages += numa_free_all_bootmem(); - tmp = 0; - /* should count reserved pages here for all nodes */ -#else - max_mapnr = end_pfn; - if (!mem_map) BUG(); - - totalram_pages += free_all_bootmem(); - - for (tmp = 0; tmp < end_pfn; tmp++) - /* - * Only count reserved RAM pages - */ - if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) - reservedpages++; -#endif - - after_bootmem = 1; - - codesize = (unsigned long) &_etext - (unsigned long) &_text; - datasize = (unsigned long) &_edata - (unsigned long) &_etext; - initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - - /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, - VMALLOC_END-VMALLOC_START); - kclist_add(&kcore_kernel, &_stext, _end - _stext); - kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START); - - printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init)\n", - (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - end_pfn << (PAGE_SHIFT-10), - codesize >> 10, - reservedpages << (PAGE_SHIFT-10), - datasize >> 10, - initsize >> 10); - - /* - * Subtle. SMP is doing its boot stuff late (because it has to - * fork idle threads) - but it also needs low mappings for the - * protected-mode entry to work. We zap these entries only after - * the WP-bit has been tested. - */ -#ifndef CONFIG_SMP - zap_low_mappings(); -#endif -} - -extern char __initdata_begin[], __initdata_end[]; - -void free_initmem(void) -{ -#ifdef __DO_LATER__ - /* - * Some pages can be pinned, but some are not. Unpinning such pages - * triggers BUG(). - */ - unsigned long addr; - - addr = (unsigned long)(&__init_begin); - for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); - memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); - xen_pte_unpin(__pa(addr)); - make_page_writable(__va(__pa(addr))); - /* - * Make pages from __PAGE_OFFSET address as well - */ - make_page_writable((void *)addr); - free_page(addr); - totalram_pages++; - } - memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin); - printk ("Freeing unused kernel memory: %luk freed\n", (&__init_end - &__init_begin) >> 10); -#endif -} - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - if (start < (unsigned long)&_end) - return; - printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); - for (; start < end; start += PAGE_SIZE) { - ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); - free_page(start); - totalram_pages++; - } -} -#endif - -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ - /* Should check here against the e820 map to avoid double free */ -#ifdef CONFIG_DISCONTIGMEM - int nid = phys_to_nid(phys); - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); -#endif -} - -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - pud = pud_offset_k(addr); - if (pud_none(*pud)) - return 0; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return 0; - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - return pfn_valid(pte_pfn(*pte)); -} - -#ifdef CONFIG_SYSCTL -#include <linux/sysctl.h> - -extern int exception_trace, page_fault_trace; - -static ctl_table debug_table2[] = { - { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL, - proc_dointvec }, -#ifdef CONFIG_CHECKING - { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL, - proc_dointvec }, -#endif - { 0, } -}; - -static ctl_table debug_root_table2[] = { - { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555, - .child = debug_table2 }, - { 0 }, -}; - -static __init int x8664_sysctl_init(void) -{ - register_sysctl_table(debug_root_table2, 1); - return 0; -} -__initcall(x8664_sysctl_init); -#endif - -/* Pseudo VMAs to allow ptrace access for the vsyscall pages. x86-64 has two - different ones: one for 32bit and one for 64bit. Use the appropiate - for the target task. */ - -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_END, - .vm_page_prot = PAGE_READONLY -}; - -static struct vm_area_struct gate32_vma = { - .vm_start = VSYSCALL32_BASE, - .vm_end = VSYSCALL32_END, - .vm_page_prot = PAGE_READONLY -}; - -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - /* lookup code assumes the pages are present. set them up - now */ - if (__map_syscall32(tsk->mm, VSYSCALL32_BASE) < 0) - return NULL; - return &gate32_vma; - } -#endif - return &gate_vma; -} - -int in_gate_area(struct task_struct *task, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(task); - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. - */ -int in_gate_area_no_task(unsigned long addr) -{ - return (((addr >= VSYSCALL_START) && (addr < VSYSCALL_END)) || - ((addr >= VSYSCALL32_BASE) && (addr < VSYSCALL32_END))); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/ioremap.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/ioremap.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,466 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> - -/* - * Reuse arch/xen/i396/mm/ioremap.c. Need to merge later - */ -#ifndef CONFIG_XEN_PHYSDEV_ACCESS - -void * __ioremap(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - return NULL; -} - -void *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return NULL; -} - -void iounmap(volatile void __iomem *addr) -{ -} - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - return NULL; -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ -} - -#else - -#if defined(__i386__) -/* - * Does @address reside within a non-highmem page that is local to this virtual - * machine (i.e., not an I/O page, nor a memory page belonging to another VM). - * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand - * why this works. - */ -static inline int is_local_lowmem(unsigned long address) -{ - extern unsigned long max_low_pfn; - unsigned long mfn = address >> PAGE_SHIFT; - unsigned long pfn = mfn_to_pfn(mfn); - return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn)); -} -#elif defined(__x86_64__) -/* - * - */ -static inline int is_local_lowmem(unsigned long address) -{ - return 0; -} -#endif - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - domid_t domid = DOMID_IO; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0x0 && last_addr < 0x100000) - return isa_bus_to_virt(phys_addr); -#endif - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (is_local_lowmem(phys_addr)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = bus_to_virt(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - - domid = DOMID_LOCAL; - } - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr, - size, __pgprot(_PAGE_PRESENT | _PAGE_RW | - _PAGE_DIRTY | _PAGE_ACCESSED -#if defined(__x86_64__) - | _PAGE_USER -#endif - | flags), domid)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} - - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (is_local_lowmem(last_addr)) { - struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} - -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p; - if ((void __force *) addr <= high_memory) - return; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; -#endif - p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); - if (!p) { - printk("__iounmap: bad address %p\n", addr); - return; - } - - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { - /* p->size includes the guard page, but cpa doesn't like that */ - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), - (p->size - PAGE_SIZE) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - kfree(p); -} - -#if defined(__i386__) -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0x0 && last_addr < 0x100000) - return isa_bus_to_virt(phys_addr); -#endif - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; -#endif - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} -#endif /* defined(__i386__) */ - -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ - -/* These hacky macros avoid phys->machine translations. */ -#define __direct_pte(x) ((pte_t) { (x) } ) -#define __direct_mk_pte(page_nr,pgprot) \ - __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) -#define direct_mk_pte_phys(physpage, pgprot) \ - __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) - -static inline void direct_remap_area_pte(pte_t *pte, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - if (address >= end) - BUG(); - - do { - (*v)->ptr = virt_to_machine(pte); - (*v)++; - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int direct_remap_area_pmd(struct mm_struct *mm, - pmd_t *pmd, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - pte_t *pte = (mm == &init_mm) ? - pte_alloc_kernel(mm, pmd, address) : - pte_alloc_map(mm, pmd, address); - if (!pte) - return -ENOMEM; - direct_remap_area_pte(pte, address, end - address, v); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v) -{ - pgd_t * dir; - unsigned long end = address + size; - int error; - -#if defined(__i386__) - dir = pgd_offset(mm, address); -#elif defined (__x86_64) - dir = (mm == &init_mm) ? - pgd_offset_k(address): - pgd_offset(mm, address); -#endif - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - do { - pud_t *pud; - pmd_t *pmd; - - error = -ENOMEM; - pud = pud_alloc(mm, dir, address); - if (!pud) - break; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - break; - error = 0; - direct_remap_area_pmd(mm, pmd, address, end - address, &v); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - - } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); - return error; -} - - -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - int i; - unsigned long start_address; -#define MAX_DIRECTMAP_MMU_QUEUE 130 - mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; - - start_address = address; - - flush_cache_all(); - - for (i = 0; i < size; i += PAGE_SIZE) { - if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) { - /* Fill in the PTE pointers. */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) - return -EFAULT; - v = u; - start_address = address; - } - - /* - * Fill in the machine address: PTE ptr is done later by - * __direct_remap_area_pages(). - */ - v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); - - machine_addr += PAGE_SIZE; - address += PAGE_SIZE; - v++; - } - - if (v != u) { - /* get the ptep's filled in */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) - return -EFAULT; - } - - flush_tlb_all(); - - return 0; -} - -EXPORT_SYMBOL(direct_remap_area_pages); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/pageattr.c --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/mm/pageattr.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,254 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/pgalloc.h> -#include <asm/io.h> - -void pte_free(struct page *pte) -{ - pte_t *ptep; - - ptep = pfn_to_kaddr(page_to_pfn(pte)); - - xen_pte_unpin(__pa(ptep)); - make_page_writable(ptep); - __free_page(pte); -} - -static inline pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - - -static void flush_kernel_map(void *address) -{ - if (0 && address && cpu_has_clflush) { - /* is this worth it? */ - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (address + i)); - } else - asm volatile("wbinvd":::"memory"); - if (address) - __flush_tlb_one((unsigned long) address); - else - __flush_tlb_all(); -} - - -static inline void flush_map(unsigned long address) -{ - on_each_cpu(flush_kernel_map, (void *)address, 1, 1); -} - -struct deferred_page { - struct deferred_page *next; - struct page *fpage; - unsigned long address; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ - -static inline void save_page(unsigned long address, struct page *fpage) -{ - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(address); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df->address = address; - df_list = df; - } -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pgprot_val(ref_prot) |= _PAGE_PSE; - large_pte = mk_pte_phys(__pa(address) & LARGE_PAGE_MASK, ref_prot); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - unsigned kpte_flags; - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - kpte_flags = pte_val(*kpte); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if ((kpte_flags & _PAGE_PSE) == 0) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this change_page_attr - * on the split page. - */ - struct page *split = split_large_page(address, prot, ref_prot); - if (!split) - return -ENOMEM; - set_pte(kpte,mk_pte(split, ref_prot)); - kpte_page = split; - } - get_page(kpte_page); - } else if ((kpte_flags & _PAGE_PSE) == 0) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - __put_page(kpte_page); - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ -// BUG_ON(PageReserved(kpte_page)); - /* - * ..., but the XEN guest kernels (currently) do: - * If the pte was reserved, it means it was created at boot - * time (not via split_large_page) and in turn we must not - * replace it with a large page. - */ - if (!PageReserved(kpte_page)) { - switch (page_count(kpte_page)) { - case 1: - save_page(address, kpte_page); - revert_page(address, ref_prot); - break; - case 0: - BUG(); /* memleak and failed 2M page regeneration */ - } - } - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0; - int i; - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2 = prot; - addr2 = __START_KERNEL_map + __pa(address); - pgprot_val(prot2) &= ~_PAGE_NX; - err = __change_page_attr(addr2, pfn, prot2, PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct deferred_page *df, *next_df; - - down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); - up_read(&init_mm.mmap_sem); - if (!df) - return; - flush_map((df && !df->next) ? df->address : 0); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/pci/Makefile --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/pci/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,38 +0,0 @@ -# -# Makefile for X86_64 specific PCI routines -# -# Reuse the i386 PCI subsystem -# -XENARCH := $(subst ",,$(CONFIG_XENARCH)) -CFLAGS += -Iarch/$(XENARCH)/pci - -CFLAGS += -Iarch/i386/pci - -c-i386-obj-y := i386.o -c-i386-obj-y += fixup.o -c-i386-obj-$(CONFIG_ACPI_PCI) += acpi.o -c-i386-obj-y += legacy.o common.o -c-i386-obj-$(CONFIG_PCI_DIRECT)+= direct.o -c-xen-obj-y += irq.o -# mmconfig has a 64bit special -c-obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o - -c-obj-$(CONFIG_NUMA) += k8-bus.o - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-xen-obj-y)): - @ln -fsn $(srctree)/arch/xen/i386/pci/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/x86_64/pci/$(notdir $@) $@ - -$(patsubst %.o,$(obj)/%.c,$(c-i386-obj-y)): - @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@ - -obj-y += $(c-i386-obj-y) $(c-obj-y) -obj-y += $(c-xen-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) -clean-files += $(patsubst %.o,%.c,$(c-i386-obj-y) $(c-i386-obj-)) -clean-files += $(patsubst %.o,%.c,$(c-xen-obj-y) $(c-xen-obj-)) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/arch/xen/x86_64/pci/Makefile-BUS --- a/linux-2.6.11-xen-sparse/arch/xen/x86_64/pci/Makefile-BUS Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,22 +0,0 @@ -# -# Makefile for X86_64 specific PCI routines -# -# Reuse the i386 PCI subsystem -# -CFLAGS += -I arch/i386/pci - -obj-y := i386.o -obj-$(CONFIG_PCI_DIRECT)+= direct.o -obj-y += fixup.o -obj-$(CONFIG_ACPI_PCI) += acpi.o -obj-y += legacy.o irq.o common.o -# mmconfig has a 64bit special -obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o - -direct-y += ../../i386/pci/direct.o -acpi-y += ../../i386/pci/acpi.o -legacy-y += ../../i386/pci/legacy.o -irq-y += ../../i386/pci/irq.o -common-y += ../../i386/pci/common.o -fixup-y += ../../i386/pci/fixup.o -i386-y += ../../i386/pci/i386.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/Makefile --- a/linux-2.6.11-xen-sparse/drivers/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,66 +0,0 @@ -# -# Makefile for the Linux kernel device drivers. -# -# 15 Sep 2000, Christoph Hellwig <hch@xxxxxxxxxxxxx> -# Rewritten to use lists instead of if-statements. -# - -obj-$(CONFIG_PCI) += pci/ -obj-$(CONFIG_PARISC) += parisc/ -obj-y += video/ -obj-$(CONFIG_ACPI_BOOT) += acpi/ -# PnP must come after ACPI since it will eventually need to check if acpi -# was used and do nothing if so -obj-$(CONFIG_PNP) += pnp/ - -# char/ comes before serial/ etc so that the VT console is the boot-time -# default. -obj-y += char/ - -# i810fb and intelfb depend on char/agp/ -obj-$(CONFIG_FB_I810) += video/i810/ -obj-$(CONFIG_FB_INTEL) += video/intelfb/ - -# we also need input/serio early so serio bus is initialized by the time -# serial drivers start registering their serio ports -obj-$(CONFIG_SERIO) += input/serio/ -obj-y += serial/ -obj-$(CONFIG_PARPORT) += parport/ -obj-y += base/ block/ misc/ net/ media/ -obj-$(CONFIG_NUBUS) += nubus/ -obj-$(CONFIG_ATM) += atm/ -obj-$(CONFIG_PPC_PMAC) += macintosh/ -obj-$(CONFIG_ARCH_XEN) += xen/ -obj-$(CONFIG_IDE) += ide/ -obj-$(CONFIG_FC4) += fc4/ -obj-$(CONFIG_SCSI) += scsi/ -obj-$(CONFIG_FUSION) += message/ -obj-$(CONFIG_IEEE1394) += ieee1394/ -obj-y += cdrom/ -obj-$(CONFIG_MTD) += mtd/ -obj-$(CONFIG_PCCARD) += pcmcia/ -obj-$(CONFIG_DIO) += dio/ -obj-$(CONFIG_SBUS) += sbus/ -obj-$(CONFIG_ZORRO) += zorro/ -obj-$(CONFIG_MAC) += macintosh/ -obj-$(CONFIG_ATA_OVER_ETH) += block/aoe/ -obj-$(CONFIG_PARIDE) += block/paride/ -obj-$(CONFIG_TC) += tc/ -obj-$(CONFIG_USB) += usb/ -obj-$(CONFIG_USB_GADGET) += usb/gadget/ -obj-$(CONFIG_INPUT) += input/ -obj-$(CONFIG_GAMEPORT) += input/gameport/ -obj-$(CONFIG_I2O) += message/ -obj-$(CONFIG_I2C) += i2c/ -obj-$(CONFIG_W1) += w1/ -obj-$(CONFIG_PHONE) += telephony/ -obj-$(CONFIG_MD) += md/ -obj-$(CONFIG_BT) += bluetooth/ -obj-$(CONFIG_ISDN) += isdn/ -obj-$(CONFIG_MCA) += mca/ -obj-$(CONFIG_EISA) += eisa/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ -obj-$(CONFIG_MMC) += mmc/ -obj-$(CONFIG_INFINIBAND) += infiniband/ -obj-y += firmware/ -obj-$(CONFIG_CRYPTO) += crypto/ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/acpi/tables.c --- a/linux-2.6.11-xen-sparse/drivers/acpi/tables.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,615 +0,0 @@ -/* - * acpi_tables.c - ACPI Boot-Time Table Parsing - * - * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@xxxxxxxxx> - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - */ - -#include <linux/config.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/irq.h> -#include <linux/errno.h> -#include <linux/acpi.h> -#include <linux/bootmem.h> - -#define PREFIX "ACPI: " - -#define ACPI_MAX_TABLES 256 - -static char *acpi_table_signatures[ACPI_TABLE_COUNT] = { - [ACPI_TABLE_UNKNOWN] = "????", - [ACPI_APIC] = "APIC", - [ACPI_BOOT] = "BOOT", - [ACPI_DBGP] = "DBGP", - [ACPI_DSDT] = "DSDT", - [ACPI_ECDT] = "ECDT", - [ACPI_ETDT] = "ETDT", - [ACPI_FADT] = "FACP", - [ACPI_FACS] = "FACS", - [ACPI_OEMX] = "OEM", - [ACPI_PSDT] = "PSDT", - [ACPI_SBST] = "SBST", - [ACPI_SLIT] = "SLIT", - [ACPI_SPCR] = "SPCR", - [ACPI_SRAT] = "SRAT", - [ACPI_SSDT] = "SSDT", - [ACPI_SPMI] = "SPMI", - [ACPI_HPET] = "HPET", - [ACPI_MCFG] = "MCFG", -}; - -static char *mps_inti_flags_polarity[] = { "dfl", "high", "res", "low" }; -static char *mps_inti_flags_trigger[] = { "dfl", "edge", "res", "level" }; - -/* System Description Table (RSDT/XSDT) */ -struct acpi_table_sdt { - unsigned long pa; - enum acpi_table_id id; - unsigned long size; -} __attribute__ ((packed)); - -static unsigned long sdt_pa; /* Physical Address */ -static unsigned long sdt_count; /* Table count */ - -static struct acpi_table_sdt sdt_entry[ACPI_MAX_TABLES]; - -void -acpi_table_print ( - struct acpi_table_header *header, - unsigned long phys_addr) -{ - char *name = NULL; - - if (!header) - return; - - /* Some table signatures aren't good table names */ - - if (!strncmp((char *) &header->signature, - acpi_table_signatures[ACPI_APIC], - sizeof(header->signature))) { - name = "MADT"; - } - else if (!strncmp((char *) &header->signature, - acpi_table_signatures[ACPI_FADT], - sizeof(header->signature))) { - name = "FADT"; - } - else - name = header->signature; - - printk(KERN_DEBUG PREFIX "%.4s (v%3.3d %6.6s %8.8s 0x%08x %.4s 0x%08x) @ 0x%p\n", - name, header->revision, header->oem_id, - header->oem_table_id, header->oem_revision, - header->asl_compiler_id, header->asl_compiler_revision, - (void *) phys_addr); -} - - -void -acpi_table_print_madt_entry ( - acpi_table_entry_header *header) -{ - if (!header) - return; - - switch (header->type) { - - case ACPI_MADT_LAPIC: - { - struct acpi_table_lapic *p = - (struct acpi_table_lapic*) header; - printk(KERN_INFO PREFIX "LAPIC (acpi_id[0x%02x] lapic_id[0x%02x] %s)\n", - p->acpi_id, p->id, p->flags.enabled?"enabled":"disabled"); - } - break; - - case ACPI_MADT_IOAPIC: - { - struct acpi_table_ioapic *p = - (struct acpi_table_ioapic*) header; - printk(KERN_INFO PREFIX "IOAPIC (id[0x%02x] address[0x%08x] gsi_base[%d])\n", - p->id, p->address, p->global_irq_base); - } - break; - - case ACPI_MADT_INT_SRC_OVR: - { - struct acpi_table_int_src_ovr *p = - (struct acpi_table_int_src_ovr*) header; - printk(KERN_INFO PREFIX "INT_SRC_OVR (bus %d bus_irq %d global_irq %d %s %s)\n", - p->bus, p->bus_irq, p->global_irq, - mps_inti_flags_polarity[p->flags.polarity], - mps_inti_flags_trigger[p->flags.trigger]); - if(p->flags.reserved) - printk(KERN_INFO PREFIX "INT_SRC_OVR unexpected reserved flags: 0x%x\n", - p->flags.reserved); - - } - break; - - case ACPI_MADT_NMI_SRC: - { - struct acpi_table_nmi_src *p = - (struct acpi_table_nmi_src*) header; - printk(KERN_INFO PREFIX "NMI_SRC (%s %s global_irq %d)\n", - mps_inti_flags_polarity[p->flags.polarity], - mps_inti_flags_trigger[p->flags.trigger], p->global_irq); - } - break; - - case ACPI_MADT_LAPIC_NMI: - { - struct acpi_table_lapic_nmi *p = - (struct acpi_table_lapic_nmi*) header; - printk(KERN_INFO PREFIX "LAPIC_NMI (acpi_id[0x%02x] %s %s lint[0x%x])\n", - p->acpi_id, - mps_inti_flags_polarity[p->flags.polarity], - mps_inti_flags_trigger[p->flags.trigger], p->lint); - } - break; - - case ACPI_MADT_LAPIC_ADDR_OVR: - { - struct acpi_table_lapic_addr_ovr *p = - (struct acpi_table_lapic_addr_ovr*) header; - printk(KERN_INFO PREFIX "LAPIC_ADDR_OVR (address[%p])\n", - (void *) (unsigned long) p->address); - } - break; - - case ACPI_MADT_IOSAPIC: - { - struct acpi_table_iosapic *p = - (struct acpi_table_iosapic*) header; - printk(KERN_INFO PREFIX "IOSAPIC (id[0x%x] address[%p] gsi_base[%d])\n", - p->id, (void *) (unsigned long) p->address, p->global_irq_base); - } - break; - - case ACPI_MADT_LSAPIC: - { - struct acpi_table_lsapic *p = - (struct acpi_table_lsapic*) header; - printk(KERN_INFO PREFIX "LSAPIC (acpi_id[0x%02x] lsapic_id[0x%02x] lsapic_eid[0x%02x] %s)\n", - p->acpi_id, p->id, p->eid, p->flags.enabled?"enabled":"disabled"); - } - break; - - case ACPI_MADT_PLAT_INT_SRC: - { - struct acpi_table_plat_int_src *p = - (struct acpi_table_plat_int_src*) header; - printk(KERN_INFO PREFIX "PLAT_INT_SRC (%s %s type[0x%x] id[0x%04x] eid[0x%x] iosapic_vector[0x%x] global_irq[0x%x]\n", - mps_inti_flags_polarity[p->flags.polarity], - mps_inti_flags_trigger[p->flags.trigger], - p->type, p->id, p->eid, p->iosapic_vector, p->global_irq); - } - break; - - default: - printk(KERN_WARNING PREFIX "Found unsupported MADT entry (type = 0x%x)\n", - header->type); - break; - } -} - - -static int -acpi_table_compute_checksum ( - void *table_pointer, - unsigned long length) -{ - u8 *p = (u8 *) table_pointer; - unsigned long remains = length; - unsigned long sum = 0; - - if (!p || !length) - return -EINVAL; - - while (remains--) - sum += *p++; - - return (sum & 0xFF); -} - -/* - * acpi_get_table_header_early() - * for acpi_blacklisted(), acpi_table_get_sdt() - */ -int __init -acpi_get_table_header_early ( - enum acpi_table_id id, - struct acpi_table_header **header) -{ - unsigned int i; - enum acpi_table_id temp_id; - - /* DSDT is different from the rest */ - if (id == ACPI_DSDT) - temp_id = ACPI_FADT; - else - temp_id = id; - - /* Locate the table. */ - - for (i = 0; i < sdt_count; i++) { - if (sdt_entry[i].id != temp_id) - continue; - *header = (void *) - __acpi_map_table(sdt_entry[i].pa, sdt_entry[i].size); - if (!*header) { - printk(KERN_WARNING PREFIX "Unable to map %s\n", - acpi_table_signatures[temp_id]); - return -ENODEV; - } - break; - } - - if (!*header) { - printk(KERN_WARNING PREFIX "%s not present\n", - acpi_table_signatures[id]); - return -ENODEV; - } - - /* Map the DSDT header via the pointer in the FADT */ - if (id == ACPI_DSDT) { - struct fadt_descriptor_rev2 *fadt = (struct fadt_descriptor_rev2 *) *header; - - if (fadt->revision == 3 && fadt->Xdsdt) { - *header = (void *) __acpi_map_table(fadt->Xdsdt, - sizeof(struct acpi_table_header)); - } else if (fadt->V1_dsdt) { - *header = (void *) __acpi_map_table(fadt->V1_dsdt, - sizeof(struct acpi_table_header)); - } else - *header = NULL; - - if (!*header) { - printk(KERN_WARNING PREFIX "Unable to map DSDT\n"); - return -ENODEV; - } - } - - return 0; -} - - -int __init -acpi_table_parse_madt_family ( - enum acpi_table_id id, - unsigned long madt_size, - int entry_id, - acpi_madt_entry_handler handler, - unsigned int max_entries) -{ - void *madt = NULL; - acpi_table_entry_header *entry; - unsigned int count = 0; - unsigned long madt_end; - unsigned int i; - - if (!handler) - return -EINVAL; - - /* Locate the MADT (if exists). There should only be one. */ - - for (i = 0; i < sdt_count; i++) { - if (sdt_entry[i].id != id) - continue; - madt = (void *) - __acpi_map_table(sdt_entry[i].pa, sdt_entry[i].size); - if (!madt) { - printk(KERN_WARNING PREFIX "Unable to map %s\n", - acpi_table_signatures[id]); - return -ENODEV; - } - break; - } - - if (!madt) { - printk(KERN_WARNING PREFIX "%s not present\n", - acpi_table_signatures[id]); - return -ENODEV; - } - - madt_end = (unsigned long) madt + sdt_entry[i].size; - - /* Parse all entries looking for a match. */ - - entry = (acpi_table_entry_header *) - ((unsigned long) madt + madt_size); - - while (((unsigned long) entry) + sizeof(acpi_table_entry_header) < madt_end) { - if (entry->type == entry_id && - (!max_entries || count++ < max_entries)) - if (handler(entry, madt_end)) - return -EINVAL; - - entry = (acpi_table_entry_header *) - ((unsigned long) entry + entry->length); - } - if (max_entries && count > max_entries) { - printk(KERN_WARNING PREFIX "[%s:0x%02x] ignored %i entries of " - "%i found\n", acpi_table_signatures[id], entry_id, - count - max_entries, count); - } - - return count; -} - - -int __init -acpi_table_parse_madt ( - enum acpi_madt_entry_id id, - acpi_madt_entry_handler handler, - unsigned int max_entries) -{ - return acpi_table_parse_madt_family(ACPI_APIC, sizeof(struct acpi_table_madt), - id, handler, max_entries); -} - - -int __init -acpi_table_parse ( - enum acpi_table_id id, - acpi_table_handler handler) -{ - int count = 0; - unsigned int i = 0; - - if (!handler) - return -EINVAL; - - for (i = 0; i < sdt_count; i++) { - if (sdt_entry[i].id != id) - continue; - count++; - if (count == 1) - handler(sdt_entry[i].pa, sdt_entry[i].size); - - else - printk(KERN_WARNING PREFIX "%d duplicate %s table ignored.\n", - count, acpi_table_signatures[id]); - } - - return count; -} - - -static int __init -acpi_table_get_sdt ( - struct acpi_table_rsdp *rsdp) -{ - struct acpi_table_header *header = NULL; - unsigned int i, id = 0; - - if (!rsdp) - return -EINVAL; - - /* First check XSDT (but only on ACPI 2.0-compatible systems) */ - - if ((rsdp->revision >= 2) && - (((struct acpi20_table_rsdp*)rsdp)->xsdt_address)) { - - struct acpi_table_xsdt *mapped_xsdt = NULL; - - sdt_pa = ((struct acpi20_table_rsdp*)rsdp)->xsdt_address; - - /* map in just the header */ - header = (struct acpi_table_header *) - __acpi_map_table(sdt_pa, sizeof(struct acpi_table_header)); - - if (!header) { - printk(KERN_WARNING PREFIX "Unable to map XSDT header\n"); - return -ENODEV; - } - - /* remap in the entire table before processing */ - mapped_xsdt = (struct acpi_table_xsdt *) - __acpi_map_table(sdt_pa, header->length); - if (!mapped_xsdt) { - printk(KERN_WARNING PREFIX "Unable to map XSDT\n"); - return -ENODEV; - } - header = &mapped_xsdt->header; - - if (strncmp(header->signature, "XSDT", 4)) { - printk(KERN_WARNING PREFIX "XSDT signature incorrect\n"); - return -ENODEV; - } - - if (acpi_table_compute_checksum(header, header->length)) { - printk(KERN_WARNING PREFIX "Invalid XSDT checksum\n"); - return -ENODEV; - } - - sdt_count = (header->length - sizeof(struct acpi_table_header)) >> 3; - if (sdt_count > ACPI_MAX_TABLES) { - printk(KERN_WARNING PREFIX "Truncated %lu XSDT entries\n", - (sdt_count - ACPI_MAX_TABLES)); - sdt_count = ACPI_MAX_TABLES; - } - - for (i = 0; i < sdt_count; i++) - sdt_entry[i].pa = (unsigned long) mapped_xsdt->entry[i]; - } - - /* Then check RSDT */ - - else if (rsdp->rsdt_address) { - - struct acpi_table_rsdt *mapped_rsdt = NULL; - - sdt_pa = rsdp->rsdt_address; - - /* map in just the header */ - header = (struct acpi_table_header *) - __acpi_map_table(sdt_pa, sizeof(struct acpi_table_header)); - if (!header) { - printk(KERN_WARNING PREFIX "Unable to map RSDT header\n"); - return -ENODEV; - } - - /* remap in the entire table before processing */ - mapped_rsdt = (struct acpi_table_rsdt *) - __acpi_map_table(sdt_pa, header->length); - if (!mapped_rsdt) { - printk(KERN_WARNING PREFIX "Unable to map RSDT\n"); - return -ENODEV; - } - header = &mapped_rsdt->header; - - if (strncmp(header->signature, "RSDT", 4)) { - printk(KERN_WARNING PREFIX "RSDT signature incorrect\n"); - return -ENODEV; - } - - if (acpi_table_compute_checksum(header, header->length)) { - printk(KERN_WARNING PREFIX "Invalid RSDT checksum\n"); - return -ENODEV; - } - - sdt_count = (header->length - sizeof(struct acpi_table_header)) >> 2; - if (sdt_count > ACPI_MAX_TABLES) { - printk(KERN_WARNING PREFIX "Truncated %lu RSDT entries\n", - (sdt_count - ACPI_MAX_TABLES)); - sdt_count = ACPI_MAX_TABLES; - } - - for (i = 0; i < sdt_count; i++) - sdt_entry[i].pa = (unsigned long) mapped_rsdt->entry[i]; - } - - else { - printk(KERN_WARNING PREFIX "No System Description Table (RSDT/XSDT) specified in RSDP\n"); - return -ENODEV; - } - - acpi_table_print(header, sdt_pa); - - for (i = 0; i < sdt_count; i++) { - - /* map in just the header */ - header = (struct acpi_table_header *) - __acpi_map_table(sdt_entry[i].pa, - sizeof(struct acpi_table_header)); - if (!header) - continue; - - /* remap in the entire table before processing */ - header = (struct acpi_table_header *) - __acpi_map_table(sdt_entry[i].pa, - header->length); - if (!header) - continue; - - acpi_table_print(header, sdt_entry[i].pa); - - if (acpi_table_compute_checksum(header, header->length)) { - printk(KERN_WARNING " >>> ERROR: Invalid checksum\n"); - continue; - } - - sdt_entry[i].size = header->length; - - for (id = 0; id < ACPI_TABLE_COUNT; id++) { - if (!strncmp((char *) &header->signature, - acpi_table_signatures[id], - sizeof(header->signature))) { - sdt_entry[i].id = id; - } - } - } - - /* - * The DSDT is *not* in the RSDT (why not? no idea.) but we want - * to print its info, because this is what people usually blacklist - * against. Unfortunately, we don't know the phys_addr, so just - * print 0. Maybe no one will notice. - */ - if(!acpi_get_table_header_early(ACPI_DSDT, &header)) - acpi_table_print(header, 0); - - return 0; -} - -/* - * acpi_table_init() - * - * find RSDP, find and checksum SDT/XSDT. - * checksum all tables, print SDT/XSDT - * - * result: sdt_entry[] is initialized - */ -#if CONFIG_XEN -#define acpi_rsdp_phys_to_va(rsdp_phys) (__fix_to_virt(FIX_ACPI_RSDP_PAGE) + \ - (rsdp_phys & ~PAGE_MASK)) -#else -#define acpi_rsdp_phys_to_va(rsdp_phys) __va(rsdp_phys) -#endif - -int __init -acpi_table_init (void) -{ - struct acpi_table_rsdp *rsdp = NULL; - unsigned long rsdp_phys = 0; - int result = 0; - - /* Locate and map the Root System Description Table (RSDP) */ - - rsdp_phys = acpi_find_rsdp(); - if (!rsdp_phys) { - printk(KERN_ERR PREFIX "Unable to locate RSDP\n"); - return -ENODEV; - } - - rsdp = (struct acpi_table_rsdp *) acpi_rsdp_phys_to_va(rsdp_phys); - if (!rsdp) { - printk(KERN_WARNING PREFIX "Unable to map RSDP\n"); - return -ENODEV; - } - - printk(KERN_DEBUG PREFIX "RSDP (v%3.3d %6.6s ) @ 0x%p\n", - rsdp->revision, rsdp->oem_id, (void *) rsdp_phys); - - if (rsdp->revision < 2) - result = acpi_table_compute_checksum(rsdp, sizeof(struct acpi_table_rsdp)); - else - result = acpi_table_compute_checksum(rsdp, ((struct acpi20_table_rsdp *)rsdp)->length); - - if (result) { - printk(KERN_WARNING " >>> ERROR: Invalid checksum\n"); - return -ENODEV; - } - - /* Locate and map the System Description table (RSDT/XSDT) */ - - if (acpi_table_get_sdt(rsdp)) - return -ENODEV; - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/char/mem.c --- a/linux-2.6.11-xen-sparse/drivers/char/mem.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,733 +0,0 @@ -/* - * linux/drivers/char/mem.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Added devfs support. - * Jan-11-1998, C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx> - * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@xxxxxxx> - */ - -#include <linux/config.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/mman.h> -#include <linux/random.h> -#include <linux/init.h> -#include <linux/raw.h> -#include <linux/tty.h> -#include <linux/capability.h> -#include <linux/smp_lock.h> -#include <linux/devfs_fs_kernel.h> -#include <linux/ptrace.h> -#include <linux/device.h> - -#include <asm/uaccess.h> -#include <asm/io.h> - -#ifdef CONFIG_IA64 -# include <linux/efi.h> -#endif - -#if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_CHAR) -extern void tapechar_init(void); -#endif - -/* - * Architectures vary in how they handle caching for addresses - * outside of main memory. - * - */ -static inline int uncached_access(struct file *file, unsigned long addr) -{ -#if defined(__i386__) - /* - * On the PPro and successors, the MTRRs are used to set - * memory types for physical addresses outside main memory, - * so blindly setting PCD or PWT on those pages is wrong. - * For Pentiums and earlier, the surround logic should disable - * caching for the high addresses through the KEN pin, but - * we maintain the tradition of paranoia in this code. - */ - if (file->f_flags & O_SYNC) - return 1; - return !( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || - test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability) ) - && addr >= __pa(high_memory); -#elif defined(__x86_64__) - /* - * This is broken because it can generate memory type aliases, - * which can cause cache corruptions - * But it is only available for root and we have to be bug-to-bug - * compatible with i386. - */ - if (file->f_flags & O_SYNC) - return 1; - /* same behaviour as i386. PAT always set to cached and MTRRs control the - caching behaviour. - Hopefully a full PAT implementation will fix that soon. */ - return 0; -#elif defined(CONFIG_IA64) - /* - * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases. - */ - return !(efi_mem_attributes(addr) & EFI_MEMORY_WB); -#elif defined(CONFIG_PPC64) - /* On PPC64, we always do non-cacheable access to the IO hole and - * cacheable elsewhere. Cache paradox can checkstop the CPU and - * the high_memory heuristic below is wrong on machines with memory - * above the IO hole... Ah, and of course, XFree86 doesn't pass - * O_SYNC when mapping us to tap IO space. Surprised ? - */ - return !page_is_ram(addr >> PAGE_SHIFT); -#else - /* - * Accessing memory above the top the kernel knows about or through a file pointer - * that was marked O_SYNC will be done non-cached. - */ - if (file->f_flags & O_SYNC) - return 1; - return addr >= __pa(high_memory); -#endif -} - -#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE -static inline int valid_phys_addr_range(unsigned long addr, size_t *count) -{ - unsigned long end_mem; - - end_mem = __pa(high_memory); - if (addr >= end_mem) - return 0; - - if (*count > end_mem - addr) - *count = end_mem - addr; - - return 1; -} -#endif - -static ssize_t do_write_mem(void *p, unsigned long realp, - const char __user * buf, size_t count, loff_t *ppos) -{ - ssize_t written; - unsigned long copied; - - written = 0; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (realp < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-realp; - if (sz > count) sz = count; - /* Hmm. Do something? */ - buf+=sz; - p+=sz; - count-=sz; - written+=sz; - } -#endif - copied = copy_from_user(p, buf, count); - if (copied) { - ssize_t ret = written + (count - copied); - - if (ret) - return ret; - return -EFAULT; - } - written += count; - *ppos += written; - return written; -} - -#ifndef ARCH_HAS_DEV_MEM -/* - * This funcion reads the *physical* memory. The f_pos points directly to the - * memory location. - */ -static ssize_t read_mem(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t read; - - if (!valid_phys_addr_range(p, &count)) - return -EFAULT; - read = 0; -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE) { - unsigned long sz = PAGE_SIZE-p; - if (sz > count) - sz = count; - if (sz > 0) { - if (clear_user(buf, sz)) - return -EFAULT; - buf += sz; - p += sz; - count -= sz; - read += sz; - } - } -#endif - if (copy_to_user(buf, __va(p), count)) - return -EFAULT; - read += count; - *ppos += read; - return read; -} - -static ssize_t write_mem(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - - if (!valid_phys_addr_range(p, &count)) - return -EFAULT; - return do_write_mem(__va(p), p, buf, count, ppos); -} -#endif - -static int mmap_kmem(struct file * file, struct vm_area_struct * vma) -{ -#ifdef pgprot_noncached - unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - int uncached; - - uncached = uncached_access(file, offset); - if (uncached) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); -#endif - - /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ - if (remap_pfn_range(vma, - vma->vm_start, - vma->vm_pgoff, - vma->vm_end-vma->vm_start, - vma->vm_page_prot)) - return -EAGAIN; - return 0; -} - -extern long vread(char *buf, char *addr, unsigned long count); -extern long vwrite(char *buf, char *addr, unsigned long count); - -/* - * This function reads the *virtual* memory as seen by the kernel. - */ -static ssize_t read_kmem(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t read = 0; - ssize_t virtr = 0; - char * kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ - - if (p < (unsigned long) high_memory) { - read = count; - if (count > (unsigned long) high_memory - p) - read = (unsigned long) high_memory - p; - -#if defined(__sparc__) || (defined(__mc68000__) && defined(CONFIG_MMU)) - /* we don't have page 0 mapped on sparc and m68k.. */ - if (p < PAGE_SIZE && read > 0) { - size_t tmp = PAGE_SIZE - p; - if (tmp > read) tmp = read; - if (clear_user(buf, tmp)) - return -EFAULT; - buf += tmp; - p += tmp; - read -= tmp; - count -= tmp; - } -#endif - if (copy_to_user(buf, (char *)p, read)) - return -EFAULT; - p += read; - buf += read; - count -= read; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - while (count > 0) { - int len = count; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - len = vread(kbuf, (char *)p, len); - if (!len) - break; - if (copy_to_user(buf, kbuf, len)) { - free_page((unsigned long)kbuf); - return -EFAULT; - } - count -= len; - buf += len; - virtr += len; - p += len; - } - free_page((unsigned long)kbuf); - } - *ppos = p; - return virtr + read; -} - -/* - * This function writes to the *virtual* memory as seen by the kernel. - */ -static ssize_t write_kmem(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long p = *ppos; - ssize_t wrote = 0; - ssize_t virtr = 0; - ssize_t written; - char * kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ - - if (p < (unsigned long) high_memory) { - - wrote = count; - if (count > (unsigned long) high_memory - p) - wrote = (unsigned long) high_memory - p; - - written = do_write_mem((void*)p, p, buf, wrote, ppos); - if (written != wrote) - return written; - wrote = written; - p += wrote; - buf += wrote; - count -= wrote; - } - - if (count > 0) { - kbuf = (char *)__get_free_page(GFP_KERNEL); - if (!kbuf) - return wrote ? wrote : -ENOMEM; - while (count > 0) { - int len = count; - - if (len > PAGE_SIZE) - len = PAGE_SIZE; - if (len) { - written = copy_from_user(kbuf, buf, len); - if (written) { - ssize_t ret; - - free_page((unsigned long)kbuf); - ret = wrote + virtr + (len - written); - return ret ? ret : -EFAULT; - } - } - len = vwrite(kbuf, (char *)p, len); - count -= len; - buf += len; - virtr += len; - p += len; - } - free_page((unsigned long)kbuf); - } - - *ppos = p; - return virtr + wrote; -} - -#if defined(CONFIG_ISA) || !defined(__mc68000__) -static ssize_t read_port(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long i = *ppos; - char __user *tmp = buf; - - if (verify_area(VERIFY_WRITE,buf,count)) - return -EFAULT; - while (count-- > 0 && i < 65536) { - if (__put_user(inb(i),tmp) < 0) - return -EFAULT; - i++; - tmp++; - } - *ppos = i; - return tmp-buf; -} - -static ssize_t write_port(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long i = *ppos; - const char __user * tmp = buf; - - if (verify_area(VERIFY_READ,buf,count)) - return -EFAULT; - while (count-- > 0 && i < 65536) { - char c; - if (__get_user(c, tmp)) - return -EFAULT; - outb(c,i); - i++; - tmp++; - } - *ppos = i; - return tmp-buf; -} -#endif - -static ssize_t read_null(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - return 0; -} - -static ssize_t write_null(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - return count; -} - -#ifdef CONFIG_MMU -/* - * For fun, we are using the MMU for this. - */ -static inline size_t read_zero_pagealigned(char __user * buf, size_t size) -{ - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long addr=(unsigned long)buf; - - mm = current->mm; - /* Oops, this was forgotten before. -ben */ - down_read(&mm->mmap_sem); - - /* For private mappings, just map in zero pages. */ - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { - unsigned long count; - - if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) - goto out_up; - if (vma->vm_flags & (VM_SHARED | VM_HUGETLB)) - break; - count = vma->vm_end - addr; - if (count > size) - count = size; - - zap_page_range(vma, addr, count, NULL); - zeromap_page_range(vma, addr, count, PAGE_COPY); - - size -= count; - buf += count; - addr += count; - if (size == 0) - goto out_up; - } - - up_read(&mm->mmap_sem); - - /* The shared case is hard. Let's do the conventional zeroing. */ - do { - unsigned long unwritten = clear_user(buf, PAGE_SIZE); - if (unwritten) - return size + unwritten - PAGE_SIZE; - cond_resched(); - buf += PAGE_SIZE; - size -= PAGE_SIZE; - } while (size); - - return size; -out_up: - up_read(&mm->mmap_sem); - return size; -} - -static ssize_t read_zero(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - unsigned long left, unwritten, written = 0; - - if (!count) - return 0; - - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - - left = count; - - /* do we want to be clever? Arbitrary cut-off */ - if (count >= PAGE_SIZE*4) { - unsigned long partial; - - /* How much left of the page? */ - partial = (PAGE_SIZE-1) & -(unsigned long) buf; - unwritten = clear_user(buf, partial); - written = partial - unwritten; - if (unwritten) - goto out; - left -= partial; - buf += partial; - unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); - written += (left & PAGE_MASK) - unwritten; - if (unwritten) - goto out; - buf += left & PAGE_MASK; - left &= ~PAGE_MASK; - } - unwritten = clear_user(buf, left); - written += left - unwritten; -out: - return written ? written : -EFAULT; -} - -static int mmap_zero(struct file * file, struct vm_area_struct * vma) -{ - if (vma->vm_flags & VM_SHARED) - return shmem_zero_setup(vma); - if (zeromap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, vma->vm_page_prot)) - return -EAGAIN; - return 0; -} -#else /* CONFIG_MMU */ -static ssize_t read_zero(struct file * file, char * buf, - size_t count, loff_t *ppos) -{ - size_t todo = count; - - while (todo) { - size_t chunk = todo; - - if (chunk > 4096) - chunk = 4096; /* Just for latency reasons */ - if (clear_user(buf, chunk)) - return -EFAULT; - buf += chunk; - todo -= chunk; - cond_resched(); - } - return count; -} - -static int mmap_zero(struct file * file, struct vm_area_struct * vma) -{ - return -ENOSYS; -} -#endif /* CONFIG_MMU */ - -static ssize_t write_full(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - return -ENOSPC; -} - -/* - * Special lseek() function for /dev/null and /dev/zero. Most notably, you - * can fopen() both devices with "a" now. This was previously impossible. - * -- SRB. - */ - -static loff_t null_lseek(struct file * file, loff_t offset, int orig) -{ - return file->f_pos = 0; -} - -/* - * The memory devices use the full 32/64 bits of the offset, and so we cannot - * check against negative addresses: they are ok. The return value is weird, - * though, in that case (0). - * - * also note that seeking relative to the "end of file" isn't supported: - * it has no meaning, so it returns -EINVAL. - */ -static loff_t memory_lseek(struct file * file, loff_t offset, int orig) -{ - loff_t ret; - - down(&file->f_dentry->d_inode->i_sem); - switch (orig) { - case 0: - file->f_pos = offset; - ret = file->f_pos; - force_successful_syscall_return(); - break; - case 1: - file->f_pos += offset; - ret = file->f_pos; - force_successful_syscall_return(); - break; - default: - ret = -EINVAL; - } - up(&file->f_dentry->d_inode->i_sem); - return ret; -} - -static int open_port(struct inode * inode, struct file * filp) -{ - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -#define mmap_mem mmap_kmem -#define zero_lseek null_lseek -#define full_lseek null_lseek -#define write_zero write_null -#define read_full read_zero -#define open_mem open_port -#define open_kmem open_mem - -#ifndef ARCH_HAS_DEV_MEM -static struct file_operations mem_fops = { - .llseek = memory_lseek, - .read = read_mem, - .write = write_mem, - .mmap = mmap_mem, - .open = open_mem, -}; -#else -extern struct file_operations mem_fops; -#endif - -static struct file_operations kmem_fops = { - .llseek = memory_lseek, - .read = read_kmem, - .write = write_kmem, - .mmap = mmap_kmem, - .open = open_kmem, -}; - -static struct file_operations null_fops = { - .llseek = null_lseek, - .read = read_null, - .write = write_null, -}; - -#if defined(CONFIG_ISA) || !defined(__mc68000__) -static struct file_operations port_fops = { - .llseek = memory_lseek, - .read = read_port, - .write = write_port, - .open = open_port, -}; -#endif - -static struct file_operations zero_fops = { - .llseek = zero_lseek, - .read = read_zero, - .write = write_zero, - .mmap = mmap_zero, -}; - -static struct file_operations full_fops = { - .llseek = full_lseek, - .read = read_full, - .write = write_full, -}; - -static ssize_t kmsg_write(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - char *tmp; - int ret; - - tmp = kmalloc(count + 1, GFP_KERNEL); - if (tmp == NULL) - return -ENOMEM; - ret = -EFAULT; - if (!copy_from_user(tmp, buf, count)) { - tmp[count] = 0; - ret = printk("%s", tmp); - } - kfree(tmp); - return ret; -} - -static struct file_operations kmsg_fops = { - .write = kmsg_write, -}; - -static int memory_open(struct inode * inode, struct file * filp) -{ - switch (iminor(inode)) { - case 1: - filp->f_op = &mem_fops; - break; - case 2: - filp->f_op = &kmem_fops; - break; - case 3: - filp->f_op = &null_fops; - break; -#if defined(CONFIG_ISA) || !defined(__mc68000__) - case 4: - filp->f_op = &port_fops; - break; -#endif - case 5: - filp->f_op = &zero_fops; - break; - case 7: - filp->f_op = &full_fops; - break; - case 8: - filp->f_op = &random_fops; - break; - case 9: - filp->f_op = &urandom_fops; - break; - case 11: - filp->f_op = &kmsg_fops; - break; - default: - return -ENXIO; - } - if (filp->f_op && filp->f_op->open) - return filp->f_op->open(inode,filp); - return 0; -} - -static struct file_operations memory_fops = { - .open = memory_open, /* just a selector for the real open */ -}; - -static const struct { - unsigned int minor; - char *name; - umode_t mode; - struct file_operations *fops; -} devlist[] = { /* list of minor devices */ - {1, "mem", S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops}, - {2, "kmem", S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops}, - {3, "null", S_IRUGO | S_IWUGO, &null_fops}, -#if defined(CONFIG_ISA) || !defined(__mc68000__) - {4, "port", S_IRUSR | S_IWUSR | S_IRGRP, &port_fops}, -#endif - {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, - {7, "full", S_IRUGO | S_IWUGO, &full_fops}, - {8, "random", S_IRUGO | S_IWUSR, &random_fops}, - {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops}, - {11,"kmsg", S_IRUGO | S_IWUSR, &kmsg_fops}, -}; - -static struct class_simple *mem_class; - -static int __init chr_dev_init(void) -{ - int i; - - if (register_chrdev(MEM_MAJOR,"mem",&memory_fops)) - printk("unable to get major %d for memory devs\n", MEM_MAJOR); - - mem_class = class_simple_create(THIS_MODULE, "mem"); - for (i = 0; i < ARRAY_SIZE(devlist); i++) { - class_simple_device_add(mem_class, - MKDEV(MEM_MAJOR, devlist[i].minor), - NULL, devlist[i].name); - devfs_mk_cdev(MKDEV(MEM_MAJOR, devlist[i].minor), - S_IFCHR | devlist[i].mode, devlist[i].name); - } - - return 0; -} - -fs_initcall(chr_dev_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/char/tty_io.c --- a/linux-2.6.11-xen-sparse/drivers/char/tty_io.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2988 +0,0 @@ -/* - * linux/drivers/char/tty_io.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles - * or rs-channels. It also implements echoing, cooked mode etc. - * - * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. - * - * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the - * tty_struct and tty_queue structures. Previously there was an array - * of 256 tty_struct's which was statically allocated, and the - * tty_queue structures were allocated at boot time. Both are now - * dynamically allocated only when the tty is open. - * - * Also restructured routines so that there is more of a separation - * between the high-level tty routines (tty_io.c and tty_ioctl.c) and - * the low-level tty routines (serial.c, pty.c, console.c). This - * makes for cleaner and more compact code. -TYT, 9/17/92 - * - * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines - * which can be dynamically activated and de-activated by the line - * discipline handling modules (like SLIP). - * - * NOTE: pay no attention to the line discipline code (yet); its - * interface is still subject to change in this version... - * -- TYT, 1/31/92 - * - * Added functionality to the OPOST tty handling. No delays, but all - * other bits should be there. - * -- Nick Holloway <alfie@xxxxxxxxxxxxxxxxx>, 27th May 1993. - * - * Rewrote canonical mode and added more termios flags. - * -- julian@xxxxxxxxxxxxxxxxxxxxxx (J. Cowley), 13Jan94 - * - * Reorganized FASYNC support so mouse code can share it. - * -- ctm@xxxxxxxx, 9Sep95 - * - * New TIOCLINUX variants added. - * -- mj@xxxxxxxxxxxxxxxxx, 19-Nov-95 - * - * Restrict vt switching via ioctl() - * -- grif@xxxxxxxxxx, 5-Dec-95 - * - * Move console and virtual terminal code to more appropriate files, - * implement CONFIG_VT and generalize console device interface. - * -- Marko Kohtala <Marko.Kohtala@xxxxxx>, March 97 - * - * Rewrote init_dev and release_dev to eliminate races. - * -- Bill Hawes <whawes@xxxxxxxx>, June 97 - * - * Added devfs support. - * -- C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, 13-Jan-1998 - * - * Added support for a Unix98-style ptmx device. - * -- C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, 14-Jan-1998 - * - * Reduced memory usage for older ARM systems - * -- Russell King <rmk@xxxxxxxxxxxxxxxx> - * - * Move do_SAK() into process context. Less stack use in devfs functions. - * alloc_tty_struct() always uses kmalloc() -- Andrew Morton <andrewm@xxxxxxxxxx> 17Mar01 - */ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/major.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/fcntl.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/tty.h> -#include <linux/tty_driver.h> -#include <linux/tty_flip.h> -#include <linux/devpts_fs.h> -#include <linux/file.h> -#include <linux/console.h> -#include <linux/timer.h> -#include <linux/ctype.h> -#include <linux/kd.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/poll.h> -#include <linux/proc_fs.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/smp_lock.h> -#include <linux/device.h> -#include <linux/idr.h> -#include <linux/wait.h> -#include <linux/bitops.h> - -#include <asm/uaccess.h> -#include <asm/system.h> - -#include <linux/kbd_kern.h> -#include <linux/vt_kern.h> -#include <linux/selection.h> -#include <linux/devfs_fs_kernel.h> - -#include <linux/kmod.h> - -#undef TTY_DEBUG_HANGUP - -#define TTY_PARANOIA_CHECK 1 -#define CHECK_TTY_COUNT 1 - -struct termios tty_std_termios = { /* for the benefit of tty drivers */ - .c_iflag = ICRNL | IXON, - .c_oflag = OPOST | ONLCR, - .c_cflag = B38400 | CS8 | CREAD | HUPCL, - .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | - ECHOCTL | ECHOKE | IEXTEN, - .c_cc = INIT_C_CC -}; - -EXPORT_SYMBOL(tty_std_termios); - -/* This list gets poked at by procfs and various bits of boot up code. This - could do with some rationalisation such as pulling the tty proc function - into this file */ - -LIST_HEAD(tty_drivers); /* linked list of tty drivers */ - -/* Semaphore to protect creating and releasing a tty. This is shared with - vt.c for deeply disgusting hack reasons */ -DECLARE_MUTEX(tty_sem); - -int console_use_vt = 1; - -#ifdef CONFIG_UNIX98_PTYS -extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ -extern int pty_limit; /* Config limit on Unix98 ptys */ -static DEFINE_IDR(allocated_ptys); -static DECLARE_MUTEX(allocated_ptys_lock); -static int ptmx_open(struct inode *, struct file *); -#endif - -extern void disable_early_printk(void); - -static void initialize_tty_struct(struct tty_struct *tty); - -static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *); -static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *); -ssize_t redirected_tty_write(struct file *, const char __user *, size_t, loff_t *); -static unsigned int tty_poll(struct file *, poll_table *); -static int tty_open(struct inode *, struct file *); -static int tty_release(struct inode *, struct file *); -int tty_ioctl(struct inode * inode, struct file * file, - unsigned int cmd, unsigned long arg); -static int tty_fasync(int fd, struct file * filp, int on); -extern void rs_360_init(void); -static void release_mem(struct tty_struct *tty, int idx); - - -static struct tty_struct *alloc_tty_struct(void) -{ - struct tty_struct *tty; - - tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); - if (tty) - memset(tty, 0, sizeof(struct tty_struct)); - return tty; -} - -static inline void free_tty_struct(struct tty_struct *tty) -{ - kfree(tty->write_buf); - kfree(tty); -} - -#define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base) - -char *tty_name(struct tty_struct *tty, char *buf) -{ - if (!tty) /* Hmm. NULL pointer. That's fun. */ - strcpy(buf, "NULL tty"); - else - strcpy(buf, tty->name); - return buf; -} - -EXPORT_SYMBOL(tty_name); - -inline int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, - const char *routine) -{ -#ifdef TTY_PARANOIA_CHECK - if (!tty) { - printk(KERN_WARNING - "null TTY for (%d:%d) in %s\n", - imajor(inode), iminor(inode), routine); - return 1; - } - if (tty->magic != TTY_MAGIC) { - printk(KERN_WARNING - "bad magic number for tty struct (%d:%d) in %s\n", - imajor(inode), iminor(inode), routine); - return 1; - } -#endif - return 0; -} - -static int check_tty_count(struct tty_struct *tty, const char *routine) -{ -#ifdef CHECK_TTY_COUNT - struct list_head *p; - int count = 0; - - file_list_lock(); - list_for_each(p, &tty->tty_files) { - count++; - } - file_list_unlock(); - if (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_SLAVE && - tty->link && tty->link->count) - count++; - if (tty->count != count) { - printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " - "!= #fd's(%d) in %s\n", - tty->name, tty->count, count, routine); - return count; - } -#endif - return 0; -} - -/* - * This is probably overkill for real world processors but - * they are not on hot paths so a little discipline won't do - * any harm. - */ - -static void tty_set_termios_ldisc(struct tty_struct *tty, int num) -{ - down(&tty->termios_sem); - tty->termios->c_line = num; - up(&tty->termios_sem); -} - -/* - * This guards the refcounted line discipline lists. The lock - * must be taken with irqs off because there are hangup path - * callers who will do ldisc lookups and cannot sleep. - */ - -static DEFINE_SPINLOCK(tty_ldisc_lock); -static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait); -static struct tty_ldisc tty_ldiscs[NR_LDISCS]; /* line disc dispatch table */ - -int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) -{ - unsigned long flags; - int ret = 0; - - if (disc < N_TTY || disc >= NR_LDISCS) - return -EINVAL; - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if (new_ldisc) { - tty_ldiscs[disc] = *new_ldisc; - tty_ldiscs[disc].num = disc; - tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED; - tty_ldiscs[disc].refcount = 0; - } else { - if(tty_ldiscs[disc].refcount) - ret = -EBUSY; - else - tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED; - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - - return ret; -} - -EXPORT_SYMBOL(tty_register_ldisc); - -struct tty_ldisc *tty_ldisc_get(int disc) -{ - unsigned long flags; - struct tty_ldisc *ld; - - if (disc < N_TTY || disc >= NR_LDISCS) - return NULL; - - spin_lock_irqsave(&tty_ldisc_lock, flags); - - ld = &tty_ldiscs[disc]; - /* Check the entry is defined */ - if(ld->flags & LDISC_FLAG_DEFINED) - { - /* If the module is being unloaded we can't use it */ - if (!try_module_get(ld->owner)) - ld = NULL; - else /* lock it */ - ld->refcount++; - } - else - ld = NULL; - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return ld; -} - -EXPORT_SYMBOL_GPL(tty_ldisc_get); - -void tty_ldisc_put(int disc) -{ - struct tty_ldisc *ld; - unsigned long flags; - - if (disc < N_TTY || disc >= NR_LDISCS) - BUG(); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - ld = &tty_ldiscs[disc]; - if(ld->refcount == 0) - BUG(); - ld->refcount --; - module_put(ld->owner); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); -} - -EXPORT_SYMBOL_GPL(tty_ldisc_put); - -static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld) -{ - tty->ldisc = *ld; - tty->ldisc.refcount = 0; -} - -/** - * tty_ldisc_try - internal helper - * @tty: the tty - * - * Make a single attempt to grab and bump the refcount on - * the tty ldisc. Return 0 on failure or 1 on success. This is - * used to implement both the waiting and non waiting versions - * of tty_ldisc_ref - */ - -static int tty_ldisc_try(struct tty_struct *tty) -{ - unsigned long flags; - struct tty_ldisc *ld; - int ret = 0; - - spin_lock_irqsave(&tty_ldisc_lock, flags); - ld = &tty->ldisc; - if(test_bit(TTY_LDISC, &tty->flags)) - { - ld->refcount++; - ret = 1; - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return ret; -} - -/** - * tty_ldisc_ref_wait - wait for the tty ldisc - * @tty: tty device - * - * Dereference the line discipline for the terminal and take a - * reference to it. If the line discipline is in flux then - * wait patiently until it changes. - * - * Note: Must not be called from an IRQ/timer context. The caller - * must also be careful not to hold other locks that will deadlock - * against a discipline change, such as an existing ldisc reference - * (which we check for) - */ - -struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) -{ - /* wait_event is a macro */ - wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); - if(tty->ldisc.refcount == 0) - printk(KERN_ERR "tty_ldisc_ref_wait\n"); - return &tty->ldisc; -} - -EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); - -/** - * tty_ldisc_ref - get the tty ldisc - * @tty: tty device - * - * Dereference the line discipline for the terminal and take a - * reference to it. If the line discipline is in flux then - * return NULL. Can be called from IRQ and timer functions. - */ - -struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) -{ - if(tty_ldisc_try(tty)) - return &tty->ldisc; - return NULL; -} - -EXPORT_SYMBOL_GPL(tty_ldisc_ref); - -/** - * tty_ldisc_deref - free a tty ldisc reference - * @ld: reference to free up - * - * Undoes the effect of tty_ldisc_ref or tty_ldisc_ref_wait. May - * be called in IRQ context. - */ - -void tty_ldisc_deref(struct tty_ldisc *ld) -{ - unsigned long flags; - - if(ld == NULL) - BUG(); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if(ld->refcount == 0) - printk(KERN_ERR "tty_ldisc_deref: no references.\n"); - else - ld->refcount--; - if(ld->refcount == 0) - wake_up(&tty_ldisc_wait); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); -} - -EXPORT_SYMBOL_GPL(tty_ldisc_deref); - -/** - * tty_ldisc_enable - allow ldisc use - * @tty: terminal to activate ldisc on - * - * Set the TTY_LDISC flag when the line discipline can be called - * again. Do neccessary wakeups for existing sleepers. - * - * Note: nobody should set this bit except via this function. Clearing - * directly is allowed. - */ - -static void tty_ldisc_enable(struct tty_struct *tty) -{ - set_bit(TTY_LDISC, &tty->flags); - wake_up(&tty_ldisc_wait); -} - -/** - * tty_set_ldisc - set line discipline - * @tty: the terminal to set - * @ldisc: the line discipline - * - * Set the discipline of a tty line. Must be called from a process - * context. - */ - -static int tty_set_ldisc(struct tty_struct *tty, int ldisc) -{ - int retval = 0; - struct tty_ldisc o_ldisc; - char buf[64]; - int work; - unsigned long flags; - struct tty_ldisc *ld; - - if ((ldisc < N_TTY) || (ldisc >= NR_LDISCS)) - return -EINVAL; - -restart: - - if (tty->ldisc.num == ldisc) - return 0; /* We are already in the desired discipline */ - - ld = tty_ldisc_get(ldisc); - /* Eduardo Blanco <ejbs@xxxxxxxxxxxx> */ - /* Cyrus Durgin <cider@xxxxxxxxxxxxx> */ - if (ld == NULL) { - request_module("tty-ldisc-%d", ldisc); - ld = tty_ldisc_get(ldisc); - } - if (ld == NULL) - return -EINVAL; - - o_ldisc = tty->ldisc; - - tty_wait_until_sent(tty, 0); - - /* - * Make sure we don't change while someone holds a - * reference to the line discipline. The TTY_LDISC bit - * prevents anyone taking a reference once it is clear. - * We need the lock to avoid racing reference takers. - */ - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if(tty->ldisc.refcount) - { - /* Free the new ldisc we grabbed. Must drop the lock - first. */ - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - tty_ldisc_put(ldisc); - /* - * There are several reasons we may be busy, including - * random momentary I/O traffic. We must therefore - * retry. We could distinguish between blocking ops - * and retries if we made tty_ldisc_wait() smarter. That - * is up for discussion. - */ - if(wait_event_interruptible(tty_ldisc_wait, tty->ldisc.refcount == 0) < 0) - return -ERESTARTSYS; - goto restart; - } - clear_bit(TTY_LDISC, &tty->flags); - clear_bit(TTY_DONT_FLIP, &tty->flags); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - - /* - * From this point on we know nobody has an ldisc - * usage reference, nor can they obtain one until - * we say so later on. - */ - - work = cancel_delayed_work(&tty->flip.work); - /* - * Wait for ->hangup_work and ->flip.work handlers to terminate - */ - - flush_scheduled_work(); - /* Shutdown the current discipline. */ - if (tty->ldisc.close) - (tty->ldisc.close)(tty); - - /* Now set up the new line discipline. */ - tty_ldisc_assign(tty, ld); - tty_set_termios_ldisc(tty, ldisc); - if (tty->ldisc.open) - retval = (tty->ldisc.open)(tty); - if (retval < 0) { - tty_ldisc_put(ldisc); - /* There is an outstanding reference here so this is safe */ - tty_ldisc_assign(tty, tty_ldisc_get(o_ldisc.num)); - tty_set_termios_ldisc(tty, tty->ldisc.num); - if (tty->ldisc.open && (tty->ldisc.open(tty) < 0)) { - tty_ldisc_put(o_ldisc.num); - /* This driver is always present */ - tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); - tty_set_termios_ldisc(tty, N_TTY); - if (tty->ldisc.open) { - int r = tty->ldisc.open(tty); - - if (r < 0) - panic("Couldn't open N_TTY ldisc for " - "%s --- error %d.", - tty_name(tty, buf), r); - } - } - } - /* At this point we hold a reference to the new ldisc and a - a reference to the old ldisc. If we ended up flipping back - to the existing ldisc we have two references to it */ - - if (tty->ldisc.num != o_ldisc.num && tty->driver->set_ldisc) - tty->driver->set_ldisc(tty); - - tty_ldisc_put(o_ldisc.num); - - /* - * Allow ldisc referencing to occur as soon as the driver - * ldisc callback completes. - */ - - tty_ldisc_enable(tty); - - /* Restart it in case no characters kick it off. Safe if - already running */ - if(work) - schedule_delayed_work(&tty->flip.work, 1); - return retval; -} - -/* - * This routine returns a tty driver structure, given a device number - */ -static struct tty_driver *get_tty_driver(dev_t device, int *index) -{ - struct tty_driver *p; - - list_for_each_entry(p, &tty_drivers, tty_drivers) { - dev_t base = MKDEV(p->major, p->minor_start); - if (device < base || device >= base + p->num) - continue; - *index = device - base; - return p; - } - return NULL; -} - -/* - * If we try to write to, or set the state of, a terminal and we're - * not in the foreground, send a SIGTTOU. If the signal is blocked or - * ignored, go ahead and perform the operation. (POSIX 7.2) - */ -int tty_check_change(struct tty_struct * tty) -{ - if (current->signal->tty != tty) - return 0; - if (tty->pgrp <= 0) { - printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n"); - return 0; - } - if (process_group(current) == tty->pgrp) - return 0; - if (is_ignored(SIGTTOU)) - return 0; - if (is_orphaned_pgrp(process_group(current))) - return -EIO; - (void) kill_pg(process_group(current), SIGTTOU, 1); - return -ERESTARTSYS; -} - -EXPORT_SYMBOL(tty_check_change); - -static ssize_t hung_up_tty_read(struct file * file, char __user * buf, - size_t count, loff_t *ppos) -{ - return 0; -} - -static ssize_t hung_up_tty_write(struct file * file, const char __user * buf, - size_t count, loff_t *ppos) -{ - return -EIO; -} - -/* No kernel lock held - none needed ;) */ -static unsigned int hung_up_tty_poll(struct file * filp, poll_table * wait) -{ - return POLLIN | POLLOUT | POLLERR | POLLHUP | POLLRDNORM | POLLWRNORM; -} - -static int hung_up_tty_ioctl(struct inode * inode, struct file * file, - unsigned int cmd, unsigned long arg) -{ - return cmd == TIOCSPGRP ? -ENOTTY : -EIO; -} - -static struct file_operations tty_fops = { - .llseek = no_llseek, - .read = tty_read, - .write = tty_write, - .poll = tty_poll, - .ioctl = tty_ioctl, - .open = tty_open, - .release = tty_release, - .fasync = tty_fasync, -}; - -#ifdef CONFIG_UNIX98_PTYS -static struct file_operations ptmx_fops = { - .llseek = no_llseek, - .read = tty_read, - .write = tty_write, - .poll = tty_poll, - .ioctl = tty_ioctl, - .open = ptmx_open, - .release = tty_release, - .fasync = tty_fasync, -}; -#endif - -static struct file_operations console_fops = { - .llseek = no_llseek, - .read = tty_read, - .write = redirected_tty_write, - .poll = tty_poll, - .ioctl = tty_ioctl, - .open = tty_open, - .release = tty_release, - .fasync = tty_fasync, -}; - -static struct file_operations hung_up_tty_fops = { - .llseek = no_llseek, - .read = hung_up_tty_read, - .write = hung_up_tty_write, - .poll = hung_up_tty_poll, - .ioctl = hung_up_tty_ioctl, - .release = tty_release, -}; - -static DEFINE_SPINLOCK(redirect_lock); -static struct file *redirect; - -/** - * tty_wakeup - request more data - * @tty: terminal - * - * Internal and external helper for wakeups of tty. This function - * informs the line discipline if present that the driver is ready - * to receive more output data. - */ - -void tty_wakeup(struct tty_struct *tty) -{ - struct tty_ldisc *ld; - - if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) { - ld = tty_ldisc_ref(tty); - if(ld) { - if(ld->write_wakeup) - ld->write_wakeup(tty); - tty_ldisc_deref(ld); - } - } - wake_up_interruptible(&tty->write_wait); -} - -EXPORT_SYMBOL_GPL(tty_wakeup); - -/** - * tty_ldisc_flush - flush line discipline queue - * @tty: tty - * - * Flush the line discipline queue (if any) for this tty. If there - * is no line discipline active this is a no-op. - */ - -void tty_ldisc_flush(struct tty_struct *tty) -{ - struct tty_ldisc *ld = tty_ldisc_ref(tty); - if(ld) { - if(ld->flush_buffer) - ld->flush_buffer(tty); - tty_ldisc_deref(ld); - } -} - -EXPORT_SYMBOL_GPL(tty_ldisc_flush); - -/* - * This can be called by the "eventd" kernel thread. That is process synchronous, - * but doesn't hold any locks, so we need to make sure we have the appropriate - * locks for what we're doing.. - */ -static void do_tty_hangup(void *data) -{ - struct tty_struct *tty = (struct tty_struct *) data; - struct file * cons_filp = NULL; - struct file *filp, *f = NULL; - struct task_struct *p; - struct tty_ldisc *ld; - int closecount = 0, n; - - if (!tty) - return; - - /* inuse_filps is protected by the single kernel lock */ - lock_kernel(); - - spin_lock(&redirect_lock); - if (redirect && redirect->private_data == tty) { - f = redirect; - redirect = NULL; - } - spin_unlock(&redirect_lock); - - check_tty_count(tty, "do_tty_hangup"); - file_list_lock(); - /* This breaks for file handles being sent over AF_UNIX sockets ? */ - list_for_each_entry(filp, &tty->tty_files, f_list) { - if (filp->f_op->write == redirected_tty_write) - cons_filp = filp; - if (filp->f_op->write != tty_write) - continue; - closecount++; - tty_fasync(-1, filp, 0); /* can't block */ - filp->f_op = &hung_up_tty_fops; - } - file_list_unlock(); - - /* FIXME! What are the locking issues here? This may me overdoing things.. - * this question is especially important now that we've removed the irqlock. */ - - ld = tty_ldisc_ref(tty); - if(ld != NULL) /* We may have no line discipline at this point */ - { - if (ld->flush_buffer) - ld->flush_buffer(tty); - if (tty->driver->flush_buffer) - tty->driver->flush_buffer(tty); - if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && - ld->write_wakeup) - ld->write_wakeup(tty); - if (ld->hangup) - ld->hangup(tty); - } - - /* FIXME: Once we trust the LDISC code better we can wait here for - ldisc completion and fix the driver call race */ - - wake_up_interruptible(&tty->write_wait); - wake_up_interruptible(&tty->read_wait); - - /* - * Shutdown the current line discipline, and reset it to - * N_TTY. - */ - if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) - { - down(&tty->termios_sem); - *tty->termios = tty->driver->init_termios; - up(&tty->termios_sem); - } - - /* Defer ldisc switch */ - /* tty_deferred_ldisc_switch(N_TTY); - - This should get done automatically when the port closes and - tty_release is called */ - - read_lock(&tasklist_lock); - if (tty->session > 0) { - do_each_task_pid(tty->session, PIDTYPE_SID, p) { - if (p->signal->tty == tty) - p->signal->tty = NULL; - if (!p->signal->leader) - continue; - send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p); - send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); - if (tty->pgrp > 0) - p->signal->tty_old_pgrp = tty->pgrp; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); - } - read_unlock(&tasklist_lock); - - tty->flags = 0; - tty->session = 0; - tty->pgrp = -1; - tty->ctrl_status = 0; - /* - * If one of the devices matches a console pointer, we - * cannot just call hangup() because that will cause - * tty->count and state->count to go out of sync. - * So we just call close() the right number of times. - */ - if (cons_filp) { - if (tty->driver->close) - for (n = 0; n < closecount; n++) - tty->driver->close(tty, cons_filp); - } else if (tty->driver->hangup) - (tty->driver->hangup)(tty); - - /* We don't want to have driver/ldisc interactions beyond - the ones we did here. The driver layer expects no - calls after ->hangup() from the ldisc side. However we - can't yet guarantee all that */ - - set_bit(TTY_HUPPED, &tty->flags); - if (ld) { - tty_ldisc_enable(tty); - tty_ldisc_deref(ld); - } - unlock_kernel(); - if (f) - fput(f); -} - -void tty_hangup(struct tty_struct * tty) -{ -#ifdef TTY_DEBUG_HANGUP - char buf[64]; - - printk(KERN_DEBUG "%s hangup...\n", tty_name(tty, buf)); -#endif - schedule_work(&tty->hangup_work); -} - -EXPORT_SYMBOL(tty_hangup); - -void tty_vhangup(struct tty_struct * tty) -{ -#ifdef TTY_DEBUG_HANGUP - char buf[64]; - - printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf)); -#endif - do_tty_hangup((void *) tty); -} -EXPORT_SYMBOL(tty_vhangup); - -int tty_hung_up_p(struct file * filp) -{ - return (filp->f_op == &hung_up_tty_fops); -} - -EXPORT_SYMBOL(tty_hung_up_p); - -/* - * This function is typically called only by the session leader, when - * it wants to disassociate itself from its controlling tty. - * - * It performs the following functions: - * (1) Sends a SIGHUP and SIGCONT to the foreground process group - * (2) Clears the tty from being controlling the session - * (3) Clears the controlling tty for all processes in the - * session group. - * - * The argument on_exit is set to 1 if called when a process is - * exiting; it is 0 if called by the ioctl TIOCNOTTY. - */ -void disassociate_ctty(int on_exit) -{ - struct tty_struct *tty; - struct task_struct *p; - int tty_pgrp = -1; - - lock_kernel(); - - down(&tty_sem); - tty = current->signal->tty; - if (tty) { - tty_pgrp = tty->pgrp; - up(&tty_sem); - if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) - tty_vhangup(tty); - } else { - if (current->signal->tty_old_pgrp) { - kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit); - kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit); - } - up(&tty_sem); - unlock_kernel(); - return; - } - if (tty_pgrp > 0) { - kill_pg(tty_pgrp, SIGHUP, on_exit); - if (!on_exit) - kill_pg(tty_pgrp, SIGCONT, on_exit); - } - - /* Must lock changes to tty_old_pgrp */ - down(&tty_sem); - current->signal->tty_old_pgrp = 0; - tty->session = 0; - tty->pgrp = -1; - - /* Now clear signal->tty under the lock */ - read_lock(&tasklist_lock); - do_each_task_pid(current->signal->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(current->signal->session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - up(&tty_sem); - unlock_kernel(); -} - -void stop_tty(struct tty_struct *tty) -{ - if (tty->stopped) - return; - tty->stopped = 1; - if (tty->link && tty->link->packet) { - tty->ctrl_status &= ~TIOCPKT_START; - tty->ctrl_status |= TIOCPKT_STOP; - wake_up_interruptible(&tty->link->read_wait); - } - if (tty->driver->stop) - (tty->driver->stop)(tty); -} - -EXPORT_SYMBOL(stop_tty); - -void start_tty(struct tty_struct *tty) -{ - if (!tty->stopped || tty->flow_stopped) - return; - tty->stopped = 0; - if (tty->link && tty->link->packet) { - tty->ctrl_status &= ~TIOCPKT_STOP; - tty->ctrl_status |= TIOCPKT_START; - wake_up_interruptible(&tty->link->read_wait); - } - if (tty->driver->start) - (tty->driver->start)(tty); - - /* If we have a running line discipline it may need kicking */ - tty_wakeup(tty); - wake_up_interruptible(&tty->write_wait); -} - -EXPORT_SYMBOL(start_tty); - -static ssize_t tty_read(struct file * file, char __user * buf, size_t count, - loff_t *ppos) -{ - int i; - struct tty_struct * tty; - struct inode *inode; - struct tty_ldisc *ld; - - tty = (struct tty_struct *)file->private_data; - inode = file->f_dentry->d_inode; - if (tty_paranoia_check(tty, inode, "tty_read")) - return -EIO; - if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags))) - return -EIO; - - /* We want to wait for the line discipline to sort out in this - situation */ - ld = tty_ldisc_ref_wait(tty); - lock_kernel(); - if (ld->read) - i = (ld->read)(tty,file,buf,count); - else - i = -EIO; - tty_ldisc_deref(ld); - unlock_kernel(); - if (i > 0) - inode->i_atime = current_fs_time(inode->i_sb); - return i; -} - -/* - * Split writes up in sane blocksizes to avoid - * denial-of-service type attacks - */ -static inline ssize_t do_tty_write( - ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t), - struct tty_struct *tty, - struct file *file, - const char __user *buf, - size_t count) -{ - ssize_t ret = 0, written = 0; - unsigned int chunk; - - if (down_interruptible(&tty->atomic_write)) { - return -ERESTARTSYS; - } - - /* - * We chunk up writes into a temporary buffer. This - * simplifies low-level drivers immensely, since they - * don't have locking issues and user mode accesses. - * - * But if TTY_NO_WRITE_SPLIT is set, we should use a - * big chunk-size.. - * - * The default chunk-size is 2kB, because the NTTY - * layer has problems with bigger chunks. It will - * claim to be able to handle more characters than - * it actually does. - */ - chunk = 2048; - if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) - chunk = 65536; - if (count < chunk) - chunk = count; - - /* write_buf/write_cnt is protected by the atomic_write semaphore */ - if (tty->write_cnt < chunk) { - unsigned char *buf; - - if (chunk < 1024) - chunk = 1024; - - buf = kmalloc(chunk, GFP_KERNEL); - if (!buf) { - up(&tty->atomic_write); - return -ENOMEM; - } - kfree(tty->write_buf); - tty->write_cnt = chunk; - tty->write_buf = buf; - } - - /* Do the write .. */ - for (;;) { - size_t size = count; - if (size > chunk) - size = chunk; - ret = -EFAULT; - if (copy_from_user(tty->write_buf, buf, size)) - break; - lock_kernel(); - ret = write(tty, file, tty->write_buf, size); - unlock_kernel(); - if (ret <= 0) - break; - written += ret; - buf += ret; - count -= ret; - if (!count) - break; - ret = -ERESTARTSYS; - if (signal_pending(current)) - break; - cond_resched(); - } - if (written) { - struct inode *inode = file->f_dentry->d_inode; - inode->i_mtime = current_fs_time(inode->i_sb); - ret = written; - } - up(&tty->atomic_write); - return ret; -} - - -static ssize_t tty_write(struct file * file, const char __user * buf, size_t count, - loff_t *ppos) -{ - struct tty_struct * tty; - struct inode *inode = file->f_dentry->d_inode; - ssize_t ret; - struct tty_ldisc *ld; - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode, "tty_write")) - return -EIO; - if (!tty || !tty->driver->write || (test_bit(TTY_IO_ERROR, &tty->flags))) - return -EIO; - - ld = tty_ldisc_ref_wait(tty); - if (!ld->write) - ret = -EIO; - else - ret = do_tty_write(ld->write, tty, file, buf, count); - tty_ldisc_deref(ld); - return ret; -} - -ssize_t redirected_tty_write(struct file * file, const char __user * buf, size_t count, - loff_t *ppos) -{ - struct file *p = NULL; - - spin_lock(&redirect_lock); - if (redirect) { - get_file(redirect); - p = redirect; - } - spin_unlock(&redirect_lock); - - if (p) { - ssize_t res; - res = vfs_write(p, buf, count, &p->f_pos); - fput(p); - return res; - } - - return tty_write(file, buf, count, ppos); -} - -static char ptychar[] = "pqrstuvwxyzabcde"; - -static inline void pty_line_name(struct tty_driver *driver, int index, char *p) -{ - int i = index + driver->name_base; - /* ->name is initialized to "ttyp", but "tty" is expected */ - sprintf(p, "%s%c%x", - driver->subtype == PTY_TYPE_SLAVE ? "tty" : driver->name, - ptychar[i >> 4 & 0xf], i & 0xf); -} - -static inline void tty_line_name(struct tty_driver *driver, int index, char *p) -{ - sprintf(p, "%s%d", driver->name, index + driver->name_base); -} - -/* - * WSH 06/09/97: Rewritten to remove races and properly clean up after a - * failed open. The new code protects the open with a semaphore, so it's - * really quite straightforward. The semaphore locking can probably be - * relaxed for the (most common) case of reopening a tty. - */ -static int init_dev(struct tty_driver *driver, int idx, - struct tty_struct **ret_tty) -{ - struct tty_struct *tty, *o_tty; - struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; - struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; - int retval=0; - - /* check whether we're reopening an existing tty */ - if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { - tty = devpts_get_tty(idx); - if (tty && driver->subtype == PTY_TYPE_MASTER) - tty = tty->link; - } else { - tty = driver->ttys[idx]; - } - if (tty) goto fast_track; - - /* - * First time open is complex, especially for PTY devices. - * This code guarantees that either everything succeeds and the - * TTY is ready for operation, or else the table slots are vacated - * and the allocated memory released. (Except that the termios - * and locked termios may be retained.) - */ - - if (!try_module_get(driver->owner)) { - retval = -ENODEV; - goto end_init; - } - - o_tty = NULL; - tp = o_tp = NULL; - ltp = o_ltp = NULL; - - tty = alloc_tty_struct(); - if(!tty) - goto fail_no_mem; - initialize_tty_struct(tty); - tty->driver = driver; - tty->index = idx; - tty_line_name(driver, idx, tty->name); - - if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { - tp_loc = &tty->termios; - ltp_loc = &tty->termios_locked; - } else { - tp_loc = &driver->termios[idx]; - ltp_loc = &driver->termios_locked[idx]; - } - - if (!*tp_loc) { - tp = (struct termios *) kmalloc(sizeof(struct termios), - GFP_KERNEL); - if (!tp) - goto free_mem_out; - *tp = driver->init_termios; - } - - if (!*ltp_loc) { - ltp = (struct termios *) kmalloc(sizeof(struct termios), - GFP_KERNEL); - if (!ltp) - goto free_mem_out; - memset(ltp, 0, sizeof(struct termios)); - } - - if (driver->type == TTY_DRIVER_TYPE_PTY) { - o_tty = alloc_tty_struct(); - if (!o_tty) - goto free_mem_out; - initialize_tty_struct(o_tty); - o_tty->driver = driver->other; - o_tty->index = idx; - tty_line_name(driver->other, idx, o_tty->name); - - if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { - o_tp_loc = &o_tty->termios; - o_ltp_loc = &o_tty->termios_locked; - } else { - o_tp_loc = &driver->other->termios[idx]; - o_ltp_loc = &driver->other->termios_locked[idx]; - } - - if (!*o_tp_loc) { - o_tp = (struct termios *) - kmalloc(sizeof(struct termios), GFP_KERNEL); - if (!o_tp) - goto free_mem_out; - *o_tp = driver->other->init_termios; - } - - if (!*o_ltp_loc) { - o_ltp = (struct termios *) - kmalloc(sizeof(struct termios), GFP_KERNEL); - if (!o_ltp) - goto free_mem_out; - memset(o_ltp, 0, sizeof(struct termios)); - } - - /* - * Everything allocated ... set up the o_tty structure. - */ - if (!(driver->other->flags & TTY_DRIVER_DEVPTS_MEM)) { - driver->other->ttys[idx] = o_tty; - } - if (!*o_tp_loc) - *o_tp_loc = o_tp; - if (!*o_ltp_loc) - *o_ltp_loc = o_ltp; - o_tty->termios = *o_tp_loc; - o_tty->termios_locked = *o_ltp_loc; - driver->other->refcount++; - if (driver->subtype == PTY_TYPE_MASTER) - o_tty->count++; - - /* Establish the links in both directions */ - tty->link = o_tty; - o_tty->link = tty; - } - - /* - * All structures have been allocated, so now we install them. - * Failures after this point use release_mem to clean up, so - * there's no need to null out the local pointers. - */ - if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { - driver->ttys[idx] = tty; - } - - if (!*tp_loc) - *tp_loc = tp; - if (!*ltp_loc) - *ltp_loc = ltp; - tty->termios = *tp_loc; - tty->termios_locked = *ltp_loc; - driver->refcount++; - tty->count++; - - /* - * Structures all installed ... call the ldisc open routines. - * If we fail here just call release_mem to clean up. No need - * to decrement the use counts, as release_mem doesn't care. - */ - - if (tty->ldisc.open) { - retval = (tty->ldisc.open)(tty); - if (retval) - goto release_mem_out; - } - if (o_tty && o_tty->ldisc.open) { - retval = (o_tty->ldisc.open)(o_tty); - if (retval) { - if (tty->ldisc.close) - (tty->ldisc.close)(tty); - goto release_mem_out; - } - tty_ldisc_enable(o_tty); - } - tty_ldisc_enable(tty); - goto success; - - /* - * This fast open can be used if the tty is already open. - * No memory is allocated, and the only failures are from - * attempting to open a closing tty or attempting multiple - * opens on a pty master. - */ -fast_track: - if (test_bit(TTY_CLOSING, &tty->flags)) { - retval = -EIO; - goto end_init; - } - if (driver->type == TTY_DRIVER_TYPE_PTY && - driver->subtype == PTY_TYPE_MASTER) { - /* - * special case for PTY masters: only one open permitted, - * and the slave side open count is incremented as well. - */ - if (tty->count) { - retval = -EIO; - goto end_init; - } - tty->link->count++; - } - tty->count++; - tty->driver = driver; /* N.B. why do this every time?? */ - - /* FIXME */ - if(!test_bit(TTY_LDISC, &tty->flags)) - printk(KERN_ERR "init_dev but no ldisc\n"); -success: - *ret_tty = tty; - - /* All paths come through here to release the semaphore */ -end_init: - return retval; - - /* Release locally allocated memory ... nothing placed in slots */ -free_mem_out: - if (o_tp) - kfree(o_tp); - if (o_tty) - free_tty_struct(o_tty); - if (ltp) - kfree(ltp); - if (tp) - kfree(tp); - free_tty_struct(tty); - -fail_no_mem: - module_put(driver->owner); - retval = -ENOMEM; - goto end_init; - - /* call the tty release_mem routine to clean out this slot */ -release_mem_out: - printk(KERN_INFO "init_dev: ldisc open failed, " - "clearing slot %d\n", idx); - release_mem(tty, idx); - goto end_init; -} - -/* - * Releases memory associated with a tty structure, and clears out the - * driver table slots. - */ -static void release_mem(struct tty_struct *tty, int idx) -{ - struct tty_struct *o_tty; - struct termios *tp; - int devpts = tty->driver->flags & TTY_DRIVER_DEVPTS_MEM; - - if ((o_tty = tty->link) != NULL) { - if (!devpts) - o_tty->driver->ttys[idx] = NULL; - if (o_tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) { - tp = o_tty->termios; - if (!devpts) - o_tty->driver->termios[idx] = NULL; - kfree(tp); - - tp = o_tty->termios_locked; - if (!devpts) - o_tty->driver->termios_locked[idx] = NULL; - kfree(tp); - } - o_tty->magic = 0; - o_tty->driver->refcount--; - file_list_lock(); - list_del_init(&o_tty->tty_files); - file_list_unlock(); - free_tty_struct(o_tty); - } - - if (!devpts) - tty->driver->ttys[idx] = NULL; - if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) { - tp = tty->termios; - if (!devpts) - tty->driver->termios[idx] = NULL; - kfree(tp); - - tp = tty->termios_locked; - if (!devpts) - tty->driver->termios_locked[idx] = NULL; - kfree(tp); - } - - tty->magic = 0; - tty->driver->refcount--; - file_list_lock(); - list_del_init(&tty->tty_files); - file_list_unlock(); - module_put(tty->driver->owner); - free_tty_struct(tty); -} - -/* - * Even releasing the tty structures is a tricky business.. We have - * to be very careful that the structures are all released at the - * same time, as interrupts might otherwise get the wrong pointers. - * - * WSH 09/09/97: rewritten to avoid some nasty race conditions that could - * lead to double frees or releasing memory still in use. - */ -static void release_dev(struct file * filp) -{ - struct tty_struct *tty, *o_tty; - int pty_master, tty_closing, o_tty_closing, do_sleep; - int devpts_master, devpts; - int idx; - char buf[64]; - unsigned long flags; - - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev")) - return; - - check_tty_count(tty, "release_dev"); - - tty_fasync(-1, filp, 0); - - idx = tty->index; - pty_master = (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER); - devpts = (tty->driver->flags & TTY_DRIVER_DEVPTS_MEM) != 0; - devpts_master = pty_master && devpts; - o_tty = tty->link; - -#ifdef TTY_PARANOIA_CHECK - if (idx < 0 || idx >= tty->driver->num) { - printk(KERN_DEBUG "release_dev: bad idx when trying to " - "free (%s)\n", tty->name); - return; - } - if (!(tty->driver->flags & TTY_DRIVER_DEVPTS_MEM)) { - if (tty != tty->driver->ttys[idx]) { - printk(KERN_DEBUG "release_dev: driver.table[%d] not tty " - "for (%s)\n", idx, tty->name); - return; - } - if (tty->termios != tty->driver->termios[idx]) { - printk(KERN_DEBUG "release_dev: driver.termios[%d] not termios " - "for (%s)\n", - idx, tty->name); - return; - } - if (tty->termios_locked != tty->driver->termios_locked[idx]) { - printk(KERN_DEBUG "release_dev: driver.termios_locked[%d] not " - "termios_locked for (%s)\n", - idx, tty->name); - return; - } - } -#endif - -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "release_dev of %s (tty count=%d)...", - tty_name(tty, buf), tty->count); -#endif - -#ifdef TTY_PARANOIA_CHECK - if (tty->driver->other && - !(tty->driver->flags & TTY_DRIVER_DEVPTS_MEM)) { - if (o_tty != tty->driver->other->ttys[idx]) { - printk(KERN_DEBUG "release_dev: other->table[%d] " - "not o_tty for (%s)\n", - idx, tty->name); - return; - } - if (o_tty->termios != tty->driver->other->termios[idx]) { - printk(KERN_DEBUG "release_dev: other->termios[%d] " - "not o_termios for (%s)\n", - idx, tty->name); - return; - } - if (o_tty->termios_locked != - tty->driver->other->termios_locked[idx]) { - printk(KERN_DEBUG "release_dev: other->termios_locked[" - "%d] not o_termios_locked for (%s)\n", - idx, tty->name); - return; - } - if (o_tty->link != tty) { - printk(KERN_DEBUG "release_dev: bad pty pointers\n"); - return; - } - } -#endif - if (tty->driver->close) - tty->driver->close(tty, filp); - - /* - * Sanity check: if tty->count is going to zero, there shouldn't be - * any waiters on tty->read_wait or tty->write_wait. We test the - * wait queues and kick everyone out _before_ actually starting to - * close. This ensures that we won't block while releasing the tty - * structure. - * - * The test for the o_tty closing is necessary, since the master and - * slave sides may close in any order. If the slave side closes out - * first, its count will be one, since the master side holds an open. - * Thus this test wouldn't be triggered at the time the slave closes, - * so we do it now. - * - * Note that it's possible for the tty to be opened again while we're - * flushing out waiters. By recalculating the closing flags before - * each iteration we avoid any problems. - */ - while (1) { - /* Guard against races with tty->count changes elsewhere and - opens on /dev/tty */ - - down(&tty_sem); - tty_closing = tty->count <= 1; - o_tty_closing = o_tty && - (o_tty->count <= (pty_master ? 1 : 0)); - up(&tty_sem); - do_sleep = 0; - - if (tty_closing) { - if (waitqueue_active(&tty->read_wait)) { - wake_up(&tty->read_wait); - do_sleep++; - } - if (waitqueue_active(&tty->write_wait)) { - wake_up(&tty->write_wait); - do_sleep++; - } - } - if (o_tty_closing) { - if (waitqueue_active(&o_tty->read_wait)) { - wake_up(&o_tty->read_wait); - do_sleep++; - } - if (waitqueue_active(&o_tty->write_wait)) { - wake_up(&o_tty->write_wait); - do_sleep++; - } - } - if (!do_sleep) - break; - - printk(KERN_WARNING "release_dev: %s: read/write wait queue " - "active!\n", tty_name(tty, buf)); - schedule(); - } - - /* - * The closing flags are now consistent with the open counts on - * both sides, and we've completed the last operation that could - * block, so it's safe to proceed with closing. - */ - - down(&tty_sem); - if (pty_master) { - if (--o_tty->count < 0) { - printk(KERN_WARNING "release_dev: bad pty slave count " - "(%d) for %s\n", - o_tty->count, tty_name(o_tty, buf)); - o_tty->count = 0; - } - } - if (--tty->count < 0) { - printk(KERN_WARNING "release_dev: bad tty->count (%d) for %s\n", - tty->count, tty_name(tty, buf)); - tty->count = 0; - } - up(&tty_sem); - - /* - * We've decremented tty->count, so we need to remove this file - * descriptor off the tty->tty_files list; this serves two - * purposes: - * - check_tty_count sees the correct number of file descriptors - * associated with this tty. - * - do_tty_hangup no longer sees this file descriptor as - * something that needs to be handled for hangups. - */ - file_kill(filp); - filp->private_data = NULL; - - /* - * Perform some housekeeping before deciding whether to return. - * - * Set the TTY_CLOSING flag if this was the last open. In the - * case of a pty we may have to wait around for the other side - * to close, and TTY_CLOSING makes sure we can't be reopened. - */ - if(tty_closing) - set_bit(TTY_CLOSING, &tty->flags); - if(o_tty_closing) - set_bit(TTY_CLOSING, &o_tty->flags); - - /* - * If _either_ side is closing, make sure there aren't any - * processes that still think tty or o_tty is their controlling - * tty. - */ - if (tty_closing || o_tty_closing) { - struct task_struct *p; - - read_lock(&tasklist_lock); - do_each_task_pid(tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); - if (o_tty) - do_each_task_pid(o_tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(o_tty->session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - } - - /* check whether both sides are closing ... */ - if (!tty_closing || (o_tty && !o_tty_closing)) - return; - -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "freeing tty structure..."); -#endif - /* - * Prevent flush_to_ldisc() from rescheduling the work for later. Then - * kill any delayed work. As this is the final close it does not - * race with the set_ldisc code path. - */ - clear_bit(TTY_LDISC, &tty->flags); - clear_bit(TTY_DONT_FLIP, &tty->flags); - cancel_delayed_work(&tty->flip.work); - - /* - * Wait for ->hangup_work and ->flip.work handlers to terminate - */ - - flush_scheduled_work(); - - /* - * Wait for any short term users (we know they are just driver - * side waiters as the file is closing so user count on the file - * side is zero. - */ - spin_lock_irqsave(&tty_ldisc_lock, flags); - while(tty->ldisc.refcount) - { - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0); - spin_lock_irqsave(&tty_ldisc_lock, flags); - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - /* - * Shutdown the current line discipline, and reset it to N_TTY. - * N.B. why reset ldisc when we're releasing the memory?? - * - * FIXME: this MUST get fixed for the new reflocking - */ - if (tty->ldisc.close) - (tty->ldisc.close)(tty); - tty_ldisc_put(tty->ldisc.num); - - /* - * Switch the line discipline back - */ - tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); - tty_set_termios_ldisc(tty,N_TTY); - if (o_tty) { - /* FIXME: could o_tty be in setldisc here ? */ - clear_bit(TTY_LDISC, &o_tty->flags); - if (o_tty->ldisc.close) - (o_tty->ldisc.close)(o_tty); - tty_ldisc_put(o_tty->ldisc.num); - tty_ldisc_assign(o_tty, tty_ldisc_get(N_TTY)); - tty_set_termios_ldisc(o_tty,N_TTY); - } - /* - * The release_mem function takes care of the details of clearing - * the slots and preserving the termios structure. - */ - release_mem(tty, idx); - -#ifdef CONFIG_UNIX98_PTYS - /* Make this pty number available for reallocation */ - if (devpts) { - down(&allocated_ptys_lock); - idr_remove(&allocated_ptys, idx); - up(&allocated_ptys_lock); - } -#endif - -} - -/* - * tty_open and tty_release keep up the tty count that contains the - * number of opens done on a tty. We cannot use the inode-count, as - * different inodes might point to the same tty. - * - * Open-counting is needed for pty masters, as well as for keeping - * track of serial lines: DTR is dropped when the last close happens. - * (This is not done solely through tty->count, now. - Ted 1/27/92) - * - * The termios state of a pty is reset on first open so that - * settings don't persist across reuse. - */ -static int tty_open(struct inode * inode, struct file * filp) -{ - struct tty_struct *tty; - int noctty, retval; - struct tty_driver *driver; - int index; - dev_t device = inode->i_rdev; - unsigned short saved_flags = filp->f_flags; - - nonseekable_open(inode, filp); - -retry_open: - noctty = filp->f_flags & O_NOCTTY; - index = -1; - retval = 0; - - down(&tty_sem); - - if (device == MKDEV(TTYAUX_MAJOR,0)) { - if (!current->signal->tty) { - up(&tty_sem); - return -ENXIO; - } - driver = current->signal->tty->driver; - index = current->signal->tty->index; - filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ - /* noctty = 1; */ - goto got_driver; - } -#ifdef CONFIG_VT - if (console_use_vt && (device == MKDEV(TTY_MAJOR,0))) { - extern int fg_console; - extern struct tty_driver *console_driver; - driver = console_driver; - index = fg_console; - noctty = 1; - goto got_driver; - } -#endif - if (device == MKDEV(TTYAUX_MAJOR,1)) { - driver = console_device(&index); - if (driver) { - /* Don't let /dev/console block */ - filp->f_flags |= O_NONBLOCK; - noctty = 1; - goto got_driver; - } - up(&tty_sem); - return -ENODEV; - } - - driver = get_tty_driver(device, &index); - if (!driver) { - up(&tty_sem); - return -ENODEV; - } -got_driver: - retval = init_dev(driver, index, &tty); - up(&tty_sem); - if (retval) - return retval; - - filp->private_data = tty; - file_move(filp, &tty->tty_files); - check_tty_count(tty, "tty_open"); - if (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER) - noctty = 1; -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "opening %s...", tty->name); -#endif - if (!retval) { - if (tty->driver->open) - retval = tty->driver->open(tty, filp); - else - retval = -ENODEV; - } - filp->f_flags = saved_flags; - - if (!retval && test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) - retval = -EBUSY; - - if (retval) { -#ifdef TTY_DEBUG_HANGUP - printk(KERN_DEBUG "error %d in opening %s...", retval, - tty->name); -#endif - release_dev(filp); - if (retval != -ERESTARTSYS) - return retval; - if (signal_pending(current)) - return retval; - schedule(); - /* - * Need to reset f_op in case a hangup happened. - */ - if (filp->f_op == &hung_up_tty_fops) - filp->f_op = &tty_fops; - goto retry_open; - } - if (!noctty && - current->signal->leader && - !current->signal->tty && - tty->session == 0) { - task_lock(current); - current->signal->tty = tty; - task_unlock(current); - current->signal->tty_old_pgrp = 0; - tty->session = current->signal->session; - tty->pgrp = process_group(current); - } - return 0; -} - -#ifdef CONFIG_UNIX98_PTYS -static int ptmx_open(struct inode * inode, struct file * filp) -{ - struct tty_struct *tty; - int retval; - int index; - int idr_ret; - - nonseekable_open(inode, filp); - - /* find a device that is not in use. */ - down(&allocated_ptys_lock); - if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { - up(&allocated_ptys_lock); - return -ENOMEM; - } - idr_ret = idr_get_new(&allocated_ptys, NULL, &index); - if (idr_ret < 0) { - up(&allocated_ptys_lock); - if (idr_ret == -EAGAIN) - return -ENOMEM; - return -EIO; - } - if (index >= pty_limit) { - idr_remove(&allocated_ptys, index); - up(&allocated_ptys_lock); - return -EIO; - } - up(&allocated_ptys_lock); - - down(&tty_sem); - retval = init_dev(ptm_driver, index, &tty); - up(&tty_sem); - - if (retval) - goto out; - - set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - filp->private_data = tty; - file_move(filp, &tty->tty_files); - - retval = -ENOMEM; - if (devpts_pty_new(tty->link)) - goto out1; - - check_tty_count(tty, "tty_open"); - retval = ptm_driver->open(tty, filp); - if (!retval) - return 0; -out1: - release_dev(filp); -out: - down(&allocated_ptys_lock); - idr_remove(&allocated_ptys, index); - up(&allocated_ptys_lock); - return retval; -} -#endif - -static int tty_release(struct inode * inode, struct file * filp) -{ - lock_kernel(); - release_dev(filp); - unlock_kernel(); - return 0; -} - -/* No kernel lock held - fine */ -static unsigned int tty_poll(struct file * filp, poll_table * wait) -{ - struct tty_struct * tty; - struct tty_ldisc *ld; - int ret = 0; - - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "tty_poll")) - return 0; - - ld = tty_ldisc_ref_wait(tty); - if (ld->poll) - ret = (ld->poll)(tty, filp, wait); - tty_ldisc_deref(ld); - return ret; -} - -static int tty_fasync(int fd, struct file * filp, int on) -{ - struct tty_struct * tty; - int retval; - - tty = (struct tty_struct *)filp->private_data; - if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "tty_fasync")) - return 0; - - retval = fasync_helper(fd, filp, on, &tty->fasync); - if (retval <= 0) - return retval; - - if (on) { - if (!waitqueue_active(&tty->read_wait)) - tty->minimum_to_wake = 1; - retval = f_setown(filp, (-tty->pgrp) ? : current->pid, 0); - if (retval) - return retval; - } else { - if (!tty->fasync && !waitqueue_active(&tty->read_wait)) - tty->minimum_to_wake = N_TTY_BUF_SIZE; - } - return 0; -} - -static int tiocsti(struct tty_struct *tty, char __user *p) -{ - char ch, mbz = 0; - struct tty_ldisc *ld; - - if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (get_user(ch, p)) - return -EFAULT; - ld = tty_ldisc_ref_wait(tty); - ld->receive_buf(tty, &ch, &mbz, 1); - tty_ldisc_deref(ld); - return 0; -} - -static int tiocgwinsz(struct tty_struct *tty, struct winsize __user * arg) -{ - if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) - return -EFAULT; - return 0; -} - -static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, - struct winsize __user * arg) -{ - struct winsize tmp_ws; - - if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) - return -EFAULT; - if (!memcmp(&tmp_ws, &tty->winsize, sizeof(*arg))) - return 0; -#ifdef CONFIG_VT - if (tty->driver->type == TTY_DRIVER_TYPE_CONSOLE) { - unsigned int currcons = tty->index; - int rc; - - acquire_console_sem(); - rc = vc_resize(currcons, tmp_ws.ws_col, tmp_ws.ws_row); - release_console_sem(); - if (rc) - return -ENXIO; - } -#endif - if (tty->pgrp > 0) - kill_pg(tty->pgrp, SIGWINCH, 1); - if ((real_tty->pgrp != tty->pgrp) && (real_tty->pgrp > 0)) - kill_pg(real_tty->pgrp, SIGWINCH, 1); - tty->winsize = tmp_ws; - real_tty->winsize = tmp_ws; - return 0; -} - -static int tioccons(struct file *file) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (file->f_op->write == redirected_tty_write) { - struct file *f; - spin_lock(&redirect_lock); - f = redirect; - redirect = NULL; - spin_unlock(&redirect_lock); - if (f) - fput(f); - return 0; - } - spin_lock(&redirect_lock); - if (redirect) { - spin_unlock(&redirect_lock); - return -EBUSY; - } - get_file(file); - redirect = file; - spin_unlock(&redirect_lock); - return 0; -} - - -static int fionbio(struct file *file, int __user *p) -{ - int nonblock; - - if (get_user(nonblock, p)) - return -EFAULT; - - if (nonblock) - file->f_flags |= O_NONBLOCK; - else - file->f_flags &= ~O_NONBLOCK; - return 0; -} - -static int tiocsctty(struct tty_struct *tty, int arg) -{ - task_t *p; - - if (current->signal->leader && - (current->signal->session == tty->session)) - return 0; - /* - * The process must be a session leader and - * not have a controlling tty already. - */ - if (!current->signal->leader || current->signal->tty) - return -EPERM; - if (tty->session > 0) { - /* - * This tty is already the controlling - * tty for another session group! - */ - if ((arg == 1) && capable(CAP_SYS_ADMIN)) { - /* - * Steal it away - */ - - read_lock(&tasklist_lock); - do_each_task_pid(tty->session, PIDTYPE_SID, p) { - p->signal->tty = NULL; - } while_each_task_pid(tty->session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); - } else - return -EPERM; - } - task_lock(current); - current->signal->tty = tty; - task_unlock(current); - current->signal->tty_old_pgrp = 0; - tty->session = current->signal->session; - tty->pgrp = process_group(current); - return 0; -} - -static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -{ - /* - * (tty == real_tty) is a cheap way of - * testing if the tty is NOT a master pty. - */ - if (tty == real_tty && current->signal->tty != real_tty) - return -ENOTTY; - return put_user(real_tty->pgrp, p); -} - -static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -{ - pid_t pgrp; - int retval = tty_check_change(real_tty); - - if (retval == -EIO) - return -ENOTTY; - if (retval) - return retval; - if (!current->signal->tty || - (current->signal->tty != real_tty) || - (real_tty->session != current->signal->session)) - return -ENOTTY; - if (get_user(pgrp, p)) - return -EFAULT; - if (pgrp < 0) - return -EINVAL; - if (session_of_pgrp(pgrp) != current->signal->session) - return -EPERM; - real_tty->pgrp = pgrp; - return 0; -} - -static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -{ - /* - * (tty == real_tty) is a cheap way of - * testing if the tty is NOT a master pty. - */ - if (tty == real_tty && current->signal->tty != real_tty) - return -ENOTTY; - if (real_tty->session <= 0) - return -ENOTTY; - return put_user(real_tty->session, p); -} - -static int tiocsetd(struct tty_struct *tty, int __user *p) -{ - int ldisc; - - if (get_user(ldisc, p)) - return -EFAULT; - return tty_set_ldisc(tty, ldisc); -} - -static int send_break(struct tty_struct *tty, int duration) -{ - tty->driver->break_ctl(tty, -1); - if (!signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(duration); - } - tty->driver->break_ctl(tty, 0); - if (signal_pending(current)) - return -EINTR; - return 0; -} - -static int -tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p) -{ - int retval = -EINVAL; - - if (tty->driver->tiocmget) { - retval = tty->driver->tiocmget(tty, file); - - if (retval >= 0) - retval = put_user(retval, p); - } - return retval; -} - -static int -tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int cmd, - unsigned __user *p) -{ - int retval = -EINVAL; - - if (tty->driver->tiocmset) { - unsigned int set, clear, val; - - retval = get_user(val, p); - if (retval) - return retval; - - set = clear = 0; - switch (cmd) { - case TIOCMBIS: - set = val; - break; - case TIOCMBIC: - clear = val; - break; - case TIOCMSET: - set = val; - clear = ~val; - break; - } - - set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; - clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; - - retval = tty->driver->tiocmset(tty, file, set, clear); - } - return retval; -} - -/* - * Split this up, as gcc can choke on it otherwise.. - */ -int tty_ioctl(struct inode * inode, struct file * file, - unsigned int cmd, unsigned long arg) -{ - struct tty_struct *tty, *real_tty; - void __user *p = (void __user *)arg; - int retval; - struct tty_ldisc *ld; - - tty = (struct tty_struct *)file->private_data; - if (tty_paranoia_check(tty, inode, "tty_ioctl")) - return -EINVAL; - - real_tty = tty; - if (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER) - real_tty = tty->link; - - /* - * Break handling by driver - */ - if (!tty->driver->break_ctl) { - switch(cmd) { - case TIOCSBRK: - case TIOCCBRK: - if (tty->driver->ioctl) - return tty->driver->ioctl(tty, file, cmd, arg); - return -EINVAL; - - /* These two ioctl's always return success; even if */ - /* the driver doesn't support them. */ - case TCSBRK: - case TCSBRKP: - if (!tty->driver->ioctl) - return 0; - retval = tty->driver->ioctl(tty, file, cmd, arg); - if (retval == -ENOIOCTLCMD) - retval = 0; - return retval; - } - } - - /* - * Factor out some common prep work - */ - switch (cmd) { - case TIOCSETD: - case TIOCSBRK: - case TIOCCBRK: - case TCSBRK: - case TCSBRKP: - retval = tty_check_change(tty); - if (retval) - return retval; - if (cmd != TIOCCBRK) { - tty_wait_until_sent(tty, 0); - if (signal_pending(current)) - return -EINTR; - } - break; - } - - switch (cmd) { - case TIOCSTI: - return tiocsti(tty, p); - case TIOCGWINSZ: - return tiocgwinsz(tty, p); - case TIOCSWINSZ: - return tiocswinsz(tty, real_tty, p); - case TIOCCONS: - return real_tty!=tty ? -EINVAL : tioccons(file); - case FIONBIO: - return fionbio(file, p); - case TIOCEXCL: - set_bit(TTY_EXCLUSIVE, &tty->flags); - return 0; - case TIOCNXCL: - clear_bit(TTY_EXCLUSIVE, &tty->flags); - return 0; - case TIOCNOTTY: - if (current->signal->tty != tty) - return -ENOTTY; - if (current->signal->leader) - disassociate_ctty(0); - task_lock(current); - current->signal->tty = NULL; - task_unlock(current); - return 0; - case TIOCSCTTY: - return tiocsctty(tty, arg); - case TIOCGPGRP: - return tiocgpgrp(tty, real_tty, p); - case TIOCSPGRP: - return tiocspgrp(tty, real_tty, p); - case TIOCGSID: - return tiocgsid(tty, real_tty, p); - case TIOCGETD: - /* FIXME: check this is ok */ - return put_user(tty->ldisc.num, (int __user *)p); - case TIOCSETD: - return tiocsetd(tty, p); -#ifdef CONFIG_VT - case TIOCLINUX: - return tioclinux(tty, arg); -#endif - /* - * Break handling - */ - case TIOCSBRK: /* Turn break on, unconditionally */ - tty->driver->break_ctl(tty, -1); - return 0; - - case TIOCCBRK: /* Turn break off, unconditionally */ - tty->driver->break_ctl(tty, 0); - return 0; - case TCSBRK: /* SVID version: non-zero arg --> no break */ - /* - * XXX is the above comment correct, or the - * code below correct? Is this ioctl used at - * all by anyone? - */ - if (!arg) - return send_break(tty, HZ/4); - return 0; - case TCSBRKP: /* support for POSIX tcsendbreak() */ - return send_break(tty, arg ? arg*(HZ/10) : HZ/4); - - case TIOCMGET: - return tty_tiocmget(tty, file, p); - - case TIOCMSET: - case TIOCMBIC: - case TIOCMBIS: - return tty_tiocmset(tty, file, cmd, p); - } - if (tty->driver->ioctl) { - retval = (tty->driver->ioctl)(tty, file, cmd, arg); - if (retval != -ENOIOCTLCMD) - return retval; - } - ld = tty_ldisc_ref_wait(tty); - retval = -EINVAL; - if (ld->ioctl) { - retval = ld->ioctl(tty, file, cmd, arg); - if (retval == -ENOIOCTLCMD) - retval = -EINVAL; - } - tty_ldisc_deref(ld); - return retval; -} - - -/* - * This implements the "Secure Attention Key" --- the idea is to - * prevent trojan horses by killing all processes associated with this - * tty when the user hits the "Secure Attention Key". Required for - * super-paranoid applications --- see the Orange Book for more details. - * - * This code could be nicer; ideally it should send a HUP, wait a few - * seconds, then send a INT, and then a KILL signal. But you then - * have to coordinate with the init process, since all processes associated - * with the current tty must be dead before the new getty is allowed - * to spawn. - * - * Now, if it would be correct ;-/ The current code has a nasty hole - - * it doesn't catch files in flight. We may send the descriptor to ourselves - * via AF_UNIX socket, close it and later fetch from socket. FIXME. - * - * Nasty bug: do_SAK is being called in interrupt context. This can - * deadlock. We punt it up to process context. AKPM - 16Mar2001 - */ -static void __do_SAK(void *arg) -{ -#ifdef TTY_SOFT_SAK - tty_hangup(tty); -#else - struct tty_struct *tty = arg; - struct task_struct *p; - int session; - int i; - struct file *filp; - struct tty_ldisc *disc; - - if (!tty) - return; - session = tty->session; - - /* We don't want an ldisc switch during this */ - disc = tty_ldisc_ref(tty); - if (disc && disc->flush_buffer) - disc->flush_buffer(tty); - tty_ldisc_deref(disc); - - if (tty->driver->flush_buffer) - tty->driver->flush_buffer(tty); - - read_lock(&tasklist_lock); - do_each_task_pid(session, PIDTYPE_SID, p) { - if (p->signal->tty == tty || session > 0) { - printk(KERN_NOTICE "SAK: killed process %d" - " (%s): p->signal->session==tty->session\n", - p->pid, p->comm); - send_sig(SIGKILL, p, 1); - continue; - } - task_lock(p); - if (p->files) { - spin_lock(&p->files->file_lock); - for (i=0; i < p->files->max_fds; i++) { - filp = fcheck_files(p->files, i); - if (!filp) - continue; - if (filp->f_op->read == tty_read && - filp->private_data == tty) { - printk(KERN_NOTICE "SAK: killed process %d" - " (%s): fd#%d opened to the tty\n", - p->pid, p->comm, i); - send_sig(SIGKILL, p, 1); - break; - } - } - spin_unlock(&p->files->file_lock); - } - task_unlock(p); - } while_each_task_pid(session, PIDTYPE_SID, p); - read_unlock(&tasklist_lock); -#endif -} - -/* - * The tq handling here is a little racy - tty->SAK_work may already be queued. - * Fortunately we don't need to worry, because if ->SAK_work is already queued, - * the values which we write to it will be identical to the values which it - * already has. --akpm - */ -void do_SAK(struct tty_struct *tty) -{ - if (!tty) - return; - PREPARE_WORK(&tty->SAK_work, __do_SAK, tty); - schedule_work(&tty->SAK_work); -} - -EXPORT_SYMBOL(do_SAK); - -/* - * This routine is called out of the software interrupt to flush data - * from the flip buffer to the line discipline. - */ - -static void flush_to_ldisc(void *private_) -{ - struct tty_struct *tty = (struct tty_struct *) private_; - unsigned char *cp; - char *fp; - int count; - unsigned long flags; - struct tty_ldisc *disc; - - disc = tty_ldisc_ref(tty); - if (disc == NULL) /* !TTY_LDISC */ - return; - - if (test_bit(TTY_DONT_FLIP, &tty->flags)) { - /* - * Do it after the next timer tick: - */ - schedule_delayed_work(&tty->flip.work, 1); - goto out; - } - spin_lock_irqsave(&tty->read_lock, flags); - if (tty->flip.buf_num) { - cp = tty->flip.char_buf + TTY_FLIPBUF_SIZE; - fp = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; - tty->flip.buf_num = 0; - tty->flip.char_buf_ptr = tty->flip.char_buf; - tty->flip.flag_buf_ptr = tty->flip.flag_buf; - } else { - cp = tty->flip.char_buf; - fp = tty->flip.flag_buf; - tty->flip.buf_num = 1; - tty->flip.char_buf_ptr = tty->flip.char_buf + TTY_FLIPBUF_SIZE; - tty->flip.flag_buf_ptr = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; - } - count = tty->flip.count; - tty->flip.count = 0; - spin_unlock_irqrestore(&tty->read_lock, flags); - - disc->receive_buf(tty, cp, fp, count); -out: - tty_ldisc_deref(disc); -} - -/* - * Routine which returns the baud rate of the tty - * - * Note that the baud_table needs to be kept in sync with the - * include/asm/termbits.h file. - */ -static int baud_table[] = { - 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, - 9600, 19200, 38400, 57600, 115200, 230400, 460800, -#ifdef __sparc__ - 76800, 153600, 307200, 614400, 921600 -#else - 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, - 2500000, 3000000, 3500000, 4000000 -#endif -}; - -static int n_baud_table = ARRAY_SIZE(baud_table); - -/** - * tty_termios_baud_rate - * @termios: termios structure - * - * Convert termios baud rate data into a speed. This should be called - * with the termios lock held if this termios is a terminal termios - * structure. May change the termios data. - */ - -int tty_termios_baud_rate(struct termios *termios) -{ - unsigned int cbaud; - - cbaud = termios->c_cflag & CBAUD; - - if (cbaud & CBAUDEX) { - cbaud &= ~CBAUDEX; - - if (cbaud < 1 || cbaud + 15 > n_baud_table) - termios->c_cflag &= ~CBAUDEX; - else - cbaud += 15; - } - return baud_table[cbaud]; -} - -EXPORT_SYMBOL(tty_termios_baud_rate); - -/** - * tty_get_baud_rate - get tty bit rates - * @tty: tty to query - * - * Returns the baud rate as an integer for this terminal. The - * termios lock must be held by the caller and the terminal bit - * flags may be updated. - */ - -int tty_get_baud_rate(struct tty_struct *tty) -{ - int baud = tty_termios_baud_rate(tty->termios); - - if (baud == 38400 && tty->alt_speed) { - if (!tty->warned) { - printk(KERN_WARNING "Use of setserial/setrocket to " - "set SPD_* flags is deprecated\n"); - tty->warned = 1; - } - baud = tty->alt_speed; - } - - return baud; -} - -EXPORT_SYMBOL(tty_get_baud_rate); - -/** - * tty_flip_buffer_push - terminal - * @tty: tty to push - * - * Queue a push of the terminal flip buffers to the line discipline. This - * function must not be called from IRQ context if tty->low_latency is set. - * - * In the event of the queue being busy for flipping the work will be - * held off and retried later. - */ - -void tty_flip_buffer_push(struct tty_struct *tty) -{ - if (tty->low_latency) - flush_to_ldisc((void *) tty); - else - schedule_delayed_work(&tty->flip.work, 1); -} - -EXPORT_SYMBOL(tty_flip_buffer_push); - -/* - * This subroutine initializes a tty structure. - */ -static void initialize_tty_struct(struct tty_struct *tty) -{ - memset(tty, 0, sizeof(struct tty_struct)); - tty->magic = TTY_MAGIC; - tty_ldisc_assign(tty, tty_ldisc_get(N_TTY)); - tty->pgrp = -1; - tty->flip.char_buf_ptr = tty->flip.char_buf; - tty->flip.flag_buf_ptr = tty->flip.flag_buf; - INIT_WORK(&tty->flip.work, flush_to_ldisc, tty); - init_MUTEX(&tty->flip.pty_sem); - init_MUTEX(&tty->termios_sem); - init_waitqueue_head(&tty->write_wait); - init_waitqueue_head(&tty->read_wait); - INIT_WORK(&tty->hangup_work, do_tty_hangup, tty); - sema_init(&tty->atomic_read, 1); - sema_init(&tty->atomic_write, 1); - spin_lock_init(&tty->read_lock); - INIT_LIST_HEAD(&tty->tty_files); - INIT_WORK(&tty->SAK_work, NULL, NULL); -} - -/* - * The default put_char routine if the driver did not define one. - */ -static void tty_default_put_char(struct tty_struct *tty, unsigned char ch) -{ - tty->driver->write(tty, &ch, 1); -} - -static struct class_simple *tty_class; - -/** - * tty_register_device - register a tty device - * @driver: the tty driver that describes the tty device - * @index: the index in the tty driver for this tty device - * @device: a struct device that is associated with this tty device. - * This field is optional, if there is no known struct device for this - * tty device it can be set to NULL safely. - * - * This call is required to be made to register an individual tty device if - * the tty driver's flags have the TTY_DRIVER_NO_DEVFS bit set. If that - * bit is not set, this function should not be called. - */ -void tty_register_device(struct tty_driver *driver, unsigned index, - struct device *device) -{ - char name[64]; - dev_t dev = MKDEV(driver->major, driver->minor_start) + index; - - if (index >= driver->num) { - printk(KERN_ERR "Attempt to register invalid tty line number " - " (%d).\n", index); - return; - } - - devfs_mk_cdev(dev, S_IFCHR | S_IRUSR | S_IWUSR, - "%s%d", driver->devfs_name, index + driver->name_base); - - if (driver->type == TTY_DRIVER_TYPE_PTY) - pty_line_name(driver, index, name); - else - tty_line_name(driver, index, name); - class_simple_device_add(tty_class, dev, device, name); -} - -/** - * tty_unregister_device - unregister a tty device - * @driver: the tty driver that describes the tty device - * @index: the index in the tty driver for this tty device - * - * If a tty device is registered with a call to tty_register_device() then - * this function must be made when the tty device is gone. - */ -void tty_unregister_device(struct tty_driver *driver, unsigned index) -{ - devfs_remove("%s%d", driver->devfs_name, index + driver->name_base); - class_simple_device_remove(MKDEV(driver->major, driver->minor_start) + index); -} - -EXPORT_SYMBOL(tty_register_device); -EXPORT_SYMBOL(tty_unregister_device); - -struct tty_driver *alloc_tty_driver(int lines) -{ - struct tty_driver *driver; - - driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL); - if (driver) { - memset(driver, 0, sizeof(struct tty_driver)); - driver->magic = TTY_DRIVER_MAGIC; - driver->num = lines; - /* later we'll move allocation of tables here */ - } - return driver; -} - -void put_tty_driver(struct tty_driver *driver) -{ - kfree(driver); -} - -void tty_set_operations(struct tty_driver *driver, struct tty_operations *op) -{ - driver->open = op->open; - driver->close = op->close; - driver->write = op->write; - driver->put_char = op->put_char; - driver->flush_chars = op->flush_chars; - driver->write_room = op->write_room; - driver->chars_in_buffer = op->chars_in_buffer; - driver->ioctl = op->ioctl; - driver->set_termios = op->set_termios; - driver->throttle = op->throttle; - driver->unthrottle = op->unthrottle; - driver->stop = op->stop; - driver->start = op->start; - driver->hangup = op->hangup; - driver->break_ctl = op->break_ctl; - driver->flush_buffer = op->flush_buffer; - driver->set_ldisc = op->set_ldisc; - driver->wait_until_sent = op->wait_until_sent; - driver->send_xchar = op->send_xchar; - driver->read_proc = op->read_proc; - driver->write_proc = op->write_proc; - driver->tiocmget = op->tiocmget; - driver->tiocmset = op->tiocmset; -} - - -EXPORT_SYMBOL(alloc_tty_driver); -EXPORT_SYMBOL(put_tty_driver); -EXPORT_SYMBOL(tty_set_operations); - -/* - * Called by a tty driver to register itself. - */ -int tty_register_driver(struct tty_driver *driver) -{ - int error; - int i; - dev_t dev; - void **p = NULL; - - if (driver->flags & TTY_DRIVER_INSTALLED) - return 0; - - if (!(driver->flags & TTY_DRIVER_DEVPTS_MEM)) { - p = kmalloc(driver->num * 3 * sizeof(void *), GFP_KERNEL); - if (!p) - return -ENOMEM; - memset(p, 0, driver->num * 3 * sizeof(void *)); - } - - if (!driver->major) { - error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, - (char*)driver->name); - if (!error) { - driver->major = MAJOR(dev); - driver->minor_start = MINOR(dev); - } - } else { - dev = MKDEV(driver->major, driver->minor_start); - error = register_chrdev_region(dev, driver->num, - (char*)driver->name); - } - if (error < 0) { - kfree(p); - return error; - } - - if (p) { - driver->ttys = (struct tty_struct **)p; - driver->termios = (struct termios **)(p + driver->num); - driver->termios_locked = (struct termios **)(p + driver->num * 2); - } else { - driver->ttys = NULL; - driver->termios = NULL; - driver->termios_locked = NULL; - } - - cdev_init(&driver->cdev, &tty_fops); - driver->cdev.owner = driver->owner; - error = cdev_add(&driver->cdev, dev, driver->num); - if (error) { - cdev_del(&driver->cdev); - unregister_chrdev_region(dev, driver->num); - driver->ttys = NULL; - driver->termios = driver->termios_locked = NULL; - kfree(p); - return error; - } - - if (!driver->put_char) - driver->put_char = tty_default_put_char; - - list_add(&driver->tty_drivers, &tty_drivers); - - if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { - for(i = 0; i < driver->num; i++) - tty_register_device(driver, i, NULL); - } - proc_tty_register_driver(driver); - return 0; -} - -EXPORT_SYMBOL(tty_register_driver); - -/* - * Called by a tty driver to unregister itself. - */ -int tty_unregister_driver(struct tty_driver *driver) -{ - int i; - struct termios *tp; - void *p; - - if (driver->refcount) - return -EBUSY; - - unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), - driver->num); - - list_del(&driver->tty_drivers); - - /* - * Free the termios and termios_locked structures because - * we don't want to get memory leaks when modular tty - * drivers are removed from the kernel. - */ - for (i = 0; i < driver->num; i++) { - tp = driver->termios[i]; - if (tp) { - driver->termios[i] = NULL; - kfree(tp); - } - tp = driver->termios_locked[i]; - if (tp) { - driver->termios_locked[i] = NULL; - kfree(tp); - } - if (!(driver->flags & TTY_DRIVER_NO_DEVFS)) - tty_unregister_device(driver, i); - } - p = driver->ttys; - proc_tty_unregister_driver(driver); - driver->ttys = NULL; - driver->termios = driver->termios_locked = NULL; - kfree(p); - cdev_del(&driver->cdev); - return 0; -} - -EXPORT_SYMBOL(tty_unregister_driver); - - -/* - * Initialize the console device. This is called *early*, so - * we can't necessarily depend on lots of kernel help here. - * Just do some early initializations, and do the complex setup - * later. - */ -void __init console_init(void) -{ - initcall_t *call; - - /* Setup the default TTY line discipline. */ - (void) tty_register_ldisc(N_TTY, &tty_ldisc_N_TTY); - - /* - * set up the console device so that later boot sequences can - * inform about problems etc.. - */ -#ifdef CONFIG_EARLY_PRINTK - disable_early_printk(); -#endif -#ifdef CONFIG_SERIAL_68360 - /* This is not a console initcall. I know not what it's doing here. - So I haven't moved it. dwmw2 */ - rs_360_init(); -#endif - call = __con_initcall_start; - while (call < __con_initcall_end) { - (*call)(); - call++; - } -} - -#ifdef CONFIG_VT -extern int vty_init(void); -#endif - -static int __init tty_class_init(void) -{ - tty_class = class_simple_create(THIS_MODULE, "tty"); - if (IS_ERR(tty_class)) - return PTR_ERR(tty_class); - return 0; -} - -postcore_initcall(tty_class_init); - -/* 3/2004 jmc: why do these devices exist? */ - -static struct cdev tty_cdev, console_cdev; -#ifdef CONFIG_UNIX98_PTYS -static struct cdev ptmx_cdev; -#endif -#ifdef CONFIG_VT -static struct cdev vc0_cdev; -#endif - -/* - * Ok, now we can initialize the rest of the tty devices and can count - * on memory allocations, interrupts etc.. - */ -static int __init tty_init(void) -{ - cdev_init(&tty_cdev, &tty_fops); - if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || - register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) - panic("Couldn't register /dev/tty driver\n"); - devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 0), S_IFCHR|S_IRUGO|S_IWUGO, "tty"); - class_simple_device_add(tty_class, MKDEV(TTYAUX_MAJOR, 0), NULL, "tty"); - - cdev_init(&console_cdev, &console_fops); - if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || - register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) - panic("Couldn't register /dev/console driver\n"); - devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 1), S_IFCHR|S_IRUSR|S_IWUSR, "console"); - class_simple_device_add(tty_class, MKDEV(TTYAUX_MAJOR, 1), NULL, "console"); - -#ifdef CONFIG_UNIX98_PTYS - cdev_init(&ptmx_cdev, &ptmx_fops); - if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || - register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) - panic("Couldn't register /dev/ptmx driver\n"); - devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 2), S_IFCHR|S_IRUGO|S_IWUGO, "ptmx"); - class_simple_device_add(tty_class, MKDEV(TTYAUX_MAJOR, 2), NULL, "ptmx"); -#endif - -#ifdef CONFIG_VT - if (console_use_vt) { - cdev_init(&vc0_cdev, &console_fops); - if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) || - register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, - "/dev/vc/0") < 0) - panic("Couldn't register /dev/tty0 driver\n"); - devfs_mk_cdev(MKDEV(TTY_MAJOR, 0), S_IFCHR|S_IRUSR|S_IWUSR, - "vc/0"); - class_simple_device_add(tty_class, MKDEV(TTY_MAJOR, 0), NULL, - "tty0"); - - vty_init(); - } -#endif - return 0; -} -module_init(tty_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,14 +0,0 @@ - - -obj-y += console/ -obj-y += evtchn/ -obj-y += balloon/ -obj-y += privcmd/ -obj-y += xenbus/ - -obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ -obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ -obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/ -obj-$(CONFIG_XEN_NETDEV_FRONTEND) += netfront/ -obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/balloon/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/balloon/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y += balloon.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/balloon/balloon.c --- a/linux-2.6.11-xen-sparse/drivers/xen/balloon/balloon.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,438 +0,0 @@ -/****************************************************************************** - * balloon.c - * - * Xen balloon driver - enables returning/claiming memory to/from Xen. - * - * Copyright (c) 2003, B Dragovic - * Copyright (c) 2003-2004, M Williamson, K Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/smp_lock.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/highmem.h> -#include <linux/vmalloc.h> -#include <asm-xen/xen_proc.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/balloon.h> -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/tlb.h> -#include <linux/list.h> - -static struct proc_dir_entry *balloon_pde; - -static DECLARE_MUTEX(balloon_mutex); -spinlock_t balloon_lock = SPIN_LOCK_UNLOCKED; - -/* We aim for 'current allocation' == 'target allocation'. */ -static unsigned long current_pages; -static unsigned long target_pages; - -/* We may hit the hard limit in Xen. If we do then we remember it. */ -static unsigned long hard_limit; - -/* - * Drivers may alter the memory reservation independently, but they must - * inform the balloon driver so that we can avoid hitting the hard limit. - */ -static unsigned long driver_pages; - -/* List of ballooned pages, threaded through the mem_map array. */ -static LIST_HEAD(ballooned_pages); -static unsigned long balloon_low, balloon_high; - -/* Main work function, always executed in process context. */ -static void balloon_process(void *unused); -static DECLARE_WORK(balloon_worker, balloon_process, NULL); -static struct timer_list balloon_timer; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -/* Use the private and mapping fields of struct page as a list. */ -#define PAGE_TO_LIST(p) ( (struct list_head *)&p->private ) -#define LIST_TO_PAGE(l) ( list_entry( ((unsigned long *)l), \ - struct page, private ) ) -#define UNLIST_PAGE(p) do { list_del(PAGE_TO_LIST(p)); \ - p->mapping = NULL; \ - p->private = 0; } while(0) -#else -/* There's a dedicated list field in struct page we can use. */ -#define PAGE_TO_LIST(p) ( &p->list ) -#define LIST_TO_PAGE(l) ( list_entry(l, struct page, list) ) -#define UNLIST_PAGE(p) ( list_del(&p->list) ) -#define pte_offset_kernel pte_offset -#define pud_t pgd_t -#define pud_offset(d, va) d -#define pud_none(d) 0 -#define pud_bad(d) 0 -#define subsys_initcall(_fn) __initcall(_fn) -#define pfn_to_page(_pfn) (mem_map + (_pfn)) -#endif - -#define IPRINTK(fmt, args...) \ - printk(KERN_INFO "xen_mem: " fmt, ##args) -#define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_mem: " fmt, ##args) - -/* balloon_append: add the given page to the balloon. */ -static void balloon_append(struct page *page) -{ - /* Low memory is re-populated first, so highmem pages go at list tail. */ - if ( PageHighMem(page) ) - { - list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); - balloon_high++; - } - else - { - list_add(PAGE_TO_LIST(page), &ballooned_pages); - balloon_low++; - } -} - -/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) -{ - struct page *page; - - if ( list_empty(&ballooned_pages) ) - return NULL; - - page = LIST_TO_PAGE(ballooned_pages.next); - UNLIST_PAGE(page); - - if ( PageHighMem(page) ) - balloon_high--; - else - balloon_low--; - - return page; -} - -static void balloon_alarm(unsigned long unused) -{ - schedule_work(&balloon_worker); -} - -static unsigned long current_target(void) -{ - unsigned long target = min(target_pages, hard_limit); - if ( target > (current_pages + balloon_low + balloon_high) ) - target = current_pages + balloon_low + balloon_high; - return target; -} - -/* - * We avoid multiple worker processes conflicting via the balloon mutex. - * We may of course race updates of the target counts (which are protected - * by the balloon lock), or with changes to the Xen hard limit, but we will - * recover from these in time. - */ -static void balloon_process(void *unused) -{ - unsigned long *mfn_list, pfn, i, flags; - struct page *page; - long credit, debt, rc; - void *v; - - down(&balloon_mutex); - - retry: - mfn_list = NULL; - - if ( (credit = current_target() - current_pages) > 0 ) - { - mfn_list = (unsigned long *)vmalloc(credit * sizeof(*mfn_list)); - if ( mfn_list == NULL ) - goto out; - - balloon_lock(flags); - rc = HYPERVISOR_dom_mem_op( - MEMOP_increase_reservation, mfn_list, credit, 0); - balloon_unlock(flags); - if ( rc < credit ) - { - /* We hit the Xen hard limit: reprobe. */ - if ( HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, mfn_list, rc, 0) != rc ) - BUG(); - hard_limit = current_pages + rc - driver_pages; - vfree(mfn_list); - goto retry; - } - - for ( i = 0; i < credit; i++ ) - { - if ( (page = balloon_retrieve()) == NULL ) - BUG(); - - pfn = page - mem_map; - if ( phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY ) - BUG(); - - /* Update P->M and M->P tables. */ - phys_to_machine_mapping[pfn] = mfn_list[i]; - xen_machphys_update(mfn_list[i], pfn); - - /* Link back into the page tables if it's not a highmem page. */ - if ( pfn < max_low_pfn ) - { - HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma((mfn_list[i] << PAGE_SHIFT) | - pgprot_val(PAGE_KERNEL)), - 0); - } - - /* Finally, relinquish the memory back to the system allocator. */ - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - - current_pages += credit; - } - else if ( credit < 0 ) - { - debt = -credit; - - mfn_list = (unsigned long *)vmalloc(debt * sizeof(*mfn_list)); - if ( mfn_list == NULL ) - goto out; - - for ( i = 0; i < debt; i++ ) - { - if ( (page = alloc_page(GFP_HIGHUSER)) == NULL ) - { - debt = i; - break; - } - - pfn = page - mem_map; - mfn_list[i] = phys_to_machine_mapping[pfn]; - - if ( !PageHighMem(page) ) - { - v = phys_to_virt(pfn << PAGE_SHIFT); - scrub_pages(v, 1); - HYPERVISOR_update_va_mapping( - (unsigned long)v, __pte_ma(0), 0); - } -#ifdef CONFIG_XEN_SCRUB_PAGES - else - { - v = kmap(page); - scrub_pages(v, 1); - kunmap(page); - } -#endif - } - - /* Ensure that ballooned highmem pages don't have cached mappings. */ - kmap_flush_unused(); - flush_tlb_all(); - - /* No more mappings: invalidate pages in P2M and add to balloon. */ - for ( i = 0; i < debt; i++ ) - { - pfn = mfn_to_pfn(mfn_list[i]); - phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY; - balloon_append(pfn_to_page(pfn)); - } - - if ( HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, mfn_list, debt, 0) != debt ) - BUG(); - - current_pages -= debt; - } - - out: - if ( mfn_list != NULL ) - vfree(mfn_list); - - /* Schedule more work if there is some still to be done. */ - if ( current_target() != current_pages ) - mod_timer(&balloon_timer, jiffies + HZ); - - up(&balloon_mutex); -} - -/* Resets the Xen limit, sets new target, and kicks off processing. */ -static void set_new_target(unsigned long target) -{ - /* No need for lock. Not read-modify-write updates. */ - hard_limit = ~0UL; - target_pages = target; - schedule_work(&balloon_worker); -} - -static void balloon_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->subtype ) - { - case CMSG_MEM_REQUEST_SET: - { - mem_request_t *req = (mem_request_t *)&msg->msg[0]; - set_new_target(req->target); - req->status = 0; - } - break; - - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -static int balloon_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) -{ - char memstring[64], *endchar; - unsigned long long target_bytes; - - if ( !capable(CAP_SYS_ADMIN) ) - return -EPERM; - - if ( count <= 1 ) - return -EBADMSG; /* runt */ - if ( count > sizeof(memstring) ) - return -EFBIG; /* too long */ - - if ( copy_from_user(memstring, buffer, count) ) - return -EFAULT; - memstring[sizeof(memstring)-1] = '\0'; - - target_bytes = memparse(memstring, &endchar); - set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static int balloon_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - int len; - -#define K(_p) ((_p)<<(PAGE_SHIFT-10)) - len = sprintf( - page, - "Current allocation: %8lu kB\n" - "Requested target: %8lu kB\n" - "Low-mem balloon: %8lu kB\n" - "High-mem balloon: %8lu kB\n" - "Xen hard limit: ", - K(current_pages), K(target_pages), K(balloon_low), K(balloon_high)); - - if ( hard_limit != ~0UL ) - len += sprintf( - page + len, - "%8lu kB (inc. %8lu kB driver headroom)\n", - K(hard_limit), K(driver_pages)); - else - len += sprintf( - page + len, - " ??? kB\n"); - - *eof = 1; - return len; -} - -static int __init balloon_init(void) -{ - unsigned long pfn; - struct page *page; - - IPRINTK("Initialising balloon driver.\n"); - - current_pages = min(xen_start_info.nr_pages, max_pfn); - target_pages = current_pages; - balloon_low = 0; - balloon_high = 0; - driver_pages = 0UL; - hard_limit = ~0UL; - - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; - - if ( (balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL ) - { - WPRINTK("Unable to create /proc/xen/balloon.\n"); - return -1; - } - - balloon_pde->read_proc = balloon_read; - balloon_pde->write_proc = balloon_write; - - (void)ctrl_if_register_receiver(CMSG_MEM_REQUEST, balloon_ctrlif_rx, 0); - - /* Initialise the balloon with excess memory space. */ - for ( pfn = xen_start_info.nr_pages; pfn < max_pfn; pfn++ ) - { - page = &mem_map[pfn]; - if ( !PageReserved(page) ) - balloon_append(page); - } - - return 0; -} - -subsys_initcall(balloon_init); - -void balloon_update_driver_allowance(long delta) -{ - unsigned long flags; - balloon_lock(flags); - driver_pages += delta; /* non-atomic update */ - balloon_unlock(flags); -} - -void balloon_put_pages(unsigned long *mfn_list, unsigned long nr_mfns) -{ - unsigned long flags; - - balloon_lock(flags); - if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, - mfn_list, nr_mfns, 0) != nr_mfns ) - BUG(); - current_pages -= nr_mfns; /* non-atomic update */ - balloon_unlock(flags); - - schedule_work(&balloon_worker); -} - -EXPORT_SYMBOL(balloon_update_driver_allowance); -EXPORT_SYMBOL(balloon_put_pages); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkback/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/blkback/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y := blkback.o control.o interface.o vbd.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blkback/blkback.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,756 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/blkif/backend/main.c - * - * Back-end of the driver for virtual block devices. This portion of the - * driver exports a 'unified' block-device interface that can be accessed - * by any operating system that implements a compatible front end. A - * reference front-end implementation can be found in: - * arch/xen/drivers/blkif/frontend - * - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Copyright (c) 2005, Christopher Clark - */ - -#include "common.h" -#include <asm-xen/evtchn.h> -#ifdef CONFIG_XEN_BLKDEV_GRANT -#include <asm-xen/xen-public/grant_table.h> -#endif - -/* - * These are rather arbitrary. They are fairly large because adjacent requests - * pulled from a communication ring are quite likely to end up being part of - * the same scatter/gather request at the disc. - * - * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW ** - * This will increase the chances of being able to write whole tracks. - * 64 should be enough to keep us competitive with Linux. - */ -#define MAX_PENDING_REQS 64 -#define BATCH_PER_DOMAIN 16 - -static unsigned long mmap_vstart; -#define MMAP_PAGES \ - (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - -/* - * Each outstanding request that we've passed to the lower device layers has a - * 'pending_req' allocated to it. Each buffer_head that completes decrements - * the pendcnt towards zero. When it hits zero, the specified domain has a - * response queued for it, with the saved 'id' passed back. - */ -typedef struct { - blkif_t *blkif; - unsigned long id; - int nr_pages; - atomic_t pendcnt; - unsigned short operation; - int status; -} pending_req_t; - -/* - * We can't allocate pending_req's in order, since they may complete out of - * order. We therefore maintain an allocation ring. This ring also indicates - * when enough work has been passed down -- at that point the allocation ring - * will be empty. - */ -static pending_req_t pending_reqs[MAX_PENDING_REQS]; -static unsigned char pending_ring[MAX_PENDING_REQS]; -static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED; -/* NB. We use a different index type to differentiate from shared blk rings. */ -typedef unsigned int PEND_RING_IDX; -#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) -static PEND_RING_IDX pending_prod, pending_cons; -#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -static kmem_cache_t *buffer_head_cachep; -#else -static request_queue_t *plugged_queue; -static inline void flush_plugged_queue(void) -{ - request_queue_t *q = plugged_queue; - if ( q != NULL ) - { - if ( q->unplug_fn != NULL ) - q->unplug_fn(q); - blk_put_queue(q); - plugged_queue = NULL; - } -} -#endif - -#ifdef CONFIG_XEN_BLKDEV_GRANT -/* When using grant tables to map a frame for device access then the - * handle returned must be used to unmap the frame. This is needed to - * drop the ref count on the frame. - */ -static u16 pending_grant_handles[MMAP_PAGES]; -#define pending_handle(_idx, _i) \ - (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) -#define BLKBACK_INVALID_HANDLE (0xFFFF) -#endif - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE -/* - * If the tap driver is used, we may get pages belonging to either the tap - * or (more likely) the real frontend. The backend must specify which domain - * a given page belongs to in update_va_mapping though. For the moment, - * the tap rewrites the ID field of the request to contain the request index - * and the id of the real front end domain. - */ -#define BLKTAP_COOKIE 0xbeadfeed -static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); } -#endif - -static int do_block_io_op(blkif_t *blkif, int max_to_do); -static void dispatch_probe(blkif_t *blkif, blkif_request_t *req); -static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); -static void make_response(blkif_t *blkif, unsigned long id, - unsigned short op, int st); - -static void fast_flush_area(int idx, int nr_pages) -{ -#ifdef CONFIG_XEN_BLKDEV_GRANT - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int i, invcount = 0; - u16 handle; - - for ( i = 0; i < nr_pages; i++ ) - { - if ( BLKBACK_INVALID_HANDLE != ( handle = pending_handle(idx, i) ) ) - { - unmap[i].host_virt_addr = MMAP_VADDR(idx, i); - unmap[i].dev_bus_addr = 0; - unmap[i].handle = handle; - pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; - invcount++; - } - } - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount))) - BUG(); -#else - - multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int i; - - for ( i = 0; i < nr_pages; i++ ) - { - MULTI_update_va_mapping(mcl+i, MMAP_VADDR(idx, i), - __pte(0), 0); - } - - mcl[nr_pages-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) - BUG(); -#endif -} - - -/****************************************************************** - * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE - */ - -static struct list_head blkio_schedule_list; -static spinlock_t blkio_schedule_list_lock; - -static int __on_blkdev_list(blkif_t *blkif) -{ - return blkif->blkdev_list.next != NULL; -} - -static void remove_from_blkdev_list(blkif_t *blkif) -{ - unsigned long flags; - if ( !__on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( __on_blkdev_list(blkif) ) - { - list_del(&blkif->blkdev_list); - blkif->blkdev_list.next = NULL; - blkif_put(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); -} - -static void add_to_blkdev_list_tail(blkif_t *blkif) -{ - unsigned long flags; - if ( __on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) - { - list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); - blkif_get(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); -} - - -/****************************************************************** - * SCHEDULER FUNCTIONS - */ - -static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); - -static int blkio_schedule(void *arg) -{ - DECLARE_WAITQUEUE(wq, current); - - blkif_t *blkif; - struct list_head *ent; - - daemonize( -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - "xenblkd" -#endif - ); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&blkio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || - list_empty(&blkio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&blkio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&blkio_schedule_list) ) - { - ent = blkio_schedule_list.next; - blkif = list_entry(ent, blkif_t, blkdev_list); - blkif_get(blkif); - remove_from_blkdev_list(blkif); - if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) - add_to_blkdev_list_tail(blkif); - blkif_put(blkif); - } - - /* Push the batch through to disc. */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - run_task_queue(&tq_disk); -#else - flush_plugged_queue(); -#endif - } -} - -static void maybe_trigger_blkio_schedule(void) -{ - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && - !list_empty(&blkio_schedule_list) ) - wake_up(&blkio_schedule_wait); -} - - - -/****************************************************************** - * COMPLETION CALLBACK -- Called as bh->b_end_io() - */ - -static void __end_block_io_op(pending_req_t *pending_req, int uptodate) -{ - unsigned long flags; - - /* An error fails the entire request. */ - if ( !uptodate ) - { - DPRINTK("Buffer not up-to-date at end of operation\n"); - pending_req->status = BLKIF_RSP_ERROR; - } - - if ( atomic_dec_and_test(&pending_req->pendcnt) ) - { - int pending_idx = pending_req - pending_reqs; - fast_flush_area(pending_idx, pending_req->nr_pages); - make_response(pending_req->blkif, pending_req->id, - pending_req->operation, pending_req->status); - blkif_put(pending_req->blkif); - spin_lock_irqsave(&pend_prod_lock, flags); - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - spin_unlock_irqrestore(&pend_prod_lock, flags); - maybe_trigger_blkio_schedule(); - } -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -static void end_block_io_op(struct buffer_head *bh, int uptodate) -{ - __end_block_io_op(bh->b_private, uptodate); - kmem_cache_free(buffer_head_cachep, bh); -} -#else -static int end_block_io_op(struct bio *bio, unsigned int done, int error) -{ - if ( bio->bi_size != 0 ) - return 1; - __end_block_io_op(bio->bi_private, !error); - bio_put(bio); - return error; -} -#endif - - -/****************************************************************************** - * NOTIFICATION FROM GUEST OS. - */ - -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) -{ - blkif_t *blkif = dev_id; - add_to_blkdev_list_tail(blkif); - maybe_trigger_blkio_schedule(); - return IRQ_HANDLED; -} - - - -/****************************************************************** - * DOWNWARD CALLS -- These interface with the block-device layer proper. - */ - -static int do_block_io_op(blkif_t *blkif, int max_to_do) -{ - blkif_back_ring_t *blk_ring = &blkif->blk_ring; - blkif_request_t *req; - RING_IDX i, rp; - int more_to_do = 0; - - rp = blk_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - for ( i = blk_ring->req_cons; - (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i); - i++ ) - { - if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) - { - more_to_do = 1; - break; - } - - req = RING_GET_REQUEST(blk_ring, i); - switch ( req->operation ) - { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - dispatch_rw_block_io(blkif, req); - break; - - case BLKIF_OP_PROBE: - dispatch_probe(blkif, req); - break; - - default: - DPRINTK("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); - break; - } - } - - blk_ring->req_cons = i; - return more_to_do; -} - -static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) -{ - int rsp = BLKIF_RSP_ERROR; - int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - - /* We expect one buffer only. */ - if ( unlikely(req->nr_segments != 1) ) - goto out; - - /* Make sure the buffer is page-sized. */ - if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || - (blkif_last_sect(req->frame_and_sects[0]) != 7) ) - goto out; - -#ifdef CONFIG_XEN_BLKDEV_GRANT - { - struct gnttab_map_grant_ref map; - - map.host_virt_addr = MMAP_VADDR(pending_idx, 0); - map.flags = GNTMAP_host_map; - map.ref = blkif_gref_from_fas(req->frame_and_sects[0]); - map.dom = blkif->domid; - - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, &map, 1))) - BUG(); - - if ( map.handle < 0 ) - goto out; - - pending_handle(pending_idx, 0) = map.handle; - } -#else /* else CONFIG_XEN_BLKDEV_GRANT */ - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - /* Grab the real frontend out of the probe message. */ - if (req->frame_and_sects[1] == BLKTAP_COOKIE) - blkif->is_blktap = 1; -#endif - - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - if ( HYPERVISOR_update_va_mapping_otherdomain( - MMAP_VADDR(pending_idx, 0), - (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, - 0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) ) - - goto out; -#else - if ( HYPERVISOR_update_va_mapping_otherdomain( - MMAP_VADDR(pending_idx, 0), - (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, - 0, blkif->domid) ) - - goto out; -#endif -#endif /* endif CONFIG_XEN_BLKDEV_GRANT */ - - rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), - PAGE_SIZE / sizeof(vdisk_t)); - - out: - fast_flush_area(pending_idx, 1); - make_response(blkif, req->id, req->operation, rsp); -} - -static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) -{ - extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); - int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; - unsigned long fas = 0; - int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - pending_req_t *pending_req; -#ifdef CONFIG_XEN_BLKDEV_GRANT - struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -#else - unsigned long remap_prot; - multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -#endif - struct phys_req preq; - struct { - unsigned long buf; unsigned int nsec; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int nseg; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - struct buffer_head *bh; -#else - struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int nbio = 0; - request_queue_t *q; -#endif - - /* Check that number of segments is sane. */ - nseg = req->nr_segments; - if ( unlikely(nseg == 0) || - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) - { - DPRINTK("Bad number of segments in request (%d)\n", nseg); - goto bad_descriptor; - } - - preq.dev = req->device; - preq.sector_number = req->sector_number; - preq.nr_sects = 0; - -#ifdef CONFIG_XEN_BLKDEV_GRANT - for ( i = 0; i < nseg; i++ ) - { - fas = req->frame_and_sects[i]; - seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; - - if ( seg[i].nsec <= 0 ) - goto bad_descriptor; - preq.nr_sects += seg[i].nsec; - - map[i].host_virt_addr = MMAP_VADDR(pending_idx, i); - map[i].dom = blkif->domid; - map[i].ref = blkif_gref_from_fas(fas); - map[i].flags = GNTMAP_host_map; - if ( operation == WRITE ) - map[i].flags |= GNTMAP_readonly; - } - - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, map, nseg))) - BUG(); - - for ( i = 0; i < nseg; i++ ) - { - if ( unlikely(map[i].handle < 0) ) - { - DPRINTK("invalid buffer -- could not remap it\n"); - fast_flush_area(pending_idx, nseg); - goto bad_descriptor; - } - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(map[i].dev_bus_addr); - - pending_handle(pending_idx, i) = map[i].handle; - } -#endif - - for ( i = 0; i < nseg; i++ ) - { - fas = req->frame_and_sects[i]; -#ifdef CONFIG_XEN_BLKDEV_GRANT - seg[i].buf = (map[i].dev_bus_addr << PAGE_SHIFT) | - (blkif_first_sect(fas) << 9); -#else - seg[i].buf = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); - seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; - if ( seg[i].nsec <= 0 ) - goto bad_descriptor; - preq.nr_sects += seg[i].nsec; -#endif - } - - if ( vbd_translate(&preq, blkif, operation) != 0 ) - { - DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", - operation == READ ? "read" : "write", preq.sector_number, - preq.sector_number + preq.nr_sects, preq.dev); - goto bad_descriptor; - } - -#ifndef CONFIG_XEN_BLKDEV_GRANT - if ( operation == READ ) - remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; - else - remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED; - - for ( i = 0; i < nseg; i++ ) - { - MULTI_update_va_mapping_otherdomain( - mcl+i, MMAP_VADDR(pending_idx, i), - pfn_pte_ma(seg[i].buf >> PAGE_SHIFT, remap_prot), - 0, blkif->domid); -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - if ( blkif->is_blktap ) - mcl[i].args[3] = ID_TO_DOM(req->id); -#endif - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(seg[i].buf >> PAGE_SHIFT); - } - - BUG_ON(HYPERVISOR_multicall(mcl, nseg) != 0); - - for ( i = 0; i < nseg; i++ ) - { - if ( unlikely(mcl[i].result != 0) ) - { - DPRINTK("invalid buffer -- could not remap it\n"); - fast_flush_area(pending_idx, nseg); - goto bad_descriptor; - } - } -#endif /* end ifndef CONFIG_XEN_BLKDEV_GRANT */ - - pending_req = &pending_reqs[pending_idx]; - pending_req->blkif = blkif; - pending_req->id = req->id; - pending_req->operation = operation; - pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - - atomic_set(&pending_req->pendcnt, nseg); - pending_cons++; - blkif_get(blkif); - - for ( i = 0; i < nseg; i++ ) - { - bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); - if ( unlikely(bh == NULL) ) - { - __end_block_io_op(pending_req, 0); - continue; - } - - memset(bh, 0, sizeof (struct buffer_head)); - - init_waitqueue_head(&bh->b_wait); - bh->b_size = seg[i].nsec << 9; - bh->b_dev = preq.dev; - bh->b_rdev = preq.dev; - bh->b_rsector = (unsigned long)preq.sector_number; - bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + - (seg[i].buf & ~PAGE_MASK); - bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); - bh->b_end_io = end_block_io_op; - bh->b_private = pending_req; - - bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | - (1 << BH_Req) | (1 << BH_Launder); - if ( operation == WRITE ) - bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); - - atomic_set(&bh->b_count, 1); - - /* Dispatch a single request. We'll flush it to disc later. */ - generic_make_request(operation, bh); - - preq.sector_number += seg[i].nsec; - } - -#else - - for ( i = 0; i < nseg; i++ ) - { - if ( ((int)preq.sector_number|(int)seg[i].nsec) & - ((bdev_hardsect_size(preq.bdev) >> 9) - 1) ) - { - DPRINTK("Misaligned I/O request from domain %d", blkif->domid); - goto cleanup_and_fail; - } - - while ( (bio == NULL) || - (bio_add_page(bio, - virt_to_page(MMAP_VADDR(pending_idx, i)), - seg[i].nsec << 9, - seg[i].buf & ~PAGE_MASK) == 0) ) - { - bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i); - if ( unlikely(bio == NULL) ) - { - cleanup_and_fail: - for ( i = 0; i < (nbio-1); i++ ) - bio_put(biolist[i]); - fast_flush_area(pending_idx, nseg); - goto bad_descriptor; - } - - bio->bi_bdev = preq.bdev; - bio->bi_private = pending_req; - bio->bi_end_io = end_block_io_op; - bio->bi_sector = preq.sector_number; - } - - preq.sector_number += seg[i].nsec; - } - - if ( (q = bdev_get_queue(bio->bi_bdev)) != plugged_queue ) - { - flush_plugged_queue(); - blk_get_queue(q); - plugged_queue = q; - } - - atomic_set(&pending_req->pendcnt, nbio); - pending_cons++; - blkif_get(blkif); - - for ( i = 0; i < nbio; i++ ) - submit_bio(operation, biolist[i]); - -#endif - - return; - - bad_descriptor: - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); -} - - - -/****************************************************************** - * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING - */ - - -static void make_response(blkif_t *blkif, unsigned long id, - unsigned short op, int st) -{ - blkif_response_t *resp; - unsigned long flags; - blkif_back_ring_t *blk_ring = &blkif->blk_ring; - - /* Place on the response ring for the relevant domain. */ - spin_lock_irqsave(&blkif->blk_ring_lock, flags); - resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - wmb(); /* Ensure other side can see the response fields. */ - blk_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES(blk_ring); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); - - /* Kick the relevant domain. */ - notify_via_evtchn(blkif->evtchn); -} - -void blkif_deschedule(blkif_t *blkif) -{ - remove_from_blkdev_list(blkif); -} - -static int __init blkif_init(void) -{ - int i; - - if ( !(xen_start_info.flags & SIF_INITDOMAIN) && - !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) - return 0; - - blkif_interface_init(); - - if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) - BUG(); - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - memset(pending_reqs, 0, sizeof(pending_reqs)); - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; - - spin_lock_init(&blkio_schedule_list_lock); - INIT_LIST_HEAD(&blkio_schedule_list); - - if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - buffer_head_cachep = kmem_cache_create( - "buffer_head_cache", sizeof(struct buffer_head), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -#endif - - blkif_ctrlif_init(); - -#ifdef CONFIG_XEN_BLKDEV_GRANT - memset( pending_grant_handles, BLKBACK_INVALID_HANDLE, MMAP_PAGES ); - printk(KERN_ALERT "Blkif backend is using grant tables.\n"); -#endif - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n"); -#endif - - return 0; -} - -__initcall(blkif_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkback/common.h --- a/linux-2.6.11-xen-sparse/drivers/xen/blkback/common.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,109 +0,0 @@ - -#ifndef __BLKIF__BACKEND__COMMON_H__ -#define __BLKIF__BACKEND__COMMON_H__ - -#include <linux/config.h> -#include <linux/version.h> -#include <linux/module.h> -#include <linux/rbtree.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <asm/io.h> -#include <asm/setup.h> -#include <asm/pgalloc.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/xen-public/io/blkif.h> -#include <asm-xen/xen-public/io/ring.h> - -#if 0 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define ASSERT(_p) ((void)0) -#define DPRINTK(_f, _a...) ((void)0) -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -typedef struct rb_root rb_root_t; -typedef struct rb_node rb_node_t; -#else -struct block_device; -#endif - -typedef struct blkif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - int irq; - /* Comms information. */ - blkif_back_ring_t blk_ring; - /* VBDs attached to this interface. */ - rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs.*/ - spinlock_t vbd_lock; /* Protects VBD mapping. */ - /* Private fields. */ - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - /* Is this a blktap frontend */ - unsigned int is_blktap; -#endif - struct blkif_st *hash_next; - struct list_head blkdev_list; - spinlock_t blk_ring_lock; - atomic_t refcnt; - - struct work_struct work; -#ifdef CONFIG_XEN_BLKDEV_GRANT - u16 shmem_handle; - memory_t shmem_vaddr; - grant_ref_t shmem_ref; -#endif -} blkif_t; - -void blkif_create(blkif_be_create_t *create); -void blkif_destroy(blkif_be_destroy_t *destroy); -void blkif_connect(blkif_be_connect_t *connect); -int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); -void blkif_disconnect_complete(blkif_t *blkif); -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); -#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define blkif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - blkif_disconnect_complete(_b); \ - } while (0) - -void vbd_create(blkif_be_vbd_create_t *create); -void vbd_destroy(blkif_be_vbd_destroy_t *delete); -int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds); -void destroy_all_vbds(blkif_t *blkif); - -struct phys_req { - unsigned short dev; - unsigned short nr_sects; - struct block_device *bdev; - blkif_sector_t sector_number; -}; - -int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); - -void blkif_interface_init(void); -void blkif_ctrlif_init(void); - -void blkif_deschedule(blkif_t *blkif); - -irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); - -#endif /* __BLKIF__BACKEND__COMMON_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkback/control.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blkback/control.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,61 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/blkif/backend/control.c - * - * Routines for interfacing with the control plane. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype); - - switch ( msg->subtype ) - { - case CMSG_BLKIF_BE_CREATE: - blkif_create((blkif_be_create_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DESTROY: - blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_CONNECT: - blkif_connect((blkif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DISCONNECT: - if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) ) - return; /* Sending the response is deferred until later. */ - break; - case CMSG_BLKIF_BE_VBD_CREATE: - vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_VBD_DESTROY: - vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]); - break; - default: - DPRINTK("Parse error while reading message subtype %d, len %d\n", - msg->subtype, msg->length); - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -void blkif_ctrlif_init(void) -{ - ctrl_msg_t cmsg; - blkif_be_driver_status_t st; - - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_be_driver_status_t); - st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkback/vbd.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blkback/vbd.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,295 +0,0 @@ -/****************************************************************************** - * blkback/vbd.c - * - * Routines for managing virtual block devices (VBDs). - * - * NOTE: vbd_lock protects updates to the rb_tree against concurrent lookups - * in vbd_translate. All other lookups are implicitly protected because the - * only caller (the control message dispatch routine) serializes the calls. - * - * Copyright (c) 2003-2005, Keir Fraser & Steve Hand - */ - -#include "common.h" - -struct vbd { - blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ - unsigned char readonly; /* Non-zero -> read-only */ - unsigned char type; /* VDISK_xxx */ - blkif_pdev_t pdevice; /* phys device that this vbd maps to */ - struct block_device *bdev; - rb_node_t rb; /* for linking into R-B tree lookup struct */ -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -static inline dev_t vbd_map_devnum(blkif_pdev_t cookie) -{ return MKDEV(cookie>>8, cookie&0xff); } -#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ - (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) -#define bdev_put(_b) blkdev_put(_b) -#else -#define vbd_sz(_v) (blk_size[MAJOR((_v)->pdevice)][MINOR((_v)->pdevice)]*2) -#define bdev_put(_b) ((void)0) -#define bdev_hardsect_size(_b) 512 -#endif - -void vbd_create(blkif_be_vbd_create_t *create) -{ - struct vbd *vbd; - rb_node_t **rb_p, *rb_parent = NULL; - blkif_t *blkif; - blkif_vdev_t vdevice = create->vdevice; - - blkif = blkif_find_by_handle(create->domid, create->blkif_handle); - if ( unlikely(blkif == NULL) ) - { - DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", - create->domid, create->blkif_handle); - create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - rb_p = &blkif->vbd_rb.rb_node; - while ( *rb_p != NULL ) - { - rb_parent = *rb_p; - vbd = rb_entry(rb_parent, struct vbd, rb); - if ( vdevice < vbd->vdevice ) - { - rb_p = &rb_parent->rb_left; - } - else if ( vdevice > vbd->vdevice ) - { - rb_p = &rb_parent->rb_right; - } - else - { - DPRINTK("vbd_create attempted for already existing vbd\n"); - create->status = BLKIF_BE_STATUS_VBD_EXISTS; - return; - } - } - - if ( unlikely((vbd = kmalloc(sizeof(struct vbd), GFP_KERNEL)) == NULL) ) - { - DPRINTK("vbd_create: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - vbd->vdevice = vdevice; - vbd->readonly = create->readonly; - vbd->type = 0; - - /* Mask to 16-bit for compatibility with old tools */ - vbd->pdevice = create->pdevice & 0xffff; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - vbd->bdev = open_by_devnum( - vbd_map_devnum(vbd->pdevice), - vbd->readonly ? FMODE_READ : FMODE_WRITE); - if ( IS_ERR(vbd->bdev) ) - { - DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - create->status = BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND; - return; - } - - if ( (vbd->bdev->bd_disk == NULL) ) - { - DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - create->status = BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND; - bdev_put(vbd->bdev); - return; - } - - if ( vbd->bdev->bd_disk->flags & GENHD_FL_CD ) - vbd->type |= VDISK_CDROM; - if ( vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE ) - vbd->type |= VDISK_REMOVABLE; - -#else - if ( (blk_size[MAJOR(vbd->pdevice)] == NULL) || (vbd_sz(vbd) == 0) ) - { - DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - create->status = BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND; - return; - } -#endif - - spin_lock(&blkif->vbd_lock); - rb_link_node(&vbd->rb, rb_parent, rb_p); - rb_insert_color(&vbd->rb, &blkif->vbd_rb); - spin_unlock(&blkif->vbd_lock); - - DPRINTK("Successful creation of vdev=%04x (dom=%u)\n", - vdevice, create->domid); - create->status = BLKIF_BE_STATUS_OKAY; -} - - -void vbd_destroy(blkif_be_vbd_destroy_t *destroy) -{ - blkif_t *blkif; - struct vbd *vbd; - rb_node_t *rb; - blkif_vdev_t vdevice = destroy->vdevice; - - blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); - if ( unlikely(blkif == NULL) ) - { - DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", - destroy->domid, destroy->blkif_handle); - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - rb = blkif->vbd_rb.rb_node; - while ( rb != NULL ) - { - vbd = rb_entry(rb, struct vbd, rb); - if ( vdevice < vbd->vdevice ) - rb = rb->rb_left; - else if ( vdevice > vbd->vdevice ) - rb = rb->rb_right; - else - goto found; - } - - destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; - return; - - found: - spin_lock(&blkif->vbd_lock); - rb_erase(rb, &blkif->vbd_rb); - spin_unlock(&blkif->vbd_lock); - bdev_put(vbd->bdev); - kfree(vbd); -} - - -void destroy_all_vbds(blkif_t *blkif) -{ - struct vbd *vbd; - rb_node_t *rb; - - spin_lock(&blkif->vbd_lock); - - while ( (rb = blkif->vbd_rb.rb_node) != NULL ) - { - vbd = rb_entry(rb, struct vbd, rb); - rb_erase(rb, &blkif->vbd_rb); - spin_unlock(&blkif->vbd_lock); - bdev_put(vbd->bdev); - kfree(vbd); - spin_lock(&blkif->vbd_lock); - } - - spin_unlock(&blkif->vbd_lock); -} - - -static void vbd_probe_single( - blkif_t *blkif, vdisk_t *vbd_info, struct vbd *vbd) -{ - vbd_info->device = vbd->vdevice; - vbd_info->info = vbd->type | (vbd->readonly ? VDISK_READONLY : 0); - vbd_info->capacity = vbd_sz(vbd); - vbd_info->sector_size = bdev_hardsect_size(vbd->bdev); -} - - -int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds) -{ - int rc = 0, nr_vbds = 0; - rb_node_t *rb; - - spin_lock(&blkif->vbd_lock); - - if ( (rb = blkif->vbd_rb.rb_node) == NULL ) - goto out; - - new_subtree: - /* STEP 1. Find least node (it'll be left-most). */ - while ( rb->rb_left != NULL ) - rb = rb->rb_left; - - for ( ; ; ) - { - /* STEP 2. Dealt with left subtree. Now process current node. */ - vbd_probe_single(blkif, &vbd_info[nr_vbds], - rb_entry(rb, struct vbd, rb)); - if ( ++nr_vbds == max_vbds ) - goto out; - - /* STEP 3. Process right subtree, if any. */ - if ( rb->rb_right != NULL ) - { - rb = rb->rb_right; - goto new_subtree; - } - - /* STEP 4. Done both subtrees. Head back through ancesstors. */ - for ( ; ; ) - { - /* We're done when we get back to the root node. */ - if ( rb->rb_parent == NULL ) - goto out; - /* If we are left of parent, then parent is next to process. */ - if ( rb->rb_parent->rb_left == rb ) - break; - /* If we are right of parent, then we climb to grandparent. */ - rb = rb->rb_parent; - } - - rb = rb->rb_parent; - } - - out: - spin_unlock(&blkif->vbd_lock); - return (rc == 0) ? nr_vbds : rc; -} - - -int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) -{ - struct vbd *vbd; - rb_node_t *rb; - int rc = -EACCES; - - /* Take the vbd_lock because another thread could be updating the tree. */ - spin_lock(&blkif->vbd_lock); - - rb = blkif->vbd_rb.rb_node; - while ( rb != NULL ) - { - vbd = rb_entry(rb, struct vbd, rb); - if ( req->dev < vbd->vdevice ) - rb = rb->rb_left; - else if ( req->dev > vbd->vdevice ) - rb = rb->rb_right; - else - goto found; - } - - DPRINTK("vbd_translate; domain %u attempted to access " - "non-existent VBD.\n", blkif->domid); - rc = -ENODEV; - goto out; - - found: - - if ( (operation == WRITE) && vbd->readonly ) - goto out; - - if ( unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)) ) - goto out; - - req->dev = vbd->pdevice; - req->bdev = vbd->bdev; - rc = 0; - - out: - spin_unlock(&blkif->vbd_lock); - return rc; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkfront/Kconfig --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/Kconfig Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,6 +0,0 @@ - -config XENBLOCK - tristate "Block device driver" - depends on ARCH_XEN - help - Block device driver for Xen diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkfront/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ - -obj-y := blkfront.o vbd.o - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/blkfront.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1491 +0,0 @@ -/****************************************************************************** - * blkfront.c - * - * XenLinux virtual block-device driver. - * - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - * Copyright (c) 2004, Christian Limpach - * Copyright (c) 2004, Andrew Warfield - * Copyright (c) 2005, Christopher Clark - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) -#endif - -#include <linux/version.h> - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#include "block.h" -#else -#include "common.h" -#include <linux/blk.h> -#include <linux/tqueue.h> -#endif - -#include <linux/cdrom.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <scsi/scsi.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/evtchn.h> -#ifdef CONFIG_XEN_BLKDEV_GRANT -#include <asm-xen/xen-public/grant_table.h> -#include <asm-xen/gnttab.h> -#endif - -typedef unsigned char byte; /* from linux/ide.h */ - -/* Control whether runtime update of vbds is enabled. */ -#define ENABLE_VBD_UPDATE 1 - -#if ENABLE_VBD_UPDATE -static void vbd_update(void); -#else -static void vbd_update(void){}; -#endif - -#define BLKIF_STATE_CLOSED 0 -#define BLKIF_STATE_DISCONNECTED 1 -#define BLKIF_STATE_CONNECTED 2 - -static int blkif_handle = 0; -static unsigned int blkif_state = BLKIF_STATE_CLOSED; -static unsigned int blkif_evtchn = 0; -static unsigned int blkif_irq = 0; - -static int blkif_control_rsp_valid; -static blkif_response_t blkif_control_rsp; - -static blkif_front_ring_t blk_ring; - -#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) - -#ifdef CONFIG_XEN_BLKDEV_GRANT -static domid_t rdomid = 0; -static grant_ref_t gref_head, gref_terminal; -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) -#define GRANTREF_INVALID (1<<15) -#endif - -static struct blk_shadow { - blkif_request_t req; - unsigned long request; - unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -} blk_shadow[BLK_RING_SIZE]; -unsigned long blk_shadow_free; - -static int recovery = 0; /* Recovery in progress: protected by blkif_io_lock */ - -static void kick_pending_request_queues(void); - -int __init xlblk_init(void); - -static void blkif_completion(struct blk_shadow *s); - -static inline int GET_ID_FROM_FREELIST(void) -{ - unsigned long free = blk_shadow_free; - BUG_ON(free > BLK_RING_SIZE); - blk_shadow_free = blk_shadow[free].req.id; - blk_shadow[free].req.id = 0x0fffffee; /* debug */ - return free; -} - -static inline void ADD_ID_TO_FREELIST(unsigned long id) -{ - blk_shadow[id].req.id = blk_shadow_free; - blk_shadow[id].request = 0; - blk_shadow_free = id; -} - - -/************************ COMMON CODE (inlined) ************************/ - -/* Kernel-specific definitions used in the common code */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define DISABLE_SCATTERGATHER() -#else -static int sg_operation = -1; -#define DISABLE_SCATTERGATHER() (sg_operation = -1) -#endif - -static inline void pickle_request(struct blk_shadow *s, blkif_request_t *r) -{ -#ifndef CONFIG_XEN_BLKDEV_GRANT - int i; -#endif - - s->req = *r; - -#ifndef CONFIG_XEN_BLKDEV_GRANT - for ( i = 0; i < r->nr_segments; i++ ) - s->req.frame_and_sects[i] = machine_to_phys(r->frame_and_sects[i]); -#endif -} - -static inline void unpickle_request(blkif_request_t *r, struct blk_shadow *s) -{ -#ifndef CONFIG_XEN_BLKDEV_GRANT - int i; -#endif - - *r = s->req; - -#ifndef CONFIG_XEN_BLKDEV_GRANT - for ( i = 0; i < s->req.nr_segments; i++ ) - r->frame_and_sects[i] = phys_to_machine(s->req.frame_and_sects[i]); -#endif -} - - -static inline void flush_requests(void) -{ - DISABLE_SCATTERGATHER(); - RING_PUSH_REQUESTS(&blk_ring); - notify_via_evtchn(blkif_evtchn); -} - - -/************************** KERNEL VERSION 2.6 **************************/ - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - -module_init(xlblk_init); - -#if ENABLE_VBD_UPDATE -static void update_vbds_task(void *unused) -{ - xlvbd_update_vbds(); -} - -static void vbd_update(void) -{ - static DECLARE_WORK(update_tq, update_vbds_task, NULL); - schedule_work(&update_tq); -} -#endif /* ENABLE_VBD_UPDATE */ - -static struct xlbd_disk_info *head_waiting = NULL; -static void kick_pending_request_queues(void) -{ - struct xlbd_disk_info *di; - while ( ((di = head_waiting) != NULL) && !RING_FULL(&blk_ring) ) - { - head_waiting = di->next_waiting; - di->next_waiting = NULL; - /* Re-enable calldowns. */ - blk_start_queue(di->rq); - /* Kick things off immediately. */ - do_blkif_request(di->rq); - } -} - -int blkif_open(struct inode *inode, struct file *filep) -{ - struct gendisk *gd = inode->i_bdev->bd_disk; - struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; - - /* Update of usage count is protected by per-device semaphore. */ - di->mi->usage++; - - return 0; -} - - -int blkif_release(struct inode *inode, struct file *filep) -{ - struct gendisk *gd = inode->i_bdev->bd_disk; - struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; - - /* - * When usage drops to zero it may allow more VBD updates to occur. - * Update of usage count is protected by a per-device semaphore. - */ - if ( --di->mi->usage == 0 ) - vbd_update(); - - return 0; -} - - -int blkif_ioctl(struct inode *inode, struct file *filep, - unsigned command, unsigned long argument) -{ - int i; - - DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", - command, (long)argument, inode->i_rdev); - - switch ( command ) - { - case HDIO_GETGEO: - /* return ENOSYS to use defaults */ - return -ENOSYS; - - case CDROMMULTISESSION: - DPRINTK("FIXME: support multisession CDs later\n"); - for ( i = 0; i < sizeof(struct cdrom_multisession); i++ ) - if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT; - return 0; - - default: - /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", - command);*/ - return -EINVAL; /* same return as native Linux */ - } - - return 0; -} - - -/* - * blkif_queue_request - * - * request block io - * - * id: for guest use only. - * operation: BLKIF_OP_{READ,WRITE,PROBE} - * buffer: buffer to read/write into. this should be a - * virtual address in the guest os. - */ -static int blkif_queue_request(struct request *req) -{ - struct xlbd_disk_info *di = req->rq_disk->private_data; - unsigned long buffer_ma; - blkif_request_t *ring_req; - struct bio *bio; - struct bio_vec *bvec; - int idx; - unsigned long id; - unsigned int fsect, lsect; -#ifdef CONFIG_XEN_BLKDEV_GRANT - int ref; -#endif - - if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) - return 1; - - /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt); - id = GET_ID_FROM_FREELIST(); - blk_shadow[id].request = (unsigned long)req; - - ring_req->id = id; - ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : - BLKIF_OP_READ; - ring_req->sector_number = (blkif_sector_t)req->sector; - ring_req->device = di->xd_device; - - ring_req->nr_segments = 0; - rq_for_each_bio(bio, req) - { - bio_for_each_segment(bvec, bio, idx) - { - if ( ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST ) - BUG(); - buffer_ma = page_to_phys(bvec->bv_page); - fsect = bvec->bv_offset >> 9; - lsect = fsect + (bvec->bv_len >> 9) - 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); - ASSERT( ref != -ENOSPC ); - - gnttab_grant_foreign_access_ref( - ref, - rdomid, - buffer_ma >> PAGE_SHIFT, - rq_data_dir(req) ); - - blk_shadow[id].frame[ring_req->nr_segments] = - buffer_ma >> PAGE_SHIFT; - - ring_req->frame_and_sects[ring_req->nr_segments++] = - (((u32) ref) << 16) | (fsect << 3) | lsect; - -#else - ring_req->frame_and_sects[ring_req->nr_segments++] = - buffer_ma | (fsect << 3) | lsect; -#endif - } - } - - blk_ring.req_prod_pvt++; - - /* Keep a private copy so we can reissue requests when recovering. */ - pickle_request(&blk_shadow[id], ring_req); - - return 0; -} - - -/* - * do_blkif_request - * read a block; request is in a request queue - */ -void do_blkif_request(request_queue_t *rq) -{ - struct xlbd_disk_info *di; - struct request *req; - int queued; - - DPRINTK("Entered do_blkif_request\n"); - - queued = 0; - - while ( (req = elv_next_request(rq)) != NULL ) - { - if ( !blk_fs_request(req) ) - { - end_request(req, 0); - continue; - } - - if ( RING_FULL(&blk_ring) ) - goto wait; - - DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n", - req, req->cmd, req->sector, req->current_nr_sectors, - req->nr_sectors, req->buffer, - rq_data_dir(req) ? "write" : "read"); - - blkdev_dequeue_request(req); - if ( blkif_queue_request(req) ) - { - wait: - di = req->rq_disk->private_data; - if ( di->next_waiting == NULL ) - { - di->next_waiting = head_waiting; - head_waiting = di; - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); - } - break; - } - - queued++; - } - - if ( queued != 0 ) - flush_requests(); -} - - -static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) -{ - struct request *req; - blkif_response_t *bret; - RING_IDX i, rp; - unsigned long flags; - - spin_lock_irqsave(&blkif_io_lock, flags); - - if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || - unlikely(recovery) ) - { - spin_unlock_irqrestore(&blkif_io_lock, flags); - return IRQ_HANDLED; - } - - rp = blk_ring.sring->rsp_prod; - rmb(); /* Ensure we see queued responses up to 'rp'. */ - - for ( i = blk_ring.rsp_cons; i != rp; i++ ) - { - unsigned long id; - - bret = RING_GET_RESPONSE(&blk_ring, i); - id = bret->id; - req = (struct request *)blk_shadow[id].request; - - blkif_completion(&blk_shadow[id]); - - ADD_ID_TO_FREELIST(id); - - switch ( bret->operation ) - { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) - DPRINTK("Bad return from blkdev data request: %x\n", - bret->status); - - if ( unlikely(end_that_request_first - (req, - (bret->status == BLKIF_RSP_OKAY), - req->hard_nr_sectors)) ) - BUG(); - end_that_request_last(req); - - break; - case BLKIF_OP_PROBE: - memcpy(&blkif_control_rsp, bret, sizeof(*bret)); - blkif_control_rsp_valid = 1; - break; - default: - BUG(); - } - } - - blk_ring.rsp_cons = i; - - kick_pending_request_queues(); - - spin_unlock_irqrestore(&blkif_io_lock, flags); - - return IRQ_HANDLED; -} - -#else -/************************** KERNEL VERSION 2.4 **************************/ - -static kdev_t sg_dev; -static unsigned long sg_next_sect; - -/* - * Request queues with outstanding work, but ring is currently full. - * We need no special lock here, as we always access this with the - * blkif_io_lock held. We only need a small maximum list. - */ -#define MAX_PENDING 8 -static request_queue_t *pending_queues[MAX_PENDING]; -static int nr_pending; - - -#define blkif_io_lock io_request_lock - -/*============================================================================*/ -#if ENABLE_VBD_UPDATE - -/* - * blkif_update_int/update-vbds_task - handle VBD update events. - * Schedule a task for keventd to run, which will update the VBDs and perform - * the corresponding updates to our view of VBD state. - */ -static void update_vbds_task(void *unused) -{ - xlvbd_update_vbds(); -} - -static void vbd_update(void) -{ - static struct tq_struct update_tq; - update_tq.routine = update_vbds_task; - schedule_task(&update_tq); -} - -#endif /* ENABLE_VBD_UPDATE */ -/*============================================================================*/ - -static void kick_pending_request_queues(void) -{ - /* We kick pending request queues if the ring is reasonably empty. */ - if ( (nr_pending != 0) && - (RING_PENDING_REQUESTS(&blk_ring) < (BLK_RING_SIZE >> 1)) ) - { - /* Attempt to drain the queue, but bail if the ring becomes full. */ - while ( (nr_pending != 0) && !RING_FULL(&blk_ring) ) - do_blkif_request(pending_queues[--nr_pending]); - } -} - -int blkif_open(struct inode *inode, struct file *filep) -{ - short xldev = inode->i_rdev; - struct gendisk *gd = get_gendisk(xldev); - xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); - short minor = MINOR(xldev); - - if ( gd->part[minor].nr_sects == 0 ) - { - /* - * Device either doesn't exist, or has zero capacity; we use a few - * cheesy heuristics to return the relevant error code - */ - if ( (gd->sizes[minor >> gd->minor_shift] != 0) || - ((minor & (gd->max_p - 1)) != 0) ) - { - /* - * We have a real device, but no such partition, or we just have a - * partition number so guess this is the problem. - */ - return -ENXIO; /* no such device or address */ - } - else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE ) - { - /* This is a removable device => assume that media is missing. */ - return -ENOMEDIUM; /* media not present (this is a guess) */ - } - else - { - /* Just go for the general 'no such device' error. */ - return -ENODEV; /* no such device */ - } - } - - /* Update of usage count is protected by per-device semaphore. */ - disk->usage++; - - return 0; -} - - -int blkif_release(struct inode *inode, struct file *filep) -{ - xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); - - /* - * When usage drops to zero it may allow more VBD updates to occur. - * Update of usage count is protected by a per-device semaphore. - */ - if ( --disk->usage == 0 ) { - vbd_update(); - } - - return 0; -} - - -int blkif_ioctl(struct inode *inode, struct file *filep, - unsigned command, unsigned long argument) -{ - kdev_t dev = inode->i_rdev; - struct hd_geometry *geo = (struct hd_geometry *)argument; - struct gendisk *gd; - struct hd_struct *part; - int i; - unsigned short cylinders; - byte heads, sectors; - - /* NB. No need to check permissions. That is done for us. */ - - DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", - command, (long) argument, dev); - - gd = get_gendisk(dev); - part = &gd->part[MINOR(dev)]; - - switch ( command ) - { - case BLKGETSIZE: - DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); - return put_user(part->nr_sects, (unsigned long *) argument); - - case BLKGETSIZE64: - DPRINTK_IOCTL(" BLKGETSIZE64: %x %llx\n", BLKGETSIZE64, - (u64)part->nr_sects * 512); - return put_user((u64)part->nr_sects * 512, (u64 *) argument); - - case BLKRRPART: /* re-read partition table */ - DPRINTK_IOCTL(" BLKRRPART: %x\n", BLKRRPART); - return blkif_revalidate(dev); - - case BLKSSZGET: - return hardsect_size[MAJOR(dev)][MINOR(dev)]; - - case BLKBSZGET: /* get block size */ - DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET); - break; - - case BLKBSZSET: /* set block size */ - DPRINTK_IOCTL(" BLKBSZSET: %x\n", BLKBSZSET); - break; - - case BLKRASET: /* set read-ahead */ - DPRINTK_IOCTL(" BLKRASET: %x\n", BLKRASET); - break; - - case BLKRAGET: /* get read-ahead */ - DPRINTK_IOCTL(" BLKRAFET: %x\n", BLKRAGET); - break; - - case HDIO_GETGEO: - DPRINTK_IOCTL(" HDIO_GETGEO: %x\n", HDIO_GETGEO); - if (!argument) return -EINVAL; - - /* We don't have real geometry info, but let's at least return - values consistent with the size of the device */ - - heads = 0xff; - sectors = 0x3f; - cylinders = part->nr_sects / (heads * sectors); - - if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; - if (put_user(heads, (byte *)&geo->heads)) return -EFAULT; - if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT; - if (put_user(cylinders, (unsigned short *)&geo->cylinders)) return -EFAULT; - - return 0; - - case HDIO_GETGEO_BIG: - DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG); - if (!argument) return -EINVAL; - - /* We don't have real geometry info, but let's at least return - values consistent with the size of the device */ - - heads = 0xff; - sectors = 0x3f; - cylinders = part->nr_sects / (heads * sectors); - - if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; - if (put_user(heads, (byte *)&geo->heads)) return -EFAULT; - if (put_user(sectors, (byte *)&geo->sectors)) return -EFAULT; - if (put_user(cylinders, (unsigned int *) &geo->cylinders)) return -EFAULT; - - return 0; - - case CDROMMULTISESSION: - DPRINTK("FIXME: support multisession CDs later\n"); - for ( i = 0; i < sizeof(struct cdrom_multisession); i++ ) - if ( put_user(0, (byte *)(argument + i)) ) return -EFAULT; - return 0; - - case SCSI_IOCTL_GET_BUS_NUMBER: - DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in XL blkif"); - return -ENOSYS; - - default: - WPRINTK("ioctl %08x not supported by XL blkif\n", command); - return -ENOSYS; - } - - return 0; -} - - - -/* check media change: should probably do something here in some cases :-) */ -int blkif_check(kdev_t dev) -{ - DPRINTK("blkif_check\n"); - return 0; -} - -int blkif_revalidate(kdev_t dev) -{ - struct block_device *bd; - struct gendisk *gd; - xl_disk_t *disk; - unsigned long capacity; - int i, rc = 0; - - if ( (bd = bdget(dev)) == NULL ) - return -EINVAL; - - /* - * Update of partition info, and check of usage count, is protected - * by the per-block-device semaphore. - */ - down(&bd->bd_sem); - - if ( ((gd = get_gendisk(dev)) == NULL) || - ((disk = xldev_to_xldisk(dev)) == NULL) || - ((capacity = gd->part[MINOR(dev)].nr_sects) == 0) ) - { - rc = -EINVAL; - goto out; - } - - if ( disk->usage > 1 ) - { - rc = -EBUSY; - goto out; - } - - /* Only reread partition table if VBDs aren't mapped to partitions. */ - if ( !(gd->flags[MINOR(dev) >> gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) - { - for ( i = gd->max_p - 1; i >= 0; i-- ) - { - invalidate_device(dev+i, 1); - gd->part[MINOR(dev+i)].start_sect = 0; - gd->part[MINOR(dev+i)].nr_sects = 0; - gd->sizes[MINOR(dev+i)] = 0; - } - - grok_partitions(gd, MINOR(dev)>>gd->minor_shift, gd->max_p, capacity); - } - - out: - up(&bd->bd_sem); - bdput(bd); - return rc; -} - - -/* - * blkif_queue_request - * - * request block io - * - * id: for guest use only. - * operation: BLKIF_OP_{READ,WRITE,PROBE} - * buffer: buffer to read/write into. this should be a - * virtual address in the guest os. - */ -static int blkif_queue_request(unsigned long id, - int operation, - char * buffer, - unsigned long sector_number, - unsigned short nr_sectors, - kdev_t device) -{ - unsigned long buffer_ma = virt_to_bus(buffer); - unsigned long xid; - struct gendisk *gd; - blkif_request_t *req; - struct buffer_head *bh; - unsigned int fsect, lsect; -#ifdef CONFIG_XEN_BLKDEV_GRANT - int ref; -#endif - - fsect = (buffer_ma & ~PAGE_MASK) >> 9; - lsect = fsect + nr_sectors - 1; - - /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ - if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) - BUG(); - if ( lsect > 7 ) - BUG(); - - buffer_ma &= PAGE_MASK; - - if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) - return 1; - - switch ( operation ) - { - - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - gd = get_gendisk(device); - - /* - * Update the sector_number we'll pass down as appropriate; note that - * we could sanity check that resulting sector will be in this - * partition, but this will happen in driver backend anyhow. - */ - sector_number += gd->part[MINOR(device)].start_sect; - - /* - * If this unit doesn't consist of virtual partitions then we clear - * the partn bits from the device number. - */ - if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & - GENHD_FL_VIRT_PARTNS) ) - device &= ~(gd->max_p - 1); - - if ( (sg_operation == operation) && - (sg_dev == device) && - (sg_next_sect == sector_number) ) - { - req = RING_GET_REQUEST(&blk_ring, - blk_ring.req_prod_pvt - 1); - bh = (struct buffer_head *)id; - - bh->b_reqnext = (struct buffer_head *)blk_shadow[req->id].request; - blk_shadow[req->id].request = (unsigned long)id; - -#ifdef CONFIG_XEN_BLKDEV_GRANT - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); - ASSERT( ref != -ENOSPC ); - - gnttab_grant_foreign_access_ref( - ref, - rdomid, - buffer_ma >> PAGE_SHIFT, - ( operation == BLKIF_OP_WRITE ? 1 : 0 ) ); - - blk_shadow[req->id].frame[req->nr_segments] = - buffer_ma >> PAGE_SHIFT; - - req->frame_and_sects[req->nr_segments] = - (((u32) ref ) << 16) | (fsect << 3) | lsect; -#else - req->frame_and_sects[req->nr_segments] = - buffer_ma | (fsect << 3) | lsect; -#endif - if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) - sg_next_sect += nr_sectors; - else - DISABLE_SCATTERGATHER(); - - /* Update the copy of the request in the recovery ring. */ - pickle_request(&blk_shadow[req->id], req ); - - return 0; - } - else if ( RING_FULL(&blk_ring) ) - { - return 1; - } - else - { - sg_operation = operation; - sg_dev = device; - sg_next_sect = sector_number + nr_sectors; - } - break; - - default: - panic("unknown op %d\n", operation); - } - - /* Fill out a communications ring structure. */ - req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt); - - xid = GET_ID_FROM_FREELIST(); - blk_shadow[xid].request = (unsigned long)id; - - req->id = xid; - req->operation = operation; - req->sector_number = (blkif_sector_t)sector_number; - req->device = device; - req->nr_segments = 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); - ASSERT( ref != -ENOSPC ); - - gnttab_grant_foreign_access_ref( - ref, - rdomid, - buffer_ma >> PAGE_SHIFT, - ( operation == BLKIF_OP_WRITE ? 1 : 0 ) ); - - blk_shadow[xid].frame[0] = buffer_ma >> PAGE_SHIFT; - - req->frame_and_sects[0] = (((u32) ref)<<16) | (fsect<<3) | lsect; -#else - req->frame_and_sects[0] = buffer_ma | (fsect<<3) | lsect; -#endif - - /* Keep a private copy so we can reissue requests when recovering. */ - pickle_request(&blk_shadow[xid], req); - - blk_ring.req_prod_pvt++; - - return 0; -} - - -/* - * do_blkif_request - * read a block; request is in a request queue - */ -void do_blkif_request(request_queue_t *rq) -{ - struct request *req; - struct buffer_head *bh, *next_bh; - int rw, nsect, full, queued = 0; - - DPRINTK("Entered do_blkif_request\n"); - - while ( !rq->plugged && !list_empty(&rq->queue_head)) - { - if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) - goto out; - - DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", - req, req->cmd, req->sector, - req->current_nr_sectors, req->nr_sectors, req->bh); - - rw = req->cmd; - if ( rw == READA ) - rw = READ; - if ( unlikely((rw != READ) && (rw != WRITE)) ) - panic("XenoLinux Virtual Block Device: bad cmd: %d\n", rw); - - req->errors = 0; - - bh = req->bh; - while ( bh != NULL ) - { - next_bh = bh->b_reqnext; - bh->b_reqnext = NULL; - - full = blkif_queue_request( - (unsigned long)bh, - (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, - bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); - - if ( full ) - { - bh->b_reqnext = next_bh; - pending_queues[nr_pending++] = rq; - if ( unlikely(nr_pending >= MAX_PENDING) ) - BUG(); - goto out; - } - - queued++; - - /* Dequeue the buffer head from the request. */ - nsect = bh->b_size >> 9; - bh = req->bh = next_bh; - - if ( bh != NULL ) - { - /* There's another buffer head to do. Update the request. */ - req->hard_sector += nsect; - req->hard_nr_sectors -= nsect; - req->sector = req->hard_sector; - req->nr_sectors = req->hard_nr_sectors; - req->current_nr_sectors = bh->b_size >> 9; - req->buffer = bh->b_data; - } - else - { - /* That was the last buffer head. Finalise the request. */ - if ( unlikely(end_that_request_first(req, 1, "XenBlk")) ) - BUG(); - blkdev_dequeue_request(req); - end_that_request_last(req); - } - } - } - - out: - if ( queued != 0 ) - flush_requests(); -} - - -static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) -{ - RING_IDX i, rp; - unsigned long flags; - struct buffer_head *bh, *next_bh; - - spin_lock_irqsave(&io_request_lock, flags); - - if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) ) - { - spin_unlock_irqrestore(&io_request_lock, flags); - return; - } - - rp = blk_ring.sring->rsp_prod; - rmb(); /* Ensure we see queued responses up to 'rp'. */ - - for ( i = blk_ring.rsp_cons; i != rp; i++ ) - { - unsigned long id; - blkif_response_t *bret; - - bret = RING_GET_RESPONSE(&blk_ring, i); - id = bret->id; - bh = (struct buffer_head *)blk_shadow[id].request; - - blkif_completion(&blk_shadow[id]); - - ADD_ID_TO_FREELIST(id); - - switch ( bret->operation ) - { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) - DPRINTK("Bad return from blkdev data request: %lx\n", - bret->status); - for ( ; bh != NULL; bh = next_bh ) - { - next_bh = bh->b_reqnext; - bh->b_reqnext = NULL; - bh->b_end_io(bh, bret->status == BLKIF_RSP_OKAY); - } - - break; - case BLKIF_OP_PROBE: - memcpy(&blkif_control_rsp, bret, sizeof(*bret)); - blkif_control_rsp_valid = 1; - break; - default: - BUG(); - } - - } - blk_ring.rsp_cons = i; - - kick_pending_request_queues(); - - spin_unlock_irqrestore(&io_request_lock, flags); -} - -#endif - -/***************************** COMMON CODE *******************************/ - -#ifdef CONFIG_XEN_BLKDEV_GRANT -void blkif_control_probe_send(blkif_request_t *req, blkif_response_t *rsp, - unsigned long address) -{ - int ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); - ASSERT( ref != -ENOSPC ); - - gnttab_grant_foreign_access_ref( ref, rdomid, address >> PAGE_SHIFT, 0 ); - - req->frame_and_sects[0] = (((u32) ref) << 16) | 7; - - blkif_control_send(req, rsp); -} -#endif - -void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) -{ - unsigned long flags, id; - blkif_request_t *req_d; - - retry: - while ( RING_FULL(&blk_ring) ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - spin_lock_irqsave(&blkif_io_lock, flags); - if ( RING_FULL(&blk_ring) ) - { - spin_unlock_irqrestore(&blkif_io_lock, flags); - goto retry; - } - - DISABLE_SCATTERGATHER(); - req_d = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt); - *req_d = *req; - - id = GET_ID_FROM_FREELIST(); - req_d->id = id; - blk_shadow[id].request = (unsigned long)req; - - pickle_request(&blk_shadow[id], req); - - blk_ring.req_prod_pvt++; - flush_requests(); - - spin_unlock_irqrestore(&blkif_io_lock, flags); - - while ( !blkif_control_rsp_valid ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - memcpy(rsp, &blkif_control_rsp, sizeof(*rsp)); - blkif_control_rsp_valid = 0; -} - - -/* Send a driver status notification to the domain controller. */ -static void send_driver_status(int ok) -{ - ctrl_msg_t cmsg = { - .type = CMSG_BLKIF_FE, - .subtype = CMSG_BLKIF_FE_DRIVER_STATUS, - .length = sizeof(blkif_fe_driver_status_t), - }; - blkif_fe_driver_status_t *msg = (void*)cmsg.msg; - - msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN); - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -/* Tell the controller to bring up the interface. */ -static void blkif_send_interface_connect(void) -{ - ctrl_msg_t cmsg = { - .type = CMSG_BLKIF_FE, - .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT, - .length = sizeof(blkif_fe_interface_connect_t), - }; - blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; - - msg->handle = 0; - msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT); - -#ifdef CONFIG_XEN_BLKDEV_GRANT - msg->shmem_ref = gnttab_claim_grant_reference( &gref_head, gref_terminal ); - ASSERT( msg->shmem_ref != -ENOSPC ); - gnttab_grant_foreign_access_ref ( msg->shmem_ref , rdomid, msg->shmem_frame, 0 ); -#endif - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -static void blkif_free(void) -{ - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&blkif_io_lock); - recovery = 1; - blkif_state = BLKIF_STATE_DISCONNECTED; - spin_unlock_irq(&blkif_io_lock); - - /* Free resources associated with old device channel. */ - if ( blk_ring.sring != NULL ) - { - free_page((unsigned long)blk_ring.sring); - blk_ring.sring = NULL; - } - free_irq(blkif_irq, NULL); - blkif_irq = 0; - - unbind_evtchn_from_irq(blkif_evtchn); - blkif_evtchn = 0; -} - -static void blkif_close(void) -{ -} - -/* Move from CLOSED to DISCONNECTED state. */ -static void blkif_disconnect(void) -{ - blkif_sring_t *sring; - - if ( blk_ring.sring != NULL ) - free_page((unsigned long)blk_ring.sring); - - sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL); - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blk_ring, sring, PAGE_SIZE); - blkif_state = BLKIF_STATE_DISCONNECTED; - blkif_send_interface_connect(); -} - -static void blkif_reset(void) -{ - blkif_free(); - blkif_disconnect(); -} - -static void blkif_recover(void) -{ - int i; - blkif_request_t *req; - struct blk_shadow *copy; -#ifdef CONFIG_XEN_BLKDEV_GRANT - int j; -#endif - - /* Stage 1: Make a safe copy of the shadow state. */ - copy = (struct blk_shadow *)kmalloc(sizeof(blk_shadow), GFP_KERNEL); - BUG_ON(copy == NULL); - memcpy(copy, blk_shadow, sizeof(blk_shadow)); - - /* Stage 2: Set up free list. */ - memset(&blk_shadow, 0, sizeof(blk_shadow)); - for ( i = 0; i < BLK_RING_SIZE; i++ ) - blk_shadow[i].req.id = i+1; - blk_shadow_free = blk_ring.req_prod_pvt; - blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; - - /* Stage 3: Find pending requests and requeue them. */ - for ( i = 0; i < BLK_RING_SIZE; i++ ) - { - /* Not in use? */ - if ( copy[i].request == 0 ) - continue; - - /* Grab a request slot and unpickle shadow state into it. */ - req = RING_GET_REQUEST( - &blk_ring, blk_ring.req_prod_pvt); - unpickle_request(req, ©[i]); - - /* We get a new request id, and must reset the shadow state. */ - req->id = GET_ID_FROM_FREELIST(); - memcpy(&blk_shadow[req->id], ©[i], sizeof(copy[i])); - -#ifdef CONFIG_XEN_BLKDEV_GRANT - /* Rewrite any grant references invalidated by suspend/resume. */ - for ( j = 0; j < req->nr_segments; j++ ) - { - if ( req->frame_and_sects[j] & GRANTREF_INVALID ) - gnttab_grant_foreign_access_ref( - blkif_gref_from_fas(req->frame_and_sects[j]), - rdomid, - blk_shadow[req->id].frame[j], - rq_data_dir((struct request *) - blk_shadow[req->id].request)); - req->frame_and_sects[j] &= ~GRANTREF_INVALID; - } - blk_shadow[req->id].req = *req; -#endif - - blk_ring.req_prod_pvt++; - } - - kfree(copy); - - recovery = 0; - - /* blk_ring->req_prod will be set when we flush_requests().*/ - wmb(); - - /* Kicks things back into life. */ - flush_requests(); - - /* Now safe to left other people use the interface. */ - blkif_state = BLKIF_STATE_CONNECTED; -} - -static void blkif_connect(blkif_fe_interface_status_t *status) -{ - int err = 0; - - blkif_evtchn = status->evtchn; - blkif_irq = bind_evtchn_to_irq(blkif_evtchn); - - err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL); - if ( err ) - { - WPRINTK("request_irq failed (err=%d)\n", err); - return; - } - - if ( recovery ) - { - blkif_recover(); - } - else - { - /* Transition to connected in case we need to do - * a partition probe on a whole disk. */ - blkif_state = BLKIF_STATE_CONNECTED; - - /* Probe for discs attached to the interface. */ - xlvbd_init(); - } - - /* Kick pending requests. */ - spin_lock_irq(&blkif_io_lock); - kick_pending_request_queues(); - spin_unlock_irq(&blkif_io_lock); -} - -static void unexpected(blkif_fe_interface_status_t *status) -{ - DPRINTK(" Unexpected blkif status %u in state %u\n", - status->status, blkif_state); -} - -static void blkif_status(blkif_fe_interface_status_t *status) -{ -#ifdef CONFIG_XEN_BLKDEV_GRANT - rdomid = status->domid; /* need to set rdomid early */ -#endif - - if ( status->handle != blkif_handle ) - { - WPRINTK(" Invalid blkif: handle=%u\n", status->handle); - unexpected(status); - return; - } - - switch ( status->status ) - { - case BLKIF_INTERFACE_STATUS_CLOSED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_close(); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_DISCONNECTED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - blkif_disconnect(); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - /* unexpected(status); */ /* occurs during suspend/resume */ - blkif_reset(); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CONNECTED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - blkif_disconnect(); - blkif_connect(status); - break; - case BLKIF_STATE_DISCONNECTED: - blkif_connect(status); - break; - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_connect(status); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CHANGED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - case BLKIF_STATE_DISCONNECTED: - unexpected(status); - break; - case BLKIF_STATE_CONNECTED: - vbd_update(); - break; - } - break; - - default: - WPRINTK(" Invalid blkif status: %d\n", status->status); - break; - } -} - - -static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->subtype ) - { - case CMSG_BLKIF_FE_INTERFACE_STATUS: - blkif_status((blkif_fe_interface_status_t *) - &msg->msg[0]); - break; - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -int wait_for_blkif(void) -{ - int err = 0; - int i; - send_driver_status(1); - - /* - * We should read 'nr_interfaces' from response message and wait - * for notifications before proceeding. For now we assume that we - * will be notified of exactly one interface. - */ - for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - if ( blkif_state != BLKIF_STATE_CONNECTED ) - { - WPRINTK("Timeout connecting to device!\n"); - err = -ENOSYS; - } - return err; -} - -int __init xlblk_init(void) -{ - int i; - -#ifdef CONFIG_XEN_BLKDEV_GRANT - if ( 0 > gnttab_alloc_grant_references( MAXIMUM_OUTSTANDING_BLOCK_REQS, - &gref_head, &gref_terminal )) - return 1; - printk(KERN_ALERT "Blkif frontend is using grant tables.\n"); -#endif - - if ( (xen_start_info.flags & SIF_INITDOMAIN) || - (xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) - return 0; - - IPRINTK("Initialising virtual block device driver\n"); - - blk_shadow_free = 0; - memset(blk_shadow, 0, sizeof(blk_shadow)); - for ( i = 0; i < BLK_RING_SIZE; i++ ) - blk_shadow[i].req.id = i+1; - blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; - - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - wait_for_blkif(); - - return 0; -} - -void blkdev_suspend(void) -{ -} - -void blkdev_resume(void) -{ -#ifdef CONFIG_XEN_BLKDEV_GRANT - int i, j; - for ( i = 0; i < BLK_RING_SIZE; i++ ) - for ( j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++ ) - blk_shadow[i].req.frame_and_sects[j] |= GRANTREF_INVALID; -#endif - send_driver_status(1); -} - -static void blkif_completion(struct blk_shadow *s) -{ - int i; -#ifdef CONFIG_XEN_BLKDEV_GRANT - for ( i = 0; i < s->req.nr_segments; i++ ) - gnttab_release_grant_reference( - &gref_head, blkif_gref_from_fas(s->req.frame_and_sects[i])); -#else - /* This is a hack to get the dirty logging bits set */ - if ( s->req.operation == BLKIF_OP_READ ) - { - for ( i = 0; i < s->req.nr_segments; i++ ) - { - unsigned long pfn = s->req.frame_and_sects[i] >> PAGE_SHIFT; - unsigned long mfn = phys_to_machine_mapping[pfn]; - xen_machphys_update(mfn, pfn); - } - } -#endif -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkfront/block.h --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/block.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,133 +0,0 @@ -/****************************************************************************** - * block.h - * - * Shared definitions between all levels of XenLinux Virtual block devices. - * - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - * Copyright (c) 2004-2005, Christian Limpach - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __XEN_DRIVERS_BLOCK_H__ -#define __XEN_DRIVERS_BLOCK_H__ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/hdreg.h> -#include <linux/blkdev.h> -#include <linux/major.h> -#include <linux/devfs_fs_kernel.h> -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/xen-public/io/blkif.h> -#include <asm-xen/xen-public/io/ring.h> -#include <asm/io.h> -#include <asm/atomic.h> -#include <asm/uaccess.h> - -#if 1 -#define IPRINTK(fmt, args...) \ - printk(KERN_INFO "xen_blk: " fmt, ##args) -#else -#define IPRINTK(fmt, args...) ((void)0) -#endif - -#if 1 -#define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_blk: " fmt, ##args) -#else -#define WPRINTK(fmt, args...) ((void)0) -#endif - -#if 0 -#define DPRINTK(_f, _a...) printk ( KERN_ALERT _f , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - -#if 0 -#define DPRINTK_IOCTL(_f, _a...) printk ( KERN_ALERT _f , ## _a ) -#else -#define DPRINTK_IOCTL(_f, _a...) ((void)0) -#endif - -struct xlbd_type_info { - int partn_shift; - int disks_per_major; - char *devname; - char *diskname; -}; - -/* - * We have one of these per vbd, whether ide, scsi or 'other'. They - * hang in private_data off the gendisk structure. We may end up - * putting all kinds of interesting stuff here :-) - */ -struct xlbd_major_info { - int major; - int index; - int usage; - struct xlbd_type_info *type; -}; - -struct xlbd_disk_info { - int xd_device; - struct xlbd_major_info *mi; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - struct xlbd_disk_info *next_waiting; - request_queue_t *rq; -#endif -}; - -typedef struct xen_block { - int usage; -} xen_block_t; - -extern spinlock_t blkif_io_lock; - -extern int blkif_open(struct inode *inode, struct file *filep); -extern int blkif_release(struct inode *inode, struct file *filep); -extern int blkif_ioctl(struct inode *inode, struct file *filep, - unsigned command, unsigned long argument); -extern int blkif_check(dev_t dev); -extern int blkif_revalidate(dev_t dev); -extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); -#ifdef CONFIG_XEN_BLKDEV_GRANT -extern void blkif_control_probe_send( - blkif_request_t *req, blkif_response_t *rsp, unsigned long address); -#endif -extern void do_blkif_request (request_queue_t *rq); - -extern void xlvbd_update_vbds(void); - -/* Virtual block-device subsystem. */ -extern int xlvbd_init(void); -extern void xlvbd_cleanup(void); - -#endif /* __XEN_DRIVERS_BLOCK_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blkfront/vbd.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blkfront/vbd.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,500 +0,0 @@ -/****************************************************************************** - * vbd.c - * - * XenLinux virtual block-device driver (xvd). - * - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - * Copyright (c) 2004-2005, Christian Limpach - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "block.h" -#include <linux/blkdev.h> -#include <linux/list.h> - -/* - * For convenience we distinguish between ide, scsi and 'other' (i.e., - * potentially combinations of the two) in the naming scheme and in a few other - * places. - */ - -#define NUM_IDE_MAJORS 10 -#define NUM_SCSI_MAJORS 9 -#define NUM_VBD_MAJORS 1 - -struct lvdisk -{ - blkif_sector_t capacity; /* 0: Size in terms of 512-byte sectors. */ - blkif_vdev_t device; /* 8: Device number (opaque 16 bit value). */ - u16 info; - struct list_head list; -}; - -static struct xlbd_type_info xlbd_ide_type = { - .partn_shift = 6, - .disks_per_major = 2, - .devname = "ide", - .diskname = "hd", -}; - -static struct xlbd_type_info xlbd_scsi_type = { - .partn_shift = 4, - .disks_per_major = 16, - .devname = "sd", - .diskname = "sd", -}; - -static struct xlbd_type_info xlbd_vbd_type = { - .partn_shift = 4, - .disks_per_major = 16, - .devname = "xvd", - .diskname = "xvd", -}; - -static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS + - NUM_VBD_MAJORS]; - -#define XLBD_MAJOR_IDE_START 0 -#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS) -#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) - -#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1 -#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1 -#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1 - -/* Information about our VBDs. */ -#define MAX_VBDS 64 -struct list_head vbds_list; - -#define MAJOR_XEN(dev) ((dev)>>8) -#define MINOR_XEN(dev) ((dev) & 0xff) - -static struct block_device_operations xlvbd_block_fops = -{ - .owner = THIS_MODULE, - .open = blkif_open, - .release = blkif_release, - .ioctl = blkif_ioctl, -}; - -spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED; - -static struct lvdisk *xlvbd_device_alloc(void) -{ - struct lvdisk *disk; - - disk = kmalloc(sizeof(*disk), GFP_KERNEL); - if (disk != NULL) { - memset(disk, 0, sizeof(*disk)); - INIT_LIST_HEAD(&disk->list); - } - return disk; -} - -static void xlvbd_device_free(struct lvdisk *disk) -{ - list_del(&disk->list); - kfree(disk); -} - -static vdisk_t *xlvbd_probe(int *ret) -{ - blkif_response_t rsp; - blkif_request_t req; - vdisk_t *disk_info = NULL; - unsigned long buf; - int nr; - - buf = __get_free_page(GFP_KERNEL); - if ((void *)buf == NULL) - goto out; - - memset(&req, 0, sizeof(req)); - req.operation = BLKIF_OP_PROBE; - req.nr_segments = 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT - blkif_control_probe_send(&req, &rsp, - (unsigned long)(virt_to_machine(buf))); -#else - req.frame_and_sects[0] = virt_to_machine(buf) | 7; - - blkif_control_send(&req, &rsp); -#endif - if ( rsp.status <= 0 ) { - WPRINTK("Could not probe disks (%d)\n", rsp.status); - goto out; - } - nr = rsp.status; - if ( nr > MAX_VBDS ) - nr = MAX_VBDS; - - disk_info = kmalloc(nr * sizeof(vdisk_t), GFP_KERNEL); - if (disk_info != NULL) - memcpy(disk_info, (void *) buf, nr * sizeof(vdisk_t)); - - if (ret != NULL) - *ret = nr; - -out: - free_page(buf); - return disk_info; -} - -static struct xlbd_major_info *xlbd_alloc_major_info( - int major, int minor, int index) -{ - struct xlbd_major_info *ptr; - - ptr = kmalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); - if (ptr == NULL) - return NULL; - - memset(ptr, 0, sizeof(struct xlbd_major_info)); - - ptr->major = major; - - switch (index) { - case XLBD_MAJOR_IDE_RANGE: - ptr->type = &xlbd_ide_type; - ptr->index = index - XLBD_MAJOR_IDE_START; - break; - case XLBD_MAJOR_SCSI_RANGE: - ptr->type = &xlbd_scsi_type; - ptr->index = index - XLBD_MAJOR_SCSI_START; - break; - case XLBD_MAJOR_VBD_RANGE: - ptr->type = &xlbd_vbd_type; - ptr->index = index - XLBD_MAJOR_VBD_START; - break; - } - - if (register_blkdev(ptr->major, ptr->type->devname)) { - WPRINTK("can't get major %d with name %s\n", - ptr->major, ptr->type->devname); - kfree(ptr); - return NULL; - } - - devfs_mk_dir(ptr->type->devname); - major_info[index] = ptr; - return ptr; -} - -static struct xlbd_major_info *xlbd_get_major_info(int device) -{ - int major, minor, index; - - major = MAJOR_XEN(device); - minor = MINOR_XEN(device); - - switch (major) { - case IDE0_MAJOR: index = 0; break; - case IDE1_MAJOR: index = 1; break; - case IDE2_MAJOR: index = 2; break; - case IDE3_MAJOR: index = 3; break; - case IDE4_MAJOR: index = 4; break; - case IDE5_MAJOR: index = 5; break; - case IDE6_MAJOR: index = 6; break; - case IDE7_MAJOR: index = 7; break; - case IDE8_MAJOR: index = 8; break; - case IDE9_MAJOR: index = 9; break; - case SCSI_DISK0_MAJOR: index = 10; break; - case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: - index = 11 + major - SCSI_DISK1_MAJOR; - break; - case SCSI_CDROM_MAJOR: index = 18; break; - default: index = 19; break; - } - - return ((major_info[index] != NULL) ? major_info[index] : - xlbd_alloc_major_info(major, minor, index)); -} - -static int xlvbd_init_blk_queue(struct gendisk *gd, vdisk_t *disk) -{ - request_queue_t *rq; - - rq = blk_init_queue(do_blkif_request, &blkif_io_lock); - if (rq == NULL) - return -1; - - elevator_init(rq, "noop"); - - /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_hardsect_size(rq, disk->sector_size); - blk_queue_max_sectors(rq, 512); - - /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(rq, PAGE_SIZE - 1); - blk_queue_max_segment_size(rq, PAGE_SIZE); - - /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); - blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - /* Make sure buffer addresses are sector-aligned. */ - blk_queue_dma_alignment(rq, 511); - - gd->queue = rq; - - return 0; -} - -struct gendisk *xlvbd_alloc_gendisk( - struct xlbd_major_info *mi, int minor, vdisk_t *disk) -{ - struct gendisk *gd; - struct xlbd_disk_info *di; - int nr_minors = 1; - - di = kmalloc(sizeof(struct xlbd_disk_info), GFP_KERNEL); - if (di == NULL) - return NULL; - memset(di, 0, sizeof(*di)); - di->mi = mi; - di->xd_device = disk->device; - - if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0) - nr_minors = 1 << mi->type->partn_shift; - - gd = alloc_disk(nr_minors); - if (gd == NULL) - goto out; - - if (nr_minors > 1) - sprintf(gd->disk_name, "%s%c", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift)); - else - sprintf(gd->disk_name, "%s%c%d", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift), - minor & ((1 << mi->type->partn_shift) - 1)); - - gd->major = mi->major; - gd->first_minor = minor; - gd->fops = &xlvbd_block_fops; - gd->private_data = di; - set_capacity(gd, disk->capacity); - - if (xlvbd_init_blk_queue(gd, disk)) { - del_gendisk(gd); - goto out; - } - - di->rq = gd->queue; - - if (disk->info & VDISK_READONLY) - set_disk_ro(gd, 1); - - if (disk->info & VDISK_REMOVABLE) - gd->flags |= GENHD_FL_REMOVABLE; - - if (disk->info & VDISK_CDROM) - gd->flags |= GENHD_FL_CD; - - add_disk(gd); - - return gd; - -out: - kfree(di); - return NULL; -} - -static int xlvbd_device_add(struct list_head *list, vdisk_t *disk) -{ - struct lvdisk *new; - int minor; - dev_t device; - struct block_device *bd; - struct gendisk *gd; - struct xlbd_major_info *mi; - - mi = xlbd_get_major_info(disk->device); - if (mi == NULL) - return -EPERM; - - new = xlvbd_device_alloc(); - if (new == NULL) - return -1; - new->capacity = disk->capacity; - new->device = disk->device; - new->info = disk->info; - - minor = MINOR_XEN(disk->device); - device = MKDEV(mi->major, minor); - - bd = bdget(device); - if (bd == NULL) - goto out; - - gd = xlvbd_alloc_gendisk(mi, minor, disk); - if (gd == NULL) - goto out_bd; - - list_add(&new->list, list); -out_bd: - bdput(bd); -out: - return 0; -} - -static int xlvbd_device_del(struct lvdisk *disk) -{ - dev_t device; - struct block_device *bd; - struct gendisk *gd; - struct xlbd_disk_info *di; - int ret = 0, unused; - request_queue_t *rq; - - device = MKDEV(MAJOR_XEN(disk->device), MINOR_XEN(disk->device)); - - bd = bdget(device); - if (bd == NULL) - return -1; - - gd = get_gendisk(device, &unused); - di = gd->private_data; - - if (di->mi->usage != 0) { - WPRINTK("disk removal failed: used [dev=%x]\n", device); - ret = -1; - goto out; - } - - rq = gd->queue; - del_gendisk(gd); - put_disk(gd); - blk_cleanup_queue(rq); - - xlvbd_device_free(disk); -out: - bdput(bd); - return ret; -} - -static int xlvbd_device_update(struct lvdisk *ldisk, vdisk_t *disk) -{ - dev_t device; - struct block_device *bd; - struct gendisk *gd; - int unused; - - if ((ldisk->capacity == disk->capacity) && (ldisk->info == disk->info)) - return 0; - - device = MKDEV(MAJOR_XEN(ldisk->device), MINOR_XEN(ldisk->device)); - - bd = bdget(device); - if (bd == NULL) - return -1; - - gd = get_gendisk(device, &unused); - set_capacity(gd, disk->capacity); - ldisk->capacity = disk->capacity; - - bdput(bd); - - return 0; -} - -void xlvbd_refresh(void) -{ - vdisk_t *newdisks; - struct list_head *tmp, *tmp2; - struct lvdisk *disk; - int i, nr; - - newdisks = xlvbd_probe(&nr); - if (newdisks == NULL) { - WPRINTK("failed to probe\n"); - return; - } - - i = 0; - list_for_each_safe(tmp, tmp2, &vbds_list) { - disk = list_entry(tmp, struct lvdisk, list); - - for (i = 0; i < nr; i++) { - if ( !newdisks[i].device ) - continue; - if ( disk->device == newdisks[i].device ) { - xlvbd_device_update(disk, &newdisks[i]); - newdisks[i].device = 0; - break; - } - } - if (i == nr) { - xlvbd_device_del(disk); - newdisks[i].device = 0; - } - } - for (i = 0; i < nr; i++) - if ( newdisks[i].device ) - xlvbd_device_add(&vbds_list, &newdisks[i]); - kfree(newdisks); -} - -/* - * xlvbd_update_vbds - reprobes the VBD status and performs updates driver - * state. The VBDs need to be updated in this way when the domain is - * initialised and also each time we receive an XLBLK_UPDATE event. - */ -void xlvbd_update_vbds(void) -{ - xlvbd_refresh(); -} - -/* - * Set up all the linux device goop for the virtual block devices - * (vbd's) that we know about. Note that although from the backend - * driver's p.o.v. VBDs are addressed simply an opaque 16-bit device - * number, the domain creation tools conventionally allocate these - * numbers to correspond to those used by 'real' linux -- this is just - * for convenience as it means e.g. that the same /etc/fstab can be - * used when booting with or without Xen. - */ -int xlvbd_init(void) -{ - int i, nr; - vdisk_t *disks; - - INIT_LIST_HEAD(&vbds_list); - - memset(major_info, 0, sizeof(major_info)); - - disks = xlvbd_probe(&nr); - if (disks == NULL) { - WPRINTK("failed to probe\n"); - return -1; - } - - for (i = 0; i < nr; i++) - xlvbd_device_add(&vbds_list, &disks[i]); - - kfree(disks); - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blktap/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3 +0,0 @@ - -obj-y := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,87 +0,0 @@ -/****************************************************************************** - * blktap.c - * - * XenLinux virtual block-device tap. - * - * Copyright (c) 2004, Andrew Warfield - * - * Based on the original split block driver: - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - * Modifications by Mark A. Williamson are (c) Intel Research Cambridge - * Copyright (c) 2004, Christian Limpach - * - * Note that unlike the split block driver code, this driver has been developed - * strictly for Linux 2.6 - */ - -#include "blktap.h" - -int __init xlblktap_init(void) -{ - ctrl_msg_t cmsg; - blkif_fe_driver_status_t fe_st; - blkif_be_driver_status_t be_st; - - printk(KERN_INFO "Initialising Xen block tap device\n"); - - DPRINTK(" tap - Backend connection init:\n"); - - - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_fe_driver_status_t); - fe_st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &fe_st, sizeof(fe_st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - - DPRINTK(" tap - Frontend connection init:\n"); - - active_reqs_init(); - blkif_interface_init(); - blkdev_schedule_init(); - - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_be_driver_status_t); - be_st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &be_st, sizeof(be_st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - - DPRINTK(" tap - Userland channel init:\n"); - - blktap_init(); - - DPRINTK("Blkif tap device initialized.\n"); - - return 0; -} - -#if 0 /* tap doesn't handle suspend/resume */ -void blkdev_suspend(void) -{ -} - -void blkdev_resume(void) -{ - ctrl_msg_t cmsg; - blkif_fe_driver_status_t st; - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_FE; - cmsg.subtype = CMSG_BLKIF_FE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_fe_driver_status_t); - st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} -#endif - -__initcall(xlblktap_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,253 +0,0 @@ -/* - * blktap.h - * - * Interfaces for the Xen block tap driver. - * - * (c) 2004, Andrew Warfield, University of Cambridge - * - */ - -#ifndef __BLKTAP_H__ -#define __BLKTAP_H__ - -#include <linux/version.h> -#include <linux/blkdev.h> -#include <linux/config.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <asm-xen/ctrl_if.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <asm/io.h> -#include <asm/setup.h> -#include <asm/pgalloc.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/xen-public/io/blkif.h> -#include <asm-xen/xen-public/io/ring.h> - -/* Used to signal to the backend that this is a tap domain. */ -#define BLKTAP_COOKIE 0xbeadfeed - -/* -------[ debug / pretty printing ]--------------------------------- */ - -#define PRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#if 0 -#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define DPRINTK(_f, _a...) ((void)0) -#endif - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) - - -/* -------[ state descriptors ]--------------------------------------- */ - -#define BLKIF_STATE_CLOSED 0 -#define BLKIF_STATE_DISCONNECTED 1 -#define BLKIF_STATE_CONNECTED 2 - -/* -------[ connection tracking ]------------------------------------- */ - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#endif - -extern spinlock_t blkif_io_lock; - -typedef struct blkif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - int irq; - /* Comms information. */ - blkif_back_ring_t blk_ring; - - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; - struct blkif_st *hash_next; - struct list_head blkdev_list; - spinlock_t blk_ring_lock; - atomic_t refcnt; - struct work_struct work; -} blkif_t; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); -void blkif_disconnect_complete(blkif_t *blkif); -#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define blkif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - blkif_disconnect_complete(_b); \ - } while (0) - - -/* -------[ active request tracking ]--------------------------------- */ - -typedef struct { - blkif_t *blkif; - unsigned long id; - int nr_pages; - unsigned long mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned long virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int next_free; -} active_req_t; - -typedef unsigned int ACTIVE_RING_IDX; - -active_req_t *lookup_active_req(ACTIVE_RING_IDX idx); - -extern inline unsigned int ID_TO_IDX(unsigned long id) -{ - return ( id & 0x0000ffff ); -} - -extern inline domid_t ID_TO_DOM(unsigned long id) -{ - return (id >> 16); -} - -void active_reqs_init(void); - -/* -------[ interposition -> character device interface ]------------- */ - -/* /dev/xen/blktap resides at device number major=10, minor=200 */ -#define BLKTAP_MINOR 202 - -/* size of the extra VMA area to map in attached pages. */ -#define BLKTAP_VMA_PAGES BLKIF_RING_SIZE - -/* blktap IOCTLs: */ -#define BLKTAP_IOCTL_KICK_FE 1 -#define BLKTAP_IOCTL_KICK_BE 2 -#define BLKTAP_IOCTL_SETMODE 3 -#define BLKTAP_IOCTL_PRINT_IDXS 100 - -/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ -#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ -#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 -#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 -#define BLKTAP_MODE_COPY_FE 0x00000004 -#define BLKTAP_MODE_COPY_BE 0x00000008 -#define BLKTAP_MODE_COPY_FE_PAGES 0x00000010 -#define BLKTAP_MODE_COPY_BE_PAGES 0x00000020 - -#define BLKTAP_MODE_INTERPOSE \ - (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) - -#define BLKTAP_MODE_COPY_BOTH \ - (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE) - -#define BLKTAP_MODE_COPY_BOTH_PAGES \ - (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES) - -static inline int BLKTAP_MODE_VALID(unsigned long arg) -{ - return ( - ( arg == BLKTAP_MODE_PASSTHROUGH ) || - ( arg == BLKTAP_MODE_INTERCEPT_FE ) || - ( arg == BLKTAP_MODE_INTERCEPT_BE ) || - ( arg == BLKTAP_MODE_INTERPOSE ) || - ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) || - ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) || - ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH ) - ); -} - - - -/* -------[ Mappings to User VMA ]------------------------------------ */ -#define MAX_PENDING_REQS 64 -#define BATCH_PER_DOMAIN 16 -extern struct vm_area_struct *blktap_vma; - -/* The following are from blkback.c and should probably be put in a - * header and included from there. - * The mmap area described here is where attached data pages eill be mapped. - */ - -extern unsigned long mmap_vstart; -#define MMAP_PAGES_PER_REQUEST \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) -#define MMAP_PAGES \ - (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - -/* immediately before the mmap area, we have a bunch of pages reserved - * for shared memory rings. - */ - -#define RING_PAGES 3 /* Ctrl, Front, and Back */ -extern unsigned long rings_vstart; - - -/* -------[ Here be globals ]----------------------------------------- */ -extern unsigned long blktap_mode; - -/* Connection to a single backend domain. */ -extern blkif_front_ring_t blktap_be_ring; -extern unsigned int blktap_be_evtchn; -extern unsigned int blktap_be_state; - -/* User ring status. */ -extern unsigned long blktap_ring_ok; - -/* -------[ ...and function prototypes. ]----------------------------- */ - -/* init function for character device interface. */ -int blktap_init(void); - -/* init function for the blkif cache. */ -void __init blkif_interface_init(void); -void __init blkdev_schedule_init(void); -void blkif_deschedule(blkif_t *blkif); - -/* interfaces to the char driver, passing messages to and from apps. */ -void blktap_kick_user(void); - -/* user ring access functions: */ -int blktap_write_fe_ring(blkif_request_t *req); -int blktap_write_be_ring(blkif_response_t *rsp); -int blktap_write_ctrl_ring(ctrl_msg_t *msg); - -/* fe/be ring access functions: */ -int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp); -int write_req_to_be_ring(blkif_request_t *req); - -/* event notification functions */ -void kick_fe_domain(blkif_t *blkif); -void kick_be_domain(void); - -/* Interrupt handlers. */ -irqreturn_t blkif_ptbe_int(int irq, void *dev_id, - struct pt_regs *ptregs); -irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs); - -/* Control message receiver. */ -extern void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id); - -/* debug */ -void print_fe_ring_idxs(void); -void print_be_ring_idxs(void); - -#define __BLKINT_H__ -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,540 +0,0 @@ -/****************************************************************************** - * blktap_controlmsg.c - * - * XenLinux virtual block-device tap. - * Control interfaces to the frontend and backend drivers. - * - * Copyright (c) 2004, Andrew Warfield - * - */ - -#include "blktap.h" - -static char *blkif_state_name[] = { - [BLKIF_STATE_CLOSED] = "closed", - [BLKIF_STATE_DISCONNECTED] = "disconnected", - [BLKIF_STATE_CONNECTED] = "connected", -}; - -static char * blkif_status_name[] = { - [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", - [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", - [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", - [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", -}; - -static unsigned blktap_be_irq; -unsigned int blktap_be_state = BLKIF_STATE_CLOSED; -unsigned int blktap_be_evtchn; - -/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/ - -#define BLKIF_HASHSZ 1024 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) - -static kmem_cache_t *blkif_cachep; -static blkif_t *blkif_hash[BLKIF_HASHSZ]; - -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) -{ - blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif != NULL) && - ((blkif->domid != domid) || (blkif->handle != handle)) ) - blkif = blkif->hash_next; - return blkif; -} - -static void __blkif_disconnect_complete(void *arg) -{ - blkif_t *blkif = (blkif_t *)arg; - ctrl_msg_t cmsg; - blkif_be_disconnect_t disc; - - /* - * These can't be done in blkif_disconnect() because at that point there - * may be outstanding requests at the disc whose asynchronous responses - * must still be notified to the remote driver. - */ - unbind_evtchn_from_irq(blkif->evtchn); - vfree(blkif->blk_ring.sring); - - /* Construct the deferred response message. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; - cmsg.id = blkif->disconnect_rspid; - cmsg.length = sizeof(blkif_be_disconnect_t); - disc.domid = blkif->domid; - disc.blkif_handle = blkif->handle; - disc.status = BLKIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'blkif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( blkif->status != DISCONNECTING ) - BUG(); - blkif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); -} - -void blkif_disconnect_complete(blkif_t *blkif) -{ - INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif); - schedule_work(&blkif->work); -} - -void blkif_ptfe_create(blkif_be_create_t *create) -{ - blkif_t *blkif, **pblkif; - domid_t domid = create->domid; - unsigned int handle = create->blkif_handle; - - - /* May want to store info on the connecting domain here. */ - - DPRINTK("PT got BE_CREATE\n"); - - if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) - { - WPRINTK("Could not create blkif: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - /* blkif struct init code from blkback.c */ - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->handle = handle; - blkif->status = DISCONNECTED; - spin_lock_init(&blkif->blk_ring_lock); - atomic_set(&blkif->refcnt, 0); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif != NULL ) - { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - { - WPRINTK("Could not create blkif: already exists\n"); - create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; - kmem_cache_free(blkif_cachep, blkif); - return; - } - pblkif = &(*pblkif)->hash_next; - } - - blkif->hash_next = *pblkif; - *pblkif = blkif; - - create->status = BLKIF_BE_STATUS_OKAY; -} - - -void blkif_ptfe_destroy(blkif_be_destroy_t *destroy) -{ - /* Clear anything that we initialized above. */ - - domid_t domid = destroy->domid; - unsigned int handle = destroy->blkif_handle; - blkif_t **pblkif, *blkif; - - DPRINTK("PT got BE_DESTROY\n"); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif = *pblkif) != NULL ) - { - if ( (blkif->domid == domid) && (blkif->handle == handle) ) - { - if ( blkif->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pblkif = &blkif->hash_next; - } - - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pblkif = blkif->hash_next; - kmem_cache_free(blkif_cachep, blkif); - destroy->status = BLKIF_BE_STATUS_OKAY; -} - -void blkif_ptfe_connect(blkif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int handle = connect->blkif_handle; - unsigned int evtchn = connect->evtchn; - unsigned long shmem_frame = connect->shmem_frame; - struct vm_struct *vma; - pgprot_t prot; - int error; - blkif_t *blkif; - blkif_sring_t *sring; - - DPRINTK("PT got BE_CONNECT\n"); - - blkif = blkif_find_by_handle(domid, handle); - if ( unlikely(blkif == NULL) ) - { - WPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", - connect->domid, connect->blkif_handle); - connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); - error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), - shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - WPRINTK("BE_CONNECT: error! (%d)\n", error); - if ( error == -ENOMEM ) - connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) { - connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; - WPRINTK("BE_CONNECT: MAPPING error!\n"); - } - else - connect->status = BLKIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } - - if ( blkif->status != DISCONNECTED ) - { - connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - vfree(vma->addr); - return; - } - - sring = (blkif_sring_t *)vma->addr; - SHARED_RING_INIT(sring); - BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); - - blkif->evtchn = evtchn; - blkif->irq = bind_evtchn_to_irq(evtchn); - blkif->shmem_frame = shmem_frame; - blkif->status = CONNECTED; - blkif_get(blkif); - - request_irq(blkif->irq, blkif_ptfe_int, 0, "blkif-pt-backend", blkif); - - connect->status = BLKIF_BE_STATUS_OKAY; -} - -int blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) -{ - domid_t domid = disconnect->domid; - unsigned int handle = disconnect->blkif_handle; - blkif_t *blkif; - - DPRINTK("PT got BE_DISCONNECT\n"); - - blkif = blkif_find_by_handle(domid, handle); - if ( unlikely(blkif == NULL) ) - { - WPRINTK("blkif_disconnect attempted for non-existent blkif" - " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); - disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ - } - - if ( blkif->status == CONNECTED ) - { - blkif->status = DISCONNECTING; - blkif->disconnect_rspid = rsp_id; - wmb(); /* Let other CPUs see the status change. */ - free_irq(blkif->irq, blkif); - blkif_deschedule(blkif); - blkif_put(blkif); - return 0; /* Caller should not send response message. */ - } - - disconnect->status = BLKIF_BE_STATUS_OKAY; - return 1; -} - -/*-----[ Control Messages to/from Backend VM ]----------------------------*/ - -/* Tell the controller to bring up the interface. */ -static void blkif_ptbe_send_interface_connect(void) -{ - ctrl_msg_t cmsg = { - .type = CMSG_BLKIF_FE, - .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT, - .length = sizeof(blkif_fe_interface_connect_t), - }; - blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; - msg->handle = 0; - msg->shmem_frame = virt_to_machine(blktap_be_ring.sring) >> PAGE_SHIFT; - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -static void blkif_ptbe_close(void) -{ -} - -/* Move from CLOSED to DISCONNECTED state. */ -static void blkif_ptbe_disconnect(void) -{ - blkif_sring_t *sring; - - sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL); - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blktap_be_ring, sring, PAGE_SIZE); - blktap_be_state = BLKIF_STATE_DISCONNECTED; - DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n"); - blkif_ptbe_send_interface_connect(); -} - -static void blkif_ptbe_connect(blkif_fe_interface_status_t *status) -{ - int err = 0; - - blktap_be_evtchn = status->evtchn; - blktap_be_irq = bind_evtchn_to_irq(blktap_be_evtchn); - - err = request_irq(blktap_be_irq, blkif_ptbe_int, - SA_SAMPLE_RANDOM, "blkif", NULL); - if ( err ) { - WPRINTK("blkfront request_irq failed (%d)\n", err); - return; - } else { - /* transtion to connected in case we need to do a - a partion probe on a whole disk */ - blktap_be_state = BLKIF_STATE_CONNECTED; - } -} - -static void unexpected(blkif_fe_interface_status_t *status) -{ - WPRINTK(" TAP: Unexpected blkif status %s in state %s\n", - blkif_status_name[status->status], - blkif_state_name[blktap_be_state]); -} - -static void blkif_ptbe_status( - blkif_fe_interface_status_t *status) -{ - if ( status->handle != 0 ) - { - DPRINTK("Status change on unsupported blkif %d\n", - status->handle); - return; - } - - DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]); - - switch ( status->status ) - { - case BLKIF_INTERFACE_STATUS_CLOSED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_ptbe_close(); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_DISCONNECTED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - blkif_ptbe_disconnect(); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n"); - unexpected(status); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CONNECTED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - blkif_ptbe_disconnect(); - blkif_ptbe_connect(status); - break; - case BLKIF_STATE_DISCONNECTED: - blkif_ptbe_connect(status); - break; - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_ptbe_connect(status); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CHANGED: - switch ( blktap_be_state ) - { - case BLKIF_STATE_CLOSED: - case BLKIF_STATE_DISCONNECTED: - unexpected(status); - break; - case BLKIF_STATE_CONNECTED: - /* vbd_update(); */ - /* tap doesn't really get state changes... */ - unexpected(status); - break; - } - break; - - default: - DPRINTK("Status change to unknown value %d\n", status->status); - break; - } -} - -/*-----[ All control messages enter here: ]-------------------------------*/ - -void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->type ) - { - case CMSG_BLKIF_FE: - - switch ( msg->subtype ) - { - case CMSG_BLKIF_FE_INTERFACE_STATUS: - blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]); - break; - - default: - goto parse_error; - } - - break; - - case CMSG_BLKIF_BE: - - /* send a copy of the message to user if wanted */ - - if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || - (blktap_mode & BLKTAP_MODE_COPY_FE) ) { - - blktap_write_ctrl_ring(msg); - blktap_kick_user(); - } - - switch ( msg->subtype ) - { - case CMSG_BLKIF_BE_CREATE: - blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DESTROY: - blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_CONNECT: - blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DISCONNECT: - if ( !blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0], - msg->id) ) - return; - break; - - /* We just ignore anything to do with vbds for now. */ - - case CMSG_BLKIF_BE_VBD_CREATE: - DPRINTK("PT got VBD_CREATE\n"); - ((blkif_be_vbd_create_t *)&msg->msg[0])->status - = BLKIF_BE_STATUS_OKAY; - break; - case CMSG_BLKIF_BE_VBD_DESTROY: - DPRINTK("PT got VBD_DESTROY\n"); - ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status - = BLKIF_BE_STATUS_OKAY; - break; - default: - goto parse_error; - } - - break; - } - - ctrl_if_send_response(msg); - return; - - parse_error: - msg->length = 0; - ctrl_if_send_response(msg); -} - -/*-----[ Initialization ]-------------------------------------------------*/ - -void __init blkif_interface_init(void) -{ - blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), - 0, 0, NULL, NULL); - memset(blkif_hash, 0, sizeof(blkif_hash)); - - blktap_be_ring.sring = NULL; -} - - - -/* Debug : print the current ring indices. */ - -void print_fe_ring_idxs(void) -{ - int i; - blkif_t *blkif; - - WPRINTK("FE Rings: \n---------\n"); - for ( i = 0; i < BLKIF_HASHSZ; i++) { - blkif = blkif_hash[i]; - while (blkif != NULL) { - if (blkif->status == DISCONNECTED) { - WPRINTK("(%2d,%2d) DISCONNECTED\n", - blkif->domid, blkif->handle); - } else if (blkif->status == DISCONNECTING) { - WPRINTK("(%2d,%2d) DISCONNECTING\n", - blkif->domid, blkif->handle); - } else if (blkif->blk_ring.sring == NULL) { - WPRINTK("(%2d,%2d) CONNECTED, but null sring!\n", - blkif->domid, blkif->handle); - } else { - blkif_get(blkif); - WPRINTK("(%2d,%2d): req_cons: %2d, rsp_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blkif->domid, blkif->handle, - blkif->blk_ring.req_cons, - blkif->blk_ring.rsp_prod_pvt, - blkif->blk_ring.sring->req_prod, - blkif->blk_ring.sring->rsp_prod); - blkif_put(blkif); - } - blkif = blkif->hash_next; - } - } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_datapath.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_datapath.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,451 +0,0 @@ -/****************************************************************************** - * blktap_datapath.c - * - * XenLinux virtual block-device tap. - * Block request routing data path. - * - * Copyright (c) 2004, Andrew Warfield - * -- see full header in blktap.c - */ - -#include "blktap.h" -#include <asm-xen/evtchn.h> - -/*-----[ The data paths ]-------------------------------------------------*/ - -/* Connection to a single backend domain. */ -blkif_front_ring_t blktap_be_ring; - -/*-----[ Tracking active requests ]---------------------------------------*/ - -/* this must be the same as MAX_PENDING_REQS in blkback.c */ -#define MAX_ACTIVE_REQS ((ACTIVE_RING_IDX)64U) - -active_req_t active_reqs[MAX_ACTIVE_REQS]; -ACTIVE_RING_IDX active_req_ring[MAX_ACTIVE_REQS]; -spinlock_t active_req_lock = SPIN_LOCK_UNLOCKED; -ACTIVE_RING_IDX active_prod, active_cons; -#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1)) -#define ACTIVE_IDX(_ar) (_ar - active_reqs) -#define NR_ACTIVE_REQS (MAX_ACTIVE_REQS - active_prod + active_cons) - -inline active_req_t *get_active_req(void) -{ - ACTIVE_RING_IDX idx; - active_req_t *ar; - unsigned long flags; - - ASSERT(active_cons != active_prod); - - spin_lock_irqsave(&active_req_lock, flags); - idx = active_req_ring[MASK_ACTIVE_IDX(active_cons++)]; - ar = &active_reqs[idx]; - spin_unlock_irqrestore(&active_req_lock, flags); - - return ar; -} - -inline void free_active_req(active_req_t *ar) -{ - unsigned long flags; - - spin_lock_irqsave(&active_req_lock, flags); - active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar); - spin_unlock_irqrestore(&active_req_lock, flags); -} - -active_req_t *lookup_active_req(ACTIVE_RING_IDX idx) -{ - return &active_reqs[idx]; -} - -void active_reqs_init(void) -{ - ACTIVE_RING_IDX i; - - active_cons = 0; - active_prod = MAX_ACTIVE_REQS; - memset(active_reqs, 0, sizeof(active_reqs)); - for ( i = 0; i < MAX_ACTIVE_REQS; i++ ) - active_req_ring[i] = i; -} - -/* Requests passing through the tap to the backend hijack the id field - * in the request message. In it we put the AR index _AND_ the fe domid. - * the domid is used by the backend to map the pages properly. - */ - -static inline unsigned long MAKE_ID(domid_t fe_dom, ACTIVE_RING_IDX idx) -{ - return ( (fe_dom << 16) | MASK_ACTIVE_IDX(idx) ); -} - -/*-----[ Ring helpers ]---------------------------------------------------*/ - -static void maybe_trigger_blktap_schedule(void); - -inline int write_resp_to_fe_ring(blkif_t *blkif, blkif_response_t *rsp) -{ - blkif_response_t *resp_d; - active_req_t *ar; - - ar = &active_reqs[ID_TO_IDX(rsp->id)]; - rsp->id = ar->id; - - resp_d = RING_GET_RESPONSE(&blkif->blk_ring, - blkif->blk_ring.rsp_prod_pvt); - memcpy(resp_d, rsp, sizeof(blkif_response_t)); - wmb(); - blkif->blk_ring.rsp_prod_pvt++; - - blkif_put(ar->blkif); - free_active_req(ar); - - return 0; -} - -inline int write_req_to_be_ring(blkif_request_t *req) -{ - blkif_request_t *req_d; - - if ( blktap_be_state != BLKIF_STATE_CONNECTED ) { - WPRINTK("Tap trying to access an unconnected backend!\n"); - return 0; - } - - req_d = RING_GET_REQUEST(&blktap_be_ring, - blktap_be_ring.req_prod_pvt); - memcpy(req_d, req, sizeof(blkif_request_t)); - wmb(); - blktap_be_ring.req_prod_pvt++; - - return 0; -} - -void kick_fe_domain(blkif_t *blkif) -{ - RING_PUSH_RESPONSES(&blkif->blk_ring); - notify_via_evtchn(blkif->evtchn); - DPRINTK("notified FE(dom %u)\n", blkif->domid); - - /* We just feed up a batch of request slots... */ - maybe_trigger_blktap_schedule(); - -} - -void kick_be_domain(void) -{ - if ( blktap_be_state != BLKIF_STATE_CONNECTED ) - return; - - wmb(); /* Ensure that the frontend can see the requests. */ - RING_PUSH_REQUESTS(&blktap_be_ring); - notify_via_evtchn(blktap_be_evtchn); - DPRINTK("notified BE\n"); -} - -/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/ - -/*-----[ Scheduler list maint -from blkback ]--- */ - -static struct list_head blkio_schedule_list; -static spinlock_t blkio_schedule_list_lock; - -static int __on_blkdev_list(blkif_t *blkif) -{ - return blkif->blkdev_list.next != NULL; -} - -static void remove_from_blkdev_list(blkif_t *blkif) -{ - unsigned long flags; - if ( !__on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( __on_blkdev_list(blkif) ) - { - list_del(&blkif->blkdev_list); - blkif->blkdev_list.next = NULL; - blkif_put(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); -} - -static void add_to_blkdev_list_tail(blkif_t *blkif) -{ - unsigned long flags; - if ( __on_blkdev_list(blkif) ) return; - spin_lock_irqsave(&blkio_schedule_list_lock, flags); - if ( !__on_blkdev_list(blkif) && (blkif->status == CONNECTED) ) - { - list_add_tail(&blkif->blkdev_list, &blkio_schedule_list); - blkif_get(blkif); - } - spin_unlock_irqrestore(&blkio_schedule_list_lock, flags); -} - - -/*-----[ Scheduler functions - from blkback ]--- */ - -static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait); - -static int do_block_io_op(blkif_t *blkif, int max_to_do); - -static int blkio_schedule(void *arg) -{ - DECLARE_WAITQUEUE(wq, current); - - blkif_t *blkif; - struct list_head *ent; - - daemonize( - "xentapd" - ); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&blkio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) || - list_empty(&blkio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&blkio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_ACTIVE_REQS < MAX_ACTIVE_REQS) && - !list_empty(&blkio_schedule_list) ) - { - ent = blkio_schedule_list.next; - blkif = list_entry(ent, blkif_t, blkdev_list); - blkif_get(blkif); - remove_from_blkdev_list(blkif); - if ( do_block_io_op(blkif, BATCH_PER_DOMAIN) ) - add_to_blkdev_list_tail(blkif); - blkif_put(blkif); - } - } -} - -static void maybe_trigger_blktap_schedule(void) -{ - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( (NR_ACTIVE_REQS < (MAX_ACTIVE_REQS/2)) && - !list_empty(&blkio_schedule_list) ) - wake_up(&blkio_schedule_wait); -} - -void blkif_deschedule(blkif_t *blkif) -{ - remove_from_blkdev_list(blkif); -} - -void __init blkdev_schedule_init(void) -{ - spin_lock_init(&blkio_schedule_list_lock); - INIT_LIST_HEAD(&blkio_schedule_list); - - if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); -} - -/*-----[ Interrupt entry from a frontend ]------ */ - -irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs) -{ - blkif_t *blkif = dev_id; - - add_to_blkdev_list_tail(blkif); - maybe_trigger_blktap_schedule(); - return IRQ_HANDLED; -} - -/*-----[ Other Frontend Ring functions ]-------- */ - -/* irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)*/ -static int do_block_io_op(blkif_t *blkif, int max_to_do) -{ - /* we have pending messages from the real frontend. */ - - blkif_request_t *req_s; - RING_IDX i, rp; - unsigned long flags; - active_req_t *ar; - int more_to_do = 0; - int notify_be = 0, notify_user = 0; - - if (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) return 1; - - /* lock both rings */ - spin_lock_irqsave(&blkif_io_lock, flags); - - rp = blkif->blk_ring.sring->req_prod; - rmb(); - - for ( i = blkif->blk_ring.req_cons; - (i != rp) && - !RING_REQUEST_CONS_OVERFLOW(&blkif->blk_ring, i); - i++ ) - { - - if ((--max_to_do == 0) || (NR_ACTIVE_REQS == MAX_ACTIVE_REQS)) - { - more_to_do = 1; - break; - } - - req_s = RING_GET_REQUEST(&blkif->blk_ring, i); - /* This is a new request: - * Assign an active request record, and remap the id. - */ - ar = get_active_req(); - ar->id = req_s->id; - ar->nr_pages = req_s->nr_segments; - blkif_get(blkif); - ar->blkif = blkif; - req_s->id = MAKE_ID(blkif->domid, ACTIVE_IDX(ar)); - /* WPRINTK("%3u < %3lu\n", ID_TO_IDX(req_s->id), ar->id); */ - - /* FE -> BE interposition point is here. */ - - /* ------------------------------------------------------------- */ - /* BLKIF_OP_PROBE_HACK: */ - /* Signal to the backend that we are a tap domain. */ - - if (req_s->operation == BLKIF_OP_PROBE) { - DPRINTK("Adding BLKTAP_COOKIE to PROBE request.\n"); - req_s->frame_and_sects[1] = BLKTAP_COOKIE; - } - - /* ------------------------------------------------------------- */ - - /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */ - if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || - (blktap_mode & BLKTAP_MODE_COPY_FE) ) { - - /* Copy the response message to UFERing */ - /* In MODE_INTERCEPT_FE, map attached pages into the app vma */ - /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */ - - DPRINTK("req->UFERing\n"); - blktap_write_fe_ring(req_s); - notify_user = 1; - } - - /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */ - if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) || - (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) { - - /* be included to prevent noise from the fe when its off */ - /* copy the request message to the BERing */ - - DPRINTK("blktap: FERing[%u] -> BERing[%u]\n", - (unsigned)i & (RING_SIZE(&blktap_be_ring)-1), - (unsigned)blktap_be_ring.req_prod_pvt & - (RING_SIZE((&blktap_be_ring)-1))); - - write_req_to_be_ring(req_s); - notify_be = 1; - } - } - - blkif->blk_ring.req_cons = i; - - /* unlock rings */ - spin_unlock_irqrestore(&blkif_io_lock, flags); - - if (notify_user) - blktap_kick_user(); - if (notify_be) - kick_be_domain(); - - return more_to_do; -} - -/*-----[ Data to/from Backend (server) VM ]------------------------------*/ - - -irqreturn_t blkif_ptbe_int(int irq, void *dev_id, - struct pt_regs *ptregs) -{ - blkif_response_t *resp_s; - blkif_t *blkif; - RING_IDX rp, i; - unsigned long flags; - - DPRINTK("PT got BE interrupt.\n"); - - /* lock both rings */ - spin_lock_irqsave(&blkif_io_lock, flags); - - rp = blktap_be_ring.sring->rsp_prod; - rmb(); - - for ( i = blktap_be_ring.rsp_cons; i != rp; i++) - { - resp_s = RING_GET_RESPONSE(&blktap_be_ring, i); - - /* BE -> FE interposition point is here. */ - - blkif = active_reqs[ID_TO_IDX(resp_s->id)].blkif; - - /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */ - if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) || - (blktap_mode & BLKTAP_MODE_COPY_BE) ) { - - /* Copy the response message to UBERing */ - /* In MODE_INTERCEPT_BE, map attached pages into the app vma */ - /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */ - - DPRINTK("rsp->UBERing\n"); - blktap_write_be_ring(resp_s); - blktap_kick_user(); - - } - - /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */ - if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) || - (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) { - - /* (fe included to prevent random interference from the BE) */ - /* Copy the response message to FERing */ - - DPRINTK("blktap: BERing[%u] -> FERing[%u]\n", - (unsigned)i & (RING_SIZE(&blkif->blk_ring)-1), - (unsigned)blkif->blk_ring.rsp_prod_pvt & - (RING_SIZE((&blkif->blk_ring)-1))); - - write_resp_to_fe_ring(blkif, resp_s); - kick_fe_domain(blkif); - - } - } - - blktap_be_ring.rsp_cons = i; - - - spin_unlock_irqrestore(&blkif_io_lock, flags); - - return IRQ_HANDLED; -} - -/* Debug : print the current ring indices. */ - -void print_be_ring_idxs(void) -{ - if (blktap_be_ring.sring != NULL) { - WPRINTK("BE Ring: \n--------\n"); - WPRINTK("BE: rsp_cons: %2d, req_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_be_ring.rsp_cons, - blktap_be_ring.req_prod_pvt, - blktap_be_ring.sring->req_prod, - blktap_be_ring.sring->rsp_prod); - } -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c --- a/linux-2.6.11-xen-sparse/drivers/xen/blktap/blktap_userdev.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,478 +0,0 @@ -/****************************************************************************** - * blktap_userdev.c - * - * XenLinux virtual block-device tap. - * Control interface between the driver and a character device. - * - * Copyright (c) 2004, Andrew Warfield - * - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/mm.h> -#include <linux/miscdevice.h> -#include <linux/errno.h> -#include <linux/major.h> -#include <linux/gfp.h> -#include <linux/poll.h> -#include <asm/pgalloc.h> -#include <asm-xen/xen-public/io/blkif.h> /* for control ring. */ - -#include "blktap.h" - - -unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH; - -/* Only one process may open /dev/xen/blktap at any time. */ -static unsigned long blktap_dev_inuse; -unsigned long blktap_ring_ok; /* make this ring->state */ - -/* for poll: */ -static wait_queue_head_t blktap_wait; - -/* Where things are inside the device mapping. */ -struct vm_area_struct *blktap_vma = NULL; -unsigned long mmap_vstart; -unsigned long rings_vstart; - -/* Rings up to user space. */ -static blkif_front_ring_t blktap_ufe_ring; -static blkif_back_ring_t blktap_ube_ring; -static ctrl_front_ring_t blktap_uctrl_ring; - -/* local prototypes */ -static int blktap_read_fe_ring(void); -static int blktap_read_be_ring(void); - -/* -------[ blktap vm ops ]------------------------------------------- */ - -static struct page *blktap_nopage(struct vm_area_struct *vma, - unsigned long address, - int *type) -{ - /* - * if the page has not been mapped in by the driver then generate - * a SIGBUS to the domain. - */ - - force_sig(SIGBUS, current); - - return 0; -} - -struct vm_operations_struct blktap_vm_ops = { - nopage: blktap_nopage, -}; - -/* -------[ blktap file ops ]----------------------------------------- */ - -static int blktap_open(struct inode *inode, struct file *filp) -{ - blkif_sring_t *sring; - ctrl_sring_t *csring; - - if ( test_and_set_bit(0, &blktap_dev_inuse) ) - return -EBUSY; - - printk(KERN_ALERT "blktap open.\n"); - - /* Allocate the ctrl ring. */ - csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL); - if (csring == NULL) - goto fail_nomem; - - SetPageReserved(virt_to_page(csring)); - - SHARED_RING_INIT(csring); - FRONT_RING_INIT(&blktap_uctrl_ring, csring, PAGE_SIZE); - - /* Allocate the fe ring. */ - sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); - if (sring == NULL) - goto fail_free_ctrl; - - SetPageReserved(virt_to_page(sring)); - - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blktap_ufe_ring, sring, PAGE_SIZE); - - /* Allocate the be ring. */ - sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); - if (sring == NULL) - goto fail_free_fe; - - SetPageReserved(virt_to_page(sring)); - - SHARED_RING_INIT(sring); - BACK_RING_INIT(&blktap_ube_ring, sring, PAGE_SIZE); - - DPRINTK(KERN_ALERT "blktap open.\n"); - - return 0; - - fail_free_ctrl: - free_page( (unsigned long) blktap_uctrl_ring.sring); - - fail_free_fe: - free_page( (unsigned long) blktap_ufe_ring.sring); - - fail_nomem: - return -ENOMEM; -} - -static int blktap_release(struct inode *inode, struct file *filp) -{ - blktap_dev_inuse = 0; - blktap_ring_ok = 0; - - printk(KERN_ALERT "blktap closed.\n"); - - /* Free the ring page. */ - ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring)); - free_page((unsigned long) blktap_uctrl_ring.sring); - - ClearPageReserved(virt_to_page(blktap_ufe_ring.sring)); - free_page((unsigned long) blktap_ufe_ring.sring); - - ClearPageReserved(virt_to_page(blktap_ube_ring.sring)); - free_page((unsigned long) blktap_ube_ring.sring); - - /* Clear any active mappings. */ - if (blktap_vma != NULL) { - zap_page_range(blktap_vma, blktap_vma->vm_start, - blktap_vma->vm_end - blktap_vma->vm_start, NULL); - blktap_vma = NULL; - } - - return 0; -} - -/* Note on mmap: - * remap_pfn_range sets VM_IO on vma->vm_flags. In trying to make libaio - * work to do direct page access from userspace, this ended up being a - * problem. The bigger issue seems to be that there is no way to map - * a foreign page in to user space and have the virtual address of that - * page map sanely down to a mfn. - * Removing the VM_IO flag results in a loop in get_user_pages, as - * pfn_valid() always fails on a foreign page. - */ -static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) -{ - int size; - - printk(KERN_ALERT "blktap mmap (%lx, %lx)\n", - vma->vm_start, vma->vm_end); - - vma->vm_ops = &blktap_vm_ops; - - size = vma->vm_end - vma->vm_start; - if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) { - printk(KERN_INFO - "blktap: you _must_ map exactly %d pages!\n", - MMAP_PAGES + RING_PAGES); - return -EAGAIN; - } - - size >>= PAGE_SHIFT; - printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); - - rings_vstart = vma->vm_start; - mmap_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); - - /* Map the ring pages to the start of the region and reserve it. */ - - /* not sure if I really need to do this... */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring)); - if (remap_pfn_range(vma, vma->vm_start, - __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - WPRINTK("ctrl_ring: remap_pfn_range failure!\n"); - } - - - DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring)); - if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, - __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - WPRINTK("be_ring: remap_pfn_range failure!\n"); - } - - DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring)); - if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), - __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - WPRINTK("fe_ring: remap_pfn_range failure!\n"); - } - - blktap_vma = vma; - blktap_ring_ok = 1; - - return 0; -} - -static int blktap_ioctl(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - switch(cmd) { - case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */ - return blktap_read_fe_ring(); - - case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */ - return blktap_read_be_ring(); - - case BLKTAP_IOCTL_SETMODE: - if (BLKTAP_MODE_VALID(arg)) { - blktap_mode = arg; - /* XXX: may need to flush rings here. */ - printk(KERN_INFO "blktap: set mode to %lx\n", arg); - return 0; - } - case BLKTAP_IOCTL_PRINT_IDXS: - { - print_be_ring_idxs(); - print_fe_ring_idxs(); - WPRINTK("User Rings: \n-----------\n"); - WPRINTK("UF: rsp_cons: %2d, req_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_ufe_ring.rsp_cons, - blktap_ufe_ring.req_prod_pvt, - blktap_ufe_ring.sring->req_prod, - blktap_ufe_ring.sring->rsp_prod); - WPRINTK("UB: req_cons: %2d, rsp_prod_prv: %2d " - "| req_prod: %2d, rsp_prod: %2d\n", - blktap_ube_ring.req_cons, - blktap_ube_ring.rsp_prod_pvt, - blktap_ube_ring.sring->req_prod, - blktap_ube_ring.sring->rsp_prod); - - } - } - return -ENOIOCTLCMD; -} - -static unsigned int blktap_poll(struct file *file, poll_table *wait) -{ - poll_wait(file, &blktap_wait, wait); - - if ( RING_HAS_UNPUSHED_REQUESTS(&blktap_uctrl_ring) || - RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) || - RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) { - - RING_PUSH_REQUESTS(&blktap_uctrl_ring); - RING_PUSH_REQUESTS(&blktap_ufe_ring); - RING_PUSH_RESPONSES(&blktap_ube_ring); - return POLLIN | POLLRDNORM; - } - - return 0; -} - -void blktap_kick_user(void) -{ - /* blktap_ring->req_prod = blktap_req_prod; */ - wake_up_interruptible(&blktap_wait); -} - -static struct file_operations blktap_fops = { - owner: THIS_MODULE, - poll: blktap_poll, - ioctl: blktap_ioctl, - open: blktap_open, - release: blktap_release, - mmap: blktap_mmap, -}; - -/*-----[ Data to/from user space ]----------------------------------------*/ - - -int blktap_write_fe_ring(blkif_request_t *req) -{ - blkif_request_t *target; - int error, i; - - /* - * This is called to pass a request from the real frontend domain's - * blkif ring to the character device. - */ - - if ( ! blktap_ring_ok ) { - DPRINTK("blktap: ufe_ring not ready for a request!\n"); - return 0; - } - - if ( RING_FULL(&blktap_ufe_ring) ) { - PRINTK("blktap: fe_ring is full, can't add.\n"); - return 0; - } - - target = RING_GET_REQUEST(&blktap_ufe_ring, - blktap_ufe_ring.req_prod_pvt); - memcpy(target, req, sizeof(*req)); - - /* Attempt to map the foreign pages directly in to the application */ - for (i=0; i<target->nr_segments; i++) { - - error = direct_remap_area_pages(blktap_vma->vm_mm, - MMAP_VADDR(ID_TO_IDX(req->id), i), - target->frame_and_sects[i] & PAGE_MASK, - PAGE_SIZE, - blktap_vma->vm_page_prot, - ID_TO_DOM(req->id)); - if ( error != 0 ) { - printk(KERN_INFO "remapping attached page failed! (%d)\n", error); - /* the request is now dropped on the floor. */ - return 0; - } - } - - blktap_ufe_ring.req_prod_pvt++; - - return 0; -} - -int blktap_write_be_ring(blkif_response_t *rsp) -{ - blkif_response_t *target; - - /* - * This is called to pass a request from the real backend domain's - * blkif ring to the character device. - */ - - if ( ! blktap_ring_ok ) { - DPRINTK("blktap: be_ring not ready for a request!\n"); - return 0; - } - - /* No test for fullness in the response direction. */ - - target = RING_GET_RESPONSE(&blktap_ube_ring, - blktap_ube_ring.rsp_prod_pvt); - memcpy(target, rsp, sizeof(*rsp)); - - /* no mapping -- pages were mapped in blktap_write_fe_ring() */ - - blktap_ube_ring.rsp_prod_pvt++; - - return 0; -} - -static int blktap_read_fe_ring(void) -{ - /* This is called to read responses from the UFE ring. */ - - RING_IDX i, rp; - blkif_response_t *resp_s; - blkif_t *blkif; - active_req_t *ar; - - DPRINTK("blktap_read_fe_ring()\n"); - - /* if we are forwarding from UFERring to FERing */ - if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) { - - /* for each outstanding message on the UFEring */ - rp = blktap_ufe_ring.sring->rsp_prod; - rmb(); - - for ( i = blktap_ufe_ring.rsp_cons; i != rp; i++ ) - { - resp_s = RING_GET_RESPONSE(&blktap_ufe_ring, i); - - DPRINTK("resp->fe_ring\n"); - ar = lookup_active_req(ID_TO_IDX(resp_s->id)); - blkif = ar->blkif; - zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0), - ar->nr_pages << PAGE_SHIFT, NULL); - write_resp_to_fe_ring(blkif, resp_s); - blktap_ufe_ring.rsp_cons = i + 1; - kick_fe_domain(blkif); - } - } - return 0; -} - -static int blktap_read_be_ring(void) -{ - /* This is called to read requests from the UBE ring. */ - - RING_IDX i, rp; - blkif_request_t *req_s; - - DPRINTK("blktap_read_be_ring()\n"); - - /* if we are forwarding from UFERring to FERing */ - if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) { - - /* for each outstanding message on the UFEring */ - rp = blktap_ube_ring.sring->req_prod; - rmb(); - for ( i = blktap_ube_ring.req_cons; i != rp; i++ ) - { - req_s = RING_GET_REQUEST(&blktap_ube_ring, i); - - DPRINTK("req->be_ring\n"); - write_req_to_be_ring(req_s); - kick_be_domain(); - } - - blktap_ube_ring.req_cons = i; - } - - return 0; -} - -int blktap_write_ctrl_ring(ctrl_msg_t *msg) -{ - ctrl_msg_t *target; - - if ( ! blktap_ring_ok ) { - DPRINTK("blktap: be_ring not ready for a request!\n"); - return 0; - } - - /* No test for fullness in the response direction. */ - - target = RING_GET_REQUEST(&blktap_uctrl_ring, - blktap_uctrl_ring.req_prod_pvt); - memcpy(target, msg, sizeof(*msg)); - - blktap_uctrl_ring.req_prod_pvt++; - - /* currently treat the ring as unidirectional. */ - blktap_uctrl_ring.rsp_cons = blktap_uctrl_ring.sring->rsp_prod; - - return 0; - -} - -/* -------[ blktap module setup ]------------------------------------- */ - -static struct miscdevice blktap_miscdev = { - .minor = BLKTAP_MINOR, - .name = "blktap", - .fops = &blktap_fops, - .devfs_name = "misc/blktap", -}; - -int blktap_init(void) -{ - int err; - - err = misc_register(&blktap_miscdev); - if ( err != 0 ) - { - printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err); - return err; - } - - init_waitqueue_head(&blktap_wait); - - - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/console/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/console/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y := console.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/console/console.c --- a/linux-2.6.11-xen-sparse/drivers/xen/console/console.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,811 +0,0 @@ -/****************************************************************************** - * console.c - * - * Virtual console driver. - * - * Copyright (c) 2002-2004, K A Fraser. - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/config.h> -#include <linux/version.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/interrupt.h> -#include <linux/tty.h> -#include <linux/tty_flip.h> -#include <linux/serial.h> -#include <linux/major.h> -#include <linux/ptrace.h> -#include <linux/ioport.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/console.h> -#include <linux/bootmem.h> -#include <asm/io.h> -#include <asm/irq.h> -#include <asm/uaccess.h> -#include <asm-xen/xen-public/event_channel.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/evtchn.h> -#include <asm-xen/ctrl_if.h> - -/* - * Modes: - * 'xencons=off' [XC_OFF]: Console is disabled. - * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. - * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. - * [XC_DEFAULT]: DOM0 -> XC_SERIAL ; all others -> XC_TTY. - * - * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses - * warnings from standard distro startup scripts. - */ -static enum { XC_OFF, XC_DEFAULT, XC_TTY, XC_SERIAL } xc_mode = XC_DEFAULT; -static int xc_num = -1; - -static int __init xencons_setup(char *str) -{ - char *q; - int n; - - if ( !strncmp(str, "ttyS", 4) ) - xc_mode = XC_SERIAL; - else if ( !strncmp(str, "tty", 3) ) - xc_mode = XC_TTY; - else if ( !strncmp(str, "off", 3) ) - xc_mode = XC_OFF; - - switch ( xc_mode ) - { - case XC_SERIAL: - n = simple_strtol( str+4, &q, 10 ); - if ( q > (str + 4) ) xc_num = n; - break; - case XC_TTY: - n = simple_strtol( str+3, &q, 10 ); - if ( q > (str + 3) ) xc_num = n; - break; - default: - break; - } - - return 1; -} -__setup("xencons=", xencons_setup); - -/* The kernel and user-land drivers share a common transmit buffer. */ -static unsigned int wbuf_size = 4096; -#define WBUF_MASK(_i) ((_i)&(wbuf_size-1)) -static char *wbuf; -static unsigned int wc, wp; /* write_cons, write_prod */ - -static int __init xencons_bufsz_setup(char *str) -{ - unsigned int goal; - goal = simple_strtoul(str, NULL, 0); - while ( wbuf_size < goal ) - wbuf_size <<= 1; - return 1; -} -__setup("xencons_bufsz=", xencons_bufsz_setup); - -/* This lock protects accesses to the common transmit buffer. */ -static spinlock_t xencons_lock = SPIN_LOCK_UNLOCKED; - -/* Common transmit-kick routine. */ -static void __xencons_tx_flush(void); - -/* This task is used to defer sending console data until there is space. */ -static void xencons_tx_flush_task_routine(void *data); - -static DECLARE_TQUEUE(xencons_tx_flush_task, - xencons_tx_flush_task_routine, - NULL); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -static struct tty_driver *xencons_driver; -#else -static struct tty_driver xencons_driver; -#endif - - -/******************** Kernel console driver ********************************/ - -static void kcons_write( - struct console *c, const char *s, unsigned int count) -{ - int i; - unsigned long flags; - - spin_lock_irqsave(&xencons_lock, flags); - - for ( i = 0; i < count; i++ ) - { - if ( (wp - wc) >= (wbuf_size - 1) ) - break; - if ( (wbuf[WBUF_MASK(wp++)] = s[i]) == '\n' ) - wbuf[WBUF_MASK(wp++)] = '\r'; - } - - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); -} - -static void kcons_write_dom0( - struct console *c, const char *s, unsigned int count) -{ - int rc; - - while ( (count > 0) && - ((rc = HYPERVISOR_console_io( - CONSOLEIO_write, count, (char *)s)) > 0) ) - { - count -= rc; - s += rc; - } -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -static struct tty_driver *kcons_device(struct console *c, int *index) -{ - *index = c->index; - return xencons_driver; -} -#else -static kdev_t kcons_device(struct console *c) -{ - return MKDEV(TTY_MAJOR, (xc_mode == XC_SERIAL) ? 64 : 1); -} -#endif - -static struct console kcons_info = { - device: kcons_device, - flags: CON_PRINTBUFFER, - index: -1 -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define __RETCODE 0 -static int __init xen_console_init(void) -#else -#define __RETCODE -void xen_console_init(void) -#endif -{ - if ( xen_start_info.flags & SIF_INITDOMAIN ) - { - if ( xc_mode == XC_DEFAULT ) - xc_mode = XC_SERIAL; - kcons_info.write = kcons_write_dom0; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - if ( xc_mode == XC_SERIAL ) - kcons_info.flags |= CON_ENABLED; -#endif - } - else - { - if ( xc_mode == XC_DEFAULT ) - xc_mode = XC_TTY; - kcons_info.write = kcons_write; - } - - switch ( xc_mode ) - { - case XC_SERIAL: - strcpy(kcons_info.name, "ttyS"); - if ( xc_num == -1 ) xc_num = 0; - break; - - case XC_TTY: - strcpy(kcons_info.name, "tty"); - if ( xc_num == -1 ) xc_num = 1; - break; - - default: - return __RETCODE; - } - - wbuf = alloc_bootmem(wbuf_size); - - register_console(&kcons_info); - - return __RETCODE; -} -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -console_initcall(xen_console_init); -#endif - -/*** Useful function for console debugging -- goes straight to Xen. ***/ -asmlinkage int xprintk(const char *fmt, ...) -{ - va_list args; - int printk_len; - static char printk_buf[1024]; - - /* Emit the output into the temporary buffer */ - va_start(args, fmt); - printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); - va_end(args); - - /* Send the processed output directly to Xen. */ - kcons_write_dom0(NULL, printk_buf, printk_len); - - return 0; -} - -/*** Forcibly flush console data before dying. ***/ -void xencons_force_flush(void) -{ - ctrl_msg_t msg; - int sz; - - /* Emergency console is synchronous, so there's nothing to flush. */ - if ( xen_start_info.flags & SIF_INITDOMAIN ) - return; - - /* - * We use dangerous control-interface functions that require a quiescent - * system and no interrupts. Try to ensure this with a global cli(). - */ - local_irq_disable(); /* XXXsmp */ - - /* Spin until console data is flushed through to the domain controller. */ - while ( (wc != wp) && !ctrl_if_transmitter_empty() ) - { - /* Interrupts are disabled -- we must manually reap responses. */ - ctrl_if_discard_responses(); - - if ( (sz = wp - wc) == 0 ) - continue; - if ( sz > sizeof(msg.msg) ) - sz = sizeof(msg.msg); - if ( sz > (wbuf_size - WBUF_MASK(wc)) ) - sz = wbuf_size - WBUF_MASK(wc); - - msg.type = CMSG_CONSOLE; - msg.subtype = CMSG_CONSOLE_DATA; - msg.length = sz; - memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); - - if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) - wc += sz; - } -} - - -/******************** User-space console driver (/dev/console) ************/ - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define DRV(_d) (_d) -#define TTY_INDEX(_tty) ((_tty)->index) -#else -static int xencons_refcount; -static struct tty_struct *xencons_table[MAX_NR_CONSOLES]; -#define DRV(_d) (&(_d)) -#define TTY_INDEX(_tty) (MINOR((_tty)->device) - xencons_driver.minor_start) -#endif - -static struct termios *xencons_termios[MAX_NR_CONSOLES]; -static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; -static struct tty_struct *xencons_tty; -static int xencons_priv_irq; -static char x_char; - -/* Non-privileged receive callback. */ -static void xencons_rx(ctrl_msg_t *msg, unsigned long id) -{ - int i; - unsigned long flags; - - spin_lock_irqsave(&xencons_lock, flags); - if ( xencons_tty != NULL ) - { - for ( i = 0; i < msg->length; i++ ) - tty_insert_flip_char(xencons_tty, msg->msg[i], 0); - tty_flip_buffer_push(xencons_tty); - } - spin_unlock_irqrestore(&xencons_lock, flags); - - msg->length = 0; - ctrl_if_send_response(msg); -} - -/* Privileged and non-privileged transmit worker. */ -static void __xencons_tx_flush(void) -{ - int sz, work_done = 0; - ctrl_msg_t msg; - - if ( xen_start_info.flags & SIF_INITDOMAIN ) - { - if ( x_char ) - { - kcons_write_dom0(NULL, &x_char, 1); - x_char = 0; - work_done = 1; - } - - while ( wc != wp ) - { - sz = wp - wc; - if ( sz > (wbuf_size - WBUF_MASK(wc)) ) - sz = wbuf_size - WBUF_MASK(wc); - kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); - wc += sz; - work_done = 1; - } - } - else - { - while ( x_char ) - { - msg.type = CMSG_CONSOLE; - msg.subtype = CMSG_CONSOLE_DATA; - msg.length = 1; - msg.msg[0] = x_char; - - if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) - x_char = 0; - else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) ) - break; - - work_done = 1; - } - - while ( wc != wp ) - { - sz = wp - wc; - if ( sz > sizeof(msg.msg) ) - sz = sizeof(msg.msg); - if ( sz > (wbuf_size - WBUF_MASK(wc)) ) - sz = wbuf_size - WBUF_MASK(wc); - - msg.type = CMSG_CONSOLE; - msg.subtype = CMSG_CONSOLE_DATA; - msg.length = sz; - memcpy(msg.msg, &wbuf[WBUF_MASK(wc)], sz); - - if ( ctrl_if_send_message_noblock(&msg, NULL, 0) == 0 ) - wc += sz; - else if ( ctrl_if_enqueue_space_callback(&xencons_tx_flush_task) ) - break; - - work_done = 1; - } - } - - if ( work_done && (xencons_tty != NULL) ) - { - wake_up_interruptible(&xencons_tty->write_wait); - if ( (xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && - (xencons_tty->ldisc.write_wakeup != NULL) ) - (xencons_tty->ldisc.write_wakeup)(xencons_tty); - } -} - -/* Non-privileged transmit kicker. */ -static void xencons_tx_flush_task_routine(void *data) -{ - unsigned long flags; - spin_lock_irqsave(&xencons_lock, flags); - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); -} - -/* Privileged receive callback and transmit kicker. */ -static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, - struct pt_regs *regs) -{ - static char rbuf[16]; - int i, l; - unsigned long flags; - - spin_lock_irqsave(&xencons_lock, flags); - - if ( xencons_tty != NULL ) - { - /* Receive work. */ - while ( (l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0 ) - for ( i = 0; i < l; i++ ) - tty_insert_flip_char(xencons_tty, rbuf[i], 0); - if ( xencons_tty->flip.count != 0 ) - tty_flip_buffer_push(xencons_tty); - } - - /* Transmit work. */ - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); - - return IRQ_HANDLED; -} - -static int xencons_write_room(struct tty_struct *tty) -{ - return wbuf_size - (wp - wc); -} - -static int xencons_chars_in_buffer(struct tty_struct *tty) -{ - return wp - wc; -} - -static void xencons_send_xchar(struct tty_struct *tty, char ch) -{ - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - x_char = ch; - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); -} - -static void xencons_throttle(struct tty_struct *tty) -{ - if ( TTY_INDEX(tty) != 0 ) - return; - - if ( I_IXOFF(tty) ) - xencons_send_xchar(tty, STOP_CHAR(tty)); -} - -static void xencons_unthrottle(struct tty_struct *tty) -{ - if ( TTY_INDEX(tty) != 0 ) - return; - - if ( I_IXOFF(tty) ) - { - if ( x_char != 0 ) - x_char = 0; - else - xencons_send_xchar(tty, START_CHAR(tty)); - } -} - -static void xencons_flush_buffer(struct tty_struct *tty) -{ - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - wc = wp = 0; - spin_unlock_irqrestore(&xencons_lock, flags); -} - -static inline int __xencons_put_char(int ch) -{ - char _ch = (char)ch; - if ( (wp - wc) == wbuf_size ) - return 0; - wbuf[WBUF_MASK(wp++)] = _ch; - return 1; -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -static int xencons_write( - struct tty_struct *tty, - const unsigned char *buf, - int count) -{ - int i; - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return count; - - spin_lock_irqsave(&xencons_lock, flags); - - for ( i = 0; i < count; i++ ) - if ( !__xencons_put_char(buf[i]) ) - break; - - if ( i != 0 ) - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); - - return i; -} -#else -static int xencons_write( - struct tty_struct *tty, - int from_user, - const u_char *buf, - int count) -{ - int i; - unsigned long flags; - - if ( from_user && verify_area(VERIFY_READ, buf, count) ) - return -EINVAL; - - if ( TTY_INDEX(tty) != 0 ) - return count; - - spin_lock_irqsave(&xencons_lock, flags); - - for ( i = 0; i < count; i++ ) - { - char ch; - if ( from_user ) - __get_user(ch, buf + i); - else - ch = buf[i]; - if ( !__xencons_put_char(ch) ) - break; - } - - if ( i != 0 ) - __xencons_tx_flush(); - - spin_unlock_irqrestore(&xencons_lock, flags); - - return i; -} -#endif - -static void xencons_put_char(struct tty_struct *tty, u_char ch) -{ - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - (void)__xencons_put_char(ch); - spin_unlock_irqrestore(&xencons_lock, flags); -} - -static void xencons_flush_chars(struct tty_struct *tty) -{ - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - spin_lock_irqsave(&xencons_lock, flags); - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); -} - -static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) -{ - unsigned long orig_jiffies = jiffies; - - if ( TTY_INDEX(tty) != 0 ) - return; - - while ( DRV(tty->driver)->chars_in_buffer(tty) ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - if ( signal_pending(current) ) - break; - if ( (timeout != 0) && time_after(jiffies, orig_jiffies + timeout) ) - break; - } - - set_current_state(TASK_RUNNING); -} - -static int xencons_open(struct tty_struct *tty, struct file *filp) -{ - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return 0; - - spin_lock_irqsave(&xencons_lock, flags); - tty->driver_data = NULL; - if ( xencons_tty == NULL ) - xencons_tty = tty; - __xencons_tx_flush(); - spin_unlock_irqrestore(&xencons_lock, flags); - - return 0; -} - -static void xencons_close(struct tty_struct *tty, struct file *filp) -{ - unsigned long flags; - - if ( TTY_INDEX(tty) != 0 ) - return; - - if ( tty->count == 1 ) - { - tty->closing = 1; - tty_wait_until_sent(tty, 0); - if ( DRV(tty->driver)->flush_buffer != NULL ) - DRV(tty->driver)->flush_buffer(tty); - if ( tty->ldisc.flush_buffer != NULL ) - tty->ldisc.flush_buffer(tty); - tty->closing = 0; - spin_lock_irqsave(&xencons_lock, flags); - xencons_tty = NULL; - spin_unlock_irqrestore(&xencons_lock, flags); - } -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -static struct tty_operations xencons_ops = { - .open = xencons_open, - .close = xencons_close, - .write = xencons_write, - .write_room = xencons_write_room, - .put_char = xencons_put_char, - .flush_chars = xencons_flush_chars, - .chars_in_buffer = xencons_chars_in_buffer, - .send_xchar = xencons_send_xchar, - .flush_buffer = xencons_flush_buffer, - .throttle = xencons_throttle, - .unthrottle = xencons_unthrottle, - .wait_until_sent = xencons_wait_until_sent, -}; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST -static const char *xennullcon_startup(void) -{ - return NULL; -} - -static int xennullcon_dummy(void) -{ - return 0; -} - -#define DUMMY (void *)xennullcon_dummy - -/* - * The console `switch' structure for the dummy console - * - * Most of the operations are dummies. - */ - -const struct consw xennull_con = { - .owner = THIS_MODULE, - .con_startup = xennullcon_startup, - .con_init = DUMMY, - .con_deinit = DUMMY, - .con_clear = DUMMY, - .con_putc = DUMMY, - .con_putcs = DUMMY, - .con_cursor = DUMMY, - .con_scroll = DUMMY, - .con_bmove = DUMMY, - .con_switch = DUMMY, - .con_blank = DUMMY, - .con_font_set = DUMMY, - .con_font_get = DUMMY, - .con_font_default = DUMMY, - .con_font_copy = DUMMY, - .con_set_palette = DUMMY, - .con_scrolldelta = DUMMY, -}; -#endif -#endif - -static int __init xencons_init(void) -{ - int rc; - - if ( xc_mode == XC_OFF ) - return 0; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - xencons_driver = alloc_tty_driver((xc_mode == XC_SERIAL) ? - 1 : MAX_NR_CONSOLES); - if ( xencons_driver == NULL ) - return -ENOMEM; -#else - memset(&xencons_driver, 0, sizeof(struct tty_driver)); - xencons_driver.magic = TTY_DRIVER_MAGIC; - xencons_driver.refcount = &xencons_refcount; - xencons_driver.table = xencons_table; - xencons_driver.num = (xc_mode == XC_SERIAL) ? 1 : MAX_NR_CONSOLES; -#endif - - DRV(xencons_driver)->major = TTY_MAJOR; - DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; - DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; - DRV(xencons_driver)->init_termios = tty_std_termios; - DRV(xencons_driver)->flags = - TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS | TTY_DRIVER_NO_DEVFS; - DRV(xencons_driver)->termios = xencons_termios; - DRV(xencons_driver)->termios_locked = xencons_termios_locked; - - if ( xc_mode == XC_SERIAL ) - { - DRV(xencons_driver)->name = "ttyS"; - DRV(xencons_driver)->minor_start = 64 + xc_num; - DRV(xencons_driver)->name_base = 0 + xc_num; - } - else - { - DRV(xencons_driver)->name = "tty"; - DRV(xencons_driver)->minor_start = xc_num; - DRV(xencons_driver)->name_base = xc_num; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - tty_set_operations(xencons_driver, &xencons_ops); -#else - xencons_driver.open = xencons_open; - xencons_driver.close = xencons_close; - xencons_driver.write = xencons_write; - xencons_driver.write_room = xencons_write_room; - xencons_driver.put_char = xencons_put_char; - xencons_driver.flush_chars = xencons_flush_chars; - xencons_driver.chars_in_buffer = xencons_chars_in_buffer; - xencons_driver.send_xchar = xencons_send_xchar; - xencons_driver.flush_buffer = xencons_flush_buffer; - xencons_driver.throttle = xencons_throttle; - xencons_driver.unthrottle = xencons_unthrottle; - xencons_driver.wait_until_sent = xencons_wait_until_sent; -#endif - - if ( (rc = tty_register_driver(DRV(xencons_driver))) != 0 ) - { - printk("WARNING: Failed to register Xen virtual " - "console driver as '%s%d'\n", - DRV(xencons_driver)->name, DRV(xencons_driver)->name_base); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - put_tty_driver(xencons_driver); - xencons_driver = NULL; -#endif - return rc; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - tty_register_device(xencons_driver, 0, NULL); -#endif - - if ( xen_start_info.flags & SIF_INITDOMAIN ) - { - xencons_priv_irq = bind_virq_to_irq(VIRQ_CONSOLE); - (void)request_irq(xencons_priv_irq, - xencons_priv_interrupt, 0, "console", NULL); - } - else - { - (void)ctrl_if_register_receiver(CMSG_CONSOLE, xencons_rx, 0); - } - - printk("Xen virtual console successfully installed as %s%d\n", - DRV(xencons_driver)->name, - DRV(xencons_driver)->name_base ); - - return 0; -} - -module_init(xencons_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/evtchn/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/evtchn/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y := evtchn.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/evtchn/evtchn.c --- a/linux-2.6.11-xen-sparse/drivers/xen/evtchn/evtchn.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,430 +0,0 @@ -/****************************************************************************** - * evtchn.c - * - * Xenolinux driver for receiving and demuxing event-channel signals. - * - * Copyright (c) 2004, K A Fraser - * Multi-process extensions Copyright (c) 2004, Steven Smith - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/errno.h> -#include <linux/miscdevice.h> -#include <linux/major.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/poll.h> -#include <linux/irq.h> -#include <linux/init.h> -#define XEN_EVTCHN_MASK_OPS -#include <asm-xen/evtchn.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#include <linux/devfs_fs_kernel.h> -#define OLD_DEVFS -#else -#include <linux/gfp.h> -#endif - -#ifdef OLD_DEVFS -/* NB. This must be shared amongst drivers if more things go in /dev/xen */ -static devfs_handle_t xen_dev_dir; -#endif - -struct per_user_data { - /* Notification ring, accessed via /dev/xen/evtchn. */ -# define EVTCHN_RING_SIZE 2048 /* 2048 16-bit entries */ -# define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) - u16 *ring; - unsigned int ring_cons, ring_prod, ring_overflow; - - /* Processes wait on this queue when ring is empty. */ - wait_queue_head_t evtchn_wait; - struct fasync_struct *evtchn_async_queue; -}; - -/* Who's bound to each port? */ -static struct per_user_data *port_user[NR_EVENT_CHANNELS]; -static spinlock_t port_user_lock; - -void evtchn_device_upcall(int port) -{ - struct per_user_data *u; - - spin_lock(&port_user_lock); - - mask_evtchn(port); - clear_evtchn(port); - - if ( (u = port_user[port]) != NULL ) - { - if ( (u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE ) - { - u->ring[EVTCHN_RING_MASK(u->ring_prod)] = (u16)port; - if ( u->ring_cons == u->ring_prod++ ) - { - wake_up_interruptible(&u->evtchn_wait); - kill_fasync(&u->evtchn_async_queue, SIGIO, POLL_IN); - } - } - else - { - u->ring_overflow = 1; - } - } - - spin_unlock(&port_user_lock); -} - -static ssize_t evtchn_read(struct file *file, char *buf, - size_t count, loff_t *ppos) -{ - int rc; - unsigned int c, p, bytes1 = 0, bytes2 = 0; - DECLARE_WAITQUEUE(wait, current); - struct per_user_data *u = file->private_data; - - add_wait_queue(&u->evtchn_wait, &wait); - - count &= ~1; /* even number of bytes */ - - if ( count == 0 ) - { - rc = 0; - goto out; - } - - if ( count > PAGE_SIZE ) - count = PAGE_SIZE; - - for ( ; ; ) - { - set_current_state(TASK_INTERRUPTIBLE); - - if ( (c = u->ring_cons) != (p = u->ring_prod) ) - break; - - if ( u->ring_overflow ) - { - rc = -EFBIG; - goto out; - } - - if ( file->f_flags & O_NONBLOCK ) - { - rc = -EAGAIN; - goto out; - } - - if ( signal_pending(current) ) - { - rc = -ERESTARTSYS; - goto out; - } - - schedule(); - } - - /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ - if ( ((c ^ p) & EVTCHN_RING_SIZE) != 0 ) - { - bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * sizeof(u16); - bytes2 = EVTCHN_RING_MASK(p) * sizeof(u16); - } - else - { - bytes1 = (p - c) * sizeof(u16); - bytes2 = 0; - } - - /* Truncate chunks according to caller's maximum byte count. */ - if ( bytes1 > count ) - { - bytes1 = count; - bytes2 = 0; - } - else if ( (bytes1 + bytes2) > count ) - { - bytes2 = count - bytes1; - } - - if ( copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || - ((bytes2 != 0) && copy_to_user(&buf[bytes1], &u->ring[0], bytes2)) ) - { - rc = -EFAULT; - goto out; - } - - u->ring_cons += (bytes1 + bytes2) / sizeof(u16); - - rc = bytes1 + bytes2; - - out: - __set_current_state(TASK_RUNNING); - remove_wait_queue(&u->evtchn_wait, &wait); - return rc; -} - -static ssize_t evtchn_write(struct file *file, const char *buf, - size_t count, loff_t *ppos) -{ - int rc, i; - u16 *kbuf = (u16 *)__get_free_page(GFP_KERNEL); - struct per_user_data *u = file->private_data; - - if ( kbuf == NULL ) - return -ENOMEM; - - count &= ~1; /* even number of bytes */ - - if ( count == 0 ) - { - rc = 0; - goto out; - } - - if ( count > PAGE_SIZE ) - count = PAGE_SIZE; - - if ( copy_from_user(kbuf, buf, count) != 0 ) - { - rc = -EFAULT; - goto out; - } - - spin_lock_irq(&port_user_lock); - for ( i = 0; i < (count/2); i++ ) - if ( (kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u) ) - unmask_evtchn(kbuf[i]); - spin_unlock_irq(&port_user_lock); - - rc = count; - - out: - free_page((unsigned long)kbuf); - return rc; -} - -static int evtchn_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int rc = 0; - struct per_user_data *u = file->private_data; - - spin_lock_irq(&port_user_lock); - - switch ( cmd ) - { - case EVTCHN_RESET: - /* Initialise the ring to empty. Clear errors. */ - u->ring_cons = u->ring_prod = u->ring_overflow = 0; - break; - - case EVTCHN_BIND: - if ( arg >= NR_EVENT_CHANNELS ) - { - rc = -EINVAL; - } - else if ( port_user[arg] != NULL ) - { - rc = -EISCONN; - } - else - { - port_user[arg] = u; - unmask_evtchn(arg); - } - break; - - case EVTCHN_UNBIND: - if ( arg >= NR_EVENT_CHANNELS ) - { - rc = -EINVAL; - } - else if ( port_user[arg] != u ) - { - rc = -ENOTCONN; - } - else - { - port_user[arg] = NULL; - mask_evtchn(arg); - } - break; - - default: - rc = -ENOSYS; - break; - } - - spin_unlock_irq(&port_user_lock); - - return rc; -} - -static unsigned int evtchn_poll(struct file *file, poll_table *wait) -{ - unsigned int mask = POLLOUT | POLLWRNORM; - struct per_user_data *u = file->private_data; - - poll_wait(file, &u->evtchn_wait, wait); - if ( u->ring_cons != u->ring_prod ) - mask |= POLLIN | POLLRDNORM; - if ( u->ring_overflow ) - mask = POLLERR; - return mask; -} - -static int evtchn_fasync(int fd, struct file *filp, int on) -{ - struct per_user_data *u = filp->private_data; - return fasync_helper(fd, filp, on, &u->evtchn_async_queue); -} - -static int evtchn_open(struct inode *inode, struct file *filp) -{ - struct per_user_data *u; - - if ( (u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL ) - return -ENOMEM; - - memset(u, 0, sizeof(*u)); - init_waitqueue_head(&u->evtchn_wait); - - if ( (u->ring = (u16 *)__get_free_page(GFP_KERNEL)) == NULL ) - { - kfree(u); - return -ENOMEM; - } - - filp->private_data = u; - - return 0; -} - -static int evtchn_release(struct inode *inode, struct file *filp) -{ - int i; - struct per_user_data *u = filp->private_data; - - spin_lock_irq(&port_user_lock); - - free_page((unsigned long)u->ring); - - for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) - { - if ( port_user[i] == u ) - { - port_user[i] = NULL; - mask_evtchn(i); - } - } - - spin_unlock_irq(&port_user_lock); - - return 0; -} - -static struct file_operations evtchn_fops = { - owner: THIS_MODULE, - read: evtchn_read, - write: evtchn_write, - ioctl: evtchn_ioctl, - poll: evtchn_poll, - fasync: evtchn_fasync, - open: evtchn_open, - release: evtchn_release -}; - -static struct miscdevice evtchn_miscdev = { - .minor = EVTCHN_MINOR, - .name = "evtchn", - .fops = &evtchn_fops, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - .devfs_name = "misc/evtchn", -#endif -}; - -static int __init evtchn_init(void) -{ -#ifdef OLD_DEVFS - devfs_handle_t symlink_handle; - int pos; - char link_dest[64]; -#endif - int err; - - spin_lock_init(&port_user_lock); - memset(port_user, 0, sizeof(port_user)); - - /* (DEVFS) create '/dev/misc/evtchn'. */ - err = misc_register(&evtchn_miscdev); - if ( err != 0 ) - { - printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); - return err; - } - -#ifdef OLD_DEVFS - /* (DEVFS) create directory '/dev/xen'. */ - xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); - - /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ - pos = devfs_generate_path(evtchn_miscdev.devfs_handle, - &link_dest[3], - sizeof(link_dest) - 3); - if ( pos >= 0 ) - strncpy(&link_dest[pos], "../", 3); - - /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ - (void)devfs_mk_symlink(xen_dev_dir, - "evtchn", - DEVFS_FL_DEFAULT, - &link_dest[pos], - &symlink_handle, - NULL); - - /* (DEVFS) automatically destroy the symlink with its destination. */ - devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); -#endif - - printk("Event-channel device installed.\n"); - - return 0; -} - -static void evtchn_cleanup(void) -{ - misc_deregister(&evtchn_miscdev); -} - -module_init(evtchn_init); -module_exit(evtchn_cleanup); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netback/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y := netback.o control.o interface.o loopback.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netback/common.h --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/common.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,103 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/netif/backend/common.h - */ - -#ifndef __NETIF__BACKEND__COMMON_H__ -#define __NETIF__BACKEND__COMMON_H__ - -#include <linux/config.h> -#include <linux/version.h> -#include <linux/module.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/ip.h> -#include <linux/in.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/xen-public/io/netif.h> -#include <asm/io.h> -#include <asm/pgalloc.h> - -#if 0 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define ASSERT(_p) ((void)0) -#define DPRINTK(_f, _a...) ((void)0) -#endif - -typedef struct netif_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - - u8 fe_dev_addr[6]; - - /* Physical parameters of the comms window. */ - unsigned long tx_shmem_frame; - unsigned long rx_shmem_frame; - unsigned int evtchn; - int irq; - - /* The shared rings and indexes. */ - netif_tx_interface_t *tx; - netif_rx_interface_t *rx; - - /* Private indexes into shared ring. */ - NETIF_RING_IDX rx_req_cons; - NETIF_RING_IDX rx_resp_prod; /* private version of shared variable */ - NETIF_RING_IDX tx_req_cons; - NETIF_RING_IDX tx_resp_prod; /* private version of shared variable */ - - /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ - unsigned long credit_bytes; - unsigned long credit_usec; - unsigned long remaining_credit; - struct timer_list credit_timeout; - - /* Miscellaneous private stuff. */ - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - int active; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; - struct netif_st *hash_next; - struct list_head list; /* scheduling list */ - atomic_t refcnt; - struct net_device *dev; - struct net_device_stats stats; - - struct work_struct work; -} netif_t; - -void netif_create(netif_be_create_t *create); -void netif_destroy(netif_be_destroy_t *destroy); -void netif_creditlimit(netif_be_creditlimit_t *creditlimit); -void netif_connect(netif_be_connect_t *connect); -int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id); -void netif_disconnect_complete(netif_t *netif); -netif_t *netif_find_by_handle(domid_t domid, unsigned int handle); -#define netif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define netif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - netif_disconnect_complete(_b); \ - } while (0) - -void netif_interface_init(void); -void netif_ctrlif_init(void); - -void netif_schedule_work(netif_t *netif); -void netif_deschedule_work(netif_t *netif); - -int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); -struct net_device_stats *netif_be_get_stats(struct net_device *dev); -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); - -#endif /* __NETIF__BACKEND__COMMON_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netback/control.c --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/control.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,58 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/netif/backend/control.c - * - * Routines for interfacing with the control plane. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype); - - switch ( msg->subtype ) - { - case CMSG_NETIF_BE_CREATE: - netif_create((netif_be_create_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_DESTROY: - netif_destroy((netif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_CREDITLIMIT: - netif_creditlimit((netif_be_creditlimit_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_CONNECT: - netif_connect((netif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_DISCONNECT: - if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) ) - return; /* Sending the response is deferred until later. */ - break; - default: - DPRINTK("Parse error while reading message subtype %d, len %d\n", - msg->subtype, msg->length); - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -void netif_ctrlif_init(void) -{ - ctrl_msg_t cmsg; - netif_be_driver_status_t st; - - (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_NETIF_BE; - cmsg.subtype = CMSG_NETIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(netif_be_driver_status_t); - st.status = NETIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netback/interface.c --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/interface.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,381 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/netif/backend/interface.c - * - * Network-device interface management. - * - * Copyright (c) 2004-2005, Keir Fraser - */ - -#include "common.h" -#include <linux/rtnetlink.h> - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#endif - -#define NETIF_HASHSZ 1024 -#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1)) - -static netif_t *netif_hash[NETIF_HASHSZ]; - -netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) -{ - netif_t *netif = netif_hash[NETIF_HASH(domid, handle)]; - while ( (netif != NULL) && - ((netif->domid != domid) || (netif->handle != handle)) ) - netif = netif->hash_next; - return netif; -} - -static void __netif_up(netif_t *netif) -{ - struct net_device *dev = netif->dev; - spin_lock_bh(&dev->xmit_lock); - netif->active = 1; - spin_unlock_bh(&dev->xmit_lock); - (void)request_irq(netif->irq, netif_be_int, 0, dev->name, netif); - netif_schedule_work(netif); -} - -static void __netif_down(netif_t *netif) -{ - struct net_device *dev = netif->dev; - spin_lock_bh(&dev->xmit_lock); - netif->active = 0; - spin_unlock_bh(&dev->xmit_lock); - free_irq(netif->irq, netif); - netif_deschedule_work(netif); -} - -static int net_open(struct net_device *dev) -{ - netif_t *netif = netdev_priv(dev); - if ( netif->status == CONNECTED ) - __netif_up(netif); - netif_start_queue(dev); - return 0; -} - -static int net_close(struct net_device *dev) -{ - netif_t *netif = netdev_priv(dev); - netif_stop_queue(dev); - if ( netif->status == CONNECTED ) - __netif_down(netif); - return 0; -} - -static void __netif_disconnect_complete(void *arg) -{ - netif_t *netif = (netif_t *)arg; - ctrl_msg_t cmsg; - netif_be_disconnect_t disc; - - /* - * These can't be done in netif_disconnect() because at that point there - * may be outstanding requests in the network stack whose asynchronous - * responses must still be notified to the remote driver. - */ - unbind_evtchn_from_irq(netif->evtchn); - vfree(netif->tx); /* Frees netif->rx as well. */ - - /* Construct the deferred response message. */ - cmsg.type = CMSG_NETIF_BE; - cmsg.subtype = CMSG_NETIF_BE_DISCONNECT; - cmsg.id = netif->disconnect_rspid; - cmsg.length = sizeof(netif_be_disconnect_t); - disc.domid = netif->domid; - disc.netif_handle = netif->handle; - disc.status = NETIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'netif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( netif->status != DISCONNECTING ) - BUG(); - netif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); -} - -void netif_disconnect_complete(netif_t *netif) -{ - INIT_WORK(&netif->work, __netif_disconnect_complete, (void *)netif); - schedule_work(&netif->work); -} - -void netif_create(netif_be_create_t *create) -{ - int err = 0; - domid_t domid = create->domid; - unsigned int handle = create->netif_handle; - struct net_device *dev; - netif_t **pnetif, *netif; - char name[IFNAMSIZ] = {}; - - snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); - dev = alloc_netdev(sizeof(netif_t), name, ether_setup); - if ( dev == NULL ) - { - DPRINTK("Could not create netif: out of memory\n"); - create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - netif = netdev_priv(dev); - memset(netif, 0, sizeof(*netif)); - netif->domid = domid; - netif->handle = handle; - netif->status = DISCONNECTED; - atomic_set(&netif->refcnt, 0); - netif->dev = dev; - - netif->credit_bytes = netif->remaining_credit = ~0UL; - netif->credit_usec = 0UL; - init_timer(&netif->credit_timeout); - - pnetif = &netif_hash[NETIF_HASH(domid, handle)]; - while ( *pnetif != NULL ) - { - if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) ) - { - DPRINTK("Could not create netif: already exists\n"); - create->status = NETIF_BE_STATUS_INTERFACE_EXISTS; - free_netdev(dev); - return; - } - pnetif = &(*pnetif)->hash_next; - } - - dev->hard_start_xmit = netif_be_start_xmit; - dev->get_stats = netif_be_get_stats; - dev->open = net_open; - dev->stop = net_close; - dev->features = NETIF_F_NO_CSUM; - - /* Disable queuing. */ - dev->tx_queue_len = 0; - - if ( (create->be_mac[0] == 0) && (create->be_mac[1] == 0) && - (create->be_mac[2] == 0) && (create->be_mac[3] == 0) && - (create->be_mac[4] == 0) && (create->be_mac[5] == 0) ) - { - /* - * Initialise a dummy MAC address. We choose the numerically largest - * non-broadcast address to prevent the address getting stolen by an - * Ethernet bridge for STP purposes. (FE:FF:FF:FF:FF:FF) - */ - memset(dev->dev_addr, 0xFF, ETH_ALEN); - dev->dev_addr[0] &= ~0x01; - } - else - { - memcpy(dev->dev_addr, create->be_mac, ETH_ALEN); - } - - memcpy(netif->fe_dev_addr, create->mac, ETH_ALEN); - - rtnl_lock(); - err = register_netdevice(dev); - rtnl_unlock(); - - if ( err != 0 ) - { - DPRINTK("Could not register new net device %s: err=%d\n", - dev->name, err); - create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - free_netdev(dev); - return; - } - - netif->hash_next = *pnetif; - *pnetif = netif; - - DPRINTK("Successfully created netif\n"); - create->status = NETIF_BE_STATUS_OKAY; -} - -void netif_destroy(netif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - unsigned int handle = destroy->netif_handle; - netif_t **pnetif, *netif; - - pnetif = &netif_hash[NETIF_HASH(domid, handle)]; - while ( (netif = *pnetif) != NULL ) - { - if ( (netif->domid == domid) && (netif->handle == handle) ) - { - if ( netif->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pnetif = &netif->hash_next; - } - - destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pnetif = netif->hash_next; - unregister_netdev(netif->dev); - free_netdev(netif->dev); - destroy->status = NETIF_BE_STATUS_OKAY; -} - -void netif_creditlimit(netif_be_creditlimit_t *creditlimit) -{ - domid_t domid = creditlimit->domid; - unsigned int handle = creditlimit->netif_handle; - netif_t *netif; - - netif = netif_find_by_handle(domid, handle); - if ( unlikely(netif == NULL) ) - { - DPRINTK("netif_creditlimit attempted for non-existent netif" - " (%u,%u)\n", creditlimit->domid, creditlimit->netif_handle); - creditlimit->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - /* Set the credit limit (reset remaining credit to new limit). */ - netif->credit_bytes = netif->remaining_credit = creditlimit->credit_bytes; - netif->credit_usec = creditlimit->period_usec; - - if ( netif->status == CONNECTED ) - { - /* - * Schedule work so that any packets waiting under previous credit - * limit are dealt with (acts like a replenishment point). - */ - netif->credit_timeout.expires = jiffies; - netif_schedule_work(netif); - } - - creditlimit->status = NETIF_BE_STATUS_OKAY; -} - -void netif_connect(netif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int handle = connect->netif_handle; - unsigned int evtchn = connect->evtchn; - unsigned long tx_shmem_frame = connect->tx_shmem_frame; - unsigned long rx_shmem_frame = connect->rx_shmem_frame; - struct vm_struct *vma; - pgprot_t prot; - int error; - netif_t *netif; - - netif = netif_find_by_handle(domid, handle); - if ( unlikely(netif == NULL) ) - { - DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", - connect->domid, connect->netif_handle); - connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( netif->status != DISCONNECTED ) - { - connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; - return; - } - - if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - prot = __pgprot(_KERNPG_TABLE); - error = direct_remap_area_pages(&init_mm, - VMALLOC_VMADDR(vma->addr), - tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - error |= direct_remap_area_pages(&init_mm, - VMALLOC_VMADDR(vma->addr) + PAGE_SIZE, - rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - if ( error == -ENOMEM ) - connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) - connect->status = NETIF_BE_STATUS_MAPPING_ERROR; - else - connect->status = NETIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } - - netif->evtchn = evtchn; - netif->irq = bind_evtchn_to_irq(evtchn); - netif->tx_shmem_frame = tx_shmem_frame; - netif->rx_shmem_frame = rx_shmem_frame; - netif->tx = - (netif_tx_interface_t *)vma->addr; - netif->rx = - (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); - netif->tx->resp_prod = netif->rx->resp_prod = 0; - netif_get(netif); - wmb(); /* Other CPUs see new state before interface is started. */ - - rtnl_lock(); - netif->status = CONNECTED; - wmb(); - if ( netif_running(netif->dev) ) - __netif_up(netif); - rtnl_unlock(); - - connect->status = NETIF_BE_STATUS_OKAY; -} - -int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id) -{ - domid_t domid = disconnect->domid; - unsigned int handle = disconnect->netif_handle; - netif_t *netif; - - netif = netif_find_by_handle(domid, handle); - if ( unlikely(netif == NULL) ) - { - DPRINTK("netif_disconnect attempted for non-existent netif" - " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); - disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ - } - - if ( netif->status == CONNECTED ) - { - rtnl_lock(); - netif->status = DISCONNECTING; - netif->disconnect_rspid = rsp_id; - wmb(); - if ( netif_running(netif->dev) ) - __netif_down(netif); - rtnl_unlock(); - netif_put(netif); - return 0; /* Caller should not send response message. */ - } - - disconnect->status = NETIF_BE_STATUS_OKAY; - return 1; -} - -void netif_interface_init(void) -{ - memset(netif_hash, 0, sizeof(netif_hash)); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netback/loopback.c --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/loopback.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,164 +0,0 @@ -/****************************************************************************** - * netback/loopback.c - * - * A two-interface loopback device to emulate a local netfront-netback - * connection. This ensures that local packet delivery looks identical - * to inter-domain delivery. Most importantly, packets delivered locally - * originating from other domains will get *copied* when they traverse this - * driver. This prevents unbounded delays in socket-buffer queues from - * causing the netback driver to "seize up". - * - * This driver creates a symmetric pair of loopback interfaces with names - * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet - * bridge, just like a proper netback interface, while a local IP interface - * is configured on 'veth0'. - * - * As with a real netback interface, vif0.0 is configured with a suitable - * dummy MAC address. No default is provided for veth0: a reasonable strategy - * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address - * (to avoid confusing the Etherbridge). - * - * Copyright (c) 2005 K A Fraser - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/etherdevice.h> -#include <linux/skbuff.h> -#include <net/dst.h> - -struct net_private { - struct net_device *loopback_dev; - struct net_device_stats stats; -}; - -static int loopback_open(struct net_device *dev) -{ - struct net_private *np = netdev_priv(dev); - memset(&np->stats, 0, sizeof(np->stats)); - netif_start_queue(dev); - return 0; -} - -static int loopback_close(struct net_device *dev) -{ - netif_stop_queue(dev); - return 0; -} - -static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct net_private *np = netdev_priv(dev); - - dst_release(skb->dst); - skb->dst = NULL; - - skb_orphan(skb); - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - - /* Switch to loopback context. */ - dev = np->loopback_dev; - np = netdev_priv(dev); - - np->stats.rx_bytes += skb->len; - np->stats.rx_packets++; - - if ( skb->ip_summed == CHECKSUM_HW ) - { - /* Defer checksum calculation. */ - skb->proto_csum_blank = 1; - /* Must be a local packet: assert its integrity. */ - skb->proto_csum_valid = 1; - } - - skb->ip_summed = skb->proto_csum_valid ? - CHECKSUM_UNNECESSARY : CHECKSUM_NONE; - - skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */ - skb->protocol = eth_type_trans(skb, dev); - skb->dev = dev; - dev->last_rx = jiffies; - netif_rx(skb); - - return 0; -} - -static struct net_device_stats *loopback_get_stats(struct net_device *dev) -{ - struct net_private *np = netdev_priv(dev); - return &np->stats; -} - -static void loopback_construct(struct net_device *dev, struct net_device *lo) -{ - struct net_private *np = netdev_priv(dev); - - np->loopback_dev = lo; - - dev->open = loopback_open; - dev->stop = loopback_close; - dev->hard_start_xmit = loopback_start_xmit; - dev->get_stats = loopback_get_stats; - - dev->tx_queue_len = 0; - - dev->features = NETIF_F_HIGHDMA | NETIF_F_LLTX; - - /* - * We do not set a jumbo MTU on the interface. Otherwise the network - * stack will try to send large packets that will get dropped by the - * Ethernet bridge (unless the physical Ethernet interface is configured - * to transfer jumbo packets). If a larger MTU is desired then the system - * administrator can specify it using the 'ifconfig' command. - */ - /*dev->mtu = 16*1024;*/ -} - -static int __init loopback_init(void) -{ - struct net_device *dev1, *dev2; - int err = -ENOMEM; - - dev1 = alloc_netdev(sizeof(struct net_private), "vif0.0", ether_setup); - dev2 = alloc_netdev(sizeof(struct net_private), "veth0", ether_setup); - if ( (dev1 == NULL) || (dev2 == NULL) ) - goto fail; - - loopback_construct(dev1, dev2); - loopback_construct(dev2, dev1); - - dev1->features |= NETIF_F_NO_CSUM; - dev2->features |= NETIF_F_IP_CSUM; - - /* - * Initialise a dummy MAC address for the 'dummy backend' interface. We - * choose the numerically largest non-broadcast address to prevent the - * address getting stolen by an Ethernet bridge for STP purposes. - */ - memset(dev1->dev_addr, 0xFF, ETH_ALEN); - dev1->dev_addr[0] &= ~0x01; - - if ( (err = register_netdev(dev1)) != 0 ) - goto fail; - - if ( (err = register_netdev(dev2)) != 0 ) - { - unregister_netdev(dev1); - goto fail; - } - - return 0; - - fail: - if ( dev1 != NULL ) - kfree(dev1); - if ( dev2 != NULL ) - kfree(dev2); - return err; -} - -module_init(loopback_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netback/netback.c --- a/linux-2.6.11-xen-sparse/drivers/xen/netback/netback.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,823 +0,0 @@ -/****************************************************************************** - * drivers/xen/netback/netback.c - * - * Back-end of the driver for virtual network devices. This portion of the - * driver exports a 'unified' network-device interface that can be accessed - * by any operating system that implements a compatible front end. A - * reference front-end implementation can be found in: - * drivers/xen/netfront/netfront.c - * - * Copyright (c) 2002-2005, K A Fraser - */ - -#include "common.h" -#include <asm-xen/balloon.h> -#include <asm-xen/evtchn.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#include <linux/delay.h> -#endif - -static void netif_idx_release(u16 pending_idx); -static void netif_page_release(struct page *page); -static void make_tx_response(netif_t *netif, - u16 id, - s8 st); -static int make_rx_response(netif_t *netif, - u16 id, - s8 st, - memory_t addr, - u16 size, - u16 csum_valid); - -static void net_tx_action(unsigned long unused); -static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); - -static void net_rx_action(unsigned long unused); -static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); - -static struct timer_list net_timer; - -static struct sk_buff_head rx_queue; -static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2+1]; -static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; -static struct mmuext_op rx_mmuext[NETIF_RX_RING_SIZE]; -static unsigned char rx_notify[NR_EVENT_CHANNELS]; - -/* Don't currently gate addition of an interface to the tx scheduling list. */ -#define tx_work_exists(_if) (1) - -#define MAX_PENDING_REQS 256 -static unsigned long mmap_vstart; -#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) - -#define PKT_PROT_LEN 64 - -static struct { - netif_tx_request_t req; - netif_t *netif; -} pending_tx_info[MAX_PENDING_REQS]; -static u16 pending_ring[MAX_PENDING_REQS]; -typedef unsigned int PEND_RING_IDX; -#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) -static PEND_RING_IDX pending_prod, pending_cons; -#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) - -/* Freed TX SKBs get batched on this ring before return to pending_ring. */ -static u16 dealloc_ring[MAX_PENDING_REQS]; -static PEND_RING_IDX dealloc_prod, dealloc_cons; - -static struct sk_buff_head tx_queue; -static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; - -static struct list_head net_schedule_list; -static spinlock_t net_schedule_list_lock; - -#define MAX_MFN_ALLOC 64 -static unsigned long mfn_list[MAX_MFN_ALLOC]; -static unsigned int alloc_index = 0; -static spinlock_t mfn_lock = SPIN_LOCK_UNLOCKED; - -static unsigned long alloc_mfn(void) -{ - unsigned long mfn = 0, flags; - spin_lock_irqsave(&mfn_lock, flags); - if ( unlikely(alloc_index == 0) ) - alloc_index = HYPERVISOR_dom_mem_op( - MEMOP_increase_reservation, mfn_list, MAX_MFN_ALLOC, 0); - if ( alloc_index != 0 ) - mfn = mfn_list[--alloc_index]; - spin_unlock_irqrestore(&mfn_lock, flags); - return mfn; -} - -static void free_mfn(unsigned long mfn) -{ - unsigned long flags; - spin_lock_irqsave(&mfn_lock, flags); - if ( alloc_index != MAX_MFN_ALLOC ) - mfn_list[alloc_index++] = mfn; - else if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, - &mfn, 1, 0) != 1 ) - BUG(); - spin_unlock_irqrestore(&mfn_lock, flags); -} - -static inline void maybe_schedule_tx_action(void) -{ - smp_mb(); - if ( (NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && - !list_empty(&net_schedule_list) ) - tasklet_schedule(&net_tx_tasklet); -} - -/* - * A gross way of confirming the origin of an skb data page. The slab - * allocator abuses a field in the page struct to cache the kmem_cache_t ptr. - */ -static inline int is_xen_skb(struct sk_buff *skb) -{ - extern kmem_cache_t *skbuff_cachep; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; -#else - kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->list.next; -#endif - return (cp == skbuff_cachep); -} - -int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - netif_t *netif = netdev_priv(dev); - - ASSERT(skb->dev == dev); - - /* Drop the packet if the target domain has no receive buffers. */ - if ( !netif->active || - (netif->rx_req_cons == netif->rx->req_prod) || - ((netif->rx_req_cons-netif->rx_resp_prod) == NETIF_RX_RING_SIZE) ) - goto drop; - - /* - * We do not copy the packet unless: - * 1. The data is shared; or - * 2. The data is not allocated from our special cache. - * NB. We also couldn't cope with fragmented packets, but we won't get - * any because we not advertise the NETIF_F_SG feature. - */ - if ( skb_shared(skb) || skb_cloned(skb) || !is_xen_skb(skb) ) - { - int hlen = skb->data - skb->head; - struct sk_buff *nskb = dev_alloc_skb(hlen + skb->len); - if ( unlikely(nskb == NULL) ) - goto drop; - skb_reserve(nskb, hlen); - __skb_put(nskb, skb->len); - if (skb_copy_bits(skb, -hlen, nskb->data - hlen, skb->len + hlen)) - BUG(); - nskb->dev = skb->dev; - nskb->proto_csum_valid = skb->proto_csum_valid; - dev_kfree_skb(skb); - skb = nskb; - } - - netif->rx_req_cons++; - netif_get(netif); - - skb_queue_tail(&rx_queue, skb); - tasklet_schedule(&net_rx_tasklet); - - return 0; - - drop: - netif->stats.tx_dropped++; - dev_kfree_skb(skb); - return 0; -} - -#if 0 -static void xen_network_done_notify(void) -{ - static struct net_device *eth0_dev = NULL; - if ( unlikely(eth0_dev == NULL) ) - eth0_dev = __dev_get_by_name("eth0"); - netif_rx_schedule(eth0_dev); -} -/* - * Add following to poll() function in NAPI driver (Tigon3 is example): - * if ( xen_network_done() ) - * tg3_enable_ints(tp); - */ -int xen_network_done(void) -{ - return skb_queue_empty(&rx_queue); -} -#endif - -static void net_rx_action(unsigned long unused) -{ - netif_t *netif; - s8 status; - u16 size, id, evtchn; - multicall_entry_t *mcl; - mmu_update_t *mmu; - struct mmuext_op *mmuext; - unsigned long vdata, mdata, new_mfn; - struct sk_buff_head rxq; - struct sk_buff *skb; - u16 notify_list[NETIF_RX_RING_SIZE]; - int notify_nr = 0; - - skb_queue_head_init(&rxq); - - mcl = rx_mcl; - mmu = rx_mmu; - mmuext = rx_mmuext; - while ( (skb = skb_dequeue(&rx_queue)) != NULL ) - { - netif = netdev_priv(skb->dev); - vdata = (unsigned long)skb->data; - mdata = virt_to_machine(vdata); - - /* Memory squeeze? Back off for an arbitrary while. */ - if ( (new_mfn = alloc_mfn()) == 0 ) - { - if ( net_ratelimit() ) - printk(KERN_WARNING "Memory squeeze in netback driver.\n"); - mod_timer(&net_timer, jiffies + HZ); - skb_queue_head(&rx_queue, skb); - break; - } - - /* - * Set the new P2M table entry before reassigning the old data page. - * Heed the comment in pgtable-2level.h:pte_page(). :-) - */ - phys_to_machine_mapping[__pa(skb->data) >> PAGE_SHIFT] = new_mfn; - - MULTI_update_va_mapping(mcl, vdata, - pfn_pte_ma(new_mfn, PAGE_KERNEL), 0); - mcl++; - - mcl->op = __HYPERVISOR_mmuext_op; - mcl->args[0] = (unsigned long)mmuext; - mcl->args[1] = 1; - mcl->args[2] = 0; - mcl->args[3] = netif->domid; - mcl++; - - mmuext->cmd = MMUEXT_REASSIGN_PAGE; - mmuext->mfn = mdata >> PAGE_SHIFT; - mmuext++; - - mmu->ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - mmu->val = __pa(vdata) >> PAGE_SHIFT; - mmu++; - - __skb_queue_tail(&rxq, skb); - - /* Filled the batch queue? */ - if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) - break; - } - - if ( mcl == rx_mcl ) - return; - - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)rx_mmu; - mcl->args[1] = mmu - rx_mmu; - mcl->args[2] = 0; - mcl->args[3] = DOMID_SELF; - mcl++; - - mcl[-3].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl) != 0) ) - BUG(); - - mcl = rx_mcl; - mmuext = rx_mmuext; - while ( (skb = __skb_dequeue(&rxq)) != NULL ) - { - netif = netdev_priv(skb->dev); - size = skb->tail - skb->data; - - /* Rederive the machine addresses. */ - new_mfn = mcl[0].args[1] >> PAGE_SHIFT; - mdata = ((mmuext[0].mfn << PAGE_SHIFT) | - ((unsigned long)skb->data & ~PAGE_MASK)); - - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->frag_list = NULL; - - netif->stats.tx_bytes += size; - netif->stats.tx_packets++; - - /* The update_va_mapping() must not fail. */ - BUG_ON(mcl[0].result != 0); - - /* Check the reassignment error code. */ - status = NETIF_RSP_OKAY; - if ( unlikely(mcl[1].result != 0) ) - { - DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid); - free_mfn(mdata >> PAGE_SHIFT); - status = NETIF_RSP_ERROR; - } - - evtchn = netif->evtchn; - id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; - if ( make_rx_response(netif, id, status, mdata, - size, skb->proto_csum_valid) && - (rx_notify[evtchn] == 0) ) - { - rx_notify[evtchn] = 1; - notify_list[notify_nr++] = evtchn; - } - - netif_put(netif); - dev_kfree_skb(skb); - - mcl += 2; - mmuext += 1; - } - - while ( notify_nr != 0 ) - { - evtchn = notify_list[--notify_nr]; - rx_notify[evtchn] = 0; - notify_via_evtchn(evtchn); - } - - /* More work to do? */ - if ( !skb_queue_empty(&rx_queue) && !timer_pending(&net_timer) ) - tasklet_schedule(&net_rx_tasklet); -#if 0 - else - xen_network_done_notify(); -#endif -} - -static void net_alarm(unsigned long unused) -{ - tasklet_schedule(&net_rx_tasklet); -} - -struct net_device_stats *netif_be_get_stats(struct net_device *dev) -{ - netif_t *netif = netdev_priv(dev); - return &netif->stats; -} - -static int __on_net_schedule_list(netif_t *netif) -{ - return netif->list.next != NULL; -} - -static void remove_from_net_schedule_list(netif_t *netif) -{ - spin_lock_irq(&net_schedule_list_lock); - if ( likely(__on_net_schedule_list(netif)) ) - { - list_del(&netif->list); - netif->list.next = NULL; - netif_put(netif); - } - spin_unlock_irq(&net_schedule_list_lock); -} - -static void add_to_net_schedule_list_tail(netif_t *netif) -{ - if ( __on_net_schedule_list(netif) ) - return; - - spin_lock_irq(&net_schedule_list_lock); - if ( !__on_net_schedule_list(netif) && netif->active ) - { - list_add_tail(&netif->list, &net_schedule_list); - netif_get(netif); - } - spin_unlock_irq(&net_schedule_list_lock); -} - -void netif_schedule_work(netif_t *netif) -{ - if ( (netif->tx_req_cons != netif->tx->req_prod) && - ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } -} - -void netif_deschedule_work(netif_t *netif) -{ - remove_from_net_schedule_list(netif); -} - - -static void tx_credit_callback(unsigned long data) -{ - netif_t *netif = (netif_t *)data; - netif->remaining_credit = netif->credit_bytes; - netif_schedule_work(netif); -} - -static void net_tx_action(unsigned long unused) -{ - struct list_head *ent; - struct sk_buff *skb; - netif_t *netif; - netif_tx_request_t txreq; - u16 pending_idx; - NETIF_RING_IDX i; - multicall_entry_t *mcl; - PEND_RING_IDX dc, dp; - unsigned int data_len; - - if ( (dc = dealloc_cons) == (dp = dealloc_prod) ) - goto skip_dealloc; - - mcl = tx_mcl; - while ( dc != dp ) - { - pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - MULTI_update_va_mapping(mcl, MMAP_VADDR(pending_idx), - __pte(0), 0); - mcl++; - } - - mcl[-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) - BUG(); - - mcl = tx_mcl; - while ( dealloc_cons != dp ) - { - /* The update_va_mapping() must not fail. */ - BUG_ON(mcl[0].result != 0); - - pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; - - netif = pending_tx_info[pending_idx].netif; - - make_tx_response(netif, pending_tx_info[pending_idx].req.id, - NETIF_RSP_OKAY); - - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - - /* - * Scheduling checks must happen after the above response is posted. - * This avoids a possible race with a guest OS on another CPU if that - * guest is testing against 'resp_prod' when deciding whether to notify - * us when it queues additional packets. - */ - mb(); - if ( (netif->tx_req_cons != netif->tx->req_prod) && - ((netif->tx_req_cons-netif->tx_resp_prod) != NETIF_TX_RING_SIZE) ) - add_to_net_schedule_list_tail(netif); - - netif_put(netif); - - mcl++; - } - - skip_dealloc: - mcl = tx_mcl; - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&net_schedule_list) ) - { - /* Get a netif from the list with work to do. */ - ent = net_schedule_list.next; - netif = list_entry(ent, netif_t, list); - netif_get(netif); - remove_from_net_schedule_list(netif); - - /* Work to do? */ - i = netif->tx_req_cons; - if ( (i == netif->tx->req_prod) || - ((i-netif->tx_resp_prod) == NETIF_TX_RING_SIZE) ) - { - netif_put(netif); - continue; - } - - rmb(); /* Ensure that we see the request before we copy it. */ - memcpy(&txreq, &netif->tx->ring[MASK_NETIF_TX_IDX(i)].req, - sizeof(txreq)); - - /* Credit-based scheduling. */ - if ( txreq.size > netif->remaining_credit ) - { - unsigned long now = jiffies; - unsigned long next_credit = - netif->credit_timeout.expires + - msecs_to_jiffies(netif->credit_usec / 1000); - - /* Timer could already be pending in some rare cases. */ - if ( timer_pending(&netif->credit_timeout) ) - break; - - /* Already passed the point at which we can replenish credit? */ - if ( time_after_eq(now, next_credit) ) - { - netif->credit_timeout.expires = now; - netif->remaining_credit = netif->credit_bytes; - } - - /* Still too big to send right now? Then set a timer callback. */ - if ( txreq.size > netif->remaining_credit ) - { - netif->remaining_credit = 0; - netif->credit_timeout.expires = next_credit; - netif->credit_timeout.data = (unsigned long)netif; - netif->credit_timeout.function = tx_credit_callback; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - add_timer_on(&netif->credit_timeout, smp_processor_id()); -#else - add_timer(&netif->credit_timeout); -#endif - break; - } - } - netif->remaining_credit -= txreq.size; - - /* - * Why the barrier? It ensures that the frontend sees updated req_cons - * before we check for more work to schedule. - */ - netif->tx->req_cons = ++netif->tx_req_cons; - mb(); - - netif_schedule_work(netif); - - if ( unlikely(txreq.size < ETH_HLEN) || - unlikely(txreq.size > ETH_FRAME_LEN) ) - { - DPRINTK("Bad packet size: %d\n", txreq.size); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - continue; - } - - /* No crossing a page boundary as the payload mustn't fragment. */ - if ( unlikely(((txreq.addr & ~PAGE_MASK) + txreq.size) >= PAGE_SIZE) ) - { - DPRINTK("txreq.addr: %lx, size: %u, end: %lu\n", - txreq.addr, txreq.size, - (txreq.addr &~PAGE_MASK) + txreq.size); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - continue; - } - - pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - - data_len = (txreq.size > PKT_PROT_LEN) ? PKT_PROT_LEN : txreq.size; - - if ( unlikely((skb = alloc_skb(data_len+16, GFP_ATOMIC)) == NULL) ) - { - DPRINTK("Can't allocate a skb in start_xmit.\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - break; - } - - /* Packets passed to netif_rx() must have some headroom. */ - skb_reserve(skb, 16); - - MULTI_update_va_mapping_otherdomain( - mcl, MMAP_VADDR(pending_idx), - pfn_pte_ma(txreq.addr >> PAGE_SHIFT, PAGE_KERNEL), - 0, netif->domid); - mcl++; - - memcpy(&pending_tx_info[pending_idx].req, &txreq, sizeof(txreq)); - pending_tx_info[pending_idx].netif = netif; - *((u16 *)skb->data) = pending_idx; - - __skb_queue_tail(&tx_queue, skb); - - pending_cons++; - - /* Filled the batch queue? */ - if ( (mcl - tx_mcl) == ARRAY_SIZE(tx_mcl) ) - break; - } - - if ( mcl == tx_mcl ) - return; - - if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) - BUG(); - - mcl = tx_mcl; - while ( (skb = __skb_dequeue(&tx_queue)) != NULL ) - { - pending_idx = *((u16 *)skb->data); - netif = pending_tx_info[pending_idx].netif; - memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); - - /* Check the remap error code. */ - if ( unlikely(mcl[0].result != 0) ) - { - DPRINTK("Bad page frame\n"); - make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); - netif_put(netif); - kfree_skb(skb); - mcl++; - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - continue; - } - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = - FOREIGN_FRAME(txreq.addr >> PAGE_SHIFT); - - data_len = (txreq.size > PKT_PROT_LEN) ? PKT_PROT_LEN : txreq.size; - - __skb_put(skb, data_len); - memcpy(skb->data, - (void *)(MMAP_VADDR(pending_idx)|(txreq.addr&~PAGE_MASK)), - data_len); - - if ( data_len < txreq.size ) - { - /* Append the packet payload as a fragment. */ - skb_shinfo(skb)->frags[0].page = - virt_to_page(MMAP_VADDR(pending_idx)); - skb_shinfo(skb)->frags[0].size = txreq.size - data_len; - skb_shinfo(skb)->frags[0].page_offset = - (txreq.addr + data_len) & ~PAGE_MASK; - skb_shinfo(skb)->nr_frags = 1; - } - else - { - /* Schedule a response immediately. */ - netif_idx_release(pending_idx); - } - - skb->data_len = txreq.size - data_len; - skb->len += skb->data_len; - - skb->dev = netif->dev; - skb->protocol = eth_type_trans(skb, skb->dev); - - /* No checking needed on localhost, but remember the field is blank. */ - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->proto_csum_valid = 1; - skb->proto_csum_blank = txreq.csum_blank; - - netif->stats.rx_bytes += txreq.size; - netif->stats.rx_packets++; - - netif_rx(skb); - netif->dev->last_rx = jiffies; - - mcl++; - } -} - -static void netif_idx_release(u16 pending_idx) -{ - static spinlock_t _lock = SPIN_LOCK_UNLOCKED; - unsigned long flags; - - spin_lock_irqsave(&_lock, flags); - dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; - spin_unlock_irqrestore(&_lock, flags); - - tasklet_schedule(&net_tx_tasklet); -} - -static void netif_page_release(struct page *page) -{ - u16 pending_idx = page - virt_to_page(mmap_vstart); - - /* Ready for next use. */ - set_page_count(page, 1); - - netif_idx_release(pending_idx); -} - -irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) -{ - netif_t *netif = dev_id; - if ( tx_work_exists(netif) ) - { - add_to_net_schedule_list_tail(netif); - maybe_schedule_tx_action(); - } - return IRQ_HANDLED; -} - -static void make_tx_response(netif_t *netif, - u16 id, - s8 st) -{ - NETIF_RING_IDX i = netif->tx_resp_prod; - netif_tx_response_t *resp; - - resp = &netif->tx->ring[MASK_NETIF_TX_IDX(i)].resp; - resp->id = id; - resp->status = st; - wmb(); - netif->tx->resp_prod = netif->tx_resp_prod = ++i; - - mb(); /* Update producer before checking event threshold. */ - if ( i == netif->tx->event ) - notify_via_evtchn(netif->evtchn); -} - -static int make_rx_response(netif_t *netif, - u16 id, - s8 st, - memory_t addr, - u16 size, - u16 csum_valid) -{ - NETIF_RING_IDX i = netif->rx_resp_prod; - netif_rx_response_t *resp; - - resp = &netif->rx->ring[MASK_NETIF_RX_IDX(i)].resp; - resp->addr = addr; - resp->csum_valid = csum_valid; - resp->id = id; - resp->status = (s16)size; - if ( st < 0 ) - resp->status = (s16)st; - wmb(); - netif->rx->resp_prod = netif->rx_resp_prod = ++i; - - mb(); /* Update producer before checking event threshold. */ - return (i == netif->rx->event); -} - -static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) -{ - struct list_head *ent; - netif_t *netif; - int i = 0; - - printk(KERN_ALERT "netif_schedule_list:\n"); - spin_lock_irq(&net_schedule_list_lock); - - list_for_each ( ent, &net_schedule_list ) - { - netif = list_entry(ent, netif_t, list); - printk(KERN_ALERT " %d: private(rx_req_cons=%08x rx_resp_prod=%08x\n", - i, netif->rx_req_cons, netif->rx_resp_prod); - printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", - netif->tx_req_cons, netif->tx_resp_prod); - printk(KERN_ALERT " shared(rx_req_prod=%08x rx_resp_prod=%08x\n", - netif->rx->req_prod, netif->rx->resp_prod); - printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", - netif->rx->event, netif->tx->req_prod); - printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", - netif->tx->resp_prod, netif->tx->event); - i++; - } - - spin_unlock_irq(&net_schedule_list_lock); - printk(KERN_ALERT " ** End of netif_schedule_list **\n"); - - return IRQ_HANDLED; -} - -static int __init netback_init(void) -{ - int i; - struct page *page; - - if ( !(xen_start_info.flags & SIF_NET_BE_DOMAIN) && - !(xen_start_info.flags & SIF_INITDOMAIN) ) - return 0; - - printk("Initialising Xen netif backend\n"); - - /* We can increase reservation by this much in net_rx_action(). */ - balloon_update_driver_allowance(NETIF_RX_RING_SIZE); - - skb_queue_head_init(&rx_queue); - skb_queue_head_init(&tx_queue); - - init_timer(&net_timer); - net_timer.data = 0; - net_timer.function = net_alarm; - - netif_interface_init(); - - mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS); - BUG_ON(mmap_vstart == 0); - - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - { - page = virt_to_page(MMAP_VADDR(i)); - set_page_count(page, 1); - SetPageForeign(page, netif_page_release); - } - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; - - spin_lock_init(&net_schedule_list_lock); - INIT_LIST_HEAD(&net_schedule_list); - - netif_ctrlif_init(); - - (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), - netif_be_dbg, SA_SHIRQ, - "net-be-dbg", &netif_be_dbg); - - return 0; -} - -static void netback_cleanup(void) -{ - BUG(); -} - -module_init(netback_init); -module_exit(netback_cleanup); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netfront/Kconfig --- a/linux-2.6.11-xen-sparse/drivers/xen/netfront/Kconfig Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,6 +0,0 @@ - -config XENNET - tristate "Xen network driver" - depends on NETDEVICES && ARCH_XEN - help - Network driver for Xen diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netfront/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/netfront/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y := netfront.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6.11-xen-sparse/drivers/xen/netfront/netfront.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1480 +0,0 @@ -/****************************************************************************** - * Virtual network driver for conversing with remote driver backends. - * - * Copyright (c) 2002-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/version.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/etherdevice.h> -#include <linux/skbuff.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <linux/proc_fs.h> -#include <linux/ethtool.h> -#include <net/sock.h> -#include <net/pkt_sched.h> -#include <net/arp.h> -#include <net/route.h> -#include <asm/io.h> -#include <asm/uaccess.h> -#include <asm-xen/evtchn.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/xen-public/io/netif.h> -#include <asm-xen/balloon.h> -#include <asm/page.h> -#include <asm/uaccess.h> - -#ifndef __GFP_NOWARN -#define __GFP_NOWARN 0 -#endif -#define alloc_xen_skb(_l) __dev_alloc_skb((_l), GFP_ATOMIC|__GFP_NOWARN) - -#define init_skb_shinfo(_skb) \ - do { \ - atomic_set(&(skb_shinfo(_skb)->dataref), 1); \ - skb_shinfo(_skb)->nr_frags = 0; \ - skb_shinfo(_skb)->frag_list = NULL; \ - } while (0) - -/* Allow headroom on each rx pkt for Ethernet header, alignment padding, ... */ -#define RX_HEADROOM 200 - -/* - * If the backend driver is pipelining transmit requests then we can be very - * aggressive in avoiding new-packet notifications -- only need to send a - * notification if there are no outstanding unreceived responses. - * If the backend may be buffering our transmit buffers for any reason then we - * are rather more conservative. - */ -#ifdef CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER -#define TX_TEST_IDX resp_prod /* aggressive: any outstanding responses? */ -#else -#define TX_TEST_IDX req_cons /* conservative: not seen all our requests? */ -#endif - -static void network_tx_buf_gc(struct net_device *dev); -static void network_alloc_rx_buffers(struct net_device *dev); - -static unsigned long rx_pfn_array[NETIF_RX_RING_SIZE]; -static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE+1]; -static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; - -#ifdef CONFIG_PROC_FS -static int xennet_proc_init(void); -static int xennet_proc_addif(struct net_device *dev); -static void xennet_proc_delif(struct net_device *dev); -#else -#define xennet_proc_init() (0) -#define xennet_proc_addif(d) (0) -#define xennet_proc_delif(d) ((void)0) -#endif - -static struct list_head dev_list; - -struct net_private -{ - struct list_head list; - struct net_device *dev; - - struct net_device_stats stats; - NETIF_RING_IDX rx_resp_cons, tx_resp_cons; - unsigned int tx_full; - - netif_tx_interface_t *tx; - netif_rx_interface_t *rx; - - spinlock_t tx_lock; - spinlock_t rx_lock; - - unsigned int handle; - unsigned int evtchn; - unsigned int irq; - - /* What is the status of our connection to the remote backend? */ -#define BEST_CLOSED 0 -#define BEST_DISCONNECTED 1 -#define BEST_CONNECTED 2 - unsigned int backend_state; - - /* Is this interface open or closed (down or up)? */ -#define UST_CLOSED 0 -#define UST_OPEN 1 - unsigned int user_state; - - /* Receive-ring batched refills. */ -#define RX_MIN_TARGET 8 -#define RX_MAX_TARGET NETIF_RX_RING_SIZE - int rx_min_target, rx_max_target, rx_target; - struct sk_buff_head rx_batch; - - /* - * {tx,rx}_skbs store outstanding skbuffs. The first entry in each - * array is an index into a chain of free entries. - */ - struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; - struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; -}; - -/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ -#define ADD_ID_TO_FREELIST(_list, _id) \ - (_list)[(_id)] = (_list)[0]; \ - (_list)[0] = (void *)(unsigned long)(_id); -#define GET_ID_FROM_FREELIST(_list) \ - ({ unsigned long _id = (unsigned long)(_list)[0]; \ - (_list)[0] = (_list)[_id]; \ - (unsigned short)_id; }) - -static char *status_name[] = { - [NETIF_INTERFACE_STATUS_CLOSED] = "closed", - [NETIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", - [NETIF_INTERFACE_STATUS_CONNECTED] = "connected", - [NETIF_INTERFACE_STATUS_CHANGED] = "changed", -}; - -static char *be_state_name[] = { - [BEST_CLOSED] = "closed", - [BEST_DISCONNECTED] = "disconnected", - [BEST_CONNECTED] = "connected", -}; - -#if DEBUG -#define DPRINTK(fmt, args...) \ - printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) -#else -#define DPRINTK(fmt, args...) ((void)0) -#endif -#define IPRINTK(fmt, args...) \ - printk(KERN_INFO "xen_net: " fmt, ##args) -#define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_net: " fmt, ##args) - -static struct net_device *find_dev_by_handle(unsigned int handle) -{ - struct list_head *ent; - struct net_private *np; - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - if (np->handle == handle) - return np->dev; - } - return NULL; -} - -/** Network interface info. */ -struct netif_ctrl { - /** Number of interfaces. */ - int interface_n; - /** Number of connected interfaces. */ - int connected_n; - /** Error code. */ - int err; - int up; -}; - -static struct netif_ctrl netctrl; - -static void netctrl_init(void) -{ - memset(&netctrl, 0, sizeof(netctrl)); - netctrl.up = NETIF_DRIVER_STATUS_DOWN; -} - -/** Get or set a network interface error. - */ -static int netctrl_err(int err) -{ - if ((err < 0) && !netctrl.err) - netctrl.err = err; - return netctrl.err; -} - -/** Test if all network interfaces are connected. - * - * @return 1 if all connected, 0 if not, negative error code otherwise - */ -static int netctrl_connected(void) -{ - int ok; - - if (netctrl.err) - ok = netctrl.err; - else if (netctrl.up == NETIF_DRIVER_STATUS_UP) - ok = (netctrl.connected_n == netctrl.interface_n); - else - ok = 0; - - return ok; -} - -/** Count the connected network interfaces. - * - * @return connected count - */ -static int netctrl_connected_count(void) -{ - - struct list_head *ent; - struct net_private *np; - unsigned int connected; - - connected = 0; - - list_for_each(ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - if (np->backend_state == BEST_CONNECTED) - connected++; - } - - netctrl.connected_n = connected; - DPRINTK("> connected_n=%d interface_n=%d\n", - netctrl.connected_n, netctrl.interface_n); - return connected; -} - -/** Send a packet on a net device to encourage switches to learn the - * MAC. We send a fake ARP request. - * - * @param dev device - * @return 0 on success, error code otherwise - */ -static int send_fake_arp(struct net_device *dev) -{ - struct sk_buff *skb; - u32 src_ip, dst_ip; - - dst_ip = INADDR_BROADCAST; - src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); - - /* No IP? Then nothing to do. */ - if (src_ip == 0) - return 0; - - skb = arp_create(ARPOP_REPLY, ETH_P_ARP, - dst_ip, dev, src_ip, - /*dst_hw*/ NULL, /*src_hw*/ NULL, - /*target_hw*/ dev->dev_addr); - if (skb == NULL) - return -ENOMEM; - - return dev_queue_xmit(skb); -} - -static int network_open(struct net_device *dev) -{ - struct net_private *np = netdev_priv(dev); - - memset(&np->stats, 0, sizeof(np->stats)); - - np->user_state = UST_OPEN; - - network_alloc_rx_buffers(dev); - np->rx->event = np->rx_resp_cons + 1; - - netif_start_queue(dev); - - return 0; -} - -static void network_tx_buf_gc(struct net_device *dev) -{ - NETIF_RING_IDX i, prod; - unsigned short id; - struct net_private *np = netdev_priv(dev); - struct sk_buff *skb; - - if (np->backend_state != BEST_CONNECTED) - return; - - do { - prod = np->tx->resp_prod; - rmb(); /* Ensure we see responses up to 'rp'. */ - - for (i = np->tx_resp_cons; i != prod; i++) { - id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id; - skb = np->tx_skbs[id]; - ADD_ID_TO_FREELIST(np->tx_skbs, id); - dev_kfree_skb_irq(skb); - } - - np->tx_resp_cons = prod; - - /* - * Set a new event, then check for race with update of tx_cons. Note - * that it is essential to schedule a callback, no matter how few - * buffers are pending. Even if there is space in the transmit ring, - * higher layers may be blocked because too much data is outstanding: - * in such cases notification from Xen is likely to be the only kick - * that we'll get. - */ - np->tx->event = - prod + ((np->tx->req_prod - prod) >> 1) + 1; - mb(); - } while (prod != np->tx->resp_prod); - - if (np->tx_full && ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE)) { - np->tx_full = 0; - if (np->user_state == UST_OPEN) - netif_wake_queue(dev); - } -} - - -static void network_alloc_rx_buffers(struct net_device *dev) -{ - unsigned short id; - struct net_private *np = netdev_priv(dev); - struct sk_buff *skb; - int i, batch_target; - NETIF_RING_IDX req_prod = np->rx->req_prod; - - if (unlikely(np->backend_state != BEST_CONNECTED)) - return; - - /* - * Allocate skbuffs greedily, even though we batch updates to the - * receive ring. This creates a less bursty demand on the memory allocator, - * so should reduce the chance of failed allocation requests both for - * ourself and for other kernel subsystems. - */ - batch_target = np->rx_target - (req_prod - np->rx_resp_cons); - for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { - if (unlikely((skb = alloc_xen_skb(dev->mtu + RX_HEADROOM)) == NULL)) - break; - __skb_queue_tail(&np->rx_batch, skb); - } - - /* Is the batch large enough to be worthwhile? */ - if (i < (np->rx_target/2)) - return; - - for (i = 0; ; i++) { - if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) - break; - - skb->dev = dev; - - id = GET_ID_FROM_FREELIST(np->rx_skbs); - - np->rx_skbs[id] = skb; - - np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; - - rx_pfn_array[i] = virt_to_machine(skb->head) >> PAGE_SHIFT; - - /* Remove this page from pseudo phys map before passing back to Xen. */ - phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] - = INVALID_P2M_ENTRY; - - MULTI_update_va_mapping(rx_mcl+i, (unsigned long)skb->head, - __pte(0), 0); - } - - /* After all PTEs have been zapped we blow away stale TLB entries. */ - rx_mcl[i-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - - /* Give away a batch of pages. */ - rx_mcl[i].op = __HYPERVISOR_dom_mem_op; - rx_mcl[i].args[0] = MEMOP_decrease_reservation; - rx_mcl[i].args[1] = (unsigned long)rx_pfn_array; - rx_mcl[i].args[2] = (unsigned long)i; - rx_mcl[i].args[3] = 0; - rx_mcl[i].args[4] = DOMID_SELF; - - /* Tell the ballon driver what is going on. */ - balloon_update_driver_allowance(i); - - /* Zap PTEs and give away pages in one big multicall. */ - (void)HYPERVISOR_multicall(rx_mcl, i+1); - - /* Check return status of HYPERVISOR_dom_mem_op(). */ - if (unlikely(rx_mcl[i].result != i)) - panic("Unable to reduce memory reservation\n"); - - /* Above is a suitable barrier to ensure backend will see requests. */ - np->rx->req_prod = req_prod + i; - - /* Adjust our floating fill target if we risked running out of buffers. */ - if (((req_prod - np->rx->resp_prod) < (np->rx_target / 4)) && - ((np->rx_target *= 2) > np->rx_max_target)) - np->rx_target = np->rx_max_target; -} - - -static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - unsigned short id; - struct net_private *np = netdev_priv(dev); - netif_tx_request_t *tx; - NETIF_RING_IDX i; - - if (unlikely(np->tx_full)) { - printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); - netif_stop_queue(dev); - goto drop; - } - - if (unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= - PAGE_SIZE)) { - struct sk_buff *nskb; - if (unlikely((nskb = alloc_xen_skb(skb->len)) == NULL)) - goto drop; - skb_put(nskb, skb->len); - memcpy(nskb->data, skb->data, skb->len); - nskb->dev = skb->dev; - dev_kfree_skb(skb); - skb = nskb; - } - - spin_lock_irq(&np->tx_lock); - - if (np->backend_state != BEST_CONNECTED) { - spin_unlock_irq(&np->tx_lock); - goto drop; - } - - i = np->tx->req_prod; - - id = GET_ID_FROM_FREELIST(np->tx_skbs); - np->tx_skbs[id] = skb; - - tx = &np->tx->ring[MASK_NETIF_TX_IDX(i)].req; - - tx->id = id; - tx->addr = virt_to_machine(skb->data); - tx->size = skb->len; - tx->csum_blank = (skb->ip_summed == CHECKSUM_HW); - - wmb(); /* Ensure that backend will see the request. */ - np->tx->req_prod = i + 1; - - network_tx_buf_gc(dev); - - if ((i - np->tx_resp_cons) == (NETIF_TX_RING_SIZE - 1)) { - np->tx_full = 1; - netif_stop_queue(dev); - } - - spin_unlock_irq(&np->tx_lock); - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - - /* Only notify Xen if we really have to. */ - mb(); - if (np->tx->TX_TEST_IDX == i) - notify_via_evtchn(np->evtchn); - - return 0; - - drop: - np->stats.tx_dropped++; - dev_kfree_skb(skb); - return 0; -} - -static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) -{ - struct net_device *dev = dev_id; - struct net_private *np = netdev_priv(dev); - unsigned long flags; - - spin_lock_irqsave(&np->tx_lock, flags); - network_tx_buf_gc(dev); - spin_unlock_irqrestore(&np->tx_lock, flags); - - if ((np->rx_resp_cons != np->rx->resp_prod) && (np->user_state == UST_OPEN)) - netif_rx_schedule(dev); - - return IRQ_HANDLED; -} - - -static int netif_poll(struct net_device *dev, int *pbudget) -{ - struct net_private *np = netdev_priv(dev); - struct sk_buff *skb, *nskb; - netif_rx_response_t *rx; - NETIF_RING_IDX i, rp; - mmu_update_t *mmu = rx_mmu; - multicall_entry_t *mcl = rx_mcl; - int work_done, budget, more_to_do = 1; - struct sk_buff_head rxq; - unsigned long flags; - - spin_lock(&np->rx_lock); - - if (np->backend_state != BEST_CONNECTED) { - spin_unlock(&np->rx_lock); - return 0; - } - - skb_queue_head_init(&rxq); - - if ((budget = *pbudget) > dev->quota) - budget = dev->quota; - - rp = np->rx->resp_prod; - rmb(); /* Ensure we see queued responses up to 'rp'. */ - - for (i = np->rx_resp_cons, work_done = 0; - (i != rp) && (work_done < budget); - i++, work_done++) { - rx = &np->rx->ring[MASK_NETIF_RX_IDX(i)].resp; - - /* - * An error here is very odd. Usually indicates a backend bug, - * low-memory condition, or that we didn't have reservation headroom. - */ - if (unlikely(rx->status <= 0)) { - if (net_ratelimit()) - printk(KERN_WARNING "Bad rx buffer (memory squeeze?).\n"); - np->rx->ring[MASK_NETIF_RX_IDX(np->rx->req_prod)].req.id = rx->id; - wmb(); - np->rx->req_prod++; - work_done--; - continue; - } - - skb = np->rx_skbs[rx->id]; - ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); - - /* NB. We handle skb overflow later. */ - skb->data = skb->head + (rx->addr & ~PAGE_MASK); - skb->len = rx->status; - skb->tail = skb->data + skb->len; - - if ( rx->csum_valid ) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - np->stats.rx_packets++; - np->stats.rx_bytes += rx->status; - - /* Remap the page. */ - mmu->ptr = (rx->addr & PAGE_MASK) | MMU_MACHPHYS_UPDATE; - mmu->val = __pa(skb->head) >> PAGE_SHIFT; - mmu++; - MULTI_update_va_mapping(mcl, (unsigned long)skb->head, - pfn_pte_ma(rx->addr >> PAGE_SHIFT, PAGE_KERNEL), 0); - mcl++; - - phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = - rx->addr >> PAGE_SHIFT; - - __skb_queue_tail(&rxq, skb); - } - - /* Some pages are no longer absent... */ - balloon_update_driver_allowance(-work_done); - - /* Do all the remapping work, and M->P updates, in one big hypercall. */ - if (likely((mcl - rx_mcl) != 0)) { - mcl->op = __HYPERVISOR_mmu_update; - mcl->args[0] = (unsigned long)rx_mmu; - mcl->args[1] = mmu - rx_mmu; - mcl->args[2] = 0; - mcl->args[3] = DOMID_SELF; - mcl++; - (void)HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); - } - - while ((skb = __skb_dequeue(&rxq)) != NULL) { - /* - * Enough room in skbuff for the data we were passed? Also, Linux - * expects at least 16 bytes headroom in each receive buffer. - */ - if (unlikely(skb->tail > skb->end) || - unlikely((skb->data - skb->head) < 16)) { - nskb = NULL; - - /* Only copy the packet if it fits in the current MTU. */ - if (skb->len <= (dev->mtu + ETH_HLEN)) { - if ((skb->tail > skb->end) && net_ratelimit()) - printk(KERN_INFO "Received packet needs %zd bytes more " - "headroom.\n", skb->tail - skb->end); - - if ((nskb = alloc_xen_skb(skb->len + 2)) != NULL) { - skb_reserve(nskb, 2); - skb_put(nskb, skb->len); - memcpy(nskb->data, skb->data, skb->len); - nskb->dev = skb->dev; - } - } - else if (net_ratelimit()) - printk(KERN_INFO "Received packet too big for MTU " - "(%d > %d)\n", skb->len - ETH_HLEN, dev->mtu); - - /* Reinitialise and then destroy the old skbuff. */ - skb->len = 0; - skb->tail = skb->data; - init_skb_shinfo(skb); - dev_kfree_skb(skb); - - /* Switch old for new, if we copied the buffer. */ - if ((skb = nskb) == NULL) - continue; - } - - /* Set the shared-info area, which is hidden behind the real data. */ - init_skb_shinfo(skb); - - /* Ethernet-specific work. Delayed to here as it peeks the header. */ - skb->protocol = eth_type_trans(skb, dev); - - /* Pass it up. */ - netif_receive_skb(skb); - dev->last_rx = jiffies; - } - - np->rx_resp_cons = i; - - /* If we get a callback with very few responses, reduce fill target. */ - /* NB. Note exponential increase, linear decrease. */ - if (((np->rx->req_prod - np->rx->resp_prod) > ((3*np->rx_target) / 4)) && - (--np->rx_target < np->rx_min_target)) - np->rx_target = np->rx_min_target; - - network_alloc_rx_buffers(dev); - - *pbudget -= work_done; - dev->quota -= work_done; - - if (work_done < budget) { - local_irq_save(flags); - - np->rx->event = i + 1; - - /* Deal with hypervisor racing our resetting of rx_event. */ - mb(); - if (np->rx->resp_prod == i) { - __netif_rx_complete(dev); - more_to_do = 0; - } - - local_irq_restore(flags); - } - - spin_unlock(&np->rx_lock); - - return more_to_do; -} - - -static int network_close(struct net_device *dev) -{ - struct net_private *np = netdev_priv(dev); - np->user_state = UST_CLOSED; - netif_stop_queue(np->dev); - return 0; -} - - -static struct net_device_stats *network_get_stats(struct net_device *dev) -{ - struct net_private *np = netdev_priv(dev); - return &np->stats; -} - - -static void network_connect(struct net_device *dev, - netif_fe_interface_status_t *status) -{ - struct net_private *np; - int i, requeue_idx; - netif_tx_request_t *tx; - - np = netdev_priv(dev); - spin_lock_irq(&np->tx_lock); - spin_lock(&np->rx_lock); - - /* Recovery procedure: */ - - /* Step 1: Reinitialise variables. */ - np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; - np->rx->event = np->tx->event = 1; - - /* Step 2: Rebuild the RX and TX ring contents. - * NB. We could just free the queued TX packets now but we hope - * that sending them out might do some good. We have to rebuild - * the RX ring because some of our pages are currently flipped out - * so we can't just free the RX skbs. - * NB2. Freelist index entries are always going to be less than - * __PAGE_OFFSET, whereas pointers to skbs will always be equal or - * greater than __PAGE_OFFSET: we use this property to distinguish - * them. - */ - - /* Rebuild the TX buffer freelist and the TX ring itself. - * NB. This reorders packets. We could keep more private state - * to avoid this but maybe it doesn't matter so much given the - * interface has been down. - */ - for (requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++) { - if ((unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET) { - struct sk_buff *skb = np->tx_skbs[i]; - - tx = &np->tx->ring[requeue_idx++].req; - - tx->id = i; - tx->addr = virt_to_machine(skb->data); - tx->size = skb->len; - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - } - } - wmb(); - np->tx->req_prod = requeue_idx; - - /* Rebuild the RX buffer freelist and the RX ring itself. */ - for (requeue_idx = 0, i = 1; i <= NETIF_RX_RING_SIZE; i++) - if ((unsigned long)np->rx_skbs[i] >= __PAGE_OFFSET) - np->rx->ring[requeue_idx++].req.id = i; - wmb(); - np->rx->req_prod = requeue_idx; - - /* Step 3: All public and private state should now be sane. Get - * ready to start sending and receiving packets and give the driver - * domain a kick because we've probably just requeued some - * packets. - */ - np->backend_state = BEST_CONNECTED; - wmb(); - notify_via_evtchn(status->evtchn); - network_tx_buf_gc(dev); - - if (np->user_state == UST_OPEN) - netif_start_queue(dev); - - spin_unlock(&np->rx_lock); - spin_unlock_irq(&np->tx_lock); -} - -static void vif_show(struct net_private *np) -{ -#if DEBUG - if (np) { - IPRINTK("<vif handle=%u %s(%s) evtchn=%u irq=%u tx=%p rx=%p>\n", - np->handle, - be_state_name[np->backend_state], - np->user_state ? "open" : "closed", - np->evtchn, - np->irq, - np->tx, - np->rx); - } else { - IPRINTK("<vif NULL>\n"); - } -#endif -} - -/* Send a connect message to xend to tell it to bring up the interface. */ -static void send_interface_connect(struct net_private *np) -{ - ctrl_msg_t cmsg = { - .type = CMSG_NETIF_FE, - .subtype = CMSG_NETIF_FE_INTERFACE_CONNECT, - .length = sizeof(netif_fe_interface_connect_t), - }; - netif_fe_interface_connect_t *msg = (void*)cmsg.msg; - - msg->handle = np->handle; - msg->tx_shmem_frame = (virt_to_machine(np->tx) >> PAGE_SHIFT); - msg->rx_shmem_frame = (virt_to_machine(np->rx) >> PAGE_SHIFT); - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -/* Send a driver status notification to the domain controller. */ -static int send_driver_status(int ok) -{ - int err = 0; - ctrl_msg_t cmsg = { - .type = CMSG_NETIF_FE, - .subtype = CMSG_NETIF_FE_DRIVER_STATUS, - .length = sizeof(netif_fe_driver_status_t), - }; - netif_fe_driver_status_t *msg = (void*)cmsg.msg; - - msg->status = (ok ? NETIF_DRIVER_STATUS_UP : NETIF_DRIVER_STATUS_DOWN); - err = ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - return err; -} - -/* Stop network device and free tx/rx queues and irq. - */ -static void vif_release(struct net_private *np) -{ - /* Stop old i/f to prevent errors whilst we rebuild the state. */ - spin_lock_irq(&np->tx_lock); - spin_lock(&np->rx_lock); - netif_stop_queue(np->dev); - /* np->backend_state = BEST_DISCONNECTED; */ - spin_unlock(&np->rx_lock); - spin_unlock_irq(&np->tx_lock); - - /* Free resources. */ - if(np->tx != NULL){ - free_irq(np->irq, np->dev); - unbind_evtchn_from_irq(np->evtchn); - free_page((unsigned long)np->tx); - free_page((unsigned long)np->rx); - np->irq = 0; - np->evtchn = 0; - np->tx = NULL; - np->rx = NULL; - } -} - -/* Release vif resources and close it down completely. - */ -static void vif_close(struct net_private *np) -{ - WPRINTK("Unexpected netif-CLOSED message in state %s\n", - be_state_name[np->backend_state]); - vif_release(np); - np->backend_state = BEST_CLOSED; - /* todo: take dev down and free. */ - vif_show(np); -} - -/* Move the vif into disconnected state. - * Allocates tx/rx pages. - * Sends connect message to xend. - */ -static void vif_disconnect(struct net_private *np) -{ - if(np->tx) free_page((unsigned long)np->tx); - if(np->rx) free_page((unsigned long)np->rx); - // Before this np->tx and np->rx had better be null. - np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); - np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); - memset(np->tx, 0, PAGE_SIZE); - memset(np->rx, 0, PAGE_SIZE); - np->backend_state = BEST_DISCONNECTED; - send_interface_connect(np); - vif_show(np); -} - -/* Begin interface recovery. - * - * NB. Whilst we're recovering, we turn the carrier state off. We - * take measures to ensure that this device isn't used for - * anything. We also stop the queue for this device. Various - * different approaches (e.g. continuing to buffer packets) have - * been tested but don't appear to improve the overall impact on - * TCP connections. - * - * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery - * is initiated by a special "RESET" message - disconnect could - * just mean we're not allowed to use this interface any more. - */ -static void vif_reset(struct net_private *np) -{ - IPRINTK("Attempting to reconnect network interface: handle=%u\n", - np->handle); - vif_release(np); - vif_disconnect(np); - vif_show(np); -} - -/* Move the vif into connected state. - * Sets the mac and event channel from the message. - * Binds the irq to the event channel. - */ -static void -vif_connect(struct net_private *np, netif_fe_interface_status_t *status) -{ - struct net_device *dev = np->dev; - memcpy(dev->dev_addr, status->mac, ETH_ALEN); - network_connect(dev, status); - np->evtchn = status->evtchn; - np->irq = bind_evtchn_to_irq(np->evtchn); - (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, dev->name, dev); - netctrl_connected_count(); - (void)send_fake_arp(dev); - vif_show(np); -} - -static struct ethtool_ops network_ethtool_ops = -{ - .get_tx_csum = ethtool_op_get_tx_csum, - .set_tx_csum = ethtool_op_set_tx_csum, -}; - -/** Create a network device. - * @param handle device handle - * @param val return parameter for created device - * @return 0 on success, error code otherwise - */ -static int create_netdev(int handle, struct net_device **val) -{ - int i, err = 0; - struct net_device *dev = NULL; - struct net_private *np = NULL; - - if ((dev = alloc_etherdev(sizeof(struct net_private))) == NULL) { - printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__); - err = -ENOMEM; - goto exit; - } - - np = netdev_priv(dev); - np->backend_state = BEST_CLOSED; - np->user_state = UST_CLOSED; - np->handle = handle; - - spin_lock_init(&np->tx_lock); - spin_lock_init(&np->rx_lock); - - skb_queue_head_init(&np->rx_batch); - np->rx_target = RX_MIN_TARGET; - np->rx_min_target = RX_MIN_TARGET; - np->rx_max_target = RX_MAX_TARGET; - - /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ - for (i = 0; i <= NETIF_TX_RING_SIZE; i++) - np->tx_skbs[i] = (void *)((unsigned long) i+1); - for (i = 0; i <= NETIF_RX_RING_SIZE; i++) - np->rx_skbs[i] = (void *)((unsigned long) i+1); - - dev->open = network_open; - dev->hard_start_xmit = network_start_xmit; - dev->stop = network_close; - dev->get_stats = network_get_stats; - dev->poll = netif_poll; - dev->weight = 64; - dev->features = NETIF_F_IP_CSUM; - - SET_ETHTOOL_OPS(dev, &network_ethtool_ops); - - if ((err = register_netdev(dev)) != 0) { - printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err); - goto exit; - } - - if ((err = xennet_proc_addif(dev)) != 0) { - unregister_netdev(dev); - goto exit; - } - - np->dev = dev; - list_add(&np->list, &dev_list); - - exit: - if ((err != 0) && (dev != NULL)) - kfree(dev); - else if (val != NULL) - *val = dev; - return err; -} - -/* Get the target interface for a status message. - * Creates the interface when it makes sense. - * The returned interface may be null when there is no error. - * - * @param status status message - * @param np return parameter for interface state - * @return 0 on success, error code otherwise - */ -static int -target_vif(netif_fe_interface_status_t *status, struct net_private **np) -{ - int err = 0; - struct net_device *dev; - - DPRINTK("> handle=%d\n", status->handle); - if (status->handle < 0) { - err = -EINVAL; - goto exit; - } - - if ((dev = find_dev_by_handle(status->handle)) != NULL) - goto exit; - - if (status->status == NETIF_INTERFACE_STATUS_CLOSED) - goto exit; - if (status->status == NETIF_INTERFACE_STATUS_CHANGED) - goto exit; - - /* It's a new interface in a good state - create it. */ - DPRINTK("> create device...\n"); - if ((err = create_netdev(status->handle, &dev)) != 0) - goto exit; - - netctrl.interface_n++; - - exit: - if (np != NULL) - *np = ((dev && !err) ? netdev_priv(dev) : NULL); - DPRINTK("< err=%d\n", err); - return err; -} - -/* Handle an interface status message. */ -static void netif_interface_status(netif_fe_interface_status_t *status) -{ - int err = 0; - struct net_private *np = NULL; - - DPRINTK("> status=%s handle=%d\n", - status_name[status->status], status->handle); - - if ((err = target_vif(status, &np)) != 0) { - WPRINTK("Invalid netif: handle=%u\n", status->handle); - return; - } - - if (np == NULL) { - DPRINTK("> no vif\n"); - return; - } - - switch (status->status) { - case NETIF_INTERFACE_STATUS_CLOSED: - switch (np->backend_state) { - case BEST_CLOSED: - case BEST_DISCONNECTED: - case BEST_CONNECTED: - vif_close(np); - break; - } - break; - - case NETIF_INTERFACE_STATUS_DISCONNECTED: - switch (np->backend_state) { - case BEST_CLOSED: - vif_disconnect(np); - break; - case BEST_DISCONNECTED: - case BEST_CONNECTED: - vif_reset(np); - break; - } - break; - - case NETIF_INTERFACE_STATUS_CONNECTED: - switch (np->backend_state) { - case BEST_CLOSED: - WPRINTK("Unexpected netif status %s in state %s\n", - status_name[status->status], - be_state_name[np->backend_state]); - vif_disconnect(np); - vif_connect(np, status); - break; - case BEST_DISCONNECTED: - vif_connect(np, status); - break; - } - break; - - case NETIF_INTERFACE_STATUS_CHANGED: - /* - * The domain controller is notifying us that a device has been - * added or removed. - */ - break; - - default: - WPRINTK("Invalid netif status code %d\n", status->status); - break; - } - - vif_show(np); -} - -/* - * Initialize the network control interface. - */ -static void netif_driver_status(netif_fe_driver_status_t *status) -{ - netctrl.up = status->status; - netctrl_connected_count(); -} - -/* Receive handler for control messages. */ -static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - - switch (msg->subtype) { - case CMSG_NETIF_FE_INTERFACE_STATUS: - netif_interface_status((netif_fe_interface_status_t *) &msg->msg[0]); - break; - - case CMSG_NETIF_FE_DRIVER_STATUS: - netif_driver_status((netif_fe_driver_status_t *) &msg->msg[0]); - break; - - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - - -#if 1 -/* Wait for all interfaces to be connected. - * - * This works OK, but we'd like to use the probing mode (see below). - */ -static int probe_interfaces(void) -{ - int err = 0, conn = 0; - int wait_i, wait_n = 100; - - DPRINTK(">\n"); - - for (wait_i = 0; wait_i < wait_n; wait_i++) { - DPRINTK("> wait_i=%d\n", wait_i); - conn = netctrl_connected(); - if(conn) break; - DPRINTK("> schedule_timeout...\n"); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(10); - } - - DPRINTK("> wait finished...\n"); - if (conn <= 0) { - err = netctrl_err(-ENETDOWN); - WPRINTK("Failed to connect all virtual interfaces: err=%d\n", err); - } - - DPRINTK("< err=%d\n", err); - - return err; -} -#else -/* Probe for interfaces until no more are found. - * - * This is the mode we'd like to use, but at the moment it panics the kernel. -*/ -static int probe_interfaces(void) -{ - int err = 0; - int wait_i, wait_n = 100; - ctrl_msg_t cmsg = { - .type = CMSG_NETIF_FE, - .subtype = CMSG_NETIF_FE_INTERFACE_STATUS, - .length = sizeof(netif_fe_interface_status_t), - }; - netif_fe_interface_status_t msg = {}; - ctrl_msg_t rmsg = {}; - netif_fe_interface_status_t *reply = (void*)rmsg.msg; - int state = TASK_UNINTERRUPTIBLE; - u32 query = -1; - - DPRINTK(">\n"); - - netctrl.interface_n = 0; - for (wait_i = 0; wait_i < wait_n; wait_i++) { - DPRINTK("> wait_i=%d query=%d\n", wait_i, query); - msg.handle = query; - memcpy(cmsg.msg, &msg, sizeof(msg)); - DPRINTK("> set_current_state...\n"); - set_current_state(state); - DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); - DPRINTK("> sending...\n"); - err = ctrl_if_send_message_and_get_response(&cmsg, &rmsg, state); - DPRINTK("> err=%d\n", err); - if(err) goto exit; - DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); - if((int)reply->handle < 0) { - // No more interfaces. - break; - } - query = -reply->handle - 2; - DPRINTK(">netif_interface_status ...\n"); - netif_interface_status(reply); - } - - exit: - if (err) { - err = netctrl_err(-ENETDOWN); - WPRINTK("Connecting virtual network interfaces failed: err=%d\n", err); - } - - DPRINTK("< err=%d\n", err); - return err; -} - -#endif - -/* - * We use this notifier to send out a fake ARP reply to reset switches and - * router ARP caches when an IP interface is brought up on a VIF. - */ -static int -inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; - struct list_head *ent; - struct net_private *np; - - if (event != NETDEV_UP) - goto out; - - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - if (np->dev == dev) - (void)send_fake_arp(dev); - } - - out: - return NOTIFY_DONE; -} - -static struct notifier_block notifier_inetdev = { - .notifier_call = inetdev_notify, - .next = NULL, - .priority = 0 -}; - -static int __init netif_init(void) -{ - int err = 0; - - if (xen_start_info.flags & SIF_INITDOMAIN) - return 0; - - if ((err = xennet_proc_init()) != 0) - return err; - - IPRINTK("Initialising virtual ethernet driver.\n"); - INIT_LIST_HEAD(&dev_list); - (void)register_inetaddr_notifier(¬ifier_inetdev); - netctrl_init(); - (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - send_driver_status(1); - err = probe_interfaces(); - if (err) - ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); - - DPRINTK("< err=%d\n", err); - return err; -} - -static void vif_suspend(struct net_private *np) -{ - /* Avoid having tx/rx stuff happen until we're ready. */ - free_irq(np->irq, np->dev); - unbind_evtchn_from_irq(np->evtchn); -} - -static void vif_resume(struct net_private *np) -{ - /* - * Connect regardless of whether IFF_UP flag set. - * Stop bad things from happening until we're back up. - */ - np->backend_state = BEST_DISCONNECTED; - memset(np->tx, 0, PAGE_SIZE); - memset(np->rx, 0, PAGE_SIZE); - - send_interface_connect(np); -} - -void netif_suspend(void) -{ - struct list_head *ent; - struct net_private *np; - - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - vif_suspend(np); - } -} - -void netif_resume(void) -{ - struct list_head *ent; - struct net_private *np; - - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - vif_resume(np); - } -} - -#ifdef CONFIG_PROC_FS - -#define TARGET_MIN 0UL -#define TARGET_MAX 1UL -#define TARGET_CUR 2UL - -static int xennet_proc_read( - char *page, char **start, off_t off, int count, int *eof, void *data) -{ - struct net_device *dev = (struct net_device *)((unsigned long)data & ~3UL); - struct net_private *np = netdev_priv(dev); - int len = 0, which_target = (long)data & 3; - - switch (which_target) - { - case TARGET_MIN: - len = sprintf(page, "%d\n", np->rx_min_target); - break; - case TARGET_MAX: - len = sprintf(page, "%d\n", np->rx_max_target); - break; - case TARGET_CUR: - len = sprintf(page, "%d\n", np->rx_target); - break; - } - - *eof = 1; - return len; -} - -static int xennet_proc_write( - struct file *file, const char __user *buffer, - unsigned long count, void *data) -{ - struct net_device *dev = (struct net_device *)((unsigned long)data & ~3UL); - struct net_private *np = netdev_priv(dev); - int which_target = (long)data & 3; - char string[64]; - long target; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (count <= 1) - return -EBADMSG; /* runt */ - if (count > sizeof(string)) - return -EFBIG; /* too long */ - - if (copy_from_user(string, buffer, count)) - return -EFAULT; - string[sizeof(string)-1] = '\0'; - - target = simple_strtol(string, NULL, 10); - if (target < RX_MIN_TARGET) - target = RX_MIN_TARGET; - if (target > RX_MAX_TARGET) - target = RX_MAX_TARGET; - - spin_lock(&np->rx_lock); - - switch (which_target) - { - case TARGET_MIN: - if (target > np->rx_max_target) - np->rx_max_target = target; - np->rx_min_target = target; - if (target > np->rx_target) - np->rx_target = target; - break; - case TARGET_MAX: - if (target < np->rx_min_target) - np->rx_min_target = target; - np->rx_max_target = target; - if (target < np->rx_target) - np->rx_target = target; - break; - case TARGET_CUR: - break; - } - - network_alloc_rx_buffers(dev); - - spin_unlock(&np->rx_lock); - - return count; -} - -static int xennet_proc_init(void) -{ - if (proc_mkdir("xen/net", NULL) == NULL) - return -ENOMEM; - return 0; -} - -static int xennet_proc_addif(struct net_device *dev) -{ - struct proc_dir_entry *dir, *min, *max, *cur; - char name[30]; - - sprintf(name, "xen/net/%s", dev->name); - - dir = proc_mkdir(name, NULL); - if (!dir) - goto nomem; - - min = create_proc_entry("rxbuf_min", 0644, dir); - max = create_proc_entry("rxbuf_max", 0644, dir); - cur = create_proc_entry("rxbuf_cur", 0444, dir); - if (!min || !max || !cur) - goto nomem; - - min->read_proc = xennet_proc_read; - min->write_proc = xennet_proc_write; - min->data = (void *)((unsigned long)dev | TARGET_MIN); - - max->read_proc = xennet_proc_read; - max->write_proc = xennet_proc_write; - max->data = (void *)((unsigned long)dev | TARGET_MAX); - - cur->read_proc = xennet_proc_read; - cur->write_proc = xennet_proc_write; - cur->data = (void *)((unsigned long)dev | TARGET_CUR); - - return 0; - - nomem: - xennet_proc_delif(dev); - return -ENOMEM; -} - -static void xennet_proc_delif(struct net_device *dev) -{ - char name[30]; - - sprintf(name, "xen/net/%s/rxbuf_min", dev->name); - remove_proc_entry(name, NULL); - - sprintf(name, "xen/net/%s/rxbuf_max", dev->name); - remove_proc_entry(name, NULL); - - sprintf(name, "xen/net/%s/rxbuf_cur", dev->name); - remove_proc_entry(name, NULL); - - sprintf(name, "xen/net/%s", dev->name); - remove_proc_entry(name, NULL); -} - -#endif - -module_init(netif_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/privcmd/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -obj-y := privcmd.o diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c --- a/linux-2.6.11-xen-sparse/drivers/xen/privcmd/privcmd.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,259 +0,0 @@ -/****************************************************************************** - * privcmd.c - * - * Interface to privileged domain-0 commands. - * - * Copyright (c) 2002-2004, K A Fraser, B Dragovic - */ - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/swap.h> -#include <linux/smp_lock.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/seq_file.h> - -#include <asm/pgalloc.h> -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/tlb.h> -#include <asm-xen/linux-public/privcmd.h> -#include <asm-xen/xen-public/dom0_ops.h> -#include <asm-xen/xen_proc.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#define pud_t pgd_t -#define pud_offset(d, va) d -#endif - -static struct proc_dir_entry *privcmd_intf; - -static int privcmd_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long data) -{ - int ret = -ENOSYS; - - switch ( cmd ) - { - case IOCTL_PRIVCMD_HYPERCALL: - { - privcmd_hypercall_t hypercall; - - if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) ) - return -EFAULT; - -#if defined(__i386__) - __asm__ __volatile__ ( - "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; " - "movl 4(%%eax),%%ebx ;" - "movl 8(%%eax),%%ecx ;" - "movl 12(%%eax),%%edx ;" - "movl 16(%%eax),%%esi ;" - "movl 20(%%eax),%%edi ;" - "movl (%%eax),%%eax ;" - TRAP_INSTR "; " - "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" - : "=a" (ret) : "0" (&hypercall) : "memory" ); -#elif defined (__x86_64__) - __asm__ __volatile__ ( - "movq %5,%%r10; movq %6,%%r8;" TRAP_INSTR - : "=a" (ret) - : "a" ((unsigned long)hypercall.op), - "D" ((unsigned long)hypercall.arg[0]), - "S" ((unsigned long)hypercall.arg[1]), - "d" ((unsigned long)hypercall.arg[2]), - "g" ((unsigned long)hypercall.arg[3]), - "g" ((unsigned long)hypercall.arg[4]) - : "r11","rcx","r8","r10","memory"); -#endif - } - break; - - case IOCTL_PRIVCMD_INITDOMAIN_EVTCHN: - { - extern int initdom_ctrlif_domcontroller_port; - ret = initdom_ctrlif_domcontroller_port; - } - break; - -#if defined(CONFIG_XEN_PRIVILEGED_GUEST) - case IOCTL_PRIVCMD_MMAP: - { -#define PRIVCMD_MMAP_SZ 32 - privcmd_mmap_t mmapcmd; - privcmd_mmap_entry_t msg[PRIVCMD_MMAP_SZ], *p; - int i, rc; - - if ( copy_from_user(&mmapcmd, (void *)data, sizeof(mmapcmd)) ) - return -EFAULT; - - p = mmapcmd.entry; - - for (i=0; i<mmapcmd.num; i+=PRIVCMD_MMAP_SZ, p+=PRIVCMD_MMAP_SZ) - { - int j, n = ((mmapcmd.num-i)>PRIVCMD_MMAP_SZ)? - PRIVCMD_MMAP_SZ:(mmapcmd.num-i); - - - if ( copy_from_user(&msg, p, n*sizeof(privcmd_mmap_entry_t)) ) - return -EFAULT; - - for ( j = 0; j < n; j++ ) - { - struct vm_area_struct *vma = - find_vma( current->mm, msg[j].va ); - - if ( !vma ) - return -EINVAL; - - if ( msg[j].va > PAGE_OFFSET ) - return -EINVAL; - - if ( (msg[j].va + (msg[j].npages<<PAGE_SHIFT)) > vma->vm_end ) - return -EINVAL; - - if ( (rc = direct_remap_area_pages(vma->vm_mm, - msg[j].va&PAGE_MASK, - msg[j].mfn<<PAGE_SHIFT, - msg[j].npages<<PAGE_SHIFT, - vma->vm_page_prot, - mmapcmd.dom)) < 0 ) - return rc; - } - } - ret = 0; - } - break; - - case IOCTL_PRIVCMD_MMAPBATCH: - { - mmu_update_t u; - privcmd_mmapbatch_t m; - struct vm_area_struct *vma = NULL; - unsigned long *p, addr; - unsigned long mfn; - int i; - - if ( copy_from_user(&m, (void *)data, sizeof(m)) ) - { ret = -EFAULT; goto batch_err; } - - vma = find_vma( current->mm, m.addr ); - - if ( !vma ) - { ret = -EINVAL; goto batch_err; } - - if ( m.addr > PAGE_OFFSET ) - { ret = -EFAULT; goto batch_err; } - - if ( (m.addr + (m.num<<PAGE_SHIFT)) > vma->vm_end ) - { ret = -EFAULT; goto batch_err; } - - p = m.arr; - addr = m.addr; - for ( i = 0; i < m.num; i++, addr += PAGE_SIZE, p++ ) - { - if ( get_user(mfn, p) ) - return -EFAULT; - - u.val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot); - - __direct_remap_area_pages(vma->vm_mm, - addr, - PAGE_SIZE, - &u); - - if ( unlikely(HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0) ) - put_user(0xF0000000 | mfn, p); - } - - ret = 0; - break; - - batch_err: - printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%p %lx-%lx\n", - ret, vma, m.addr, m.num, m.arr, - vma ? vma->vm_start : 0, vma ? vma->vm_end : 0); - break; - } - break; -#endif - - case IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN: - { - unsigned long m2pv = (unsigned long)machine_to_phys_mapping; - pgd_t *pgd = pgd_offset_k(m2pv); - pud_t *pud = pud_offset(pgd, m2pv); - pmd_t *pmd = pmd_offset(pud, m2pv); - unsigned long m2p_start_mfn = (*(unsigned long *)pmd) >> PAGE_SHIFT; - ret = put_user(m2p_start_mfn, (unsigned long *)data) ? -EFAULT: 0; - } - break; - - case IOCTL_PRIVCMD_INITDOMAIN_STORE: - { - extern int do_xenbus_probe(void*); - - if (xen_start_info.store_evtchn != 0) { - ret = -EINVAL; - break; - } - - /* Allocate page. */ - xen_start_info.store_page = get_zeroed_page(GFP_KERNEL); - if (!xen_start_info.store_page) { - ret = -ENOMEM; - break; - } - - /* We don't refcnt properly, so set reserved on page. - * (this allocation is permanent) */ - SetPageReserved(virt_to_page(xen_start_info.store_page)); - - /* Initial connect. Setup channel and page. */ - xen_start_info.store_evtchn = data; - ret = pfn_to_mfn(virt_to_phys((void *)xen_start_info.store_page) >> - PAGE_SHIFT); - - /* We'll return then this will wait for daemon to answer */ - // kthread_run(do_xenbus_probe, NULL, "xenbus_probe"); - } - break; - - default: - ret = -EINVAL; - break; - } - return ret; -} - -static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) -{ - /* DONTCOPY is essential for Xen as copy_page_range is broken. */ - vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; - - return 0; -} - -static struct file_operations privcmd_file_ops = { - ioctl : privcmd_ioctl, - mmap: privcmd_mmap -}; - - -static int __init privcmd_init(void) -{ - privcmd_intf = create_xen_proc_entry("privcmd", 0400); - if ( privcmd_intf != NULL ) - privcmd_intf->proc_fops = &privcmd_file_ops; - - return 0; -} - -__initcall(privcmd_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/usbback/common.h --- a/linux-2.6.11-xen-sparse/drivers/xen/usbback/common.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,85 +0,0 @@ - -#ifndef __USBIF__BACKEND__COMMON_H__ -#define __USBIF__BACKEND__COMMON_H__ - -#include <linux/config.h> -#include <linux/version.h> -#include <linux/module.h> -#include <linux/rbtree.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <asm/io.h> -#include <asm/setup.h> -#include <asm/pgalloc.h> -#include <asm-xen/ctrl_if.h> -#include <asm-xen/hypervisor.h> - -#include <asm-xen/xen-public/io/usbif.h> - -#if 0 -#define ASSERT(_p) \ - if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \ - __FILE__ , __LINE__ , ## _a ) -#else -#define ASSERT(_p) ((void)0) -#define DPRINTK(_f, _a...) ((void)0) -#endif - -typedef struct usbif_priv_st usbif_priv_t; - -struct usbif_priv_st { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; - /* Physical parameters of the comms window. */ - unsigned long shmem_frame; - unsigned int evtchn; - int irq; - /* Comms Information */ - usbif_back_ring_t usb_ring; - /* Private fields. */ - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; - usbif_priv_t *hash_next; - struct list_head usbif_list; - spinlock_t usb_ring_lock; - atomic_t refcnt; - - struct work_struct work; -}; - -void usbif_create(usbif_be_create_t *create); -void usbif_destroy(usbif_be_destroy_t *destroy); -void usbif_connect(usbif_be_connect_t *connect); -int usbif_disconnect(usbif_be_disconnect_t *disconnect, u8 rsp_id); -void usbif_disconnect_complete(usbif_priv_t *up); - -void usbif_release_port(usbif_be_release_port_t *msg); -int usbif_claim_port(usbif_be_claim_port_t *msg); -void usbif_release_ports(usbif_priv_t *up); - -usbif_priv_t *usbif_find(domid_t domid); -#define usbif_get(_b) (atomic_inc(&(_b)->refcnt)) -#define usbif_put(_b) \ - do { \ - if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - usbif_disconnect_complete(_b); \ - } while (0) - - -void usbif_interface_init(void); -void usbif_ctrlif_init(void); - -void usbif_deschedule(usbif_priv_t *up); -void remove_from_usbif_list(usbif_priv_t *up); - -irqreturn_t usbif_be_int(int irq, void *dev_id, struct pt_regs *regs); - -#endif /* __USBIF__BACKEND__COMMON_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/usbback/control.c --- a/linux-2.6.11-xen-sparse/drivers/xen/usbback/control.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,61 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/usbif/backend/control.c - * - * Routines for interfacing with the control plane. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -static void usbif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - DPRINTK("Received usbif backend message, subtype=%d\n", msg->subtype); - - switch ( msg->subtype ) - { - case CMSG_USBIF_BE_CREATE: - usbif_create((usbif_be_create_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_DESTROY: - usbif_destroy((usbif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_CONNECT: - usbif_connect((usbif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_DISCONNECT: - if ( !usbif_disconnect((usbif_be_disconnect_t *)&msg->msg[0],msg->id) ) - return; /* Sending the response is deferred until later. */ - break; - case CMSG_USBIF_BE_CLAIM_PORT: - usbif_claim_port((usbif_be_claim_port_t *)&msg->msg[0]); - break; - case CMSG_USBIF_BE_RELEASE_PORT: - usbif_release_port((usbif_be_release_port_t *)&msg->msg[0]); - break; - default: - DPRINTK("Parse error while reading message subtype %d, len %d\n", - msg->subtype, msg->length); - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -void usbif_ctrlif_init(void) -{ - ctrl_msg_t cmsg; - usbif_be_driver_status_changed_t st; - - (void)ctrl_if_register_receiver(CMSG_USBIF_BE, usbif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_USBIF_BE; - cmsg.subtype = CMSG_USBIF_BE_DRIVER_STATUS_CHANGED; - cmsg.length = sizeof(usbif_be_driver_status_changed_t); - st.status = USBIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/usbback/interface.c --- a/linux-2.6.11-xen-sparse/drivers/xen/usbback/interface.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,252 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/usbif/backend/interface.c - * - * USB device interface management. - * - * by Mark Williamson, Copyright (c) 2004 - */ - - -/****************************************************************************** - * arch/xen/drivers/blkif/backend/interface.c - * - * Block-device interface management. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -#define USBIF_HASHSZ 1024 -#define USBIF_HASH(_d) (((int)(_d))&(USBIF_HASHSZ-1)) - -static kmem_cache_t *usbif_priv_cachep; -static usbif_priv_t *usbif_priv_hash[USBIF_HASHSZ]; - -usbif_priv_t *usbif_find(domid_t domid) -{ - usbif_priv_t *up = usbif_priv_hash[USBIF_HASH(domid)]; - while ( (up != NULL ) && ( up->domid != domid ) ) - up = up->hash_next; - return up; -} - -static void __usbif_disconnect_complete(void *arg) -{ - usbif_priv_t *usbif = (usbif_priv_t *)arg; - ctrl_msg_t cmsg; - usbif_be_disconnect_t disc; - - /* - * These can't be done in usbif_disconnect() because at that point there - * may be outstanding requests at the device whose asynchronous responses - * must still be notified to the remote driver. - */ - unbind_evtchn_from_irq(usbif->evtchn); - vfree(usbif->usb_ring.sring); - - /* Construct the deferred response message. */ - cmsg.type = CMSG_USBIF_BE; - cmsg.subtype = CMSG_USBIF_BE_DISCONNECT; - cmsg.id = usbif->disconnect_rspid; - cmsg.length = sizeof(usbif_be_disconnect_t); - disc.domid = usbif->domid; - disc.status = USBIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'usbif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( usbif->status != DISCONNECTING ) - BUG(); - usbif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); -} - -void usbif_disconnect_complete(usbif_priv_t *up) -{ - INIT_WORK(&up->work, __usbif_disconnect_complete, (void *)up); - schedule_work(&up->work); -} - -void usbif_create(usbif_be_create_t *create) -{ - domid_t domid = create->domid; - usbif_priv_t **pup, *up; - - if ( (up = kmem_cache_alloc(usbif_priv_cachep, GFP_KERNEL)) == NULL ) - { - DPRINTK("Could not create usbif: out of memory\n"); - create->status = USBIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - memset(up, 0, sizeof(*up)); - up->domid = domid; - up->status = DISCONNECTED; - spin_lock_init(&up->usb_ring_lock); - atomic_set(&up->refcnt, 0); - - pup = &usbif_priv_hash[USBIF_HASH(domid)]; - while ( *pup != NULL ) - { - if ( (*pup)->domid == domid ) - { - create->status = USBIF_BE_STATUS_INTERFACE_EXISTS; - kmem_cache_free(usbif_priv_cachep, up); - return; - } - pup = &(*pup)->hash_next; - } - - up->hash_next = *pup; - *pup = up; - - create->status = USBIF_BE_STATUS_OKAY; -} - -void usbif_destroy(usbif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - usbif_priv_t **pup, *up; - - pup = &usbif_priv_hash[USBIF_HASH(domid)]; - while ( (up = *pup) != NULL ) - { - if ( up->domid == domid ) - { - if ( up->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pup = &up->hash_next; - } - - destroy->status = USBIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = USBIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pup = up->hash_next; - usbif_release_ports(up); - kmem_cache_free(usbif_priv_cachep, up); - destroy->status = USBIF_BE_STATUS_OKAY; -} - -void usbif_connect(usbif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int evtchn = connect->evtchn; - unsigned long shmem_frame = connect->shmem_frame; - struct vm_struct *vma; - pgprot_t prot; - int error; - usbif_priv_t *up; - usbif_sring_t *sring; - - up = usbif_find(domid); - if ( unlikely(up == NULL) ) - { - DPRINTK("usbif_connect attempted for non-existent usbif (%u)\n", - connect->domid); - connect->status = USBIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = USBIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - prot = __pgprot(_KERNPG_TABLE); - error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), - shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - if ( error == -ENOMEM ) - connect->status = USBIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) - connect->status = USBIF_BE_STATUS_MAPPING_ERROR; - else - connect->status = USBIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } - - if ( up->status != DISCONNECTED ) - { - connect->status = USBIF_BE_STATUS_INTERFACE_CONNECTED; - vfree(vma->addr); - return; - } - - sring = (usbif_sring_t *)vma->addr; - SHARED_RING_INIT(sring); - BACK_RING_INIT(&up->usb_ring, sring, PAGE_SIZE); - - up->evtchn = evtchn; - up->irq = bind_evtchn_to_irq(evtchn); - up->shmem_frame = shmem_frame; - up->status = CONNECTED; - usbif_get(up); - - request_irq(up->irq, usbif_be_int, 0, "usbif-backend", up); - - connect->status = USBIF_BE_STATUS_OKAY; -} - -/* Remove URBs for this interface before destroying it. */ -void usbif_deschedule(usbif_priv_t *up) -{ - remove_from_usbif_list(up); -} - -int usbif_disconnect(usbif_be_disconnect_t *disconnect, u8 rsp_id) -{ - domid_t domid = disconnect->domid; - usbif_priv_t *up; - - up = usbif_find(domid); - if ( unlikely(up == NULL) ) - { - DPRINTK("usbif_disconnect attempted for non-existent usbif" - " (%u)\n", disconnect->domid); - disconnect->status = USBIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ - } - - if ( up->status == CONNECTED ) - { - up->status = DISCONNECTING; - up->disconnect_rspid = rsp_id; - wmb(); /* Let other CPUs see the status change. */ - free_irq(up->irq, up); - usbif_deschedule(up); - usbif_put(up); - return 0; /* Caller should not send response message. */ - } - - disconnect->status = USBIF_BE_STATUS_OKAY; - return 1; -} - -void __init usbif_interface_init(void) -{ - usbif_priv_cachep = kmem_cache_create("usbif_priv_cache", - sizeof(usbif_priv_t), - 0, 0, NULL, NULL); - memset(usbif_priv_hash, 0, sizeof(usbif_priv_hash)); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/usbback/usbback.c --- a/linux-2.6.11-xen-sparse/drivers/xen/usbback/usbback.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1066 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/usbif/backend/main.c - * - * Backend for the Xen virtual USB driver - provides an abstraction of a - * USB host controller to the corresponding frontend driver. - * - * by Mark Williamson - * Copyright (c) 2004 Intel Research Cambridge - * Copyright (c) 2004, 2005 Mark Williamson - * - * Based on arch/xen/drivers/blkif/backend/main.c - * Copyright (c) 2003-2004, Keir Fraser & Steve Hand - */ - -#include "common.h" - - -#include <linux/list.h> -#include <linux/usb.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/tqueue.h> - -/* - * This is rather arbitrary. - */ -#define MAX_PENDING_REQS 4 -#define BATCH_PER_DOMAIN 1 - -static unsigned long mmap_vstart; - -/* Needs to be sufficiently large that we can map the (large) buffers - * the USB mass storage driver wants. */ -#define MMAP_PAGES_PER_REQUEST \ - (128) -#define MMAP_PAGES \ - (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) - -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - - -static spinlock_t owned_ports_lock; -LIST_HEAD(owned_ports); - -/* A list of these structures is used to track ownership of physical USB - * ports. */ -typedef struct -{ - usbif_priv_t *usbif_priv; - char path[16]; - int guest_port; - int enabled; - struct list_head list; - unsigned long guest_address; /* The USB device address that has been - * assigned by the guest. */ - int dev_present; /* Is there a device present? */ - struct usb_device * dev; - unsigned long ifaces; /* What interfaces are present on this device? */ -} owned_port_t; - - -/* - * Each outstanding request that we've passed to the lower device layers has a - * 'pending_req' allocated to it. The request is complete, the specified - * domain has a response queued for it, with the saved 'id' passed back. - */ -typedef struct { - usbif_priv_t *usbif_priv; - unsigned long id; - int nr_pages; - unsigned short operation; - int status; -} pending_req_t; - -/* - * We can't allocate pending_req's in order, since they may complete out of - * order. We therefore maintain an allocation ring. This ring also indicates - * when enough work has been passed down -- at that point the allocation ring - * will be empty. - */ -static pending_req_t pending_reqs[MAX_PENDING_REQS]; -static unsigned char pending_ring[MAX_PENDING_REQS]; -static spinlock_t pend_prod_lock; - -/* NB. We use a different index type to differentiate from shared usb rings. */ -typedef unsigned int PEND_RING_IDX; -#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) -static PEND_RING_IDX pending_prod, pending_cons; -#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) - -static int do_usb_io_op(usbif_priv_t *usbif, int max_to_do); -static void make_response(usbif_priv_t *usbif, unsigned long id, - unsigned short op, int st, int inband, - unsigned long actual_length); -static void dispatch_usb_probe(usbif_priv_t *up, unsigned long id, unsigned long port); -static void dispatch_usb_io(usbif_priv_t *up, usbif_request_t *req); -static void dispatch_usb_reset(usbif_priv_t *up, unsigned long portid); -static owned_port_t *usbif_find_port(char *); - -/****************************************************************** - * PRIVATE DEBUG FUNCTIONS - */ - -#undef DEBUG -#ifdef DEBUG - -static void dump_port(owned_port_t *p) -{ - printk(KERN_DEBUG "owned_port_t @ %p\n" - " usbif_priv @ %p\n" - " path: %s\n" - " guest_port: %d\n" - " guest_address: %ld\n" - " dev_present: %d\n" - " dev @ %p\n" - " ifaces: 0x%lx\n", - p, p->usbif_priv, p->path, p->guest_port, p->guest_address, - p->dev_present, p->dev, p->ifaces); -} - - -static void dump_request(usbif_request_t *req) -{ - printk(KERN_DEBUG "id = 0x%lx\n" - "devnum %d\n" - "endpoint 0x%x\n" - "direction %d\n" - "speed %d\n" - "pipe_type 0x%x\n" - "transfer_buffer 0x%lx\n" - "length 0x%lx\n" - "transfer_flags 0x%lx\n" - "setup = { 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x }\n" - "iso_schedule = 0x%lx\n" - "num_iso %ld\n", - req->id, req->devnum, req->endpoint, req->direction, req->speed, - req->pipe_type, req->transfer_buffer, req->length, - req->transfer_flags, req->setup[0], req->setup[1], req->setup[2], - req->setup[3], req->setup[4], req->setup[5], req->setup[6], - req->setup[7], req->iso_schedule, req->num_iso); -} - -static void dump_urb(struct urb *urb) -{ - printk(KERN_DEBUG "dumping urb @ %p\n", urb); - -#define DUMP_URB_FIELD(name, format) \ - printk(KERN_DEBUG " " # name " " format "\n", urb-> name) - - DUMP_URB_FIELD(pipe, "0x%x"); - DUMP_URB_FIELD(status, "%d"); - DUMP_URB_FIELD(transfer_flags, "0x%x"); - DUMP_URB_FIELD(transfer_buffer, "%p"); - DUMP_URB_FIELD(transfer_buffer_length, "%d"); - DUMP_URB_FIELD(actual_length, "%d"); -} - -static void dump_response(usbif_response_t *resp) -{ - printk(KERN_DEBUG "usbback: Sending response:\n" - " id = 0x%x\n" - " op = %d\n" - " status = %d\n" - " data = %d\n" - " length = %d\n", - resp->id, resp->op, resp->status, resp->data, resp->length); -} - -#else /* DEBUG */ - -#define dump_port(blah) ((void)0) -#define dump_request(blah) ((void)0) -#define dump_urb(blah) ((void)0) -#define dump_response(blah) ((void)0) - -#endif /* DEBUG */ - -/****************************************************************** - * MEMORY MANAGEMENT - */ - -static void fast_flush_area(int idx, int nr_pages) -{ - multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; - int i; - - for ( i = 0; i < nr_pages; i++ ) - { - MULTI_update_va_mapping(mcl+i, MMAP_VADDR(idx, i), - __pte(0), 0); - } - - mcl[nr_pages-1].args[2] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) - BUG(); -} - - -/****************************************************************** - * USB INTERFACE SCHEDULER LIST MAINTENANCE - */ - -static struct list_head usbio_schedule_list; -static spinlock_t usbio_schedule_list_lock; - -static int __on_usbif_list(usbif_priv_t *up) -{ - return up->usbif_list.next != NULL; -} - -void remove_from_usbif_list(usbif_priv_t *up) -{ - unsigned long flags; - if ( !__on_usbif_list(up) ) return; - spin_lock_irqsave(&usbio_schedule_list_lock, flags); - if ( __on_usbif_list(up) ) - { - list_del(&up->usbif_list); - up->usbif_list.next = NULL; - usbif_put(up); - } - spin_unlock_irqrestore(&usbio_schedule_list_lock, flags); -} - -static void add_to_usbif_list_tail(usbif_priv_t *up) -{ - unsigned long flags; - if ( __on_usbif_list(up) ) return; - spin_lock_irqsave(&usbio_schedule_list_lock, flags); - if ( !__on_usbif_list(up) && (up->status == CONNECTED) ) - { - list_add_tail(&up->usbif_list, &usbio_schedule_list); - usbif_get(up); - } - spin_unlock_irqrestore(&usbio_schedule_list_lock, flags); -} - -void free_pending(int pending_idx) -{ - unsigned long flags; - - /* Free the pending request. */ - spin_lock_irqsave(&pend_prod_lock, flags); - pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; - spin_unlock_irqrestore(&pend_prod_lock, flags); -} - -/****************************************************************** - * COMPLETION CALLBACK -- Called as urb->complete() - */ - -static void maybe_trigger_usbio_schedule(void); - -static void __end_usb_io_op(struct urb *purb) -{ - pending_req_t *pending_req; - int pending_idx; - - pending_req = purb->context; - - pending_idx = pending_req - pending_reqs; - - ASSERT(purb->actual_length <= purb->transfer_buffer_length); - ASSERT(purb->actual_length <= pending_req->nr_pages * PAGE_SIZE); - - /* An error fails the entire request. */ - if ( purb->status ) - { - printk(KERN_WARNING "URB @ %p failed. Status %d\n", purb, purb->status); - } - - if ( usb_pipetype(purb->pipe) == 0 ) - { - int i; - usbif_iso_t *sched = (usbif_iso_t *)MMAP_VADDR(pending_idx, pending_req->nr_pages - 1); - - /* If we're dealing with an iso pipe, we need to copy back the schedule. */ - for ( i = 0; i < purb->number_of_packets; i++ ) - { - sched[i].length = purb->iso_frame_desc[i].actual_length; - ASSERT(sched[i].buffer_offset == - purb->iso_frame_desc[i].offset); - sched[i].status = purb->iso_frame_desc[i].status; - } - } - - fast_flush_area(pending_req - pending_reqs, pending_req->nr_pages); - - kfree(purb->setup_packet); - - make_response(pending_req->usbif_priv, pending_req->id, - pending_req->operation, pending_req->status, 0, purb->actual_length); - usbif_put(pending_req->usbif_priv); - - usb_free_urb(purb); - - free_pending(pending_idx); - - rmb(); - - /* Check for anything still waiting in the rings, having freed a request... */ - maybe_trigger_usbio_schedule(); -} - -/****************************************************************** - * SCHEDULER FUNCTIONS - */ - -static DECLARE_WAIT_QUEUE_HEAD(usbio_schedule_wait); - -static int usbio_schedule(void *arg) -{ - DECLARE_WAITQUEUE(wq, current); - - usbif_priv_t *up; - struct list_head *ent; - - daemonize(); - - for ( ; ; ) - { - /* Wait for work to do. */ - add_wait_queue(&usbio_schedule_wait, &wq); - set_current_state(TASK_INTERRUPTIBLE); - if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || - list_empty(&usbio_schedule_list) ) - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&usbio_schedule_wait, &wq); - - /* Queue up a batch of requests. */ - while ( (NR_PENDING_REQS < MAX_PENDING_REQS) && - !list_empty(&usbio_schedule_list) ) - { - ent = usbio_schedule_list.next; - up = list_entry(ent, usbif_priv_t, usbif_list); - usbif_get(up); - remove_from_usbif_list(up); - if ( do_usb_io_op(up, BATCH_PER_DOMAIN) ) - add_to_usbif_list_tail(up); - usbif_put(up); - } - } -} - -static void maybe_trigger_usbio_schedule(void) -{ - /* - * Needed so that two processes, who together make the following predicate - * true, don't both read stale values and evaluate the predicate - * incorrectly. Incredibly unlikely to stall the scheduler on x86, but... - */ - smp_mb(); - - if ( !list_empty(&usbio_schedule_list) ) - wake_up(&usbio_schedule_wait); -} - - -/****************************************************************************** - * NOTIFICATION FROM GUEST OS. - */ - -irqreturn_t usbif_be_int(int irq, void *dev_id, struct pt_regs *regs) -{ - usbif_priv_t *up = dev_id; - - smp_mb(); - - add_to_usbif_list_tail(up); - - /* Will in fact /always/ trigger an io schedule in this case. */ - maybe_trigger_usbio_schedule(); - - return IRQ_HANDLED; -} - - - -/****************************************************************** - * DOWNWARD CALLS -- These interface with the usb-device layer proper. - */ - -static int do_usb_io_op(usbif_priv_t *up, int max_to_do) -{ - usbif_back_ring_t *usb_ring = &up->usb_ring; - usbif_request_t *req; - RING_IDX i, rp; - int more_to_do = 0; - - rp = usb_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - /* Take items off the comms ring, taking care not to overflow. */ - for ( i = usb_ring->req_cons; - (i != rp) && !RING_REQUEST_CONS_OVERFLOW(usb_ring, i); - i++ ) - { - if ( (max_to_do-- == 0) || (NR_PENDING_REQS == MAX_PENDING_REQS) ) - { - more_to_do = 1; - break; - } - - req = RING_GET_REQUEST(usb_ring, i); - - switch ( req->operation ) - { - case USBIF_OP_PROBE: - dispatch_usb_probe(up, req->id, req->port); - break; - - case USBIF_OP_IO: - /* Assemble an appropriate URB. */ - dispatch_usb_io(up, req); - break; - - case USBIF_OP_RESET: - dispatch_usb_reset(up, req->port); - break; - - default: - DPRINTK("error: unknown USB io operation [%d]\n", - req->operation); - make_response(up, req->id, req->operation, -EINVAL, 0, 0); - break; - } - } - - usb_ring->req_cons = i; - - return more_to_do; -} - -static owned_port_t *find_guest_port(usbif_priv_t *up, int port) -{ - unsigned long flags; - struct list_head *l; - - spin_lock_irqsave(&owned_ports_lock, flags); - list_for_each(l, &owned_ports) - { - owned_port_t *p = list_entry(l, owned_port_t, list); - if(p->usbif_priv == up && p->guest_port == port) - { - spin_unlock_irqrestore(&owned_ports_lock, flags); - return p; - } - } - spin_unlock_irqrestore(&owned_ports_lock, flags); - - return NULL; -} - -static void dispatch_usb_reset(usbif_priv_t *up, unsigned long portid) -{ - owned_port_t *port = find_guest_port(up, portid); - int ret = 0; - - - /* Allowing the guest to actually reset the device causes more problems - * than it's worth. We just fake it out in software but we will do a real - * reset when the interface is destroyed. */ - - dump_port(port); - - port->guest_address = 0; - /* If there's an attached device then the port is now enabled. */ - if ( port->dev_present ) - port->enabled = 1; - else - port->enabled = 0; - - make_response(up, 0, USBIF_OP_RESET, ret, 0, 0); -} - -static void dispatch_usb_probe(usbif_priv_t *up, unsigned long id, unsigned long portid) -{ - owned_port_t *port = find_guest_port(up, portid); - int ret; - - if ( port != NULL ) - ret = port->dev_present; - else - { - ret = -EINVAL; - printk(KERN_INFO "dispatch_usb_probe(): invalid port probe request " - "(port %ld)\n", portid); - } - - /* Probe result is sent back in-band. Probes don't have an associated id - * right now... */ - make_response(up, id, USBIF_OP_PROBE, ret, portid, 0); -} - -/** - * check_iso_schedule - safety check the isochronous schedule for an URB - * @purb : the URB in question - */ -static int check_iso_schedule(struct urb *purb) -{ - int i; - unsigned long total_length = 0; - - for ( i = 0; i < purb->number_of_packets; i++ ) - { - struct usb_iso_packet_descriptor *desc = &purb->iso_frame_desc[i]; - - if ( desc->offset >= purb->transfer_buffer_length - || ( desc->offset + desc->length) > purb->transfer_buffer_length ) - return -EINVAL; - - total_length += desc->length; - - if ( total_length > purb->transfer_buffer_length ) - return -EINVAL; - } - - return 0; -} - -owned_port_t *find_port_for_request(usbif_priv_t *up, usbif_request_t *req); - -static void dispatch_usb_io(usbif_priv_t *up, usbif_request_t *req) -{ - unsigned long buffer_mach; - int i = 0, offset = 0, - pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - pending_req_t *pending_req; - unsigned long remap_prot; - multicall_entry_t mcl[MMAP_PAGES_PER_REQUEST]; - struct urb *purb = NULL; - owned_port_t *port; - unsigned char *setup; - - dump_request(req); - - if ( NR_PENDING_REQS == MAX_PENDING_REQS ) - { - printk(KERN_WARNING "usbback: Max requests already queued. " - "Giving up!\n"); - - return; - } - - port = find_port_for_request(up, req); - - if ( port == NULL ) - { - printk(KERN_WARNING "No such device! (%d)\n", req->devnum); - dump_request(req); - - make_response(up, req->id, req->operation, -ENODEV, 0, 0); - return; - } - else if ( !port->dev_present ) - { - /* In normal operation, we'll only get here if a device is unplugged - * and the frontend hasn't noticed yet. */ - make_response(up, req->id, req->operation, -ENODEV, 0, 0); - return; - } - - - setup = kmalloc(8, GFP_KERNEL); - - if ( setup == NULL ) - goto no_mem; - - /* Copy request out for safety. */ - memcpy(setup, req->setup, 8); - - if( setup[0] == 0x0 && setup[1] == 0x5) - { - /* To virtualise the USB address space, we need to intercept - * set_address messages and emulate. From the USB specification: - * bmRequestType = 0x0; - * Brequest = SET_ADDRESS (i.e. 0x5) - * wValue = device address - * wIndex = 0 - * wLength = 0 - * data = None - */ - /* Store into the guest transfer buffer using cpu_to_le16 */ - port->guest_address = le16_to_cpu(*(u16 *)(setup + 2)); - /* Make a successful response. That was easy! */ - - make_response(up, req->id, req->operation, 0, 0, 0); - - kfree(setup); - return; - } - else if ( setup[0] == 0x0 && setup[1] == 0x9 ) - { - /* The host kernel needs to know what device configuration is in use - * because various error checks get confused otherwise. We just do - * configuration settings here, under controlled conditions. - */ - - /* Ignore configuration setting and hope that the host kernel - did it right. */ - /* usb_set_configuration(port->dev, setup[2]); */ - - make_response(up, req->id, req->operation, 0, 0, 0); - - kfree(setup); - return; - } - else if ( setup[0] == 0x1 && setup[1] == 0xB ) - { - /* The host kernel needs to know what device interface is in use - * because various error checks get confused otherwise. We just do - * configuration settings here, under controlled conditions. - */ - usb_set_interface(port->dev, (setup[4] | setup[5] << 8), - (setup[2] | setup[3] << 8) ); - - make_response(up, req->id, req->operation, 0, 0, 0); - - kfree(setup); - return; - } - - if ( ( req->transfer_buffer - (req->transfer_buffer & PAGE_MASK) - + req->length ) - > MMAP_PAGES_PER_REQUEST * PAGE_SIZE ) - { - printk(KERN_WARNING "usbback: request of %lu bytes too large\n", - req->length); - make_response(up, req->id, req->operation, -EINVAL, 0, 0); - kfree(setup); - return; - } - - buffer_mach = req->transfer_buffer; - - if( buffer_mach == 0 ) - goto no_remap; - - ASSERT((req->length >> PAGE_SHIFT) <= MMAP_PAGES_PER_REQUEST); - ASSERT(buffer_mach); - - /* Always map writeable for now. */ - remap_prot = _KERNPG_TABLE; - - for ( i = 0, offset = 0; offset < req->length; - i++, offset += PAGE_SIZE ) - { - MULTI_update_va_mapping_otherdomain( - mcl+i, MMAP_VADDR(pending_idx, i), - pfn_pte_ma(buffer_mach >> PAGE_SHIFT, remap_prot), - 0, up->domid); - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME((buffer_mach + offset) >> PAGE_SHIFT); - - ASSERT(virt_to_machine(MMAP_VADDR(pending_idx, i)) - == buffer_mach + i << PAGE_SHIFT); - } - - if ( req->pipe_type == 0 && req->num_iso > 0 ) /* Maybe schedule ISO... */ - { - /* Map in ISO schedule, if necessary. */ - MULTI_update_va_mapping_otherdomain( - mcl+i, MMAP_VADDR(pending_idx, i), - pfn_pte_ma(req->iso_schedule >> PAGE_SHIFT, remap_prot), - 0, up->domid); - - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(req->iso_schedule >> PAGE_SHIFT); - - i++; - } - - if ( unlikely(HYPERVISOR_multicall(mcl, i) != 0) ) - BUG(); - - { - int j; - for ( j = 0; j < i; j++ ) - { - if ( unlikely(mcl[j].result != 0) ) - { - printk(KERN_WARNING - "invalid buffer %d -- could not remap it\n", j); - fast_flush_area(pending_idx, i); - goto bad_descriptor; - } - } - } - - no_remap: - - ASSERT(i <= MMAP_PAGES_PER_REQUEST); - ASSERT(i * PAGE_SIZE >= req->length); - - /* We have to do this because some things might complete out of order. */ - pending_req = &pending_reqs[pending_idx]; - pending_req->usbif_priv= up; - pending_req->id = req->id; - pending_req->operation = req->operation; - pending_req->nr_pages = i; - - pending_cons++; - - usbif_get(up); - - /* Fill out an actual request for the USB layer. */ - purb = usb_alloc_urb(req->num_iso); - - if ( purb == NULL ) - { - usbif_put(up); - free_pending(pending_idx); - goto no_mem; - } - - purb->dev = port->dev; - purb->context = pending_req; - purb->transfer_buffer = - (void *)(MMAP_VADDR(pending_idx, 0) + (buffer_mach & ~PAGE_MASK)); - if(buffer_mach == 0) - purb->transfer_buffer = NULL; - purb->complete = __end_usb_io_op; - purb->transfer_buffer_length = req->length; - purb->transfer_flags = req->transfer_flags; - - purb->pipe = 0; - purb->pipe |= req->direction << 7; - purb->pipe |= port->dev->devnum << 8; - purb->pipe |= req->speed << 26; - purb->pipe |= req->pipe_type << 30; - purb->pipe |= req->endpoint << 15; - - purb->number_of_packets = req->num_iso; - - if ( purb->number_of_packets * sizeof(usbif_iso_t) > PAGE_SIZE ) - goto urb_error; - - /* Make sure there's always some kind of timeout. */ - purb->timeout = ( req->timeout > 0 ) ? (req->timeout * HZ) / 1000 - : 1000; - - purb->setup_packet = setup; - - if ( req->pipe_type == 0 ) /* ISO */ - { - int j; - usbif_iso_t *iso_sched = (usbif_iso_t *)MMAP_VADDR(pending_idx, i - 1); - - /* If we're dealing with an iso pipe, we need to copy in a schedule. */ - for ( j = 0; j < purb->number_of_packets; j++ ) - { - purb->iso_frame_desc[j].length = iso_sched[j].length; - purb->iso_frame_desc[j].offset = iso_sched[j].buffer_offset; - iso_sched[j].status = 0; - } - } - - if ( check_iso_schedule(purb) != 0 ) - goto urb_error; - - if ( usb_submit_urb(purb) != 0 ) - goto urb_error; - - return; - - urb_error: - dump_urb(purb); - usbif_put(up); - free_pending(pending_idx); - - bad_descriptor: - kfree ( setup ); - if ( purb != NULL ) - usb_free_urb(purb); - make_response(up, req->id, req->operation, -EINVAL, 0, 0); - return; - - no_mem: - if ( setup != NULL ) - kfree(setup); - make_response(up, req->id, req->operation, -ENOMEM, 0, 0); - return; -} - - - -/****************************************************************** - * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING - */ - - -static void make_response(usbif_priv_t *up, unsigned long id, - unsigned short op, int st, int inband, - unsigned long length) -{ - usbif_response_t *resp; - unsigned long flags; - usbif_back_ring_t *usb_ring = &up->usb_ring; - - /* Place on the response ring for the relevant domain. */ - spin_lock_irqsave(&up->usb_ring_lock, flags); - resp = RING_GET_RESPONSE(usb_ring, usb_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - resp->data = inband; - resp->length = length; - wmb(); /* Ensure other side can see the response fields. */ - - dump_response(resp); - - usb_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES(usb_ring); - spin_unlock_irqrestore(&up->usb_ring_lock, flags); - - /* Kick the relevant domain. */ - notify_via_evtchn(up->evtchn); -} - -/** - * usbif_claim_port - claim devices on a port on behalf of guest - * - * Once completed, this will ensure that any device attached to that - * port is claimed by this driver for use by the guest. - */ -int usbif_claim_port(usbif_be_claim_port_t *msg) -{ - owned_port_t *o_p; - - /* Sanity... */ - if ( usbif_find_port(msg->path) != NULL ) - { - printk(KERN_WARNING "usbback: Attempted to claim USB port " - "we already own!\n"); - return -EINVAL; - } - - /* No need for a slab cache - this should be infrequent. */ - o_p = kmalloc(sizeof(owned_port_t), GFP_KERNEL); - - if ( o_p == NULL ) - return -ENOMEM; - - o_p->enabled = 0; - o_p->usbif_priv = usbif_find(msg->domid); - o_p->guest_port = msg->usbif_port; - o_p->dev_present = 0; - o_p->guest_address = 0; /* Default address. */ - - strcpy(o_p->path, msg->path); - - spin_lock_irq(&owned_ports_lock); - - list_add(&o_p->list, &owned_ports); - - spin_unlock_irq(&owned_ports_lock); - - printk(KERN_INFO "usbback: Claimed USB port (%s) for %d.%d\n", o_p->path, - msg->domid, msg->usbif_port); - - /* Force a reprobe for unclaimed devices. */ - usb_scan_devices(); - - return 0; -} - -owned_port_t *find_port_for_request(usbif_priv_t *up, usbif_request_t *req) -{ - unsigned long flags; - struct list_head *port; - - /* I'm assuming this is not called from IRQ context - correct? I think - * it's probably only called in response to control messages or plug events - * in the USB hub kernel thread, so should be OK. */ - spin_lock_irqsave(&owned_ports_lock, flags); - list_for_each(port, &owned_ports) - { - owned_port_t *p = list_entry(port, owned_port_t, list); - if(p->usbif_priv == up && p->guest_address == req->devnum && p->enabled ) - { - dump_port(p); - - spin_unlock_irqrestore(&owned_ports_lock, flags); - return p; - } - } - spin_unlock_irqrestore(&owned_ports_lock, flags); - - return NULL; -} - -owned_port_t *__usbif_find_port(char *path) -{ - struct list_head *port; - - list_for_each(port, &owned_ports) - { - owned_port_t *p = list_entry(port, owned_port_t, list); - if(!strcmp(path, p->path)) - { - return p; - } - } - - return NULL; -} - -owned_port_t *usbif_find_port(char *path) -{ - owned_port_t *ret; - unsigned long flags; - - spin_lock_irqsave(&owned_ports_lock, flags); - ret = __usbif_find_port(path); - spin_unlock_irqrestore(&owned_ports_lock, flags); - - return ret; -} - - -static void *probe(struct usb_device *dev, unsigned iface, - const struct usb_device_id *id) -{ - owned_port_t *p; - - /* We don't care what the device is - if we own the port, we want it. We - * don't deal with device-specifics in this driver, so we don't care what - * the device actually is ;-) */ - if ( ( p = usbif_find_port(dev->devpath) ) != NULL ) - { - printk(KERN_INFO "usbback: claimed device attached to owned port\n"); - - p->dev_present = 1; - p->dev = dev; - set_bit(iface, &p->ifaces); - - return p->usbif_priv; - } - else - printk(KERN_INFO "usbback: hotplug for non-owned port (%s), ignoring\n", - dev->devpath); - - - return NULL; -} - -static void disconnect(struct usb_device *dev, void *usbif) -{ - /* Note the device is removed so we can tell the guest when it probes. */ - owned_port_t *port = usbif_find_port(dev->devpath); - port->dev_present = 0; - port->dev = NULL; - port->ifaces = 0; -} - - -struct usb_driver driver = -{ - .owner = THIS_MODULE, - .name = "Xen USB Backend", - .probe = probe, - .disconnect = disconnect, - .id_table = NULL, -}; - -/* __usbif_release_port - internal mechanics for releasing a port */ -void __usbif_release_port(owned_port_t *p) -{ - int i; - - for ( i = 0; p->ifaces != 0; i++) - if ( p->ifaces & 1 << i ) - { - usb_driver_release_interface(&driver, usb_ifnum_to_if(p->dev, i)); - clear_bit(i, &p->ifaces); - } - list_del(&p->list); - - /* Reset the real device. We don't simulate disconnect / probe for other - * drivers in this kernel because we assume the device is completely under - * the control of ourselves (i.e. the guest!). This should ensure that the - * device is in a sane state for the next customer ;-) */ - - /* MAW NB: we're not resetting the real device here. This looks perfectly - * valid to me but it causes memory corruption. We seem to get away with not - * resetting for now, although it'd be nice to have this tracked down. */ -/* if ( p->dev != NULL) */ -/* usb_reset_device(p->dev); */ - - kfree(p); -} - - -/** - * usbif_release_port - stop claiming devices on a port on behalf of guest - */ -void usbif_release_port(usbif_be_release_port_t *msg) -{ - owned_port_t *p; - - spin_lock_irq(&owned_ports_lock); - p = __usbif_find_port(msg->path); - __usbif_release_port(p); - spin_unlock_irq(&owned_ports_lock); -} - -void usbif_release_ports(usbif_priv_t *up) -{ - struct list_head *port, *tmp; - unsigned long flags; - - spin_lock_irqsave(&owned_ports_lock, flags); - list_for_each_safe(port, tmp, &owned_ports) - { - owned_port_t *p = list_entry(port, owned_port_t, list); - if ( p->usbif_priv == up ) - __usbif_release_port(p); - } - spin_unlock_irqrestore(&owned_ports_lock, flags); -} - -static int __init usbif_init(void) -{ - int i; - - if ( !(xen_start_info.flags & SIF_INITDOMAIN) && - !(xen_start_info.flags & SIF_USB_BE_DOMAIN) ) - return 0; - - if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) - BUG(); - - pending_cons = 0; - pending_prod = MAX_PENDING_REQS; - memset(pending_reqs, 0, sizeof(pending_reqs)); - for ( i = 0; i < MAX_PENDING_REQS; i++ ) - pending_ring[i] = i; - - spin_lock_init(&pend_prod_lock); - - spin_lock_init(&owned_ports_lock); - INIT_LIST_HEAD(&owned_ports); - - spin_lock_init(&usbio_schedule_list_lock); - INIT_LIST_HEAD(&usbio_schedule_list); - - if ( kernel_thread(usbio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) - BUG(); - - usbif_interface_init(); - - usbif_ctrlif_init(); - - usb_register(&driver); - - printk(KERN_INFO "Xen USB Backend Initialised"); - - return 0; -} - -__initcall(usbif_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/usbfront/usbfront.c --- a/linux-2.6.11-xen-sparse/drivers/xen/usbfront/usbfront.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1738 +0,0 @@ -/* - * Xen Virtual USB Frontend Driver - * - * This file contains the first version of the Xen virtual USB hub - * that I've managed not to delete by mistake (3rd time lucky!). - * - * Based on Linux's uhci.c, original copyright notices are displayed - * below. Portions also (c) 2004 Intel Research Cambridge - * and (c) 2004, 2005 Mark Williamson - * - * Contact <mark.williamson@xxxxxxxxxxxx> or - * <xen-devel@xxxxxxxxxxxxxxxxxxxxx> regarding this code. - * - * Still to be (maybe) implemented: - * - migration / backend restart support? - * - support for building / using as a module - */ - -/* - * Universal Host Controller Interface driver for USB. - * - * Maintainer: Johannes Erdfelt <johannes@xxxxxxxxxxx> - * - * (C) Copyright 1999 Linus Torvalds - * (C) Copyright 1999-2002 Johannes Erdfelt, johannes@xxxxxxxxxxx - * (C) Copyright 1999 Randy Dunlap - * (C) Copyright 1999 Georg Acher, acher@xxxxxxxxx - * (C) Copyright 1999 Deti Fliegl, deti@xxxxxxxxx - * (C) Copyright 1999 Thomas Sailer, sailer@xxxxxxxxxxxxxx - * (C) Copyright 1999 Roman Weissgaerber, weissg@xxxxxxxxx - * (C) Copyright 2000 Yggdrasil Computing, Inc. (port of new PCI interface - * support from usb-ohci.c by Adam Richter, adam@xxxxxxxxxxxxx). - * (C) Copyright 1999 Gregory P. Smith (from usb-ohci.c) - * - * Intel documents this fairly well, and as far as I know there - * are no royalties or anything like that, but even so there are - * people who decided that they want to do the same thing in a - * completely different way. - * - * WARNING! The USB documentation is downright evil. Most of it - * is just crap, written by a committee. You're better off ignoring - * most of it, the important stuff is: - * - the low-level protocol (fairly simple but lots of small details) - * - working around the horridness of the rest - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/delay.h> -#include <linux/slab.h> -#include <linux/smp_lock.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/spinlock.h> -#ifdef CONFIG_USB_DEBUG -#define DEBUG -#else -#undef DEBUG -#endif -#include <linux/usb.h> - -#include <asm/irq.h> -#include <asm/system.h> - -#include "xhci.h" - -#include "../../../../../drivers/usb/hcd.h" - -#include <asm-xen/xen-public/io/usbif.h> -#include <asm/ctrl_if.h> -#include <asm/xen-public/io/domain_controller.h> - -/* - * Version Information - */ -#define DRIVER_VERSION "v1.0" -#define DRIVER_AUTHOR "Linus 'Frodo Rabbit' Torvalds, Johannes Erdfelt, " \ - "Randy Dunlap, Georg Acher, Deti Fliegl, " \ - "Thomas Sailer, Roman Weissgaerber, Mark Williamson" -#define DRIVER_DESC "Xen Virtual USB Host Controller Interface" - -/* - * debug = 0, no debugging messages - * debug = 1, dump failed URB's except for stalls - * debug = 2, dump all failed URB's (including stalls) - */ -#ifdef DEBUG -static int debug = 1; -#else -static int debug = 0; -#endif -MODULE_PARM(debug, "i"); -MODULE_PARM_DESC(debug, "Debug level"); -static char *errbuf; -#define ERRBUF_LEN (PAGE_SIZE * 8) - -static int rh_submit_urb(struct urb *urb); -static int rh_unlink_urb(struct urb *urb); -static int xhci_unlink_urb(struct urb *urb); -static void xhci_call_completion(struct urb *urb); -static void xhci_drain_ring(void); -static void xhci_transfer_result(struct xhci *xhci, struct urb *urb); -static void xhci_finish_completion(void); - -#define MAX_URB_LOOP 2048 /* Maximum number of linked URB's */ - -static kmem_cache_t *xhci_up_cachep; /* urb_priv cache */ -static struct xhci *xhci; /* XHCI structure for the interface */ - -/****************************************************************************** - * DEBUGGING - */ - -#ifdef DEBUG - -static void dump_urb(struct urb *urb) -{ - printk(KERN_DEBUG "dumping urb @ %p\n" - " hcpriv = %p\n" - " next = %p\n" - " dev = %p\n" - " pipe = 0x%lx\n" - " status = %d\n" - " transfer_flags = 0x%lx\n" - " transfer_buffer = %p\n" - " transfer_buffer_length = %d\n" - " actual_length = %d\n" - " bandwidth = %d\n" - " setup_packet = %p\n", - urb, urb->hcpriv, urb->next, urb->dev, urb->pipe, urb->status, - urb->transfer_flags, urb->transfer_buffer, - urb->transfer_buffer_length, urb->actual_length, urb->bandwidth, - urb->setup_packet); - if ( urb->setup_packet != NULL ) - printk(KERN_DEBUG - "setup = { 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x, 0x%x }\n", - urb->setup_packet[0], urb->setup_packet[1], - urb->setup_packet[2], urb->setup_packet[3], - urb->setup_packet[4], urb->setup_packet[5], - urb->setup_packet[6], urb->setup_packet[7]); - printk(KERN_DEBUG "complete = %p\n" - "interval = %d\n", urb->complete, urb->interval); - -} - -static void xhci_show_resp(usbif_response_t *r) -{ - printk(KERN_DEBUG "dumping response @ %p\n" - " id=0x%lx\n" - " op=0x%x\n" - " data=0x%x\n" - " status=0x%x\n" - " length=0x%lx\n", - r->id, r->operation, r->data, r->status, r->length); -} - -#define DPRINK(...) printk(KERN_DEBUG __VA_ARGS__) - -#else /* DEBUG */ - -#define dump_urb(blah) ((void)0) -#define xhci_show_resp(blah) ((void)0) -#define DPRINTK(blah,...) ((void)0) - -#endif /* DEBUG */ - -/****************************************************************************** - * RING REQUEST HANDLING - */ - -#define RING_PLUGGED(_hc) ( RING_FULL(&_hc->usb_ring) || _hc->recovery ) - -/** - * xhci_construct_isoc - add isochronous information to a request - */ -static int xhci_construct_isoc(usbif_request_t *req, struct urb *urb) -{ - usbif_iso_t *schedule; - int i; - struct urb_priv *urb_priv = urb->hcpriv; - - req->num_iso = urb->number_of_packets; - schedule = (usbif_iso_t *)__get_free_page(GFP_KERNEL); - - if ( schedule == NULL ) - return -ENOMEM; - - for ( i = 0; i < req->num_iso; i++ ) - { - schedule[i].buffer_offset = urb->iso_frame_desc[i].offset; - schedule[i].length = urb->iso_frame_desc[i].length; - } - - urb_priv->schedule = schedule; - req->iso_schedule = virt_to_machine(schedule); - - return 0; -} - -/** - * xhci_queue_req - construct and queue request for an URB - */ -static int xhci_queue_req(struct urb *urb) -{ - unsigned long flags; - usbif_request_t *req; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - -#if DEBUG - printk(KERN_DEBUG - "usbif = %p, req_prod = %d (@ 0x%lx), resp_prod = %d, resp_cons = %d\n", - usbif, usbif->req_prod, virt_to_machine(&usbif->req_prod), - usbif->resp_prod, xhci->usb_resp_cons); -#endif - - spin_lock_irqsave(&xhci->ring_lock, flags); - - if ( RING_PLUGGED(xhci) ) - { - printk(KERN_WARNING - "xhci_queue_req(): USB ring plugged, not queuing request\n"); - spin_unlock_irqrestore(&xhci->ring_lock, flags); - return -ENOBUFS; - } - - /* Stick something in the shared communications ring. */ - req = RING_GET_REQUEST(usb_ring, usb_ring->req_prod_pvt); - - req->operation = USBIF_OP_IO; - req->port = 0; /* We don't care what the port is. */ - req->id = (unsigned long) urb->hcpriv; - req->transfer_buffer = virt_to_machine(urb->transfer_buffer); - req->devnum = usb_pipedevice(urb->pipe); - req->direction = usb_pipein(urb->pipe); - req->speed = usb_pipeslow(urb->pipe); - req->pipe_type = usb_pipetype(urb->pipe); - req->length = urb->transfer_buffer_length; - req->transfer_flags = urb->transfer_flags; - req->endpoint = usb_pipeendpoint(urb->pipe); - req->speed = usb_pipeslow(urb->pipe); - req->timeout = urb->timeout * (1000 / HZ); - - if ( usb_pipetype(urb->pipe) == 0 ) /* ISO */ - { - int ret = xhci_construct_isoc(req, urb); - if ( ret != 0 ) - return ret; - } - - if(urb->setup_packet != NULL) - memcpy(req->setup, urb->setup_packet, 8); - else - memset(req->setup, 0, 8); - - usb_ring->req_prod_pvt++; - RING_PUSH_REQUESTS(usb_ring); - - spin_unlock_irqrestore(&xhci->ring_lock, flags); - - notify_via_evtchn(xhci->evtchn); - - DPRINTK("Queued request for an URB.\n"); - dump_urb(urb); - - return -EINPROGRESS; -} - -/** - * xhci_queue_probe - queue a probe request for a particular port - */ -static inline usbif_request_t *xhci_queue_probe(usbif_vdev_t port) -{ - usbif_request_t *req; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - -#if DEBUG - printk(KERN_DEBUG - "queuing probe: req_prod = %d (@ 0x%lx), resp_prod = %d, " - "resp_cons = %d\n", usbif->req_prod, - virt_to_machine(&usbif->req_prod), - usbif->resp_prod, xhci->usb_resp_cons); -#endif - - /* This is always called from the timer interrupt. */ - spin_lock(&xhci->ring_lock); - - if ( RING_PLUGGED(xhci) ) - { - printk(KERN_WARNING - "xhci_queue_probe(): ring full, not queuing request\n"); - spin_unlock(&xhci->ring_lock); - return NULL; - } - - /* Stick something in the shared communications ring. */ - req = RING_GET_REQUEST(usb_ring, usb_ring->req_prod_pvt); - - memset(req, 0, sizeof(*req)); - - req->operation = USBIF_OP_PROBE; - req->port = port; - - usb_ring->req_prod_pvt++; - RING_PUSH_REQUESTS(usb_ring); - - spin_unlock(&xhci->ring_lock); - - notify_via_evtchn(xhci->evtchn); - - return req; -} - -/** - * xhci_port_reset - queue a reset request for a particular port - */ -static int xhci_port_reset(usbif_vdev_t port) -{ - usbif_request_t *req; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - - /* Only ever happens from process context (hub thread). */ - spin_lock_irq(&xhci->ring_lock); - - if ( RING_PLUGGED(xhci) ) - { - printk(KERN_WARNING - "xhci_port_reset(): ring plugged, not queuing request\n"); - spin_unlock_irq(&xhci->ring_lock); - return -ENOBUFS; - } - - /* We only reset one port at a time, so we only need one variable per - * hub. */ - xhci->awaiting_reset = 1; - - /* Stick something in the shared communications ring. */ - req = RING_GET_REQUEST(usb_ring, usb_ring->req_prod_pvt); - - memset(req, 0, sizeof(*req)); - - req->operation = USBIF_OP_RESET; - req->port = port; - - usb_ring->req_prod_pvt++; - RING_PUSH_REQUESTS(usb_ring); - - spin_unlock_irq(&xhci->ring_lock); - - notify_via_evtchn(xhci->evtchn); - - while ( xhci->awaiting_reset > 0 ) - { - mdelay(1); - xhci_drain_ring(); - } - - xhci->rh.ports[port].pe = 1; - xhci->rh.ports[port].pe_chg = 1; - - return xhci->awaiting_reset; -} - - -/****************************************************************************** - * RING RESPONSE HANDLING - */ - -static void receive_usb_reset(usbif_response_t *resp) -{ - xhci->awaiting_reset = resp->status; - rmb(); - -} - -static void receive_usb_probe(usbif_response_t *resp) -{ - spin_lock(&xhci->rh.port_state_lock); - - if ( resp->status >= 0 ) - { - if ( resp->status == 1 ) - { - /* If theres a device there and there wasn't one before there must - * have been a connection status change. */ - if( xhci->rh.ports[resp->data].cs == 0 ) - { - xhci->rh.ports[resp->data].cs = 1; - xhci->rh.ports[resp->data].cs_chg = 1; - } - } - else if ( resp->status == 0 ) - { - if(xhci->rh.ports[resp->data].cs == 1 ) - { - xhci->rh.ports[resp->data].cs = 0; - xhci->rh.ports[resp->data].cs_chg = 1; - xhci->rh.ports[resp->data].pe = 0; - /* According to USB Spec v2.0, 11.24.2.7.2.2, we don't need - * to set pe_chg since an error has not occurred. */ - } - } - else - printk(KERN_WARNING "receive_usb_probe(): unexpected status %d " - "for port %d\n", resp->status, resp->data); - } - else if ( resp->status < 0) - printk(KERN_WARNING "receive_usb_probe(): got error status %d\n", - resp->status); - - spin_unlock(&xhci->rh.port_state_lock); -} - -static void receive_usb_io(usbif_response_t *resp) -{ - struct urb_priv *urbp = (struct urb_priv *)resp->id; - struct urb *urb = urbp->urb; - - urb->actual_length = resp->length; - urbp->in_progress = 0; - - if( usb_pipetype(urb->pipe) == 0 ) /* ISO */ - { - int i; - - /* Copy ISO schedule results back in. */ - for ( i = 0; i < urb->number_of_packets; i++ ) - { - urb->iso_frame_desc[i].status - = urbp->schedule[i].status; - urb->iso_frame_desc[i].actual_length - = urbp->schedule[i].length; - } - free_page((unsigned long)urbp->schedule); - } - - /* Only set status if it's not been changed since submission. It might - * have been changed if the URB has been unlinked asynchronously, for - * instance. */ - if ( urb->status == -EINPROGRESS ) - urbp->status = urb->status = resp->status; -} - -/** - * xhci_drain_ring - drain responses from the ring, calling handlers - * - * This may be called from interrupt context when an event is received from the - * backend domain, or sometimes in process context whilst waiting for a port - * reset or URB completion. - */ -static void xhci_drain_ring(void) -{ - struct list_head *tmp, *head; - usbif_front_ring_t *usb_ring = &xhci->usb_ring; - usbif_response_t *resp; - RING_IDX i, rp; - - /* Walk the ring here to get responses, updating URBs to show what - * completed. */ - - rp = usb_ring->sring->rsp_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - /* Take items off the comms ring, taking care not to overflow. */ - for ( i = usb_ring->rsp_cons; i != rp; i++ ) - { - resp = RING_GET_RESPONSE(usb_ring, i); - - /* May need to deal with batching and with putting a ceiling on - the number dispatched for performance and anti-dos reasons */ - - xhci_show_resp(resp); - - switch ( resp->operation ) - { - case USBIF_OP_PROBE: - receive_usb_probe(resp); - break; - - case USBIF_OP_IO: - receive_usb_io(resp); - break; - - case USBIF_OP_RESET: - receive_usb_reset(resp); - break; - - default: - printk(KERN_WARNING - "error: unknown USB io operation response [%d]\n", - resp->operation); - break; - } - } - - usb_ring->rsp_cons = i; - - /* Walk the list of pending URB's to see which ones completed and do - * callbacks, etc. */ - spin_lock(&xhci->urb_list_lock); - head = &xhci->urb_list; - tmp = head->next; - while (tmp != head) { - struct urb *urb = list_entry(tmp, struct urb, urb_list); - - tmp = tmp->next; - - /* Checks the status and does all of the magic necessary */ - xhci_transfer_result(xhci, urb); - } - spin_unlock(&xhci->urb_list_lock); - - xhci_finish_completion(); -} - - -static void xhci_interrupt(int irq, void *__xhci, struct pt_regs *regs) -{ - xhci_drain_ring(); -} - -/****************************************************************************** - * HOST CONTROLLER FUNCTIONALITY - */ - -/** - * no-op implementation of private device alloc / free routines - */ -static int xhci_do_nothing_dev(struct usb_device *dev) -{ - return 0; -} - -static inline void xhci_add_complete(struct urb *urb) -{ - struct urb_priv *urbp = (struct urb_priv *)urb->hcpriv; - unsigned long flags; - - spin_lock_irqsave(&xhci->complete_list_lock, flags); - list_add_tail(&urbp->complete_list, &xhci->complete_list); - spin_unlock_irqrestore(&xhci->complete_list_lock, flags); -} - -/* When this returns, the owner of the URB may free its - * storage. - * - * We spin and wait for the URB to complete before returning. - * - * Call with urb->lock acquired. - */ -static void xhci_delete_urb(struct urb *urb) -{ - struct urb_priv *urbp; - - urbp = urb->hcpriv; - - /* If there's no urb_priv structure for this URB then it can't have - * been submitted at all. */ - if ( urbp == NULL ) - return; - - /* For now we just spin until the URB completes. It shouldn't take too - * long and we don't expect to have to do this very often. */ - while ( urb->status == -EINPROGRESS ) - { - xhci_drain_ring(); - mdelay(1); - } - - /* Now we know that further transfers to the buffer won't - * occur, so we can safely return. */ -} - -static struct urb_priv *xhci_alloc_urb_priv(struct urb *urb) -{ - struct urb_priv *urbp; - - urbp = kmem_cache_alloc(xhci_up_cachep, SLAB_ATOMIC); - if (!urbp) { - err("xhci_alloc_urb_priv: couldn't allocate memory for urb_priv\n"); - return NULL; - } - - memset((void *)urbp, 0, sizeof(*urbp)); - - urbp->inserttime = jiffies; - urbp->urb = urb; - urbp->dev = urb->dev; - - INIT_LIST_HEAD(&urbp->complete_list); - - urb->hcpriv = urbp; - - return urbp; -} - -/* - * MUST be called with urb->lock acquired - */ -/* When is this called? Do we need to stop the transfer (as we - * currently do)? */ -static void xhci_destroy_urb_priv(struct urb *urb) -{ - struct urb_priv *urbp; - - urbp = (struct urb_priv *)urb->hcpriv; - if (!urbp) - return; - - if (!list_empty(&urb->urb_list)) - warn("xhci_destroy_urb_priv: urb %p still on xhci->urb_list", urb); - - if (!list_empty(&urbp->complete_list)) - warn("xhci_destroy_urb_priv: urb %p still on xhci->complete_list", urb); - - kmem_cache_free(xhci_up_cachep, urb->hcpriv); - - urb->hcpriv = NULL; -} - -/** - * Try to find URBs in progress on the same pipe to the same device. - * - * MUST be called with xhci->urb_list_lock acquired - */ -static struct urb *xhci_find_urb_ep(struct xhci *xhci, struct urb *urb) -{ - struct list_head *tmp, *head; - - /* We don't match Isoc transfers since they are special */ - if (usb_pipeisoc(urb->pipe)) - return NULL; - - head = &xhci->urb_list; - tmp = head->next; - while (tmp != head) { - struct urb *u = list_entry(tmp, struct urb, urb_list); - - tmp = tmp->next; - - if (u->dev == urb->dev && u->pipe == urb->pipe && - u->status == -EINPROGRESS) - return u; - } - - return NULL; -} - -static int xhci_submit_urb(struct urb *urb) -{ - int ret = -EINVAL; - unsigned long flags; - struct urb *eurb; - int bustime; - - DPRINTK("URB submitted to XHCI driver.\n"); - dump_urb(urb); - - if (!urb) - return -EINVAL; - - if (!urb->dev || !urb->dev->bus || !urb->dev->bus->hcpriv) { - warn("xhci_submit_urb: urb %p belongs to disconnected device or bus?", urb); - return -ENODEV; - } - - if ( urb->dev->devpath == NULL ) - BUG(); - - usb_inc_dev_use(urb->dev); - - spin_lock_irqsave(&xhci->urb_list_lock, flags); - spin_lock(&urb->lock); - - if (urb->status == -EINPROGRESS || urb->status == -ECONNRESET || - urb->status == -ECONNABORTED) { - dbg("xhci_submit_urb: urb not available to submit (status = %d)", urb->status); - /* Since we can have problems on the out path */ - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - usb_dec_dev_use(urb->dev); - - return ret; - } - - INIT_LIST_HEAD(&urb->urb_list); - if (!xhci_alloc_urb_priv(urb)) { - ret = -ENOMEM; - - goto out; - } - - ( (struct urb_priv *)urb->hcpriv )->in_progress = 1; - - eurb = xhci_find_urb_ep(xhci, urb); - if (eurb && !(urb->transfer_flags & USB_QUEUE_BULK)) { - ret = -ENXIO; - - goto out; - } - - /* Short circuit the virtual root hub */ - if (urb->dev == xhci->rh.dev) { - ret = rh_submit_urb(urb); - - goto out; - } - - switch (usb_pipetype(urb->pipe)) { - case PIPE_CONTROL: - case PIPE_BULK: - ret = xhci_queue_req(urb); - break; - - case PIPE_INTERRUPT: - if (urb->bandwidth == 0) { /* not yet checked/allocated */ - bustime = usb_check_bandwidth(urb->dev, urb); - if (bustime < 0) - ret = bustime; - else { - ret = xhci_queue_req(urb); - if (ret == -EINPROGRESS) - usb_claim_bandwidth(urb->dev, urb, - bustime, 0); - } - } else /* bandwidth is already set */ - ret = xhci_queue_req(urb); - break; - - case PIPE_ISOCHRONOUS: - if (urb->bandwidth == 0) { /* not yet checked/allocated */ - if (urb->number_of_packets <= 0) { - ret = -EINVAL; - break; - } - bustime = usb_check_bandwidth(urb->dev, urb); - if (bustime < 0) { - ret = bustime; - break; - } - - ret = xhci_queue_req(urb); - if (ret == -EINPROGRESS) - usb_claim_bandwidth(urb->dev, urb, bustime, 1); - } else /* bandwidth is already set */ - ret = xhci_queue_req(urb); - break; - } -out: - urb->status = ret; - - if (ret == -EINPROGRESS) { - /* We use _tail to make find_urb_ep more efficient */ - list_add_tail(&urb->urb_list, &xhci->urb_list); - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - return 0; - } - - xhci_delete_urb(urb); - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - /* Only call completion if it was successful */ - if (!ret) - xhci_call_completion(urb); - - return ret; -} - -/* - * Return the result of a transfer - * - * MUST be called with urb_list_lock acquired - */ -static void xhci_transfer_result(struct xhci *xhci, struct urb *urb) -{ - int ret = 0; - unsigned long flags; - struct urb_priv *urbp; - - /* The root hub is special */ - if (urb->dev == xhci->rh.dev) - return; - - spin_lock_irqsave(&urb->lock, flags); - - urbp = (struct urb_priv *)urb->hcpriv; - - if ( ( (struct urb_priv *)urb->hcpriv )->in_progress ) - ret = -EINPROGRESS; - - if (urb->actual_length < urb->transfer_buffer_length) { - if (urb->transfer_flags & USB_DISABLE_SPD) { - ret = -EREMOTEIO; - } - } - - if (urb->status == -EPIPE) - { - ret = urb->status; - /* endpoint has stalled - mark it halted */ - usb_endpoint_halt(urb->dev, usb_pipeendpoint(urb->pipe), - usb_pipeout(urb->pipe)); - } - - if ((debug == 1 && ret != 0 && ret != -EPIPE) || - (ret != 0 && debug > 1)) { - /* Some debugging code */ - dbg("xhci_result_interrupt/bulk() failed with status %x", - status); - } - - if (ret == -EINPROGRESS) - goto out; - - switch (usb_pipetype(urb->pipe)) { - case PIPE_CONTROL: - case PIPE_BULK: - case PIPE_ISOCHRONOUS: - /* Release bandwidth for Interrupt or Isoc. transfers */ - /* Spinlock needed ? */ - if (urb->bandwidth) - usb_release_bandwidth(urb->dev, urb, 1); - xhci_delete_urb(urb); - break; - case PIPE_INTERRUPT: - /* Interrupts are an exception */ - if (urb->interval) - goto out_complete; - - /* Release bandwidth for Interrupt or Isoc. transfers */ - /* Spinlock needed ? */ - if (urb->bandwidth) - usb_release_bandwidth(urb->dev, urb, 0); - xhci_delete_urb(urb); - break; - default: - info("xhci_transfer_result: unknown pipe type %d for urb %p\n", - usb_pipetype(urb->pipe), urb); - } - - /* Remove it from xhci->urb_list */ - list_del_init(&urb->urb_list); - -out_complete: - xhci_add_complete(urb); - -out: - spin_unlock_irqrestore(&urb->lock, flags); -} - -static int xhci_unlink_urb(struct urb *urb) -{ - unsigned long flags; - struct urb_priv *urbp = urb->hcpriv; - - if (!urb) - return -EINVAL; - - if (!urb->dev || !urb->dev->bus || !urb->dev->bus->hcpriv) - return -ENODEV; - - spin_lock_irqsave(&xhci->urb_list_lock, flags); - spin_lock(&urb->lock); - - /* Release bandwidth for Interrupt or Isoc. transfers */ - /* Spinlock needed ? */ - if (urb->bandwidth) { - switch (usb_pipetype(urb->pipe)) { - case PIPE_INTERRUPT: - usb_release_bandwidth(urb->dev, urb, 0); - break; - case PIPE_ISOCHRONOUS: - usb_release_bandwidth(urb->dev, urb, 1); - break; - default: - break; - } - } - - if (urb->status != -EINPROGRESS) { - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - return 0; - } - - list_del_init(&urb->urb_list); - - /* Short circuit the virtual root hub */ - if (urb->dev == xhci->rh.dev) { - rh_unlink_urb(urb); - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - xhci_call_completion(urb); - } else { - if (urb->transfer_flags & USB_ASYNC_UNLINK) { - /* We currently don't currently attempt to cancel URBs - * that have been queued in the ring. We handle async - * unlinked URBs when they complete. */ - urbp->status = urb->status = -ECONNABORTED; - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - } else { - urb->status = -ENOENT; - - spin_unlock(&urb->lock); - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - if (in_interrupt()) { /* wait at least 1 frame */ - static int errorcount = 10; - - if (errorcount--) - dbg("xhci_unlink_urb called from interrupt for urb %p", urb); - udelay(1000); - } else - schedule_timeout(1+1*HZ/1000); - - xhci_delete_urb(urb); - - xhci_call_completion(urb); - } - } - - return 0; -} - -static void xhci_call_completion(struct urb *urb) -{ - struct urb_priv *urbp; - struct usb_device *dev = urb->dev; - int is_ring = 0, killed, resubmit_interrupt, status; - struct urb *nurb; - unsigned long flags; - - spin_lock_irqsave(&urb->lock, flags); - - urbp = (struct urb_priv *)urb->hcpriv; - if (!urbp || !urb->dev) { - spin_unlock_irqrestore(&urb->lock, flags); - return; - } - - killed = (urb->status == -ENOENT || urb->status == -ECONNABORTED || - urb->status == -ECONNRESET); - resubmit_interrupt = (usb_pipetype(urb->pipe) == PIPE_INTERRUPT && - urb->interval); - - nurb = urb->next; - if (nurb && !killed) { - int count = 0; - - while (nurb && nurb != urb && count < MAX_URB_LOOP) { - if (nurb->status == -ENOENT || - nurb->status == -ECONNABORTED || - nurb->status == -ECONNRESET) { - killed = 1; - break; - } - - nurb = nurb->next; - count++; - } - - if (count == MAX_URB_LOOP) - err("xhci_call_completion: too many linked URB's, loop? (first loop)"); - - /* Check to see if chain is a ring */ - is_ring = (nurb == urb); - } - - status = urbp->status; - if (!resubmit_interrupt || killed) - /* We don't need urb_priv anymore */ - xhci_destroy_urb_priv(urb); - - if (!killed) - urb->status = status; - - spin_unlock_irqrestore(&urb->lock, flags); - - if (urb->complete) - urb->complete(urb); - - if (resubmit_interrupt) - /* Recheck the status. The completion handler may have */ - /* unlinked the resubmitting interrupt URB */ - killed = (urb->status == -ENOENT || - urb->status == -ECONNABORTED || - urb->status == -ECONNRESET); - - if (resubmit_interrupt && !killed) { - if ( urb->dev != xhci->rh.dev ) - xhci_queue_req(urb); /* XXX What if this fails? */ - /* Don't need to resubmit URBs for the virtual root dev. */ - } else { - if (is_ring && !killed) { - urb->dev = dev; - xhci_submit_urb(urb); - } else { - /* We decrement the usage count after we're done */ - /* with everything */ - usb_dec_dev_use(dev); - } - } -} - -static void xhci_finish_completion(void) -{ - struct list_head *tmp, *head; - unsigned long flags; - - spin_lock_irqsave(&xhci->complete_list_lock, flags); - head = &xhci->complete_list; - tmp = head->next; - while (tmp != head) { - struct urb_priv *urbp = list_entry(tmp, struct urb_priv, - complete_list); - struct urb *urb = urbp->urb; - - list_del_init(&urbp->complete_list); - spin_unlock_irqrestore(&xhci->complete_list_lock, flags); - - xhci_call_completion(urb); - - spin_lock_irqsave(&xhci->complete_list_lock, flags); - head = &xhci->complete_list; - tmp = head->next; - } - spin_unlock_irqrestore(&xhci->complete_list_lock, flags); -} - -static struct usb_operations xhci_device_operations = { - .allocate = xhci_do_nothing_dev, - .deallocate = xhci_do_nothing_dev, - /* It doesn't look like any drivers actually care what the frame number - * is at the moment! If necessary, we could approximate the current - * frame nubmer by passing it from the backend in response messages. */ - .get_frame_number = NULL, - .submit_urb = xhci_submit_urb, - .unlink_urb = xhci_unlink_urb -}; - -/****************************************************************************** - * VIRTUAL ROOT HUB EMULATION - */ - -static __u8 root_hub_dev_des[] = -{ - 0x12, /* __u8 bLength; */ - 0x01, /* __u8 bDescriptorType; Device */ - 0x00, /* __u16 bcdUSB; v1.0 */ - 0x01, - 0x09, /* __u8 bDeviceClass; HUB_CLASSCODE */ - 0x00, /* __u8 bDeviceSubClass; */ - 0x00, /* __u8 bDeviceProtocol; */ - 0x08, /* __u8 bMaxPacketSize0; 8 Bytes */ - 0x00, /* __u16 idVendor; */ - 0x00, - 0x00, /* __u16 idProduct; */ - 0x00, - 0x00, /* __u16 bcdDevice; */ - 0x00, - 0x00, /* __u8 iManufacturer; */ - 0x02, /* __u8 iProduct; */ - 0x01, /* __u8 iSerialNumber; */ - 0x01 /* __u8 bNumConfigurations; */ -}; - - -/* Configuration descriptor */ -static __u8 root_hub_config_des[] = -{ - 0x09, /* __u8 bLength; */ - 0x02, /* __u8 bDescriptorType; Configuration */ - 0x19, /* __u16 wTotalLength; */ - 0x00, - 0x01, /* __u8 bNumInterfaces; */ - 0x01, /* __u8 bConfigurationValue; */ - 0x00, /* __u8 iConfiguration; */ - 0x40, /* __u8 bmAttributes; - Bit 7: Bus-powered, 6: Self-powered, - Bit 5 Remote-wakeup, 4..0: resvd */ - 0x00, /* __u8 MaxPower; */ - - /* interface */ - 0x09, /* __u8 if_bLength; */ - 0x04, /* __u8 if_bDescriptorType; Interface */ - 0x00, /* __u8 if_bInterfaceNumber; */ - 0x00, /* __u8 if_bAlternateSetting; */ - 0x01, /* __u8 if_bNumEndpoints; */ - 0x09, /* __u8 if_bInterfaceClass; HUB_CLASSCODE */ - 0x00, /* __u8 if_bInterfaceSubClass; */ - 0x00, /* __u8 if_bInterfaceProtocol; */ - 0x00, /* __u8 if_iInterface; */ - - /* endpoint */ - 0x07, /* __u8 ep_bLength; */ - 0x05, /* __u8 ep_bDescriptorType; Endpoint */ - 0x81, /* __u8 ep_bEndpointAddress; IN Endpoint 1 */ - 0x03, /* __u8 ep_bmAttributes; Interrupt */ - 0x08, /* __u16 ep_wMaxPacketSize; 8 Bytes */ - 0x00, - 0xff /* __u8 ep_bInterval; 255 ms */ -}; - -static __u8 root_hub_hub_des[] = -{ - 0x09, /* __u8 bLength; */ - 0x29, /* __u8 bDescriptorType; Hub-descriptor */ - 0x02, /* __u8 bNbrPorts; */ - 0x00, /* __u16 wHubCharacteristics; */ - 0x00, - 0x01, /* __u8 bPwrOn2pwrGood; 2ms */ - 0x00, /* __u8 bHubContrCurrent; 0 mA */ - 0x00, /* __u8 DeviceRemovable; *** 7 Ports max *** */ - 0xff /* __u8 PortPwrCtrlMask; *** 7 ports max *** */ -}; - -/* prepare Interrupt pipe transaction data; HUB INTERRUPT ENDPOINT */ -static int rh_send_irq(struct urb *urb) -{ - struct urb_priv *urbp = (struct urb_priv *)urb->hcpriv; - xhci_port_t *ports = xhci->rh.ports; - unsigned long flags; - int i, len = 1; - __u16 data = 0; - - spin_lock_irqsave(&urb->lock, flags); - for (i = 0; i < xhci->rh.numports; i++) { - /* Set a bit if anything at all has changed on the port, as per - * USB spec 11.12 */ - data |= (ports[i].cs_chg || ports[i].pe_chg ) - ? (1 << (i + 1)) - : 0; - - len = (i + 1) / 8 + 1; - } - - *(__u16 *) urb->transfer_buffer = cpu_to_le16(data); - urb->actual_length = len; - urbp->status = 0; - - spin_unlock_irqrestore(&urb->lock, flags); - - if ((data > 0) && (xhci->rh.send != 0)) { - dbg("root-hub INT complete: data: %x", data); - xhci_call_completion(urb); - } - - return 0; -} - -/* Virtual Root Hub INTs are polled by this timer every "interval" ms */ -static int rh_init_int_timer(struct urb *urb); - -static void rh_int_timer_do(unsigned long ptr) -{ - struct urb *urb = (struct urb *)ptr; - struct list_head list, *tmp, *head; - unsigned long flags; - int i; - - for ( i = 0; i < xhci->rh.numports; i++) - xhci_queue_probe(i); - - if (xhci->rh.send) - rh_send_irq(urb); - - INIT_LIST_HEAD(&list); - - spin_lock_irqsave(&xhci->urb_list_lock, flags); - head = &xhci->urb_list; - tmp = head->next; - while (tmp != head) { - struct urb *u = list_entry(tmp, struct urb, urb_list); - struct urb_priv *up = (struct urb_priv *)u->hcpriv; - - tmp = tmp->next; - - spin_lock(&u->lock); - - /* Check if the URB timed out */ - if (u->timeout && time_after_eq(jiffies, - up->inserttime + u->timeout)) { - list_del(&u->urb_list); - list_add_tail(&u->urb_list, &list); - } - - spin_unlock(&u->lock); - } - spin_unlock_irqrestore(&xhci->urb_list_lock, flags); - - head = &list; - tmp = head->next; - while (tmp != head) { - struct urb *u = list_entry(tmp, struct urb, urb_list); - - tmp = tmp->next; - - u->transfer_flags |= USB_ASYNC_UNLINK | USB_TIMEOUT_KILLED; - xhci_unlink_urb(u); - } - - rh_init_int_timer(urb); -} - -/* Root Hub INTs are polled by this timer */ -static int rh_init_int_timer(struct urb *urb) -{ - xhci->rh.interval = urb->interval; - init_timer(&xhci->rh.rh_int_timer); - xhci->rh.rh_int_timer.function = rh_int_timer_do; - xhci->rh.rh_int_timer.data = (unsigned long)urb; - xhci->rh.rh_int_timer.expires = jiffies - + (HZ * (urb->interval < 30 ? 30 : urb->interval)) / 1000; - add_timer(&xhci->rh.rh_int_timer); - - return 0; -} - -#define OK(x) len = (x); break - -/* Root Hub Control Pipe */ -static int rh_submit_urb(struct urb *urb) -{ - unsigned int pipe = urb->pipe; - struct usb_ctrlrequest *cmd = - (struct usb_ctrlrequest *)urb->setup_packet; - void *data = urb->transfer_buffer; - int leni = urb->transfer_buffer_length; - int len = 0; - xhci_port_t *status; - int stat = 0; - int i; - int retstatus; - unsigned long flags; - - __u16 cstatus; - __u16 bmRType_bReq; - __u16 wValue; - __u16 wIndex; - __u16 wLength; - - if (usb_pipetype(pipe) == PIPE_INTERRUPT) { - xhci->rh.urb = urb; - xhci->rh.send = 1; - xhci->rh.interval = urb->interval; - rh_init_int_timer(urb); - - return -EINPROGRESS; - } - - bmRType_bReq = cmd->bRequestType | cmd->bRequest << 8; - wValue = le16_to_cpu(cmd->wValue); - wIndex = le16_to_cpu(cmd->wIndex); - wLength = le16_to_cpu(cmd->wLength); - - for (i = 0; i < 8; i++) - xhci->rh.c_p_r[i] = 0; - - status = &xhci->rh.ports[wIndex - 1]; - - spin_lock_irqsave(&xhci->rh.port_state_lock, flags); - - switch (bmRType_bReq) { - /* Request Destination: - without flags: Device, - RH_INTERFACE: interface, - RH_ENDPOINT: endpoint, - RH_CLASS means HUB here, - RH_OTHER | RH_CLASS almost ever means HUB_PORT here - */ - - case RH_GET_STATUS: - *(__u16 *)data = cpu_to_le16(1); - OK(2); - case RH_GET_STATUS | RH_INTERFACE: - *(__u16 *)data = cpu_to_le16(0); - OK(2); - case RH_GET_STATUS | RH_ENDPOINT: - *(__u16 *)data = cpu_to_le16(0); - OK(2); - case RH_GET_STATUS | RH_CLASS: - *(__u32 *)data = cpu_to_le32(0); - OK(4); /* hub power */ - case RH_GET_STATUS | RH_OTHER | RH_CLASS: - cstatus = (status->cs_chg) | - (status->pe_chg << 1) | - (xhci->rh.c_p_r[wIndex - 1] << 4); - retstatus = (status->cs) | - (status->pe << 1) | - (status->susp << 2) | - (1 << 8) | /* power on */ - (status->lsda << 9); - *(__u16 *)data = cpu_to_le16(retstatus); - *(__u16 *)(data + 2) = cpu_to_le16(cstatus); - OK(4); - case RH_CLEAR_FEATURE | RH_ENDPOINT: - switch (wValue) { - case RH_ENDPOINT_STALL: - OK(0); - } - break; - case RH_CLEAR_FEATURE | RH_CLASS: - switch (wValue) { - case RH_C_HUB_OVER_CURRENT: - OK(0); /* hub power over current */ - } - break; - case RH_CLEAR_FEATURE | RH_OTHER | RH_CLASS: - switch (wValue) { - case RH_PORT_ENABLE: - status->pe = 0; - OK(0); - case RH_PORT_SUSPEND: - status->susp = 0; - OK(0); - case RH_PORT_POWER: - OK(0); /* port power */ - case RH_C_PORT_CONNECTION: - status->cs_chg = 0; - OK(0); - case RH_C_PORT_ENABLE: - status->pe_chg = 0; - OK(0); - case RH_C_PORT_SUSPEND: - /*** WR_RH_PORTSTAT(RH_PS_PSSC); */ - OK(0); - case RH_C_PORT_OVER_CURRENT: - OK(0); /* port power over current */ - case RH_C_PORT_RESET: - xhci->rh.c_p_r[wIndex - 1] = 0; - OK(0); - } - break; - case RH_SET_FEATURE | RH_OTHER | RH_CLASS: - switch (wValue) { - case RH_PORT_SUSPEND: - status->susp = 1; - OK(0); - case RH_PORT_RESET: - { - int ret; - xhci->rh.c_p_r[wIndex - 1] = 1; - status->pr = 0; - status->pe = 1; - ret = xhci_port_reset(wIndex - 1); - /* XXX MAW: should probably cancel queued transfers during reset... *\/ */ - if ( ret == 0 ) { OK(0); } - else { return ret; } - } - break; - case RH_PORT_POWER: - OK(0); /* port power ** */ - case RH_PORT_ENABLE: - status->pe = 1; - OK(0); - } - break; - case RH_SET_ADDRESS: - xhci->rh.devnum = wValue; - OK(0); - case RH_GET_DESCRIPTOR: - switch ((wValue & 0xff00) >> 8) { - case 0x01: /* device descriptor */ - len = min_t(unsigned int, leni, - min_t(unsigned int, - sizeof(root_hub_dev_des), wLength)); - memcpy(data, root_hub_dev_des, len); - OK(len); - case 0x02: /* configuration descriptor */ - len = min_t(unsigned int, leni, - min_t(unsigned int, - sizeof(root_hub_config_des), wLength)); - memcpy (data, root_hub_config_des, len); - OK(len); - case 0x03: /* string descriptors */ - len = usb_root_hub_string (wValue & 0xff, - 0, "XHCI-alt", - data, wLength); - if (len > 0) { - OK(min_t(int, leni, len)); - } else - stat = -EPIPE; - } - break; - case RH_GET_DESCRIPTOR | RH_CLASS: - root_hub_hub_des[2] = xhci->rh.numports; - len = min_t(unsigned int, leni, - min_t(unsigned int, sizeof(root_hub_hub_des), wLength)); - memcpy(data, root_hub_hub_des, len); - OK(len); - case RH_GET_CONFIGURATION: - *(__u8 *)data = 0x01; - OK(1); - case RH_SET_CONFIGURATION: - OK(0); - case RH_GET_INTERFACE | RH_INTERFACE: - *(__u8 *)data = 0x00; - OK(1); - case RH_SET_INTERFACE | RH_INTERFACE: - OK(0); - default: - stat = -EPIPE; - } - - spin_unlock_irqrestore(&xhci->rh.port_state_lock, flags); - - urb->actual_length = len; - - return stat; -} - -/* - * MUST be called with urb->lock acquired - */ -static int rh_unlink_urb(struct urb *urb) -{ - if (xhci->rh.urb == urb) { - urb->status = -ENOENT; - xhci->rh.send = 0; - xhci->rh.urb = NULL; - del_timer(&xhci->rh.rh_int_timer); - } - return 0; -} - -/****************************************************************************** - * CONTROL PLANE FUNCTIONALITY - */ - -/** - * alloc_xhci - initialise a new virtual root hub for a new USB device channel - */ -static int alloc_xhci(void) -{ - int retval; - struct usb_bus *bus; - - retval = -EBUSY; - - xhci = kmalloc(sizeof(*xhci), GFP_KERNEL); - if (!xhci) { - err("couldn't allocate xhci structure"); - retval = -ENOMEM; - goto err_alloc_xhci; - } - - xhci->state = USBIF_STATE_CLOSED; - - spin_lock_init(&xhci->urb_list_lock); - INIT_LIST_HEAD(&xhci->urb_list); - - spin_lock_init(&xhci->complete_list_lock); - INIT_LIST_HEAD(&xhci->complete_list); - - spin_lock_init(&xhci->frame_list_lock); - - bus = usb_alloc_bus(&xhci_device_operations); - - if (!bus) { - err("unable to allocate bus"); - goto err_alloc_bus; - } - - xhci->bus = bus; - bus->bus_name = "XHCI"; - bus->hcpriv = xhci; - - usb_register_bus(xhci->bus); - - /* Initialize the root hub */ - - xhci->rh.numports = 0; - - xhci->bus->root_hub = xhci->rh.dev = usb_alloc_dev(NULL, xhci->bus); - if (!xhci->rh.dev) { - err("unable to allocate root hub"); - goto err_alloc_root_hub; - } - - xhci->state = 0; - - return 0; - -/* - * error exits: - */ -err_alloc_root_hub: - usb_deregister_bus(xhci->bus); - usb_free_bus(xhci->bus); - xhci->bus = NULL; - -err_alloc_bus: - kfree(xhci); - -err_alloc_xhci: - return retval; -} - -/** - * usbif_status_change - deal with an incoming USB_INTERFACE_STATUS_ message - */ -static void usbif_status_change(usbif_fe_interface_status_changed_t *status) -{ - ctrl_msg_t cmsg; - usbif_fe_interface_connect_t up; - long rc; - usbif_sring_t *sring; - - switch ( status->status ) - { - case USBIF_INTERFACE_STATUS_DESTROYED: - printk(KERN_WARNING "Unexpected usbif-DESTROYED message in state %d\n", - xhci->state); - break; - - case USBIF_INTERFACE_STATUS_DISCONNECTED: - if ( xhci->state != USBIF_STATE_CLOSED ) - { - printk(KERN_WARNING "Unexpected usbif-DISCONNECTED message" - " in state %d\n", xhci->state); - break; - /* Not bothering to do recovery here for now. Keep things - * simple. */ - - spin_lock_irq(&xhci->ring_lock); - - /* Clean up resources. */ - free_page((unsigned long)xhci->usb_ring.sring); - free_irq(xhci->irq, xhci); - unbind_evtchn_from_irq(xhci->evtchn); - - /* Plug the ring. */ - xhci->recovery = 1; - wmb(); - - spin_unlock_irq(&xhci->ring_lock); - } - - /* Move from CLOSED to DISCONNECTED state. */ - sring = (usbif_sring_t *)__get_free_page(GFP_KERNEL); - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&xhci->usb_ring, sring, PAGE_SIZE); - xhci->state = USBIF_STATE_DISCONNECTED; - - /* Construct an interface-CONNECT message for the domain controller. */ - cmsg.type = CMSG_USBIF_FE; - cmsg.subtype = CMSG_USBIF_FE_INTERFACE_CONNECT; - cmsg.length = sizeof(usbif_fe_interface_connect_t); - up.shmem_frame = virt_to_machine(sring) >> PAGE_SHIFT; - memcpy(cmsg.msg, &up, sizeof(up)); - - /* Tell the controller to bring up the interface. */ - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - break; - - case USBIF_INTERFACE_STATUS_CONNECTED: - if ( xhci->state == USBIF_STATE_CLOSED ) - { - printk(KERN_WARNING "Unexpected usbif-CONNECTED message" - " in state %d\n", xhci->state); - break; - } - - xhci->evtchn = status->evtchn; - xhci->irq = bind_evtchn_to_irq(xhci->evtchn); - xhci->bandwidth = status->bandwidth; - xhci->rh.numports = status->num_ports; - - xhci->rh.ports = kmalloc (sizeof(xhci_port_t) * xhci->rh.numports, GFP_KERNEL); - - if ( xhci->rh.ports == NULL ) - goto alloc_ports_nomem; - - memset(xhci->rh.ports, 0, sizeof(xhci_port_t) * xhci->rh.numports); - - usb_connect(xhci->rh.dev); - - if (usb_new_device(xhci->rh.dev) != 0) { - err("unable to start root hub"); - } - - /* Allocate the appropriate USB bandwidth here... Need to - * somehow know what the total available is thought to be so we - * can calculate the reservation correctly. */ - usb_claim_bandwidth(xhci->rh.dev, xhci->rh.urb, - 1000 - xhci->bandwidth, 0); - - if ( (rc = request_irq(xhci->irq, xhci_interrupt, - SA_SAMPLE_RANDOM, "usbif", xhci)) ) - printk(KERN_ALERT"usbfront request_irq failed (%ld)\n",rc); - - DPRINTK(KERN_INFO __FILE__ - ": USB XHCI: SHM at %p (0x%lx), EVTCHN %d IRQ %d\n", - xhci->usb_ring.sring, virt_to_machine(xhci->usbif), - xhci->evtchn, xhci->irq); - - xhci->state = USBIF_STATE_CONNECTED; - - break; - - default: - printk(KERN_WARNING "Status change to unknown value %d\n", - status->status); - break; - } - - return; - - alloc_ports_nomem: - printk(KERN_WARNING "Failed to allocate port memory, XHCI failed to connect.\n"); - return; -} - -/** - * usbif_ctrlif_rx - demux control messages by subtype - */ -static void usbif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->subtype ) - { - case CMSG_USBIF_FE_INTERFACE_STATUS_CHANGED: - usbif_status_change((usbif_fe_interface_status_changed_t *) - &msg->msg[0]); - break; - - /* New interface...? */ - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -static void send_driver_up(void) -{ - control_msg_t cmsg; - usbif_fe_interface_status_changed_t st; - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_USBIF_FE; - cmsg.subtype = CMSG_USBIF_FE_DRIVER_STATUS_CHANGED; - cmsg.length = sizeof(usbif_fe_driver_status_changed_t); - st.status = USBIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -void usbif_resume(void) -{ - int i; - - /* Fake disconnection on all virtual USB ports (suspending / migrating - * will destroy hard state associated will the USB devices anyhow). */ - /* No need to lock here. */ - for ( i = 0; i < xhci->rh.numports; i++ ) - { - xhci->rh.ports[i].cs = 0; - xhci->rh.ports[i].cs_chg = 1; - xhci->rh.ports[i].pe = 0; - } - - send_driver_up(); -} - -static int __init xhci_hcd_init(void) -{ - int retval = -ENOMEM, i; - - if ( (xen_start_info.flags & SIF_INITDOMAIN) - || (xen_start_info.flags & SIF_USB_BE_DOMAIN) ) - return 0; - - info(DRIVER_DESC " " DRIVER_VERSION); - - if (debug) { - errbuf = kmalloc(ERRBUF_LEN, GFP_KERNEL); - if (!errbuf) - goto errbuf_failed; - } - - xhci_up_cachep = kmem_cache_create("xhci_urb_priv", - sizeof(struct urb_priv), 0, 0, NULL, NULL); - if (!xhci_up_cachep) - goto up_failed; - - /* Let the domain controller know we're here. For now we wait until - * connection, as for the block and net drivers. This is only strictly - * necessary if we're going to boot off a USB device. */ - printk(KERN_INFO "Initialising Xen virtual USB hub\n"); - - (void)ctrl_if_register_receiver(CMSG_USBIF_FE, usbif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - alloc_xhci(); - - send_driver_up(); - - /* - * We should read 'nr_interfaces' from response message and wait - * for notifications before proceeding. For now we assume that we - * will be notified of exactly one interface. - */ - for ( i=0; (xhci->state != USBIF_STATE_CONNECTED) && (i < 10*HZ); i++ ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - if (xhci->state != USBIF_STATE_CONNECTED) - printk(KERN_WARNING "Timeout connecting USB frontend driver!\n"); - - return 0; - -up_failed: - if (errbuf) - kfree(errbuf); - -errbuf_failed: - return retval; -} - -module_init(xhci_hcd_init); - -MODULE_AUTHOR(DRIVER_AUTHOR); -MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_LICENSE("GPL"); - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/usbfront/xhci.h --- a/linux-2.6.11-xen-sparse/drivers/xen/usbfront/xhci.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,183 +0,0 @@ -/****************************************************************************** - * xhci.h - * - * Private definitions for the Xen Virtual USB Controller. Based on - * drivers/usb/host/uhci.h from Linux. Copyright for the imported content is - * retained by the original authors. - * - * Modifications are: - * Copyright (C) 2004 Intel Research Cambridge - * Copyright (C) 2004, 2005 Mark Williamson - */ - -#ifndef __LINUX_XHCI_H -#define __LINUX_XHCI_H - -#include <linux/list.h> -#include <linux/usb.h> -#include <asm-xen/xen-public/io/usbif.h> -#include <linux/spinlock.h> - -/* xhci_port_t - current known state of a virtual hub ports */ -typedef struct { - unsigned int cs :1; /* Connection status. */ - unsigned int cs_chg :1; /* Connection status change. */ - unsigned int pe :1; /* Port enable. */ - unsigned int pe_chg :1; /* Port enable change. */ - unsigned int susp :1; /* Suspended. */ - unsigned int lsda :1; /* Low speed device attached. */ - unsigned int pr :1; /* Port reset. */ -} xhci_port_t; - -/* struct virt_root_hub - state related to the virtual root hub */ -struct virt_root_hub { - struct usb_device *dev; - int devnum; /* Address of Root Hub endpoint */ - struct urb *urb; - void *int_addr; - int send; - int interval; - int numports; - int c_p_r[8]; - struct timer_list rh_int_timer; - spinlock_t port_state_lock; - xhci_port_t *ports; -}; - -/* struct xhci - contains the state associated with a single USB interface */ -struct xhci { - -#ifdef CONFIG_PROC_FS - /* procfs */ - int num; - struct proc_dir_entry *proc_entry; -#endif - - int evtchn; /* Interdom channel to backend */ - int irq; /* Bound to evtchn */ - enum { - USBIF_STATE_CONNECTED = 2, - USBIF_STATE_DISCONNECTED = 1, - USBIF_STATE_CLOSED = 0 - } state; /* State of this USB interface */ - unsigned long recovery; /* boolean recovery in progress flag */ - - unsigned long bandwidth; - - struct usb_bus *bus; - - /* Main list of URB's currently controlled by this HC */ - spinlock_t urb_list_lock; - struct list_head urb_list; /* P: xhci->urb_list_lock */ - - /* List of URB's awaiting completion callback */ - spinlock_t complete_list_lock; - struct list_head complete_list; /* P: xhci->complete_list_lock */ - - struct virt_root_hub rh; /* private data of the virtual root hub */ - - spinlock_t ring_lock; - usbif_front_ring_t usb_ring; - - int awaiting_reset; -}; - -/* per-URB private data structure for the host controller */ -struct urb_priv { - struct urb *urb; - usbif_iso_t *schedule; - struct usb_device *dev; - - int in_progress : 1; /* QH was queued (not linked in) */ - int short_control_packet : 1; /* If we get a short packet during */ - /* a control transfer, retrigger */ - /* the status phase */ - - int status; /* Final status */ - - unsigned long inserttime; /* In jiffies */ - - struct list_head complete_list; /* P: xhci->complete_list_lock */ -}; - -/* - * Locking in xhci.c - * - * spinlocks are used extensively to protect the many lists and data - * structures we have. It's not that pretty, but it's necessary. We - * need to be done with all of the locks (except complete_list_lock) when - * we call urb->complete. I've tried to make it simple enough so I don't - * have to spend hours racking my brain trying to figure out if the - * locking is safe. - * - * Here's the safe locking order to prevent deadlocks: - * - * #1 xhci->urb_list_lock - * #2 urb->lock - * #3 xhci->urb_remove_list_lock - * #4 xhci->complete_list_lock - * - * If you're going to grab 2 or more locks at once, ALWAYS grab the lock - * at the lowest level FIRST and NEVER grab locks at the same level at the - * same time. - * - * So, if you need xhci->urb_list_lock, grab it before you grab urb->lock - */ - -/* ------------------------------------------------------------------------- - Virtual Root HUB - ------------------------------------------------------------------------- */ -/* destination of request */ -#define RH_DEVICE 0x00 -#define RH_INTERFACE 0x01 -#define RH_ENDPOINT 0x02 -#define RH_OTHER 0x03 - -#define RH_CLASS 0x20 -#define RH_VENDOR 0x40 - -/* Requests: bRequest << 8 | bmRequestType */ -#define RH_GET_STATUS 0x0080 -#define RH_CLEAR_FEATURE 0x0100 -#define RH_SET_FEATURE 0x0300 -#define RH_SET_ADDRESS 0x0500 -#define RH_GET_DESCRIPTOR 0x0680 -#define RH_SET_DESCRIPTOR 0x0700 -#define RH_GET_CONFIGURATION 0x0880 -#define RH_SET_CONFIGURATION 0x0900 -#define RH_GET_STATE 0x0280 -#define RH_GET_INTERFACE 0x0A80 -#define RH_SET_INTERFACE 0x0B00 -#define RH_SYNC_FRAME 0x0C80 -/* Our Vendor Specific Request */ -#define RH_SET_EP 0x2000 - -/* Hub port features */ -#define RH_PORT_CONNECTION 0x00 -#define RH_PORT_ENABLE 0x01 -#define RH_PORT_SUSPEND 0x02 -#define RH_PORT_OVER_CURRENT 0x03 -#define RH_PORT_RESET 0x04 -#define RH_PORT_POWER 0x08 -#define RH_PORT_LOW_SPEED 0x09 -#define RH_C_PORT_CONNECTION 0x10 -#define RH_C_PORT_ENABLE 0x11 -#define RH_C_PORT_SUSPEND 0x12 -#define RH_C_PORT_OVER_CURRENT 0x13 -#define RH_C_PORT_RESET 0x14 - -/* Hub features */ -#define RH_C_HUB_LOCAL_POWER 0x00 -#define RH_C_HUB_OVER_CURRENT 0x01 -#define RH_DEVICE_REMOTE_WAKEUP 0x00 -#define RH_ENDPOINT_STALL 0x01 - -/* Our Vendor Specific feature */ -#define RH_REMOVE_EP 0x00 - -#define RH_ACK 0x01 -#define RH_REQ_ERR -1 -#define RH_NACK 0x00 - -#endif - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/xenbus/Makefile --- a/linux-2.6.11-xen-sparse/drivers/xen/xenbus/Makefile Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,10 +0,0 @@ -obj-y += xenbus.o - -xenbus-objs = -xenbus-objs += xenbus_comms.o -xenbus-objs += xenbus_xs.o -xenbus-objs += xenbus_probe.o - -XEN_TOOLS_DIR := "../tools" -vpath %.h $(XEN_TOOLS_DIR) -EXTRA_CFLAGS += -I $(XEN_TOOLS_DIR) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_comms.c --- a/linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,208 +0,0 @@ -/****************************************************************************** - * xenbus_comms.c - * - * Low level code to talks to Xen Store: ringbuffer and event channel. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -//#define DEBUG - -#include <asm-xen/hypervisor.h> -#include <asm-xen/evtchn.h> -#include <linux/wait.h> -#include <linux/interrupt.h> -#include <linux/sched.h> -#include <linux/err.h> -#include "xenbus_comms.h" - -#define RINGBUF_DATASIZE ((PAGE_SIZE / 2) - sizeof(struct ringbuf_head)) -struct ringbuf_head -{ - u32 write; /* Next place to write to */ - u32 read; /* Next place to read from */ - u8 flags; - char buf[0]; -} __attribute__((packed)); - -DECLARE_WAIT_QUEUE_HEAD(xb_waitq); - -static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs) -{ - wake_up(&xb_waitq); - return IRQ_HANDLED; -} - -static int check_buffer(const struct ringbuf_head *h) -{ - return (h->write < RINGBUF_DATASIZE && h->read < RINGBUF_DATASIZE); -} - -/* We can't fill last byte: would look like empty buffer. */ -static void *get_output_chunk(const struct ringbuf_head *h, - void *buf, u32 *len) -{ - u32 read_mark; - - if (h->read == 0) - read_mark = RINGBUF_DATASIZE - 1; - else - read_mark = h->read - 1; - - /* Here to the end of buffer, unless they haven't read some out. */ - *len = RINGBUF_DATASIZE - h->write; - if (read_mark >= h->write) - *len = read_mark - h->write; - return buf + h->write; -} - -static const void *get_input_chunk(const struct ringbuf_head *h, - const void *buf, u32 *len) -{ - /* Here to the end of buffer, unless they haven't written some. */ - *len = RINGBUF_DATASIZE - h->read; - if (h->write >= h->read) - *len = h->write - h->read; - return buf + h->read; -} - -static void update_output_chunk(struct ringbuf_head *h, u32 len) -{ - h->write += len; - if (h->write == RINGBUF_DATASIZE) - h->write = 0; -} - -static void update_input_chunk(struct ringbuf_head *h, u32 len) -{ - h->read += len; - if (h->read == RINGBUF_DATASIZE) - h->read = 0; -} - -static int output_avail(struct ringbuf_head *out) -{ - unsigned int avail; - - get_output_chunk(out, out->buf, &avail); - return avail != 0; -} - -int xb_write(struct ringbuf_head *out, const void *data, unsigned len) -{ - struct ringbuf_head h; - - do { - void *dst; - unsigned int avail; - - wait_event(xb_waitq, output_avail(out)); - - /* Read, then check: not that we don't trust store. - * Hell, some of my best friends are daemons. But, - * in this post-911 world... */ - h = *out; - mb(); - if (!check_buffer(&h)) { - set_current_state(TASK_RUNNING); - return -EIO; /* ETERRORIST! */ - } - - dst = get_output_chunk(&h, out->buf, &avail); - if (avail > len) - avail = len; - memcpy(dst, data, avail); - data += avail; - len -= avail; - update_output_chunk(out, avail); - notify_via_evtchn(xen_start_info.store_evtchn); - } while (len != 0); - - return 0; -} - -int xs_input_avail(struct ringbuf_head *in) -{ - unsigned int avail; - - get_input_chunk(in, in->buf, &avail); - return avail != 0; -} - -int xb_read(struct ringbuf_head *in, void *data, unsigned len) -{ - struct ringbuf_head h; - int was_full; - - while (len != 0) { - unsigned int avail; - const char *src; - - wait_event(xb_waitq, xs_input_avail(in)); - h = *in; - mb(); - if (!check_buffer(&h)) { - set_current_state(TASK_RUNNING); - return -EIO; - } - - src = get_input_chunk(&h, in->buf, &avail); - if (avail > len) - avail = len; - was_full = !output_avail(&h); - - memcpy(data, src, avail); - data += avail; - len -= avail; - update_input_chunk(in, avail); - pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); - /* If it was full, tell them we've taken some. */ - if (was_full) - notify_via_evtchn(xen_start_info.store_evtchn); - } - - /* If we left something, wake watch thread to deal with it. */ - if (xs_input_avail(in)) - wake_up(&xb_waitq); - - return 0; -} - -/* Set up interrpt handler off store event channel. */ -int xb_init_comms(void **in, void **out) -{ - int err, irq; - - irq = bind_evtchn_to_irq(xen_start_info.store_evtchn); - - err = request_irq(irq, wake_waiting, SA_SHIRQ, "xenbus", &xb_waitq); - if (err) { - printk(KERN_ERR "XENBUS request irq failed %i\n", err); - unbind_evtchn_from_irq(xen_start_info.store_evtchn); - return err; - } - - *out = (void *)xen_start_info.store_page; - *in = (void *)xen_start_info.store_page + PAGE_SIZE/2; - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_comms.h --- a/linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_comms.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,14 +0,0 @@ -/* Private include for xenbus communications. */ -#ifndef _XENBUS_COMMS_H -#define _XENBUS_COMMS_H -int xs_init(void); -int xb_init_comms(void **in, void **out); - -/* Low level routines. */ -struct ringbuf_head; -int xb_write(struct ringbuf_head *out, const void *data, unsigned len); -int xb_read(struct ringbuf_head *in, void *data, unsigned len); -int xs_input_avail(struct ringbuf_head *in); -extern wait_queue_head_t xb_waitq; - -#endif /* _XENBUS_COMMS_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_probe.c --- a/linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,858 +0,0 @@ -/****************************************************************************** - * Talks to Xen Store to figure out what devices we have. - * Currently experiment code, but when I grow up I'll be a bus driver! - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * Copyright (C) 2005 Mike Wray, Hewlett-Packard - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#include <asm-xen/hypervisor.h> -#include <asm-xen/xenbus.h> -#include <linux/kernel.h> -#include <linux/err.h> -#include <linux/string.h> -#include <linux/ctype.h> -#include <linux/fcntl.h> -#include <stdarg.h> -#include "xenbus_comms.h" - -/* Directory inside a domain containing devices. */ -#define XENBUS_DEVICE_DIR "device" - -/* Directory inside a domain containing backends. */ -#define XENBUS_BACKEND_DIR "backend" - -/* Name of field containing device id. */ -#define XENBUS_DEVICE_ID "id" - -/* Name of field containing device type. */ -#define XENBUS_DEVICE_TYPE "type" - -//#define DEBUG - -#ifdef DEBUG -#define dprintf(_fmt, _args...) \ -printk(KERN_INFO __stringify(KBUILD_MODNAME) " [DBG] %s" _fmt, __FUNCTION__, ##_args) -#else -#define dprintf(_fmt, _args...) do { } while(0) -#endif - -static int xs_init_done = 0; - -/* Return the path to dir with /name appended. - * If name is null or empty returns a copy of dir. - */ -char *xenbus_path(const char *dir, const char *name) -{ - char *ret; - int len; - - len = strlen(dir) + 1; - if (name) - len += strlen(name) + 1; - ret = kmalloc(len, GFP_KERNEL); - if (ret == NULL) - return NULL; - strcpy(ret, dir); - if (name) { - strcat(ret, "/"); - strcat(ret, name); - } - return ret; -} - -#define streq(a, b) (strcmp((a), (b)) == 0) - -char *xenbus_read(const char *dir, const char *name, unsigned int *data_n) -{ - int err = 0; - char *data = NULL; - char *path = xenbus_path(dir, name); - int n = 0; - - if (!path) { - err = -ENOMEM; - goto out; - } - data = xs_read(path, &n); - if (IS_ERR(data)) { - err = PTR_ERR(data); - if (err == -EISDIR) - err = -ENOENT; - } else if (n == 0) { - err = -ENOENT; - kfree(data); - } - kfree(path); - out: - if (data_n) - *data_n = n; - return (err ? ERR_PTR(err) : data); -} - -int xenbus_write(const char *dir, const char *name, const char *data, int data_n) -{ - int err = 0; - char *path = xenbus_path(dir, name); - - if (!path) - return -ENOMEM; - err = xs_write(path, data, data_n, O_CREAT); - kfree(path); - return err; -} - -int xenbus_read_string(const char *dir, const char *name, char **val) -{ - int err = 0; - - *val = xenbus_read(dir, name, NULL); - if (IS_ERR(*val)) { - err = PTR_ERR(*val); - *val = NULL; - } - return err; -} - -int xenbus_write_string(const char *dir, const char *name, const char *val) -{ - return xenbus_write(dir, name, val, strlen(val)); -} - -int xenbus_read_ulong(const char *dir, const char *name, unsigned long *val) -{ - int err = 0; - char *data = NULL, *end = NULL; - unsigned int data_n = 0; - - data = xenbus_read(dir, name, &data_n); - if (IS_ERR(data)) { - err = PTR_ERR(data); - goto out; - } - if (data_n <= 1) { - err = -ENOENT; - goto free_data; - } - *val = simple_strtoul(data, &end, 10); - if (end != data + data_n) { - printk("XENBUS: Path %s/%s, bad parse of '%s' as ulong\n", - dir, name, data); - err = -EINVAL; - } - free_data: - kfree(data); - out: - if (err) - *val = 0; - return err; -} - -int xenbus_write_ulong(const char *dir, const char *name, unsigned long val) -{ - char data[32] = {}; - - snprintf(data, sizeof(data), "%lu", val); - return xenbus_write(dir, name, data, strlen(data)); -} - -int xenbus_read_long(const char *dir, const char *name, long *val) -{ - int err = 0; - char *data = NULL, *end = NULL; - unsigned int data_n = 0; - - data = xenbus_read(dir, name, &data_n); - if (IS_ERR(data)) { - err = PTR_ERR(data); - goto out; - } - if (data_n <= 1) { - err = -ENOENT; - goto free_data; - } - *val = simple_strtol(data, &end, 10); - if (end != data + data_n) { - printk("XENBUS: Path %s/%s, bad parse of '%s' as long\n", - dir, name, data); - err = -EINVAL; - } - free_data: - kfree(data); - out: - if (err) - *val = 0; - return err; -} - -int xenbus_write_long(const char *dir, const char *name, long val) -{ - char data[32] = {}; - - snprintf(data, sizeof(data), "%li", val); - return xenbus_write(dir, name, data, strlen(data)); -} - -/* Number of characters in string form of a MAC address. */ -#define MAC_LENGTH 17 - -/** Convert a mac address from a string of the form - * XX:XX:XX:XX:XX:XX to numerical form (an array of 6 unsigned chars). - * Each X denotes a hex digit: 0..9, a..f, A..F. - * Also supports using '-' as the separator instead of ':'. - */ -static int mac_aton(const char *macstr, unsigned int n, unsigned char mac[6]) -{ - int err = -EINVAL; - int i, j; - const char *p; - char sep = 0; - - if (!macstr || n != MAC_LENGTH) - goto exit; - for (i = 0, p = macstr; i < 6; i++) { - unsigned char d = 0; - if (i) { - if (!sep && (*p == ':' || *p == '-')) - sep = *p; - if (sep && *p == sep) - p++; - else - goto exit; - } - for (j = 0; j < 2; j++, p++) { - if (j) - d <<= 4; - if (isdigit(*p)) - d += *p - '0'; - else if (isxdigit(*p)) - d += toupper(*p) - 'A' + 10; - else - goto exit; - } - mac[i] = d; - } - err = 0; - exit: - return err; -} - -int xenbus_read_mac(const char *dir, const char *name, unsigned char mac[6]) -{ - int err = 0; - char *data = 0; - unsigned int data_n = 0; - - data = xenbus_read(dir, name, &data_n); - if (IS_ERR(data)) { - err = PTR_ERR(data); - goto out; - } - if (data_n <= 1) { - err = -ENOENT; - goto free_data; - } - err = mac_aton(data, data_n, mac); - if (err) { - printk("XENBUS: Path %s/%s, bad parse of '%s' as mac\n", - dir, name, data); - err = -EINVAL; - } - free_data: - kfree(data); - out: - if (err) - memset(mac, 0, sizeof(mac)); - return err; -} - -int xenbus_write_mac(const char *dir, const char *name, const unsigned char mac[6]) -{ - char buf[MAC_LENGTH] = {}; - int buf_n = sizeof(buf); - - snprintf(buf, buf_n, "%02x:%02x:%02x:%02x:%02x:%02x", - mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); - return xenbus_write(dir, name, buf, buf_n); -} - -/* Read event channel information from xenstore. - * - * Event channel xenstore fields: - * dom1 - backend domain id (int) - * port1 - backend port (int) - * dom2 - frontend domain id (int) - * port2 - frontend port (int) - */ -int xenbus_read_evtchn(const char *dir, const char *name, struct xenbus_evtchn *evtchn) -{ - int err = 0; - char *evtchn_path = xenbus_path(dir, name); - - if (!evtchn_path) { - err = -ENOMEM; - goto out; - } - err = xenbus_read_ulong(evtchn_path, "dom1", &evtchn->dom1); - if (err) - goto free_evtchn_path; - err = xenbus_read_ulong(evtchn_path, "port1", &evtchn->port1); - if (err) - goto free_evtchn_path; - err = xenbus_read_ulong(evtchn_path, "dom2", &evtchn->dom2); - if (err) - goto free_evtchn_path; - err = xenbus_read_ulong(evtchn_path, "port2", &evtchn->port2); - - free_evtchn_path: - kfree(evtchn_path); - out: - if (err) - *evtchn = (struct xenbus_evtchn){}; - return err; -} - -/* Write a message to 'dir'. - * The data is 'val' followed by parameter names and values, - * terminated by NULL. - */ -int xenbus_message(const char *dir, const char *val, ...) -{ - static const char *mid_name = "@mid"; - va_list args; - int err = 0; - char *mid_path = NULL; - char *msg_path = NULL; - char mid_str[32] = {}; - long mid = 0; - int i; - - va_start(args, val); - mid_path = xenbus_path(dir, mid_name); - if (!mid_path) { - err = -ENOMEM; - goto out; - } - err = xenbus_read_long(dir, mid_name, &mid); - if (err != -ENOENT) - goto out; - mid++; - err = xenbus_write_long(dir, mid_name, mid); - if (err) - goto out; - sprintf(mid_str, "%li", mid); - msg_path = xenbus_path(dir, mid_str); - if (!mid_path) { - err = -ENOMEM; - goto out; - } - - for (i = 0; i < 16; i++) { - char *k, *v; - k = va_arg(args, char *); - if (!k) - break; - v = va_arg(args, char *); - if (!v) - break; - err = xenbus_write_string(msg_path, k, v); - if (err) - goto out; - } - err = xenbus_write_string(msg_path, NULL, val); - - out: - kfree(msg_path); - kfree(mid_path); - va_end(args); - return err; -} - -/* If something in array of ids matches this device, return it. */ -static const struct xenbus_device_id * -match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) -{ - for (; !streq(arr->devicetype, ""); arr++) { - if (!streq(arr->devicetype, dev->devicetype)) - continue; - - if (streq(arr->subtype, "") || - streq(arr->subtype, dev->subtype)) { - return arr; - } - } - return NULL; -} - -static int xenbus_match(struct device *_dev, struct device_driver *_drv) -{ - struct xenbus_driver *drv = to_xenbus_driver(_drv); - - if (!drv->ids) - return 0; - - return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; -} - -/* Bus type for frontend drivers. */ -static struct bus_type xenbus_type = { - .name = "xenbus", - .match = xenbus_match, -}; - - -/* Bus type for backend drivers. */ -static struct bus_type xenback_type = { - .name = "xenback", - .match = xenbus_match, -}; - -struct xenbus_for_dev { - int (*fn)(struct xenbus_device *, void *); - void *data; -}; - -static int for_dev(struct device *_dev, void *_data) -{ - struct xenbus_device *dev = to_xenbus_device(_dev); - struct xenbus_for_dev *data = _data; - dev = to_xenbus_device(_dev); - return data->fn(dev, data->data); -} - -int xenbus_for_each_dev(struct xenbus_device * start, void * data, - int (*fn)(struct xenbus_device *, void *)) -{ - struct xenbus_for_dev for_data = { - .fn = fn, - .data = data, - }; - if (!fn) - return -EINVAL; - printk("%s> data=%p fn=%p for_data=%p\n", __FUNCTION__, - data, fn, &for_data); - return bus_for_each_dev(&xenbus_type, - (start ? &start->dev : NULL), - &for_data, for_dev); -} - -struct xenbus_for_drv { - int (*fn)(struct xenbus_driver *, void *); - void *data; -}; - -static int for_drv(struct device_driver *_drv, void *_data) -{ - struct xenbus_driver *drv = to_xenbus_driver(_drv); - struct xenbus_for_drv *data = _data; - return data->fn(drv, data->data); -} - -int xenbus_for_each_drv(struct xenbus_driver * start, void * data, - int (*fn)(struct xenbus_driver *, void *)) -{ - struct xenbus_for_drv for_data = { - .fn = fn, - .data = data, - }; - if (!fn) - return -EINVAL; - return bus_for_each_drv(&xenbus_type, - (start ? &start->driver: NULL), - &for_data, for_drv); -} - -static int xenbus_dev_probe(struct device *_dev) -{ - struct xenbus_device *dev = to_xenbus_device(_dev); - struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); - const struct xenbus_device_id *id; - - if (!drv->probe) - return -ENODEV; - - id = match_device(drv->ids, dev); - if (!id) - return -ENODEV; - return drv->probe(dev, id); -} - -static int xenbus_dev_remove(struct device *_dev) -{ - struct xenbus_device *dev = to_xenbus_device(_dev); - struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); - - if (!drv->remove) - return 0; - return drv->remove(dev); -} - -int xenbus_register_driver(struct xenbus_driver *drv) -{ - int err = 0; - - printk("%s> frontend driver %p %s\n", __FUNCTION__, - drv, drv->name); - drv->driver.name = drv->name; - drv->driver.bus = &xenbus_type; - drv->driver.owner = drv->owner; - drv->driver.probe = xenbus_dev_probe; - drv->driver.remove = xenbus_dev_remove; - - err = driver_register(&drv->driver); - if (err == 0 && xs_init_done && drv->connect) { - printk("%s> connecting driver %p %s\n", __FUNCTION__, - drv, drv->name); - drv->connect(drv); - } - return err; -} - -void xenbus_unregister_driver(struct xenbus_driver *drv) -{ - driver_unregister(&drv->driver); -} - -static int xenbus_probe_device(const char *dir, const char *name) -{ - int err; - struct xenbus_device *xendev; - unsigned int xendev_n; - long id; - char *nodename, *devicetype; - unsigned int devicetype_n; - - dprintf("> dir=%s name=%s\n", dir, name); - nodename = xenbus_path(dir, name); - if (!nodename) - return -ENOMEM; - - devicetype = xenbus_read(nodename, XENBUS_DEVICE_TYPE, &devicetype_n); - if (IS_ERR(devicetype)) { - err = PTR_ERR(devicetype); - goto free_nodename; - } - - err = xenbus_read_long(nodename, XENBUS_DEVICE_ID, &id); - if (err == -ENOENT) - id = 0; - else if (err != 0) - goto free_devicetype; - - dprintf("> devicetype='%s' name='%s' id=%ld\n", devicetype, name, id); - /* FIXME: This could be a rescan. Don't re-register existing devices. */ - - /* Add space for the strings. */ - xendev_n = sizeof(*xendev) + strlen(nodename) + strlen(devicetype) + 2; - xendev = kmalloc(xendev_n, GFP_KERNEL); - if (!xendev) { - err = -ENOMEM; - goto free_devicetype; - } - memset(xendev, 0, xendev_n); - - snprintf(xendev->dev.bus_id, BUS_ID_SIZE, "%s-%s", devicetype, name); - xendev->dev.bus = &xenbus_type; - - xendev->id = id; - - /* Copy the strings into the extra space. */ - xendev->nodename = (char *)(xendev + 1); - strcpy(xendev->nodename, nodename); - xendev->devicetype = xendev->nodename + strlen(xendev->nodename) + 1; - strcpy(xendev->devicetype, devicetype); - - /* Register with generic device framework. */ - printk("XENBUS: Registering device %s\n", xendev->dev.bus_id); - err = device_register(&xendev->dev); - if (err) { - printk("XENBUS: Registering device %s: error %i\n", - xendev->dev.bus_id, err); - kfree(xendev); - } - -free_devicetype: - kfree(devicetype); -free_nodename: - kfree(nodename); - dprintf("< err=%i\n", err); - return err; -} - -static int xenbus_probe_device_type(const char *dirpath, const char *typename) -{ - int err = 0; - char **dir; - char *path; - unsigned int dir_n = 0; - int i; - - dprintf("> dirpath=%s typename=%s\n", dirpath, typename); - path = xenbus_path(dirpath, typename); - if (!path) - return -ENOMEM; - - dir = xs_directory(path, &dir_n); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; - } - - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_device(path, dir[i]); - if (err) - break; - } - kfree(dir); -out: - kfree(path); - dprintf("< err=%i\n", err); - return err; -} - -static int xenbus_probe_devices(const char *path) -{ - int err = 0; - char **dir; - unsigned int i, dir_n; - - dprintf("> path=%s\n", path); - down(&xs_lock); - dir = xs_directory(path, &dir_n); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto unlock; - } - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_device_type(path, dir[i]); - if (err) - break; - } - kfree(dir); -unlock: - up(&xs_lock); - dprintf("< err=%i\n", err); - return err; -} - - -static int xenbus_probe_backend(const char *dir, const char *name) -{ - int err = 0; - struct xenbus_device *xendev = NULL; - unsigned int xendev_n = 0; - char *nodename = NULL, *devicetype = NULL; - unsigned int devicetype_n = 0; - - dprintf("> dir=%s name=%s\n", dir, name); - nodename = xenbus_path(dir, name); - if (!nodename) - return -ENOMEM; - - devicetype = xenbus_read(nodename, XENBUS_DEVICE_TYPE, &devicetype_n); - if (IS_ERR(devicetype)) { - err = PTR_ERR(devicetype); - goto free_nodename; - } - - dprintf("> devicetype='%s'\n", devicetype); - /* FIXME: This could be a rescan. Don't re-register existing devices. */ - - /* Add space for the strings. */ - xendev_n = sizeof(*xendev) + strlen(nodename) + strlen(devicetype) + 2; - xendev = kmalloc(xendev_n, GFP_KERNEL); - if (!xendev) { - err = -ENOMEM; - goto free_devicetype; - } - memset(xendev, 0, xendev_n); - - snprintf(xendev->dev.bus_id, BUS_ID_SIZE, "%s", devicetype); - xendev->dev.bus = &xenback_type; - - /* Copy the strings into the extra space. */ - xendev->nodename = (char *)(xendev + 1); - strcpy(xendev->nodename, nodename); - xendev->devicetype = xendev->nodename + strlen(xendev->nodename) + 1; - strcpy(xendev->devicetype, devicetype); - - /* Register with generic device framework. */ - printk("XENBUS: Registering backend %s\n", xendev->dev.bus_id); - err = device_register(&xendev->dev); - if (err) { - printk("XENBUS: Registering device %s: error %i\n", - xendev->dev.bus_id, err); - kfree(xendev); - } - -free_devicetype: - kfree(devicetype); -free_nodename: - kfree(nodename); - dprintf("< err=%i\n", err); - return err; -} - -static int xenbus_probe_backends(const char *path) -{ - int err = 0; - char **dir; - unsigned int i, dir_n; - - dprintf("> path=%s\n", path); - down(&xs_lock); - dir = xs_directory(path, &dir_n); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto unlock; - } - for (i = 0; i < dir_n; i++) { - err = xenbus_probe_backend(path, dir[i]); - if (err) - break; - } - kfree(dir); -unlock: - up(&xs_lock); - dprintf("< err=%i\n", err); - return err; -} - -int xenbus_register_backend(struct xenbus_driver *drv) -{ - int err = 0; - - printk("%s> backend driver %p %s\n", __FUNCTION__, - drv, drv->name); - drv->driver.name = drv->name; - drv->driver.bus = &xenback_type; - drv->driver.owner = drv->owner; - drv->driver.probe = xenbus_dev_probe; - drv->driver.remove = xenbus_dev_remove; - - err = driver_register(&drv->driver); - if (err == 0 && xs_init_done && drv->connect) { - printk("%s> connecting driver %p %s\n", __FUNCTION__, - drv, drv->name); - drv->connect(drv); - } - return err; -} - -void xenbus_unregister_backend(struct xenbus_driver *drv) -{ - driver_unregister(&drv->driver); -} - -int xenbus_for_each_backend(struct xenbus_driver * start, void * data, - int (*fn)(struct xenbus_driver *, void *)) -{ - struct xenbus_for_drv for_data = { - .fn = fn, - .data = data, - }; - if (!fn) - return -EINVAL; - return bus_for_each_drv(&xenback_type, - (start ? &start->driver: NULL), - &for_data, for_drv); -} - -static void test_callback(struct xenbus_watch *w, const char *node) -{ - printk("test_callback: got watch hit for %s\n", node); -} - -static void test_watch(void) -{ - static int init_done = 0; - static struct xenbus_watch watch = { .node = "/", - .priority = 0, - .callback = test_callback }; - - if (init_done) - return; - printk("registering watch %lX = %i\n", - (long)&watch, - register_xenbus_watch(&watch)); - init_done = 1; -} - -static int xenbus_driver_connect(struct xenbus_driver *drv, void *data) -{ - printk("%s> driver %p %s\n", __FUNCTION__, drv, drv->name); - if (drv->connect) { - printk("%s> connecting driver %p %s\n", __FUNCTION__, - drv, drv->name); - drv->connect(drv); - } - printk("%s< driver %p %s\n", __FUNCTION__, drv, drv->name); - return 0; -} - -int do_xenbus_connect(void *unused) -{ - int err = 0; - - printk("%s> xs_init_done=%d\n", __FUNCTION__, xs_init_done); - if (xs_init_done) - goto exit; - /* Initialize xenstore comms unless already done. */ - printk("store_evtchn = %i\n", xen_start_info.store_evtchn); - err = xs_init(); - if (err) { - printk("XENBUS: Error initializing xenstore comms:" - " %i\n", err); - goto exit; - } - xs_init_done = 1; - - /* Notify drivers that xenstore has connected. */ - test_watch(); - printk("%s> connect drivers...\n", __FUNCTION__); - xenbus_for_each_drv(NULL, NULL, xenbus_driver_connect); - printk("%s> connect backends...\n", __FUNCTION__); - xenbus_for_each_backend(NULL, NULL, xenbus_driver_connect); - - /* Enumerate devices and backends in xenstore. */ - xenbus_probe_devices(XENBUS_DEVICE_DIR); - xenbus_probe_backends(XENBUS_BACKEND_DIR); - -exit: - printk("%s< err=%d\n", __FUNCTION__, err); - return err; -} - -static int __init xenbus_probe_init(void) -{ - bus_register(&xenbus_type); - bus_register(&xenback_type); - - if (!xen_start_info.store_evtchn) - return 0; - - do_xenbus_connect(NULL); - return 0; -} - -postcore_initcall(xenbus_probe_init); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_xs.c --- a/linux-2.6.11-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,488 +0,0 @@ -/****************************************************************************** - * xenbus_xs.c - * - * This is the kernel equivalent of the "xs" library. We don't need everything - * and we use xenbus_comms to communication. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include <linux/errno.h> -#include <linux/types.h> -#include "xenstore/xenstored.h" -#include <linux/uio.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/fcntl.h> -#include <linux/kthread.h> -#include <asm-xen/xenbus.h> -#include "xenbus_comms.h" - -#define streq(a, b) (strcmp((a), (b)) == 0) - -static void *xs_in, *xs_out; -static LIST_HEAD(watches); -static DECLARE_MUTEX(watches_lock); -DECLARE_MUTEX(xs_lock); - -static int get_error(const char *errorstring) -{ - unsigned int i; - - for (i = 0; !streq(errorstring, xsd_errors[i].errstring); i++) { - if (i == ARRAY_SIZE(xsd_errors) - 1) { - printk(KERN_WARNING - "XENBUS xen store gave: unknown error %s", - errorstring); - return EINVAL; - } - } - return xsd_errors[i].errnum; -} - -static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) -{ - struct xsd_sockmsg msg; - void *ret; - int err; - - err = xb_read(xs_in, &msg, sizeof(msg)); - if (err) - return ERR_PTR(err); - - ret = kmalloc(msg.len + 1, GFP_KERNEL); - if (!ret) - return ERR_PTR(-ENOMEM); - - err = xb_read(xs_in, ret, msg.len); - if (err) { - kfree(ret); - return ERR_PTR(err); - } - ((char*)ret)[msg.len] = '\0'; - - *type = msg.type; - if (len) - *len = msg.len; - return ret; -} - -/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */ -static void *xs_talkv(enum xsd_sockmsg_type type, - const struct kvec *iovec, - unsigned int num_vecs, - unsigned int *len) -{ - struct xsd_sockmsg msg; - void *ret = NULL; - unsigned int i; - int err; - - WARN_ON(down_trylock(&xs_lock) == 0); - - msg.type = type; - msg.len = 0; - for (i = 0; i < num_vecs; i++) - msg.len += iovec[i].iov_len; - - err = xb_write(xs_out, &msg, sizeof(msg)); - if (err) - return ERR_PTR(err); - - for (i = 0; i < num_vecs; i++) { - err = xb_write(xs_out, iovec[i].iov_base, iovec[i].iov_len);; - if (err) - return ERR_PTR(err); - } - - /* Watches can have fired before reply comes: daemon detects - * and re-transmits, so we can ignore this. */ - do { - kfree(ret); - ret = read_reply(&msg.type, len); - if (IS_ERR(ret)) - return ret; - } while (msg.type == XS_WATCH_EVENT); - - if (msg.type == XS_ERROR) { - err = get_error(ret); - kfree(ret); - return ERR_PTR(-err); - } - - BUG_ON(msg.type != type); - return ret; -} - -/* Simplified version of xs_talkv: single message. */ -static void *xs_single(enum xsd_sockmsg_type type, - const char *string, unsigned int *len) -{ - struct kvec iovec; - - iovec.iov_base = (void *)string; - iovec.iov_len = strlen(string) + 1; - return xs_talkv(type, &iovec, 1, len); -} - -/* Many commands only need an ack, don't care what it says. */ -static int xs_error(char *reply) -{ - if (IS_ERR(reply)) - return PTR_ERR(reply); - kfree(reply); - return 0; -} - -static unsigned int count_strings(const char *strings, unsigned int len) -{ - unsigned int num; - const char *p; - - for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) - num++; - - return num; -} - -char **xs_directory(const char *path, unsigned int *num) -{ - char *strings, *p, **ret; - unsigned int len; - - strings = xs_single(XS_DIRECTORY, path, &len); - if (IS_ERR(strings)) - return (char **)strings; - - /* Count the strings. */ - *num = count_strings(strings, len); - - /* Transfer to one big alloc for easy freeing. */ - ret = kmalloc(*num * sizeof(char *) + len, GFP_ATOMIC); - if (!ret) { - kfree(strings); - return ERR_PTR(-ENOMEM); - } - memcpy(&ret[*num], strings, len); - kfree(strings); - - strings = (char *)&ret[*num]; - for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1) - ret[(*num)++] = p; - return ret; -} - -/* Check if a path exists. Return 1 if it does. */ -int xs_exists(const char *path) -{ - char **dir; - int dir_n; - - dir = xs_directory(path, &dir_n); - if (IS_ERR(dir)) - return 0; - kfree(dir); - return 1; -} - -/* Make a directory, creating dirs on the path to it if necessary. - * Return 0 on success, error code otherwise. - */ -int xs_mkdirs(const char *path) -{ - int err = 0; - char s[strlen(path) + 1], *p = s; - - if (xs_exists(path)) - goto out; - strcpy(p, path); - if (*p == '/') - p++; - for (;;) { - p = strchr(p, '/'); - if (p) - *p = '\0'; - if (!xs_exists(s)) { - err = xs_mkdir(s); - if (err) - goto out; - } - if (!p) - break; - *p++ = '/'; - } - out: - return err; -} - - -/* Get the value of a single file. - * Returns a kmalloced value: call free() on it after use. - * len indicates length in bytes. - */ -void *xs_read(const char *path, unsigned int *len) -{ - return xs_single(XS_READ, path, len); -} - -/* Write the value of a single file. - * Returns -err on failure. createflags can be 0, O_CREAT, or O_CREAT|O_EXCL. - */ -int xs_write(const char *path, - const void *data, unsigned int len, int createflags) -{ - const char *flags; - struct kvec iovec[3]; - - /* Format: Flags (as string), path, data. */ - if (createflags == 0) - flags = XS_WRITE_NONE; - else if (createflags == O_CREAT) - flags = XS_WRITE_CREATE; - else if (createflags == (O_CREAT|O_EXCL)) - flags = XS_WRITE_CREATE_EXCL; - else - return -EINVAL; - - iovec[0].iov_base = (void *)path; - iovec[0].iov_len = strlen(path) + 1; - iovec[1].iov_base = (void *)flags; - iovec[1].iov_len = strlen(flags) + 1; - iovec[2].iov_base = (void *)data; - iovec[2].iov_len = len; - - return xs_error(xs_talkv(XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL)); -} - -/* Create a new directory. */ -int xs_mkdir(const char *path) -{ - return xs_error(xs_single(XS_MKDIR, path, NULL)); -} - -/* Destroy a file or directory (directories must be empty). */ -int xs_rm(const char *path) -{ - return xs_error(xs_single(XS_RM, path, NULL)); -} - -/* Start a transaction: changes by others will not be seen during this - * transaction, and changes will not be visible to others until end. - * Transaction only applies to the given subtree. - * You can only have one transaction at any time. - */ -int xs_transaction_start(const char *subtree) -{ - return xs_error(xs_single(XS_TRANSACTION_START, subtree, NULL)); -} - -/* End a transaction. - * If abandon is true, transaction is discarded instead of committed. - */ -int xs_transaction_end(int abort) -{ - char abortstr[2]; - - if (abort) - strcpy(abortstr, "F"); - else - strcpy(abortstr, "T"); - return xs_error(xs_single(XS_TRANSACTION_END, abortstr, NULL)); -} - -char *xs_get_domain_path(domid_t domid) -{ - char domid_str[32]; - - sprintf(domid_str, "%u", domid); - return xs_single(XS_GETDOMAINPATH, domid_str, NULL); -} - -static int xs_watch(const char *path, const char *token, unsigned int priority) -{ - char prio[32]; - struct kvec iov[3]; - - sprintf(prio, "%u", priority); - iov[0].iov_base = (void *)path; - iov[0].iov_len = strlen(path) + 1; - iov[1].iov_base = (void *)token; - iov[1].iov_len = strlen(token) + 1; - iov[2].iov_base = prio; - iov[2].iov_len = strlen(prio) + 1; - - return xs_error(xs_talkv(XS_WATCH, iov, ARRAY_SIZE(iov), NULL)); -} - -static char *xs_read_watch(char **token) -{ - enum xsd_sockmsg_type type; - char *ret; - - ret = read_reply(&type, NULL); - if (IS_ERR(ret)) - return ret; - - BUG_ON(type != XS_WATCH_EVENT); - *token = ret + strlen(ret) + 1; - return ret; -} - -static int xs_acknowledge_watch(const char *token) -{ - return xs_error(xs_single(XS_WATCH_ACK, token, NULL)); -} - -static int xs_unwatch(const char *path, const char *token) -{ - struct kvec iov[2]; - - iov[0].iov_base = (char *)path; - iov[0].iov_len = strlen(path) + 1; - iov[1].iov_base = (char *)token; - iov[1].iov_len = strlen(token) + 1; - - return xs_error(xs_talkv(XS_UNWATCH, iov, ARRAY_SIZE(iov), NULL)); -} - -/* A little paranoia: we don't just trust token. */ -static struct xenbus_watch *find_watch(const char *token) -{ - struct xenbus_watch *i, *cmp; - - cmp = (void *)simple_strtoul(token, NULL, 16); - - list_for_each_entry(i, &watches, list) - if (i == cmp) - return i; - return NULL; -} - -/* Register callback to watch this node. */ -int register_xenbus_watch(struct xenbus_watch *watch) -{ - /* Pointer in ascii is the token. */ - char token[sizeof(watch) * 2 + 1]; - int err; - - sprintf(token, "%lX", (long)watch); - down(&watches_lock); - BUG_ON(find_watch(token)); - - down(&xs_lock); - err = xs_watch(watch->node, token, watch->priority); - up(&xs_lock); - if (!err) - list_add(&watch->list, &watches); - up(&watches_lock); - return err; -} - -void unregister_xenbus_watch(struct xenbus_watch *watch) -{ - char token[sizeof(watch) * 2 + 1]; - int err; - - sprintf(token, "%lX", (long)watch); - down(&watches_lock); - BUG_ON(!find_watch(token)); - - down(&xs_lock); - err = xs_unwatch(watch->node, token); - up(&xs_lock); - list_del(&watch->list); - up(&watches_lock); - - if (err) - printk(KERN_WARNING "XENBUS Failed to release watch %s: %i\n", - watch->node, err); -} - -static int watch_thread(void *unused) -{ - int err; - unsigned long mtu; - - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ*10); - printk("watch_thread, doing read\n"); - down(&xs_lock); - err = xenbus_read_long("", "mtu", &mtu); - up(&xs_lock); - printk("fake field read: %i (%lu)\n", err, mtu); - - for (;;) { - char *token; - char *node = NULL; - - wait_event(xb_waitq, xs_input_avail(xs_in)); - - /* If this is a spurious wakeup caused by someone - * doing an op, they'll hold the lock and the buffer - * will be empty by the time we get there. - */ - down(&xs_lock); - if (xs_input_avail(xs_in)) - node = xs_read_watch(&token); - /* Release lock before calling callback. */ - up(&xs_lock); - if (node && !IS_ERR(node)) { - struct xenbus_watch *w; - int err; - - down(&watches_lock); - w = find_watch(token); - BUG_ON(!w); - w->callback(w, node); - up(&watches_lock); - down(&xs_lock); - err = xs_acknowledge_watch(token); - if (err) - printk(KERN_WARNING - "XENBUS acknowledge %s failed %i\n", - node, err); - up(&xs_lock); - kfree(node); - } else - printk(KERN_WARNING "XENBUS xs_read_watch: %li\n", - PTR_ERR(node)); - } -} - -int xs_init(void) -{ - int err; - struct task_struct *watcher; - - err = xb_init_comms(&xs_in, &xs_out); - if (err) - return err; - - watcher = kthread_run(watch_thread, NULL, "kxbwatch"); - if (IS_ERR(watcher)) - return PTR_ERR(watcher); - return 0; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-generic/pgtable.h --- a/linux-2.6.11-xen-sparse/include/asm-generic/pgtable.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,147 +0,0 @@ -#ifndef _ASM_GENERIC_PGTABLE_H -#define _ASM_GENERIC_PGTABLE_H - -#ifndef __HAVE_ARCH_PTEP_ESTABLISH -/* - * Establish a new mapping: - * - flush the old one - * - update the page tables - * - inform the TLB about the new one - * - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock. - * - * Note: the old pte is known to not be writable, so we don't need to - * worry about dirty bits etc getting lost. - */ -#ifndef __HAVE_ARCH_SET_PTE_ATOMIC -#define ptep_establish(__vma, __address, __ptep, __entry) \ -do { \ - set_pte(__ptep, __entry); \ - flush_tlb_page(__vma, __address); \ -} while (0) -#else /* __HAVE_ARCH_SET_PTE_ATOMIC */ -#define ptep_establish(__vma, __address, __ptep, __entry) \ -do { \ - set_pte_atomic(__ptep, __entry); \ - flush_tlb_page(__vma, __address); \ -} while (0) -#endif /* __HAVE_ARCH_SET_PTE_ATOMIC */ -#endif - -#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -/* - * Largely same as above, but only sets the access flags (dirty, - * accessed, and writable). Furthermore, we know it always gets set - * to a "more permissive" setting, which allows most architectures - * to optimize this. - */ -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ -do { \ - set_pte(__ptep, __entry); \ - flush_tlb_page(__vma, __address); \ -} while (0) -#endif - -#ifndef __HAVE_ARCH_PTEP_ESTABLISH_NEW -/* - * Establish a mapping where none previously existed - */ -#define ptep_establish_new(__vma, __address, __ptep, __entry) \ -do { \ - set_pte(__ptep, __entry); \ -} while (0) -#endif - -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int ptep_test_and_clear_young(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_young(pte)) - return 0; - set_pte(ptep, pte_mkold(pte)); - return 1; -} -#endif - -#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young = ptep_test_and_clear_young(__ptep); \ - if (__young) \ - flush_tlb_page(__vma, __address); \ - __young; \ -}) -#endif - -#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -static inline int ptep_test_and_clear_dirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - return 0; - set_pte(ptep, pte_mkclean(pte)); - return 1; -} -#endif - -#ifndef __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH -#define ptep_clear_flush_dirty(__vma, __address, __ptep) \ -({ \ - int __dirty = ptep_test_and_clear_dirty(__ptep); \ - if (__dirty) \ - flush_tlb_page(__vma, __address); \ - __dirty; \ -}) -#endif - -#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(pte_t *ptep) -{ - pte_t pte = *ptep; - pte_clear(ptep); - return pte; -} -#endif - -#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(__vma, __address, __ptep) \ -({ \ - pte_t __pte = ptep_get_and_clear(__ptep); \ - flush_tlb_page(__vma, __address); \ - __pte; \ -}) -#endif - -#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(pte_t *ptep) -{ - pte_t old_pte = *ptep; - set_pte(ptep, pte_wrprotect(old_pte)); -} -#endif - -#ifndef __HAVE_ARCH_PTEP_MKDIRTY -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t old_pte = *ptep; - set_pte(ptep, pte_mkdirty(old_pte)); -} -#endif - -#ifndef __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (pte_val(A) == pte_val(B)) -#endif - -#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_DIRTY -#define page_test_and_clear_dirty(page) (0) -#endif - -#ifndef __HAVE_ARCH_PAGE_TEST_AND_CLEAR_YOUNG -#define page_test_and_clear_young(page) (0) -#endif - -#ifndef __HAVE_ARCH_PGD_OFFSET_GATE -#define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) -#endif - -#endif /* _ASM_GENERIC_PGTABLE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/agp.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/agp.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,37 +0,0 @@ -#ifndef AGP_H -#define AGP_H 1 - -#include <asm/pgtable.h> -#include <asm/cacheflush.h> -#include <asm/system.h> - -/* - * Functions to keep the agpgart mappings coherent with the MMU. - * The GART gives the CPU a physical alias of pages in memory. The alias region is - * mapped uncacheable. Make sure there are no conflicting mappings - * with different cachability attributes for the same page. This avoids - * data corruption on some CPUs. - */ - -int map_page_into_agp(struct page *page); -int unmap_page_from_agp(struct page *page); -#define flush_agp_mappings() global_flush_tlb() - -/* Could use CLFLUSH here if the cpu supports it. But then it would - need to be called for each cacheline of the whole page so it may not be - worth it. Would need a page for it. */ -#define flush_agp_cache() wbinvd() - -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) phys_to_machine(x) -#define gart_to_phys(x) machine_to_phys(x) - -/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -#define alloc_gatt_pages(order) ({ \ - char *_t; dma_addr_t _d; \ - _t = dma_alloc_coherent(NULL,PAGE_SIZE<<(order),&_d,GFP_KERNEL); \ - _t; }) -#define free_gatt_pages(table, order) \ - dma_free_coherent(NULL,PAGE_SIZE<<(order),(table),virt_to_bus(table)) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/desc.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/desc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,142 +0,0 @@ -#ifndef __ARCH_DESC_H -#define __ARCH_DESC_H - -#include <asm/ldt.h> -#include <asm/segment.h> - -#ifndef __ASSEMBLY__ - -#include <linux/preempt.h> -#include <linux/smp.h> - -#include <asm/mmu.h> - -extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; - -struct Xgt_desc_struct { - unsigned short size; - unsigned long address __attribute__((packed)); - unsigned short pad; -} __attribute__ ((packed)); - -extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; - -#define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) - -#define get_cpu_gdt_table(_cpu) ((struct desc_struct *)cpu_gdt_descr[(_cpu)].address) - -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; -extern void set_intr_gate(unsigned int irq, void * addr); - -#define _set_tssldt_desc(n,addr,limit,type) \ -__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ - "movw %%ax,2(%2)\n\t" \ - "rorl $16,%%eax\n\t" \ - "movb %%al,4(%2)\n\t" \ - "movb %4,5(%2)\n\t" \ - "movb $0,6(%2)\n\t" \ - "movb %%ah,7(%2)\n\t" \ - "rorl $16,%%eax" \ - : "=m"(*(n)) : "a" (addr), "r"(n), "ir"(limit), "i"(type)) - -static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr) -{ - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr, - offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89); -} - -#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) - -static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size) -{ - _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], - (int)addr, ((size << 3)-1), 0x82); -} - -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) - -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - 0x7000) - -#define LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 ) - -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), ((u32 *)&t->tls_array[i])[0], ((u32 *)&t->tls_array[i])[1]) - C(0); C(1); C(2); -#undef C -} - -static inline void clear_LDT(void) -{ - int cpu = get_cpu(); - - /* - * NB. We load the default_ldt for lcall7/27 handling on demand, as - * it slows down context switching. Noone uses it anyway. - */ - cpu = cpu; /* XXX avoid compiler warning */ - xen_set_ldt(0UL, 0); - put_cpu(); -} - -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock(mm_context_t *pc, int cpu) -{ - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) - segments = NULL; - - xen_set_ldt((unsigned long)segments, count); -} - -static inline void load_LDT(mm_context_t *pc) -{ - int cpu = get_cpu(); - load_LDT_nolock(pc, cpu); - put_cpu(); -} - -static inline unsigned long get_desc_base(unsigned long *desc) -{ - unsigned long base; - base = ((desc[0] >> 16) & 0x0000ffff) | - ((desc[1] << 16) & 0x00ff0000) | - (desc[1] & 0xff000000); - return base; -} - -#endif /* !__ASSEMBLY__ */ - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,177 +0,0 @@ -#ifndef _ASM_I386_DMA_MAPPING_H -#define _ASM_I386_DMA_MAPPING_H - -#include <linux/mm.h> - -#include <asm/cache.h> -#include <asm/io.h> -#include <asm/scatterlist.h> - -#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) -#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, int flag); - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); - -static inline dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - flush_write_buffers(); - return virt_to_bus(ptr); -} - -static inline void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - int i; - - BUG_ON(direction == DMA_NONE); - - for (i = 0; i < nents; i++ ) { - BUG_ON(!sg[i].page); - - sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset; - } - - flush_write_buffers(); - return nents; -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - return page_to_phys(page) + offset; -} - -static inline void -dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - -static inline void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} - -static inline void -dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} - -static inline void -dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ -} - -static inline void -dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} - -static inline int -dma_mapping_error(dma_addr_t dma_addr) -{ - return 0; -} - -static inline int -dma_supported(struct device *dev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - */ - if(mask < 0x00ffffff) - return 0; - - return 1; -} - -static inline int -dma_set_mask(struct device *dev, u64 mask) -{ - if(!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - - *dev->dma_mask = mask; - - return 0; -} - -static inline int -dma_get_cache_alignment(void) -{ - /* no easy way to get cache size on all x86, so return the - * maximum possible, to be safe */ - return (1 << L1_CACHE_SHIFT_MAX); -} - -#define dma_is_consistent(d) (1) - -static inline void -dma_cache_sync(void *vaddr, size_t size, - enum dma_data_direction direction) -{ - flush_write_buffers(); -} - -#define ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY -extern int -dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags); - -extern void -dma_release_declared_memory(struct device *dev); - -extern void * -dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size); - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/fixmap.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/fixmap.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,168 +0,0 @@ -/* - * fixmap.h: compile-time virtual memory allocation - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1998 Ingo Molnar - * - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - */ - -#ifndef _ASM_FIXMAP_H -#define _ASM_FIXMAP_H - -#include <linux/config.h> - -/* used by vmalloc.c, vsyscall.lds.S. - * - * Leave one empty page between vmalloc'ed areas and - * the start of the fixmap. - */ -#define __FIXADDR_TOP (HYPERVISOR_VIRT_START - 2 * PAGE_SIZE) - -#ifndef __ASSEMBLY__ -#include <linux/kernel.h> -#include <asm/acpi.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm-xen/gnttab.h> -#ifdef CONFIG_HIGHMEM -#include <linux/threads.h> -#include <asm/kmap_types.h> -#endif - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. - * Also this lets us do fail-safe vmalloc(), we - * can guarantee that these special addresses and - * vmalloc()-ed addresses never overlap. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ -enum fixed_addresses { - FIX_HOLE, - FIX_VSYSCALL, -#ifdef CONFIG_X86_LOCAL_APIC - FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ -#endif -#ifdef CONFIG_X86_IO_APIC - FIX_IO_APIC_BASE_0, - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, -#endif -#ifdef CONFIG_X86_VISWS_APIC - FIX_CO_CPU, /* Cobalt timer */ - FIX_CO_APIC, /* Cobalt APIC Redirection Table */ - FIX_LI_PCIA, /* Lithium PCI Bridge A */ - FIX_LI_PCIB, /* Lithium PCI Bridge B */ -#endif -#ifdef CONFIG_X86_F00F_BUG - FIX_F00F_IDT, /* Virtual mapping for IDT */ -#endif -#ifdef CONFIG_X86_CYCLONE_TIMER - FIX_CYCLONE_TIMER, /*cyclone timer register*/ -#endif -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif -#ifdef CONFIG_ACPI_BOOT - FIX_ACPI_BEGIN, - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, - FIX_ACPI_RSDP_PAGE, -#endif -#ifdef CONFIG_PCI_MMCONFIG - FIX_PCIE_MCFG, -#endif - FIX_SHARED_INFO, - FIX_GNTTAB_BEGIN, - FIX_GNTTAB_END = FIX_GNTTAB_BEGIN + NR_GRANT_FRAMES - 1, -#ifdef CONFIG_XEN_PHYSDEV_ACCESS -#define NR_FIX_ISAMAPS 256 - FIX_ISAMAP_END, - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, -#endif - __end_of_permanent_fixed_addresses, - /* temporary boot-time mappings, used before ioremap() is functional */ -#define NR_FIX_BTMAPS 16 - FIX_BTMAP_END = __end_of_permanent_fixed_addresses, - FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS - 1, - FIX_WP_TEST, - __end_of_fixed_addresses -}; - -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); - -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - -#define clear_fixmap(idx) \ - __set_fixmap(idx, 0, __pgprot(0)) - -#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) - -#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) -#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) -#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) - -/* - * This is the range that is readable by user mode, and things - * acting like user mode such as get_user_pages. - */ -#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) - - -extern void __this_fixmap_does_not_exist(void); - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without tranlation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -static __always_inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* - * this branch gets completely eliminated after inlining, - * except when someone tries to use fixaddr indices in an - * illegal way. (such as mixing up address types or using - * out-of-range indices). - * - * If it doesn't get removed, the linker will complain - * loudly with a reasonably clear error message.. - */ - if (idx >= __end_of_fixed_addresses) - __this_fixmap_does_not_exist(); - - return __fix_to_virt(idx); -} - -static inline unsigned long virt_to_fix(const unsigned long vaddr) -{ - BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); - return __virt_to_fix(vaddr); -} - -#endif /* !__ASSEMBLY__ */ -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/floppy.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/floppy.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,147 +0,0 @@ -/* - * Architecture specific parts of the Floppy driver - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1995 - * - * Modifications for Xen are Copyright (c) 2004, Keir Fraser. - */ -#ifndef __ASM_XEN_I386_FLOPPY_H -#define __ASM_XEN_I386_FLOPPY_H - -#include <linux/vmalloc.h> - -/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */ -#include <asm/dma.h> -#undef MAX_DMA_ADDRESS -#define MAX_DMA_ADDRESS 0 -#define CROSS_64KB(a,s) (0) - -#define fd_inb(port) inb_p(port) -#define fd_outb(value,port) outb_p(value,port) - -#define fd_request_dma() (0) -#define fd_free_dma() ((void)0) -#define fd_enable_irq() enable_irq(FLOPPY_IRQ) -#define fd_disable_irq() disable_irq(FLOPPY_IRQ) -#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL) -#define fd_get_dma_residue() (virtual_dma_count + virtual_dma_residue) -#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io) -/* - * Do not use vmalloc/vfree: floppy_release_irq_and_dma() gets called from - * softirq context via motor_off_callback. A generic bug we happen to trigger. - */ -#define fd_dma_mem_alloc(size) __get_free_pages(GFP_KERNEL, get_order(size)) -#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size)) - -static int virtual_dma_count; -static int virtual_dma_residue; -static char *virtual_dma_addr; -static int virtual_dma_mode; -static int doing_pdma; - -static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs) -{ - register unsigned char st; - register int lcount; - register char *lptr; - - if (!doing_pdma) - return floppy_interrupt(irq, dev_id, regs); - - st = 1; - for(lcount=virtual_dma_count, lptr=virtual_dma_addr; - lcount; lcount--, lptr++) { - st=inb(virtual_dma_port+4) & 0xa0 ; - if(st != 0xa0) - break; - if(virtual_dma_mode) - outb_p(*lptr, virtual_dma_port+5); - else - *lptr = inb_p(virtual_dma_port+5); - } - virtual_dma_count = lcount; - virtual_dma_addr = lptr; - st = inb(virtual_dma_port+4); - - if(st == 0x20) - return IRQ_HANDLED; - if(!(st & 0x20)) { - virtual_dma_residue += virtual_dma_count; - virtual_dma_count=0; - doing_pdma = 0; - floppy_interrupt(irq, dev_id, regs); - return IRQ_HANDLED; - } - return IRQ_HANDLED; -} - -static void fd_disable_dma(void) -{ - doing_pdma = 0; - virtual_dma_residue += virtual_dma_count; - virtual_dma_count=0; -} - -static int fd_request_irq(void) -{ - return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT, - "floppy", NULL); -} - -static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) -{ - doing_pdma = 1; - virtual_dma_port = io; - virtual_dma_mode = (mode == DMA_MODE_WRITE); - virtual_dma_addr = addr; - virtual_dma_count = size; - virtual_dma_residue = 0; - return 0; -} - -/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */ -#define FDC1 xen_floppy_init() -static int FDC2 = -1; - -static int xen_floppy_init(void) -{ - use_virtual_dma = 1; - can_use_virtual_dma = 1; - return 0x3f0; -} - -/* - * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock - * is needed to prevent corrupted CMOS RAM in case "insmod floppy" - * coincides with another rtc CMOS user. Paul G. - */ -#define FLOPPY0_TYPE ({ \ - unsigned long flags; \ - unsigned char val; \ - spin_lock_irqsave(&rtc_lock, flags); \ - val = (CMOS_READ(0x10) >> 4) & 15; \ - spin_unlock_irqrestore(&rtc_lock, flags); \ - val; \ -}) - -#define FLOPPY1_TYPE ({ \ - unsigned long flags; \ - unsigned char val; \ - spin_lock_irqsave(&rtc_lock, flags); \ - val = CMOS_READ(0x10) & 15; \ - spin_unlock_irqrestore(&rtc_lock, flags); \ - val; \ -}) - -#define N_FDC 2 -#define N_DRIVE 8 - -#define FLOPPY_MOTOR_MASK 0xf0 - -#define EXTRA_FLOPPY_PARAMS - -#endif /* __ASM_XEN_I386_FLOPPY_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/highmem.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/highmem.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,82 +0,0 @@ -/* - * highmem.h: virtual kernel memory mappings for high memory - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@xxxxxxxxxxxxxx - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - */ - -#ifndef _ASM_HIGHMEM_H -#define _ASM_HIGHMEM_H - -#ifdef __KERNEL__ - -#include <linux/config.h> -#include <linux/interrupt.h> -#include <linux/threads.h> -#include <asm/kmap_types.h> -#include <asm/tlbflush.h> - -/* declarations for highmem.c */ -extern unsigned long highstart_pfn, highend_pfn; - -extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; -extern pte_t *pkmap_page_table; - -extern void kmap_init(void); - -/* - * Right now we initialize only a single pte table. It can be extended - * easily, subsequent pte tables have to be allocated in one physical - * chunk of RAM. - */ -#ifdef CONFIG_X86_PAE -#define LAST_PKMAP 512 -#else -#define LAST_PKMAP 1024 -#endif -/* - * Ordering is: - * - * FIXADDR_TOP - * fixed_addresses - * FIXADDR_START - * temp fixed addresses - * FIXADDR_BOOT_START - * Persistent kmap area - * PKMAP_BASE - * VMALLOC_END - * Vmalloc area - * VMALLOC_START - * high_memory - */ -#define PKMAP_BASE ( (FIXADDR_BOOT_START - PAGE_SIZE*(LAST_PKMAP + 1)) & PMD_MASK ) -#define LAST_PKMAP_MASK (LAST_PKMAP-1) -#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) -#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) - -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); - -void *kmap(struct page *page); -void kunmap(struct page *page); -void *kmap_atomic(struct page *page, enum km_type type); -void *kmap_atomic_pte(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); - -#define flush_cache_kmaps() do { } while (0) - -#endif /* __KERNEL__ */ - -#endif /* _ASM_HIGHMEM_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/hypercall.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,564 +0,0 @@ -/****************************************************************************** - * hypercall.h - * - * Linux-specific hypervisor handling. - * - * Copyright (c) 2002-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __HYPERCALL_H__ -#define __HYPERCALL_H__ -#include <asm-xen/xen-public/xen.h> - -/* - * Assembler stubs for hyper-calls. - */ - -static inline int -HYPERVISOR_set_trap_table( - trap_info_t *table) -{ - int ret; - unsigned long ignore; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ignore) - : "0" (__HYPERVISOR_set_trap_table), "1" (table) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_mmu_update( - mmu_update_t *req, int count, int *success_count, domid_t domid) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_mmu_update), "1" (req), "2" (count), - "3" (success_count), "4" (domid) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_mmuext_op( - struct mmuext_op *op, int count, int *success_count, domid_t domid) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_mmuext_op), "1" (op), "2" (count), - "3" (success_count), "4" (domid) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_set_gdt( - unsigned long *frame_list, int entries) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_set_gdt), "1" (frame_list), "2" (entries) - : "memory" ); - - - return ret; -} - -static inline int -HYPERVISOR_stack_switch( - unsigned long ss, unsigned long esp) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_stack_switch), "1" (ss), "2" (esp) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_set_callbacks( - unsigned long event_selector, unsigned long event_address, - unsigned long failsafe_selector, unsigned long failsafe_address) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_set_callbacks), "1" (event_selector), - "2" (event_address), "3" (failsafe_selector), "4" (failsafe_address) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_fpu_taskswitch( - int set) -{ - int ret; - unsigned long ign; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_fpu_taskswitch), "1" (set) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_yield( - void) -{ - int ret; - unsigned long ign; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_yield) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_block( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_block) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_shutdown( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift)) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_reboot( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift)) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_suspend( - unsigned long srec) -{ - int ret; - unsigned long ign1, ign2; - - /* NB. On suspend, control software expects a suspend record in %esi. */ - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=S" (ign2) - : "0" (__HYPERVISOR_sched_op), - "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), - "S" (srec) : "memory"); - - return ret; -} - -static inline int -HYPERVISOR_crash( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_shutdown | (SHUTDOWN_crash << SCHEDOP_reasonshift)) - : "memory" ); - - return ret; -} - -static inline long -HYPERVISOR_set_timer_op( - u64 timeout) -{ - int ret; - unsigned long timeout_hi = (unsigned long)(timeout>>32); - unsigned long timeout_lo = (unsigned long)timeout; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_set_timer_op), "b" (timeout_lo), "c" (timeout_hi) - : "memory"); - - return ret; -} - -static inline int -HYPERVISOR_dom0_op( - dom0_op_t *dom0_op) -{ - int ret; - unsigned long ign1; - - dom0_op->interface_version = DOM0_INTERFACE_VERSION; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_dom0_op), "1" (dom0_op) - : "memory"); - - return ret; -} - -static inline int -HYPERVISOR_set_debugreg( - int reg, unsigned long value) -{ - int ret; - unsigned long ign1, ign2; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_set_debugreg), "1" (reg), "2" (value) - : "memory" ); - - return ret; -} - -static inline unsigned long -HYPERVISOR_get_debugreg( - int reg) -{ - unsigned long ret; - unsigned long ign; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_get_debugreg), "1" (reg) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_update_descriptor( - unsigned long ma, unsigned long word1, unsigned long word2) -{ - int ret; - unsigned long ign1, ign2, ign3; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) - : "0" (__HYPERVISOR_update_descriptor), "1" (ma), "2" (word1), - "3" (word2) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_dom_mem_op( - unsigned int op, unsigned long *extent_list, - unsigned long nr_extents, unsigned int extent_order) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4, ign5; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4), - "=D" (ign5) - : "0" (__HYPERVISOR_dom_mem_op), "1" (op), "2" (extent_list), - "3" (nr_extents), "4" (extent_order), "5" (DOMID_SELF) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_multicall( - void *call_list, int nr_calls) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_multicall), "1" (call_list), "2" (nr_calls) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_update_va_mapping( - unsigned long va, pte_t new_val, unsigned long flags) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_update_va_mapping), - "1" (va), "2" ((new_val).pte_low), -#ifdef CONFIG_X86_PAE - "3" ((new_val).pte_high), -#else - "3" (0), -#endif - "4" (flags) - : "memory" ); - - if ( unlikely(ret < 0) ) - { - printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", - va, (new_val).pte_low, flags); - BUG(); - } - - return ret; -} - -static inline int -HYPERVISOR_event_channel_op( - void *op) -{ - int ret; - unsigned long ignore; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ignore) - : "0" (__HYPERVISOR_event_channel_op), "1" (op) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_xen_version( - int cmd) -{ - int ret; - unsigned long ignore; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ignore) - : "0" (__HYPERVISOR_xen_version), "1" (cmd) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_console_io( - int cmd, int count, char *str) -{ - int ret; - unsigned long ign1, ign2, ign3; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) - : "0" (__HYPERVISOR_console_io), "1" (cmd), "2" (count), "3" (str) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_physdev_op( - void *physdev_op) -{ - int ret; - unsigned long ign; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_physdev_op), "1" (physdev_op) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_grant_table_op( - unsigned int cmd, void *uop, unsigned int count) -{ - int ret; - unsigned long ign1, ign2, ign3; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) - : "0" (__HYPERVISOR_grant_table_op), "1" (cmd), "2" (uop), "3" (count) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_update_va_mapping_otherdomain( - unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4, ign5; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), - "=S" (ign4), "=D" (ign5) - : "0" (__HYPERVISOR_update_va_mapping_otherdomain), - "1" (va), "2" ((new_val).pte_low), -#ifdef CONFIG_X86_PAE - "3" ((new_val).pte_high), -#else - "3" (0), -#endif - "4" (flags), "5" (domid) : - "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_vm_assist( - unsigned int cmd, unsigned int type) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_vm_assist), "1" (cmd), "2" (type) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_boot_vcpu( - unsigned long vcpu, vcpu_guest_context_t *ctxt) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_boot_vcpu), "1" (vcpu), "2" (ctxt) - : "memory"); - - return ret; -} - -static inline int -HYPERVISOR_vcpu_down( - int vcpu) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_vcpu_down | (vcpu << SCHEDOP_vcpushift)) - : "memory" ); - - return ret; -} - -static inline int -HYPERVISOR_vcpu_up( - int vcpu) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_vcpu_up | (vcpu << SCHEDOP_vcpushift)) - : "memory" ); - - return ret; -} -#endif /* __HYPERCALL_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/io.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/io.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,425 +0,0 @@ -#ifndef _ASM_IO_H -#define _ASM_IO_H - -#include <linux/config.h> -#include <linux/string.h> -#include <linux/compiler.h> - -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@xxxxxxxxxxxxxxxx> - */ - -#define IO_SPACE_LIMIT 0xffff - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ - -#ifdef __KERNEL__ - -#include <asm-generic/iomap.h> - -#include <linux/vmalloc.h> -#include <asm/fixmap.h> - -/** - * virt_to_phys - map virtual addresses to physical - * @address: address to remap - * - * The returned physical address is the physical (CPU) mapping for - * the memory address given. It is only valid to use this function on - * addresses directly mapped or allocated via kmalloc. - * - * This function does not give bus mappings for DMA transfers. In - * almost all conceivable cases a device driver should not be using - * this function - */ - -static inline unsigned long virt_to_phys(volatile void * address) -{ - return __pa(address); -} - -/** - * phys_to_virt - map physical address to virtual - * @address: address to remap - * - * The returned virtual address is a current CPU mapping for - * the memory address given. It is only valid to use this function on - * addresses that have a kernel mapping - * - * This function does not handle bus mappings for DMA transfers. In - * almost all conceivable cases a device driver should not be using - * this function - */ - -static inline void * phys_to_virt(unsigned long address) -{ - return __va(address); -} - -/* - * Change "struct page" to physical address. - */ -#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) - -#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \ - (unsigned long) bio_offset((bio))) -#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ - (unsigned long) (bv)->bv_offset) - -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \ - ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \ - bvec_to_pseudophys((vec2)))) - -extern void __iomem * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); - -/** - * ioremap - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - */ - -static inline void __iomem * ioremap(unsigned long offset, unsigned long size) -{ - return __ioremap(offset, size, 0); -} - -extern void __iomem * ioremap_nocache(unsigned long offset, unsigned long size); -extern void iounmap(volatile void __iomem *addr); - -/* - * bt_ioremap() and bt_iounmap() are for temporary early boot-time - * mappings, before the real ioremap() is functional. - * A boot-time mapping is currently limited to at most 16 pages. - */ -extern void *bt_ioremap(unsigned long offset, unsigned long size); -extern void bt_iounmap(void *addr, unsigned long size); - -/* - * ISA I/O bus memory addresses are 1:1 with the physical address. - */ -#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x -#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x -#ifdef CONFIG_XEN_PHYSDEV_ACCESS -#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) -#else -#define isa_bus_to_virt(_x) isa_bus_to_virt_needs_PRIVILEGED_BUILD -#endif - -/* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them on x86 for legacy drivers, though. - */ -#define virt_to_bus(_x) phys_to_machine(__pa(_x)) -#define bus_to_virt(_x) __va(machine_to_phys(_x)) - -/* - * readX/writeX() are used to access memory mapped devices. On some - * architectures the memory mapped IO stuff needs to be accessed - * differently. On the x86 architecture, we just read/write the - * memory location directly. - */ - -static inline unsigned char readb(const volatile void __iomem *addr) -{ - return *(volatile unsigned char __force *) addr; -} -static inline unsigned short readw(const volatile void __iomem *addr) -{ - return *(volatile unsigned short __force *) addr; -} -static inline unsigned int readl(const volatile void __iomem *addr) -{ - return *(volatile unsigned int __force *) addr; -} -#define readb_relaxed(addr) readb(addr) -#define readw_relaxed(addr) readw(addr) -#define readl_relaxed(addr) readl(addr) -#define __raw_readb readb -#define __raw_readw readw -#define __raw_readl readl - -static inline void writeb(unsigned char b, volatile void __iomem *addr) -{ - *(volatile unsigned char __force *) addr = b; -} -static inline void writew(unsigned short b, volatile void __iomem *addr) -{ - *(volatile unsigned short __force *) addr = b; -} -static inline void writel(unsigned int b, volatile void __iomem *addr) -{ - *(volatile unsigned int __force *) addr = b; -} -#define __raw_writeb writeb -#define __raw_writew writew -#define __raw_writel writel - -#define mmiowb() - -static inline void memset_io(volatile void __iomem *addr, unsigned char val, int count) -{ - memset((void __force *) addr, val, count); -} -static inline void memcpy_fromio(void *dst, const volatile void __iomem *src, int count) -{ - __memcpy(dst, (void __force *) src, count); -} -static inline void memcpy_toio(volatile void __iomem *dst, const void *src, int count) -{ - __memcpy((void __force *) dst, src, count); -} - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -#define isa_readb(a) readb(__ISA_IO_base + (a)) -#define isa_readw(a) readw(__ISA_IO_base + (a)) -#define isa_readl(a) readl(__ISA_IO_base + (a)) -#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a)) -#define isa_writew(w,a) writew(w,__ISA_IO_base + (a)) -#define isa_writel(l,a) writel(l,__ISA_IO_base + (a)) -#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c)) -#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c)) -#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c)) - - -/* - * Again, i386 does not require mem IO specific function. - */ - -#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(b),(c),(d)) -#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void __force *)(__ISA_IO_base + (b)),(c),(d)) - -/** - * check_signature - find BIOS signatures - * @io_addr: mmio address to check - * @signature: signature block - * @length: length of signature - * - * Perform a signature comparison with the mmio address io_addr. This - * address should have been obtained by ioremap. - * Returns 1 on a match. - */ - -static inline int check_signature(volatile void __iomem * io_addr, - const unsigned char *signature, int length) -{ - int retval = 0; - do { - if (readb(io_addr) != *signature) - goto out; - io_addr++; - signature++; - length--; - } while (length); - retval = 1; -out: - return retval; -} - -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE) - -static inline void flush_write_buffers(void) -{ - __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory"); -} - -#define dma_cache_inv(_start,_size) flush_write_buffers() -#define dma_cache_wback(_start,_size) flush_write_buffers() -#define dma_cache_wback_inv(_start,_size) flush_write_buffers() - -#else - -/* Nothing to do */ - -#define dma_cache_inv(_start,_size) do { } while (0) -#define dma_cache_wback(_start,_size) do { } while (0) -#define dma_cache_wback_inv(_start,_size) do { } while (0) -#define flush_write_buffers() - -#endif - -#endif /* __KERNEL__ */ - -#ifdef SLOW_IO_BY_JUMPING -#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:" -#elif defined(__UNSAFE_IO__) -#define __SLOW_DOWN_IO "outb %%al,$0x80;" -#else -#define __SLOW_DOWN_IO "\n1: outb %%al,$0x80\n" \ - "2:\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,2b\n" \ - ".previous" -#endif - -static inline void slow_down_io(void) { - __asm__ __volatile__( - __SLOW_DOWN_IO -#ifdef REALLY_SLOW_IO - __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO -#endif - : : ); -} - -#ifdef CONFIG_X86_NUMAQ -extern void *xquad_portio; /* Where the IO area was mapped */ -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) -#define __BUILDIO(bwl,bw,type) \ -static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \ - if (xquad_portio) \ - write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \ - else \ - out##bwl##_local(value, port); \ -} \ -static inline void out##bwl(unsigned type value, int port) { \ - out##bwl##_quad(value, port, 0); \ -} \ -static inline unsigned type in##bwl##_quad(int port, int quad) { \ - if (xquad_portio) \ - return read##bwl(XQUAD_PORT_ADDR(port, quad)); \ - else \ - return in##bwl##_local(port); \ -} \ -static inline unsigned type in##bwl(int port) { \ - return in##bwl##_quad(port, 0); \ -} -#else -#define __BUILDIO(bwl,bw,type) \ -static inline void out##bwl(unsigned type value, int port) { \ - out##bwl##_local(value, port); \ -} \ -static inline unsigned type in##bwl(int port) { \ - return in##bwl##_local(port); \ -} -#endif - - -#if __UNSAFE_IO__ -#define ____BUILDIO(bwl,bw,type) \ -static inline void out##bwl##_local(unsigned type value, int port) { \ - __asm__ __volatile__("out" #bwl " %" #bw "0, %w1" : : "a"(value), "Nd"(port)); \ -} \ -static inline unsigned type in##bwl##_local(int port) { \ - unsigned type value; \ - __asm__ __volatile__("in" #bwl " %w1, %" #bw "0" : "=a"(value) : "Nd"(port)); \ - return value; \ -} -#else -#define ____BUILDIO(bwl,bw,type) \ -static inline void out##bwl##_local(unsigned type value, int port) { \ - __asm__ __volatile__("1: out" #bwl " %" #bw "0, %w1\n" \ - "2:\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,2b\n" \ - ".previous" : : "a"(value), "Nd"(port)); \ -} \ -static inline unsigned type in##bwl##_local(int port) { \ - unsigned type value; \ - __asm__ __volatile__("1:in" #bwl " %w1, %" #bw "0\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3: mov" #bwl " $~0,%" #bw "0\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,3b\n" \ - ".previous" : "=a"(value) : "Nd"(port)); \ - return value; \ -} -#endif - -#define BUILDIO(bwl,bw,type) \ -____BUILDIO(bwl,bw,type) \ -static inline void out##bwl##_local_p(unsigned type value, int port) { \ - out##bwl##_local(value, port); \ - slow_down_io(); \ -} \ -static inline unsigned type in##bwl##_local_p(int port) { \ - unsigned type value = in##bwl##_local(port); \ - slow_down_io(); \ - return value; \ -} \ -__BUILDIO(bwl,bw,type) \ -static inline void out##bwl##_p(unsigned type value, int port) { \ - out##bwl(value, port); \ - slow_down_io(); \ -} \ -static inline unsigned type in##bwl##_p(int port) { \ - unsigned type value = in##bwl(port); \ - slow_down_io(); \ - return value; \ -} \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) { \ - __asm__ __volatile__("rep; outs" #bwl : "+S"(addr), "+c"(count) : "d"(port)); \ -} \ -static inline void ins##bwl(int port, void *addr, unsigned long count) { \ - __asm__ __volatile__("rep; ins" #bwl : "+D"(addr), "+c"(count) : "d"(port)); \ -} - -BUILDIO(b,b,char) -BUILDIO(w,w,short) -BUILDIO(l,,int) - -/* We will be supplying our own /dev/mem implementation */ -#define ARCH_HAS_DEV_MEM - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,51 +0,0 @@ -/** - * machine_specific_memory_setup - Hook for machine specific memory setup. - * - * Description: - * This is included late in kernel/setup.c so that it can make - * use of all of the static functions. - **/ - -static char * __init machine_specific_memory_setup(void) -{ - char *who; - unsigned long start_pfn, max_pfn; - - who = "Xen"; - - /* In dom0, we have to start the fake e820 map above the first - * 1MB, in other domains, it can start at 0. */ - if (xen_start_info.flags & SIF_INITDOMAIN) - start_pfn = 0x100; - else - start_pfn = 0; - max_pfn = xen_start_info.nr_pages; - - e820.nr_map = 0; - add_memory_region(PFN_PHYS(start_pfn), PFN_PHYS(max_pfn) - PFN_PHYS(start_pfn), E820_RAM); - - return who; -} - -void __init machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c) -{ - clear_bit(X86_FEATURE_VME, c->x86_capability); - clear_bit(X86_FEATURE_DE, c->x86_capability); - clear_bit(X86_FEATURE_PSE, c->x86_capability); - clear_bit(X86_FEATURE_PGE, c->x86_capability); - clear_bit(X86_FEATURE_SEP, c->x86_capability); - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - clear_bit(X86_FEATURE_MTRR, c->x86_capability); -} - -extern void hypervisor_callback(void); -extern void failsafe_callback(void); - -static void __init machine_specific_arch_setup(void) -{ - HYPERVISOR_set_callbacks( - __KERNEL_CS, (unsigned long)hypervisor_callback, - __KERNEL_CS, (unsigned long)failsafe_callback); - - machine_specific_modify_cpu_capabilities(&boot_cpu_data); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_pre.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,5 +0,0 @@ -/* Hook to call BIOS initialisation function */ - -#define ARCH_SETUP machine_specific_arch_setup(); - -static void __init machine_specific_arch_setup(void); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mach-xen/smpboot_hooks.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,55 +0,0 @@ -/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws - * which needs to alter them. */ - -static inline void smpboot_clear_io_apic_irqs(void) -{ -#ifdef CONFIG_X86_IO_APIC - io_apic_irqs = 0; -#endif -} - -static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) -{ -#if 1 - printk("smpboot_setup_warm_reset_vector\n"); -#else - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - Dprintk("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; - Dprintk("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; - Dprintk("3.\n"); -#endif -} - -static inline void smpboot_restore_warm_reset_vector(void) -{ - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - *((volatile long *) phys_to_virt(0x467)) = 0; -} - -static inline void smpboot_setup_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif -} - - -#define smp_found_config (HYPERVISOR_shared_info->n_vcpu > 1) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,26 +0,0 @@ -#ifndef __i386_MMU_H -#define __i386_MMU_H - -#include <asm/semaphore.h> -/* - * The i386 doesn't have a mmu context, but - * we put the segment information here. - * - * cpu_vm_mask is used to optimize ldt flushing. - */ -typedef struct { - int size; - struct semaphore sem; - void *ldt; - unsigned pinned:1; - struct list_head unpinned; -} mm_context_t; - -extern struct list_head mm_unpinned; -extern spinlock_t mm_unpinned_lock; - -/* mm/memory.c:exit_mmap hook */ -extern void _arch_exit_mmap(struct mm_struct *mm); -#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,108 +0,0 @@ -#ifndef __I386_SCHED_H -#define __I386_SCHED_H - -#include <linux/config.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> - -/* - * Used for LDT copy/destruction. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -#if 0 /* XEN: no lazy tlb */ - unsigned cpu = smp_processor_id(); - if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; -#endif -} - -#define prepare_arch_switch(rq,next) __prepare_arch_switch() -#define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -#define task_running(rq, p) ((rq)->curr == (p)) - -static inline void __prepare_arch_switch(void) -{ - /* - * Save away %fs and %gs. No need to save %es and %ds, as those - * are always kernel segments while inside the kernel. Must - * happen before reload of cr3/ldt (i.e., not in __switch_to). - */ - __asm__ __volatile__ ( "movl %%fs,%0 ; movl %%gs,%1" - : "=m" (*(int *)¤t->thread.fs), - "=m" (*(int *)¤t->thread.gs)); - __asm__ __volatile__ ( "movl %0,%%fs ; movl %0,%%gs" - : : "r" (0) ); -} - -extern void mm_pin(struct mm_struct *mm); -extern void mm_unpin(struct mm_struct *mm); -void mm_pin_all(void); - -static inline void switch_mm(struct mm_struct *prev, - struct mm_struct *next, - struct task_struct *tsk) -{ - int cpu = smp_processor_id(); - struct mmuext_op _op[2], *op = _op; - - if (likely(prev != next)) { - if (!next->context.pinned) - mm_pin(next); - - /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); -#if 0 /* XEN: no lazy tlb */ - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - per_cpu(cpu_tlbstate, cpu).active_mm = next; -#endif - cpu_set(cpu, next->cpu_vm_mask); - - /* Re-load page tables: load_cr3(next->pgd) */ - per_cpu(cur_pgd, cpu) = next->pgd; - op->cmd = MMUEXT_NEW_BASEPTR; - op->mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); - op++; - - /* - * load the LDT, if the LDT is different: - */ - if (unlikely(prev->context.ldt != next->context.ldt)) { - /* load_LDT_nolock(&next->context, cpu) */ - op->cmd = MMUEXT_SET_LDT; - op->linear_addr = (unsigned long)next->context.ldt; - op->nr_ents = next->context.size; - op++; - } - - BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); - } -#if 0 /* XEN: no lazy tlb */ - else { - per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; - BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); - - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { - /* We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload %cr3. - */ - load_cr3(next->pgd); - load_LDT_nolock(&next->context, cpu); - } - } -#endif -} - -#define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) - -#define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,203 +0,0 @@ -#ifndef _I386_PAGE_H -#define _I386_PAGE_H - -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) - -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -#include <linux/config.h> -#include <linux/string.h> -#include <linux/types.h> -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/foreign_page.h> - -#define arch_free_page(_page,_order) \ -({ int foreign = PageForeign(_page); \ - if (foreign) \ - (PageForeignDestructor(_page))(_page); \ - foreign; \ -}) -#define HAVE_ARCH_FREE_PAGE - -#ifdef CONFIG_XEN_SCRUB_PAGES -#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT) -#else -#define scrub_pages(_p,_n) ((void)0) -#endif - -#ifdef CONFIG_X86_USE_3DNOW - -#include <asm/mmx.h> - -#define clear_page(page) mmx_clear_page((void *)(page)) -#define copy_page(to,from) mmx_copy_page(to,from) - -#else - -#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE - -/* - * On older X86 processors it's not a win to use MMX here it seems. - * Maybe the K6-III ? - */ - -#define clear_page(page) memset((void *)(page), 0, PAGE_SIZE) -#define copy_page(to,from) memcpy((void *)(to), (void *)(from), PAGE_SIZE) - -#endif - -#define clear_user_page(page, vaddr, pg) clear_page(page) -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) - -/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ -extern unsigned int *phys_to_machine_mapping; -#define pfn_to_mfn(_pfn) ((unsigned long)(phys_to_machine_mapping[(_pfn)])) -#define mfn_to_pfn(_mfn) ((unsigned long)(machine_to_phys_mapping[(_mfn)])) -static inline unsigned long phys_to_machine(unsigned long phys) -{ - unsigned long machine = pfn_to_mfn(phys >> PAGE_SHIFT); - machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); - return machine; -} -static inline unsigned long machine_to_phys(unsigned long machine) -{ - unsigned long phys = mfn_to_pfn(machine >> PAGE_SHIFT); - phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); - return phys; -} - -/* - * These are used to make use of C type-checking.. - */ -extern int nx_enabled; -#ifdef CONFIG_X86_PAE -extern unsigned long long __supported_pte_mask; -typedef struct { unsigned long pte_low, pte_high; } pte_t; -typedef struct { unsigned long long pmd; } pmd_t; -typedef struct { unsigned long long pgd; } pgd_t; -typedef struct { unsigned long long pgprot; } pgprot_t; -#define pmd_val(x) ((x).pmd) -#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) -#define __pmd(x) ((pmd_t) { (x) } ) -#define HPAGE_SHIFT 21 -#else -typedef struct { unsigned long pte_low; } pte_t; -typedef struct { unsigned long pgd; } pgd_t; -typedef struct { unsigned long pgprot; } pgprot_t; -#define boot_pte_t pte_t /* or would you rather have a typedef */ -#define pte_val(x) (((x).pte_low & 1) ? machine_to_phys((x).pte_low) : \ - (x).pte_low) -#define pte_val_ma(x) ((x).pte_low) -#define HPAGE_SHIFT 22 -#endif -#define PTE_MASK PAGE_MASK - -#ifdef CONFIG_HUGETLB_PAGE -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) -#define HPAGE_MASK (~(HPAGE_SIZE - 1)) -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA -#endif - - -static inline unsigned long pgd_val(pgd_t x) -{ - unsigned long ret = x.pgd; - if (ret) ret = machine_to_phys(ret); - return ret; -} -#define pgprot_val(x) ((x).pgprot) - -#define __pte(x) ({ unsigned long _x = (x); \ - (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); }) -#define __pte_ma(x) ((pte_t) { (x) } ) -#define __pgd(x) ({ unsigned long _x = (x); \ - (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); }) -#define __pgprot(x) ((pgprot_t) { (x) } ) - -#endif /* !__ASSEMBLY__ */ - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - -/* - * This handles the memory map.. We could make this a config - * option, but too many people screw it up, and too few need - * it. - * - * A __PAGE_OFFSET of 0xC0000000 means that the kernel has - * a virtual address space of one gigabyte, which limits the - * amount of physical memory you can use to about 950MB. - * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. - */ - -#ifndef __ASSEMBLY__ - -/* - * This much address space is reserved for vmalloc() and iomap() - * as well as fixmap mappings. - */ -extern unsigned int __VMALLOC_RESERVE; - -/* Pure 2^n version of get_order */ -static __inline__ int get_order(unsigned long size) -{ - int order; - - size = (size-1) >> (PAGE_SHIFT-1); - order = -1; - do { - size >>= 1; - order++; - } while (size); - return order; -} - -extern int sysctl_legacy_va_layout; - -#endif /* __ASSEMBLY__ */ - -#ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) -#else -#define __PAGE_OFFSET (0xC0000000UL) -#endif - - -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) -#define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (HYPERVISOR_VIRT_START-__PAGE_OFFSET-__VMALLOC_RESERVE) -#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) -#ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif /* !CONFIG_DISCONTIGMEM */ -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) - -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) - -#define VM_DATA_DEFAULT_FLAGS \ - (VM_READ | VM_WRITE | \ - ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -/* VIRT <-> MACHINE conversion */ -#define virt_to_machine(_a) (phys_to_machine(__pa(_a))) -#define machine_to_virt(_m) (__va(machine_to_phys(_m))) - -#endif /* __KERNEL__ */ - -#endif /* _I386_PAGE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/param.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/param.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,23 +0,0 @@ -#ifndef _ASMi386_PARAM_H -#define _ASMi386_PARAM_H - -#ifdef __KERNEL__ -# define HZ 100 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#define EXEC_PAGESIZE 4096 - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ -#define COMMAND_LINE_SIZE 256 - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pci.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pci.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,117 +0,0 @@ -#ifndef __i386_PCI_H -#define __i386_PCI_H - -#include <linux/config.h> - -#ifdef __KERNEL__ -#include <linux/mm.h> /* for struct page */ - -/* Can be used to override the logic in pci_scan_bus for skipping - already-configured bus numbers - to be used for buggy BIOSes - or architectures with incomplete PCI setup by the loader */ - -#ifdef CONFIG_PCI -extern unsigned int pcibios_assign_all_busses(void); -#else -#define pcibios_assign_all_busses() 0 -#endif -#define pcibios_scan_all_fns(a, b) 0 - -extern unsigned long pci_mem_start; -#define PCIBIOS_MIN_IO 0x1000 -#define PCIBIOS_MIN_MEM (pci_mem_start) - -#define PCIBIOS_MIN_CARDBUS_IO 0x4000 - -void pcibios_config_init(void); -struct pci_bus * pcibios_scan_root(int bus); - -void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq); -struct irq_routing_table *pcibios_get_irq_routing_table(void); -int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); - -/* Dynamic DMA mapping stuff. - * i386 has everything mapped statically. - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <asm/scatterlist.h> -#include <linux/string.h> -#include <asm/io.h> - -struct pci_dev; - -/* The PCI address space does equal the physical memory - * address space. The networking and block device layers use - * this boolean for bounce buffer decisions. - */ -#define PCI_DMA_BUS_IS_PHYS (1) - -/* pci_unmap_{page,single} is a nop so... */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) (0) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define pci_unmap_len(PTR, LEN_NAME) (0) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) - -/* This is always fine. */ -#define pci_dac_dma_supported(pci_dev, mask) (1) - -static inline dma64_addr_t -pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) -{ - return ((dma64_addr_t) page_to_phys(page) + - (dma64_addr_t) offset); -} - -static inline struct page * -pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) -{ - return pfn_to_page(dma_addr >> PAGE_SHIFT); -} - -static inline unsigned long -pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) -{ - return (dma_addr & ~PAGE_MASK); -} - -static inline void -pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) -{ -} - -static inline void -pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) -{ - flush_write_buffers(); -} - -#define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine); - - -static inline void pcibios_add_platform_entries(struct pci_dev *dev) -{ -} - -#endif /* __KERNEL__ */ - -/* implement the pci_ DMA API in terms of the generic device dma_ one */ -#include <asm-generic/pci-dma-compat.h> - -/* generic pci stuff */ -#include <asm-generic/pci.h> - -/* On Xen we have to scan all functions since Xen hides bridges from - * us. If a bridge is at fn=0 and that slot has a multifunction - * device, we won't find the additional devices without scanning all - * functions. */ -#undef pcibios_scan_all_fns -#define pcibios_scan_all_fns(a, b) 1 - -#endif /* __i386_PCI_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,62 +0,0 @@ -#ifndef _I386_PGALLOC_H -#define _I386_PGALLOC_H - -#include <linux/config.h> -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <linux/threads.h> -#include <linux/mm.h> /* for struct page */ -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ - -#define pmd_populate_kernel(mm, pmd, pte) \ - set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) - -#define pmd_populate(mm, pmd, pte) \ -do { \ - if (unlikely((mm)->context.pinned)) { \ - if (!PageHighMem(pte)) \ - HYPERVISOR_update_va_mapping( \ - (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\ - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0);\ - set_pmd(pmd, __pmd(_PAGE_TABLE + \ - ((unsigned long long)page_to_pfn(pte) << \ - (unsigned long long) PAGE_SHIFT))); \ - } else { \ - *(pmd) = __pmd(_PAGE_TABLE + \ - ((unsigned long long)page_to_pfn(pte) << \ - (unsigned long long) PAGE_SHIFT)); \ - } \ -} while (0) - -/* - * Allocate and free page tables. - */ -extern pgd_t *pgd_alloc(struct mm_struct *); -extern void pgd_free(pgd_t *pgd); - -extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long); -extern struct page *pte_alloc_one(struct mm_struct *, unsigned long); - -static inline void pte_free_kernel(pte_t *pte) -{ - free_page((unsigned long)pte); - make_page_writable(pte); -} - -extern void pte_free(struct page *pte); - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - -#ifdef CONFIG_X86_PAE -/* - * In the PAE case we free the pmds as part of the pgd. - */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) -#define pmd_free(x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) -#define pud_populate(mm, pmd, pte) BUG() -#endif - -#define check_pgt_cache() do { } while (0) - -#endif /* _I386_PGALLOC_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level-defs.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level-defs.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,19 +0,0 @@ -#ifndef _I386_PGTABLE_2LEVEL_DEFS_H -#define _I386_PGTABLE_2LEVEL_DEFS_H - -/* - * traditional i386 two-level paging structure: - */ - -#define PGDIR_SHIFT 22 -#define PTRS_PER_PGD 1024 -#define PTRS_PER_PGD_NO_HV (HYPERVISOR_VIRT_START >> PGDIR_SHIFT) - -/* - * the i386 is two-level, so we don't really have any - * PMD directory physically. - */ - -#define PTRS_PER_PTE 1024 - -#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,106 +0,0 @@ -#ifndef _I386_PGTABLE_2LEVEL_H -#define _I386_PGTABLE_2LEVEL_H - -#include <asm-generic/pgtable-nopmd.h> - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) - -/* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. - */ -#define set_pte(pteptr, pteval) (*(pteptr) = pteval) -#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) - -#ifndef CONFIG_XEN_SHADOW_MODE -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) -#else -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) -#endif - -#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0)) -#define pte_same(a, b) ((a).pte_low == (b).pte_low) -/* - * We detect special mappings in one of two ways: - * 1. If the MFN is an I/O page then Xen will set the m2p entry - * to be outside our maximum possible pseudophys range. - * 2. If the MFN belongs to a different domain then we will certainly - * not have MFN in our p2m table. Conversely, if the page is ours, - * then we'll have p2m(m2p(MFN))==MFN. - * If we detect a special mapping then it doesn't have a 'struct page'. - * We force !pfn_valid() by returning an out-of-range pointer. - * - * NB. These checks require that, for any MFN that is not in our reservation, - * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if - * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. - * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. - * - * NB2. When deliberately mapping foreign pages into the p2m table, you *must* - * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we - * require. In all the cases we care about, the high bit gets shifted out - * (e.g., phys_to_machine()) so behaviour there is correct. - */ -#define INVALID_P2M_ENTRY (~0U) -#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) -#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) -#define pte_pfn(_pte) \ -({ \ - unsigned long mfn = pte_mfn(_pte); \ - unsigned long pfn = mfn_to_pfn(mfn); \ - if ((pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn)) \ - pfn = max_mapnr; /* special: force !pfn_valid() */ \ - pfn; \ -}) - -#define pte_page(_pte) pfn_to_page(pte_pfn(_pte)) - -#define pte_none(x) (!(x).pte_low) -#define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pfn_pte_ma(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) - -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) - -#define pmd_page_kernel(pmd) \ -((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) - -/* - * All present user pages are user-executable: - */ -static inline int pte_exec(pte_t pte) -{ - return pte_user(pte); -} - -/* - * All present pages are kernel-executable: - */ -static inline int pte_exec_kernel(pte_t pte) -{ - return 1; -} - -/* - * Bits 0, 6 and 7 are taken, split up the 29 bits of offset - * into this range: - */ -#define PTE_FILE_MAX_BITS 29 - -#define pte_to_pgoff(pte) \ - ((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 )) - -#define pgoff_to_pte(off) \ - ((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE }) - -/* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x1f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -#endif /* _I386_PGTABLE_2LEVEL_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,494 +0,0 @@ -#ifndef _I386_PGTABLE_H -#define _I386_PGTABLE_H - -#include <linux/config.h> -#include <asm-xen/hypervisor.h> - -/* - * The Linux memory management assumes a three-level page table setup. On - * the i386, we use that, but "fold" the mid level into the top-level page - * table, so that we physically have the same two-level page table as the - * i386 mmu expects. - * - * This file contains the functions and defines necessary to modify and use - * the i386 page table tree. - */ -#ifndef __ASSEMBLY__ -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <linux/threads.h> - -#ifndef _I386_BITOPS_H -#include <asm/bitops.h> -#endif - -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/spinlock.h> - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -extern unsigned long empty_zero_page[1024]; -extern pgd_t swapper_pg_dir[1024]; -extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; -extern spinlock_t pgd_lock; -extern struct page *pgd_list; - -void pmd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_ctor(void *, kmem_cache_t *, unsigned long); -void pgd_dtor(void *, kmem_cache_t *, unsigned long); -void pgtable_cache_init(void); -void paging_init(void); - -/* - * The Linux x86 paging architecture is 'compile-time dual-mode', it - * implements both the traditional 2-level x86 page tables and the - * newer 3-level PAE-mode page tables. - */ -#ifdef CONFIG_X86_PAE -# include <asm/pgtable-3level-defs.h> -# define PMD_SIZE (1UL << PMD_SHIFT) -# define PMD_MASK (~(PMD_SIZE-1)) -#else -# include <asm/pgtable-2level-defs.h> -#endif - -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 - -#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) -#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) - -#define TWOLEVEL_PGDIR_SHIFT 22 -#define BOOT_USER_PGD_PTRS (__PAGE_OFFSET >> TWOLEVEL_PGDIR_SHIFT) -#define BOOT_KERNEL_PGD_PTRS (1024-BOOT_USER_PGD_PTRS) - -/* Just any arbitrary offset to the start of the vmalloc VM area: the - * current 8MB value just means that there will be a 8MB "hole" after the - * physical memory until the kernel virtual memory starts. That means that - * any out-of-bounds memory accesses will hopefully be caught. - * The vmalloc() routines leaves a hole of 4kB between each vmalloced - * area for the same reason. ;) - */ -#define VMALLOC_OFFSET (8*1024*1024) -#define VMALLOC_START (((unsigned long) high_memory + vmalloc_earlyreserve + \ - 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) -#ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) -#else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) -#endif - -/* - * The 4MB page is guessing.. Detailed in the infamous "Chapter H" - * of the Pentium details, but assuming intel did the straightforward - * thing, this bit set in the page directory entry just means that - * the page directory entry points directly to a 4MB-aligned block of - * memory. - */ -#define _PAGE_BIT_PRESENT 0 -#define _PAGE_BIT_RW 1 -#define _PAGE_BIT_USER 2 -#define _PAGE_BIT_PWT 3 -#define _PAGE_BIT_PCD 4 -#define _PAGE_BIT_ACCESSED 5 -#define _PAGE_BIT_DIRTY 6 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page, Pentium+, if present.. */ -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_UNUSED2 10 -#define _PAGE_BIT_UNUSED3 11 -#define _PAGE_BIT_NX 63 - -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PSE 0x080 /* 4 MB (or 2MB) page, Pentium+, if present.. */ -#define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ -#define _PAGE_UNUSED1 0x200 /* available for programmer */ -#define _PAGE_UNUSED2 0x400 -#define _PAGE_UNUSED3 0x800 - -#define _PAGE_FILE 0x040 /* set:pagecache unset:swap */ -#define _PAGE_PROTNONE 0x080 /* If not present */ -#ifdef CONFIG_X86_PAE -#define _PAGE_NX (1ULL<<_PAGE_BIT_NX) -#else -#define _PAGE_NX 0 -#endif - -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) - -#define PAGE_NONE \ - __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED \ - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) - -#define PAGE_SHARED_EXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY_NOEXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_COPY_EXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY \ - PAGE_COPY_NOEXEC -#define PAGE_READONLY \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_READONLY_EXEC \ - __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) - -#define _PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX) -#define _PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) - -extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; -#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) -#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) -#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) -#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) - -#define PAGE_KERNEL __pgprot(__PAGE_KERNEL) -#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) - -/* - * The i386 can't do page protection for execute, and considers that - * the same are read. Also, write permissions imply read permissions. - * This is the closest we can get.. - */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - -/* - * Define this if things work differently on an i386 and an i486: - * it will (on an i486) warn about kernel memory accesses that are - * done without a 'verify_area(VERIFY_WRITE,..)' - */ -#undef TEST_VERIFY_AREA - -/* The boot page tables (all created as a single array) */ -extern unsigned long pg0[]; - -#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) - -#define pmd_none(x) (!pmd_val(x)) -/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. - can temporarily clear it. */ -#define pmd_present(x) (pmd_val(x)) -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) - - -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -static inline int pte_user(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_read(pte_t pte) { return (pte).pte_low & _PAGE_USER; } -static inline int pte_dirty(pte_t pte) { return (pte).pte_low & _PAGE_DIRTY; } -static inline int pte_young(pte_t pte) { return (pte).pte_low & _PAGE_ACCESSED; } -static inline int pte_write(pte_t pte) { return (pte).pte_low & _PAGE_RW; } - -/* - * The following only works if pte_present() is not true. - */ -static inline int pte_file(pte_t pte) { return (pte).pte_low & _PAGE_FILE; } - -static inline pte_t pte_rdprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } -static inline pte_t pte_exprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_USER; return pte; } -static inline pte_t pte_mkclean(pte_t pte) { (pte).pte_low &= ~_PAGE_DIRTY; return pte; } -static inline pte_t pte_mkold(pte_t pte) { (pte).pte_low &= ~_PAGE_ACCESSED; return pte; } -static inline pte_t pte_wrprotect(pte_t pte) { (pte).pte_low &= ~_PAGE_RW; return pte; } -static inline pte_t pte_mkread(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } -static inline pte_t pte_mkexec(pte_t pte) { (pte).pte_low |= _PAGE_USER; return pte; } -static inline pte_t pte_mkdirty(pte_t pte) { (pte).pte_low |= _PAGE_DIRTY; return pte; } -static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; return pte; } -static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } - -#ifdef CONFIG_X86_PAE -# include <asm/pgtable-3level.h> -#else -# include <asm/pgtable-2level.h> -#endif - -static inline int ptep_test_and_clear_dirty(pte_t *ptep) -{ - if (!pte_dirty(*ptep)) - return 0; - return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); -} - -static inline int ptep_test_and_clear_young(pte_t *ptep) -{ - if (!pte_young(*ptep)) - return 0; - return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); -} - -static inline void ptep_set_wrprotect(pte_t *ptep) -{ - if (pte_write(*ptep)) - clear_bit(_PAGE_BIT_RW, &ptep->pte_low); -} - -static inline void ptep_mkdirty(pte_t *ptep) -{ - if (!pte_dirty(*ptep)) - set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); -} - -/* - * Macro to mark a page protection value as "uncacheable". On processors which do not support - * it, this is a no-op. - */ -#define pgprot_noncached(prot) ((boot_cpu_data.x86 > 3) \ - ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) : (prot)) - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -#define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE) - -static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - pte.pte_low &= _PAGE_CHG_MASK; - pte.pte_low |= pgprot_val(newprot); -#ifdef CONFIG_X86_PAE - /* - * Chop off the NX bit (if present), and add the NX portion of - * the newprot (if present): - */ - pte.pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - pte.pte_high |= (pgprot_val(newprot) >> 32) & \ - (__supported_pte_mask >> 32); -#endif - return pte; -} - -#define page_pte(page) page_pte_prot(page, __pgprot(0)) - -#define pmd_large(pmd) \ -((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) - -/* - * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] - * - * this macro returns the index of the entry in the pgd page which would - * control the given virtual address - */ -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_index_k(addr) pgd_index(addr) - -/* - * pgd_offset() returns a (pgd_t *) - * pgd_index() is used get the offset into the pgd page's array of pgd_t's; - */ -#define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) - -/* - * a shortcut which implies the use of the kernel's pgd, instead - * of a process's - */ -#define pgd_offset_k(address) pgd_offset(&init_mm, address) - -/* - * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] - * - * this macro returns the index of the entry in the pmd page which would - * control the given virtual address - */ -#define pmd_index(address) \ - (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) - -/* - * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] - * - * this macro returns the index of the entry in the pte page which would - * control the given virtual address - */ -#define pte_index(address) \ - (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) \ - ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) - -/* - * Helper function that returns the kernel pagetable entry controlling - * the virtual address 'address'. NULL means no pagetable entry present. - * NOTE: the return type is pte_t but if the pmd is PSE then we return it - * as a pte too. - */ -extern pte_t *lookup_address(unsigned long address); - -/* - * Make a given kernel text page executable/non-executable. - * Returns the previous executability setting of that page (which - * is used to restore the previous state). Used by the SMP bootup code. - * NOTE: this is an __init function for security reasons. - */ -#ifdef CONFIG_X86_PAE - extern int set_kernel_exec(unsigned long vaddr, int enable); -#else - static inline int set_kernel_exec(unsigned long vaddr, int enable) { return 0;} -#endif - -extern void noexec_setup(const char *str); - -#if defined(CONFIG_HIGHPTE) -#define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + \ - pte_index(address)) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + \ - pte_index(address)) -#define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) -#define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) -#else -#define pte_offset_map(dir, address) \ - ((pte_t *)page_address(pmd_page(*(dir))) + pte_index(address)) -#define pte_offset_map_nested(dir, address) pte_offset_map(dir, address) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) -#endif - -/* - * The i386 doesn't have any external MMU info: the kernel page - * tables contain all the necessary information. - * - * Also, we only update the dirty/accessed state if we set - * the dirty bit by hand in the kernel, since the hardware - * will do the accessed bit for us, and we don't want to - * race with other CPU's that might be updating the dirty - * bit at the same time. - */ -#define update_mmu_cache(vma,address,pte) do { } while (0) -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ - do { \ - if (__dirty) { \ - if ( likely((__vma)->vm_mm == current->mm) ) { \ - HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits)); \ - } else { \ - xen_l1_entry_update((__ptep), (__entry).pte_low); \ - flush_tlb_page((__vma), (__address)); \ - } \ - } \ - } while (0) - -#define __HAVE_ARCH_PTEP_ESTABLISH -#define ptep_establish(__vma, __address, __ptep, __entry) \ -do { \ - ptep_set_access_flags(__vma, __address, __ptep, __entry, 1); \ -} while (0) - -#define __HAVE_ARCH_PTEP_ESTABLISH_NEW -#define ptep_establish_new(__vma, __address, __ptep, __entry) \ -do { \ - if (likely((__vma)->vm_mm == current->mm)) { \ - HYPERVISOR_update_va_mapping((__address), \ - __entry, 0); \ - } else { \ - xen_l1_entry_update((__ptep), (__entry).pte_low); \ - } \ -} while (0) - -#ifndef CONFIG_XEN_SHADOW_MODE -void make_lowmem_page_readonly(void *va); -void make_lowmem_page_writable(void *va); -void make_page_readonly(void *va); -void make_page_writable(void *va); -void make_pages_readonly(void *va, unsigned int nr); -void make_pages_writable(void *va, unsigned int nr); -#else -#define make_lowmem_page_readonly(_va) ((void)0) -#define make_lowmem_page_writable(_va) ((void)0) -#define make_page_readonly(_va) ((void)0) -#define make_page_writable(_va) ((void)0) -#define make_pages_readonly(_va, _nr) ((void)0) -#define make_pages_writable(_va, _nr) ((void)0) -#endif - -#define virt_to_ptep(__va) \ -({ \ - pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \ - pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \ - pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \ - pte_offset_kernel(__pmd, (unsigned long)(__va)); \ -}) - -#define arbitrary_virt_to_machine(__va) \ -({ \ - pte_t *__pte = virt_to_ptep(__va); \ - unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK; \ - __pa | ((unsigned long)(__va) & (PAGE_SIZE-1)); \ -}) - -#endif /* !__ASSEMBLY__ */ - -#ifndef CONFIG_DISCONTIGMEM -#define kern_addr_valid(addr) (1) -#endif /* !CONFIG_DISCONTIGMEM */ - -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid); -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v); - -#define io_remap_page_range(vma,from,phys,size,prot) \ -direct_remap_area_pages(vma->vm_mm,from,phys,size,prot,DOMID_IO) - -#define io_remap_pfn_range(vma,from,pfn,size,prot) \ -direct_remap_area_pages(vma->vm_mm,from,pfn<<PAGE_SHIFT,size,prot,DOMID_IO) - -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -#define __HAVE_ARCH_PTEP_MKDIRTY -#define __HAVE_ARCH_PTE_SAME -#include <asm-generic/pgtable.h> - -#endif /* _I386_PGTABLE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/processor.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/processor.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,684 +0,0 @@ -/* - * include/asm-i386/processor.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -#ifndef __ASM_I386_PROCESSOR_H -#define __ASM_I386_PROCESSOR_H - -#include <asm/vm86.h> -#include <asm/math_emu.h> -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/sigcontext.h> -#include <asm/cpufeature.h> -#include <asm/msr.h> -#include <asm/system.h> -#include <linux/cache.h> -#include <linux/config.h> -#include <linux/threads.h> -#include <asm/percpu.h> - -/* flag for disabling the tsc */ -extern int tsc_disable; - -struct desc_struct { - unsigned long a,b; -}; - -#define desc_empty(desc) \ - (!((desc)->a + (desc)->b)) - -#define desc_equal(desc1, desc2) \ - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) -/* - * Default implementation of macro that returns current - * instruction pointer ("program counter"). - */ -#define current_text_addr() ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) - -/* - * CPU type and hardware bug flags. Kept separately for each CPU. - * Members of this structure are referenced in head.S, so think twice - * before touching them. [mj] - */ - -struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; - __u8 x86_mask; - char wp_works_ok; /* It doesn't on 386's */ - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ - char hard_math; - char rfu; - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ - unsigned long x86_capability[NCAPINTS]; - char x86_vendor_id[16]; - char x86_model_id[64]; - int x86_cache_size; /* in KB - valid for CPUS which support this - call */ - int x86_cache_alignment; /* In bytes */ - int fdiv_bug; - int f00f_bug; - int coma_bug; - unsigned long loops_per_jiffy; - unsigned char x86_num_cores; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - -#define X86_VENDOR_INTEL 0 -#define X86_VENDOR_CYRIX 1 -#define X86_VENDOR_AMD 2 -#define X86_VENDOR_UMC 3 -#define X86_VENDOR_NEXGEN 4 -#define X86_VENDOR_CENTAUR 5 -#define X86_VENDOR_RISE 6 -#define X86_VENDOR_TRANSMETA 7 -#define X86_VENDOR_NSC 8 -#define X86_VENDOR_NUM 9 -#define X86_VENDOR_UNKNOWN 0xff - -/* - * capabilities of CPUs - */ - -extern struct cpuinfo_x86 boot_cpu_data; -extern struct cpuinfo_x86 new_cpu_data; -extern struct tss_struct doublefault_tss; -DECLARE_PER_CPU(struct tss_struct, init_tss); -DECLARE_PER_CPU(pgd_t *, cur_pgd); - -#ifdef CONFIG_SMP -extern struct cpuinfo_x86 cpu_data[]; -#define current_cpu_data cpu_data[smp_processor_id()] -#else -#define cpu_data (&boot_cpu_data) -#define current_cpu_data boot_cpu_data -#endif - -extern int phys_proc_id[NR_CPUS]; -extern char ignore_fpu_irq; - -extern void identify_cpu(struct cpuinfo_x86 *); -extern void print_cpu_info(struct cpuinfo_x86 *); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern void dodgy_tsc(void); - -#ifdef CONFIG_X86_HT -extern void detect_ht(struct cpuinfo_x86 *c); -#else -static inline void detect_ht(struct cpuinfo_x86 *c) {} -#endif - -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - -/* - * Generic CPUID function - * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx - * resulting in stale register contents being returned. - */ -static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx) -{ - __asm__("cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (op), "c"(0)); -} - -/* - * CPUID functions returning a single datum - */ -static inline unsigned int cpuid_eax(unsigned int op) -{ - unsigned int eax; - - __asm__("cpuid" - : "=a" (eax) - : "0" (op) - : "bx", "cx", "dx"); - return eax; -} -static inline unsigned int cpuid_ebx(unsigned int op) -{ - unsigned int eax, ebx; - - __asm__("cpuid" - : "=a" (eax), "=b" (ebx) - : "0" (op) - : "cx", "dx" ); - return ebx; -} -static inline unsigned int cpuid_ecx(unsigned int op) -{ - unsigned int eax, ecx; - - __asm__("cpuid" - : "=a" (eax), "=c" (ecx) - : "0" (op) - : "bx", "dx" ); - return ecx; -} -static inline unsigned int cpuid_edx(unsigned int op) -{ - unsigned int eax, edx; - - __asm__("cpuid" - : "=a" (eax), "=d" (edx) - : "0" (op) - : "bx", "cx"); - return edx; -} - -#define load_cr3(pgdir) do { \ - xen_pt_switch(__pa(pgdir)); \ - per_cpu(cur_pgd, smp_processor_id()) = pgdir; \ -} while (/* CONSTCOND */0) - - -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; - -static inline void set_in_cr4 (unsigned long mask) -{ - mmu_cr4_features |= mask; - switch (mask) { - case X86_CR4_OSFXSR: - case X86_CR4_OSXMMEXCPT: - break; - default: - do { - const char *msg = "Xen unsupported cr4 update\n"; - (void)HYPERVISOR_console_io( - CONSOLEIO_write, __builtin_strlen(msg), - (char *)msg); - BUG(); - } while (0); - } -} - -static inline void clear_in_cr4 (unsigned long mask) -{ - mmu_cr4_features &= ~mask; - __asm__("movl %%cr4,%%eax\n\t" - "andl %0,%%eax\n\t" - "movl %%eax,%%cr4\n" - : : "irg" (~mask) - :"ax"); -} - -/* - * NSC/Cyrix CPU configuration register indexes - */ - -#define CX86_PCR0 0x20 -#define CX86_GCR 0xb8 -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_PCR1 0xf0 -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - -/* - * NSC/Cyrix CPU indexed register access macros - */ - -#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) - -#define setCx86(reg, data) do { \ - outb((reg), 0x22); \ - outb((data), 0x23); \ -} while (0) - -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax,%ecx,%edx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc8;" - : :"a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax,%ecx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); -} - -/* from system description table in BIOS. Mostly for MCA use, but -others may find it useful. */ -extern unsigned int machine_id; -extern unsigned int machine_submodel_id; -extern unsigned int BIOS_revision; -extern unsigned int mca_pentium_flag; - -/* Boot loader type from the setup header */ -extern int bootloader_type; - -/* - * User space process size: 3GB (default). - */ -#define TASK_SIZE (PAGE_OFFSET) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) - -#define HAVE_ARCH_PICK_MMAP_LAYOUT - -/* - * Size of io_bitmap. - */ -#define IO_BITMAP_BITS 65536 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) -#define INVALID_IO_BITMAP_OFFSET 0x8000 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 - -struct i387_fsave_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - long status; /* software status information */ -}; - -struct i387_fxsave_struct { - unsigned short cwd; - unsigned short swd; - unsigned short twd; - unsigned short fop; - long fip; - long fcs; - long foo; - long fos; - long mxcsr; - long mxcsr_mask; - long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ - long padding[56]; -} __attribute__ ((aligned (16))); - -struct i387_soft_struct { - long cwd; - long swd; - long twd; - long fip; - long fcs; - long foo; - long fos; - long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */ - unsigned char ftop, changed, lookahead, no_update, rm, alimit; - struct info *info; - unsigned long entry_eip; -}; - -union i387_union { - struct i387_fsave_struct fsave; - struct i387_fxsave_struct fxsave; - struct i387_soft_struct soft; -}; - -typedef struct { - unsigned long seg; -} mm_segment_t; - -struct thread_struct; - -struct tss_struct { - unsigned short back_link,__blh; - unsigned long esp0; - unsigned short ss0,__ss0h; - unsigned long esp1; - unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ - unsigned long esp2; - unsigned short ss2,__ss2h; - unsigned long __cr3; - unsigned long eip; - unsigned long eflags; - unsigned long eax,ecx,edx,ebx; - unsigned long esp; - unsigned long ebp; - unsigned long esi; - unsigned long edi; - unsigned short es, __esh; - unsigned short cs, __csh; - unsigned short ss, __ssh; - unsigned short ds, __dsh; - unsigned short fs, __fsh; - unsigned short gs, __gsh; - unsigned short ldt, __ldth; - unsigned short trace, io_bitmap_base; - /* - * The extra 1 is there because the CPU will access an - * additional byte beyond the end of the IO permission - * bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; - /* - * Cache the current maximum and the last task that used the bitmap: - */ - unsigned long io_bitmap_max; - struct thread_struct *io_bitmap_owner; - /* - * pads the TSS to be cacheline-aligned (size is 0x100) - */ - unsigned long __cacheline_filler[35]; - /* - * .. and then another 0x100 bytes for emergency kernel stack - */ - unsigned long stack[64]; -} __attribute__((packed)); - -#define ARCH_MIN_TASKALIGN 16 - -struct thread_struct { -/* cached TLS descriptors. */ - struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; - unsigned long esp0; - unsigned long sysenter_cs; - unsigned long eip; - unsigned long esp; - unsigned long fs; - unsigned long gs; - unsigned int io_pl; -/* Hardware debugging registers */ - unsigned long debugreg[8]; /* %%db0-7 debug registers */ -/* fault info */ - unsigned long cr2, trap_no, error_code; -/* floating point info */ - union i387_union i387; -/* virtual 86 mode info */ - struct vm86_struct __user * vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags, v86mask, saved_esp0; - unsigned int saved_fs, saved_gs; -/* IO permissions */ - unsigned long *io_bitmap_ptr; -/* max allowed port in the bitmap, in bytes: */ - unsigned long io_bitmap_max; -}; - -#define INIT_THREAD { \ - .vm86_info = NULL, \ - .sysenter_cs = __KERNEL_CS, \ - .io_bitmap_ptr = NULL, \ -} - -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .esp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .ldt = GDT_ENTRY_LDT, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - -static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread) -{ - tss->esp0 = thread->esp0; - /* This can only happen when SEP is enabled, no need to test "SEP"arately */ - if (unlikely(tss->ss1 != thread->sysenter_cs)) { - tss->ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } - HYPERVISOR_stack_switch(tss->ss0, tss->esp0); -} - -#define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ - set_fs(USER_DS); \ - regs->xds = __USER_DS; \ - regs->xes = __USER_DS; \ - regs->xss = __USER_DS; \ - regs->xcs = __USER_CS; \ - regs->eip = new_eip; \ - regs->esp = new_esp; \ -} while (0) - -/* Forward declaration, a strange C thing */ -struct task_struct; -struct mm_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); - -/* - * create a kernel thread without removing it from tasklists - */ -extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); - -extern unsigned long thread_saved_pc(struct task_struct *tsk); -void show_trace(struct task_struct *task, unsigned long *stack); - -unsigned long get_wchan(struct task_struct *p); - -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)KSTK_TOP((task)->thread_info); \ - __regs__ - 1; \ -}) - -#define KSTK_EIP(task) (task_pt_regs(task)->eip) -#define KSTK_ESP(task) (task_pt_regs(task)->esp) - - -struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; -}; - -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; -/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */ -#define MICROCODE_IOCFREE _IO('6',0) - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -#define cpu_relax() rep_nop() - -/* generic versions from gas */ -#define GENERIC_NOP1 ".byte 0x90\n" -#define GENERIC_NOP2 ".byte 0x89,0xf6\n" -#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" -#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" -#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 -#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" -#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 - -/* Opteron nops */ -#define K8_NOP1 GENERIC_NOP1 -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 - -/* K7 nops */ -/* uses eax dependencies (arbitary choice) */ -#define K7_NOP1 GENERIC_NOP1 -#define K7_NOP2 ".byte 0x8b,0xc0\n" -#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" -#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" -#define K7_NOP5 K7_NOP4 ASM_NOP1 -#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" -#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" -#define K7_NOP8 K7_NOP7 ASM_NOP1 - -#ifdef CONFIG_MK8 -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 -#elif defined(CONFIG_MK7) -#define ASM_NOP1 K7_NOP1 -#define ASM_NOP2 K7_NOP2 -#define ASM_NOP3 K7_NOP3 -#define ASM_NOP4 K7_NOP4 -#define ASM_NOP5 K7_NOP5 -#define ASM_NOP6 K7_NOP6 -#define ASM_NOP7 K7_NOP7 -#define ASM_NOP8 K7_NOP8 -#else -#define ASM_NOP1 GENERIC_NOP1 -#define ASM_NOP2 GENERIC_NOP2 -#define ASM_NOP3 GENERIC_NOP3 -#define ASM_NOP4 GENERIC_NOP4 -#define ASM_NOP5 GENERIC_NOP5 -#define ASM_NOP6 GENERIC_NOP6 -#define ASM_NOP7 GENERIC_NOP7 -#define ASM_NOP8 GENERIC_NOP8 -#endif - -#define ASM_NOP_MAX 8 - -/* Prefetch instructions for Pentium III and AMD Athlon */ -/* It's not worth to care about 3dnow! prefetches for the K6 - because they are microcoded there and very slow. - However we don't do prefetches for pre XP Athlons currently - That should be fixed. */ -#define ARCH_HAS_PREFETCH -extern inline void prefetch(const void *x) -{ - alternative_input(ASM_NOP4, - "prefetchnta (%1)", - X86_FEATURE_XMM, - "r" (x)); -} - -#define ARCH_HAS_PREFETCH -#define ARCH_HAS_PREFETCHW -#define ARCH_HAS_SPINLOCK_PREFETCH - -/* 3dnow! prefetch to get an exclusive cache line. Useful for - spinlocks to avoid one state transition in the cache coherency protocol. */ -extern inline void prefetchw(const void *x) -{ - alternative_input(ASM_NOP4, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); -} -#define spin_lock_prefetch(x) prefetchw(x) - -extern void select_idle_routine(const struct cpuinfo_x86 *c); - -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) - -extern unsigned long boot_option_idle_override; - -#endif /* __ASM_I386_PROCESSOR_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/ptrace.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/ptrace.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,69 +0,0 @@ -#ifndef _I386_PTRACE_H -#define _I386_PTRACE_H - -#define EBX 0 -#define ECX 1 -#define EDX 2 -#define ESI 3 -#define EDI 4 -#define EBP 5 -#define EAX 6 -#define DS 7 -#define ES 8 -#define FS 9 -#define GS 10 -#define ORIG_EAX 11 -#define EIP 12 -#define CS 13 -#define EFL 14 -#define UESP 15 -#define SS 16 -#define FRAME_SIZE 17 - -/* this struct defines the way the registers are stored on the - stack during a system call. */ - -struct pt_regs { - long ebx; - long ecx; - long edx; - long esi; - long edi; - long ebp; - long eax; - int xds; - int xes; - long orig_eax; - long eip; - int xcs; - long eflags; - long esp; - int xss; -}; - -/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */ -#define PTRACE_GETREGS 12 -#define PTRACE_SETREGS 13 -#define PTRACE_GETFPREGS 14 -#define PTRACE_SETFPREGS 15 -#define PTRACE_GETFPXREGS 18 -#define PTRACE_SETFPXREGS 19 - -#define PTRACE_OLDSETOPTIONS 21 - -#define PTRACE_GET_THREAD_AREA 25 -#define PTRACE_SET_THREAD_AREA 26 - -#ifdef __KERNEL__ -struct task_struct; -extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code); -#define user_mode(regs) ((VM_MASK & (regs)->eflags) || (2 & (regs)->xcs)) -#define instruction_pointer(regs) ((regs)->eip) -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -extern unsigned long profile_pc(struct pt_regs *regs); -#else -#define profile_pc(regs) instruction_pointer(regs) -#endif -#endif - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/segment.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/segment.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,96 +0,0 @@ -#ifndef _ASM_SEGMENT_H -#define _ASM_SEGMENT_H - -/* - * The layout of the per-CPU GDT under Linux: - * - * 0 - null - * 1 - reserved - * 2 - reserved - * 3 - reserved - * - * 4 - unused <==== new cacheline - * 5 - unused - * - * ------- start of TLS (Thread-Local Storage) segments: - * - * 6 - TLS segment #1 [ glibc's TLS segment ] - * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] - * 8 - TLS segment #3 - * 9 - reserved - * 10 - reserved - * 11 - reserved - * - * ------- start of kernel segments: - * - * 12 - kernel code segment <==== new cacheline - * 13 - kernel data segment - * 14 - default user CS - * 15 - default user DS - * 16 - TSS - * 17 - LDT - * 18 - PNPBIOS support (16->32 gate) - * 19 - PNPBIOS support - * 20 - PNPBIOS support - * 21 - PNPBIOS support - * 22 - PNPBIOS support - * 23 - APM BIOS support - * 24 - APM BIOS support - * 25 - APM BIOS support - * - * 26 - unused - * 27 - unused - * 28 - unused - * 29 - unused - * 30 - unused - * 31 - TSS for double fault handler - */ -#define GDT_ENTRY_TLS_ENTRIES 3 -#define GDT_ENTRY_TLS_MIN 6 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) - -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) - -#define GDT_ENTRY_DEFAULT_USER_CS 14 -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS * 8 + 3) - -#define GDT_ENTRY_DEFAULT_USER_DS 15 -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS * 8 + 3) - -#define GDT_ENTRY_KERNEL_BASE 12 - -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8 + 1) - -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8 + 1) - -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) - -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) - -#define GDT_ENTRY_DOUBLEFAULT_TSS 31 - -/* - * The GDT has 32 entries - */ -#define GDT_ENTRIES 32 - -#define GDT_SIZE (GDT_ENTRIES * 8) - -/* Simple and small GDT entries for booting only */ - -#define __BOOT_CS FLAT_KERNEL_CS - -#define __BOOT_DS FLAT_KERNEL_DS - -/* - * The interrupt descriptor table has room for 256 idt's, - * the global descriptor table is dependent on the number - * of tasks we can have.. - */ -#define IDT_ENTRIES 256 - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/setup.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/setup.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,66 +0,0 @@ -/* - * Just a place holder. We don't want to have to test x86 before - * we include stuff - */ - -#ifndef _i386_SETUP_H -#define _i386_SETUP_H - -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) - -/* - * Reserved space for vmalloc and iomap - defined in asm/page.h - */ -#define MAXMEM_PFN PFN_DOWN(MAXMEM) -#define MAX_NONPAE_PFN (1 << 20) - -#define PARAM_SIZE 2048 -#define COMMAND_LINE_SIZE 256 - -#define OLD_CL_MAGIC_ADDR 0x90020 -#define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 -#define NEW_CL_POINTER 0x228 /* Relative to real mode data */ - -#ifndef __ASSEMBLY__ -/* - * This is set up by the setup-routine at boot-time - */ -extern unsigned char boot_params[PARAM_SIZE]; - -#define PARAM (boot_params) -#define SCREEN_INFO (*(struct screen_info *) (PARAM+0)) -#define EXT_MEM_K (*(unsigned short *) (PARAM+2)) -#define ALT_MEM_K (*(unsigned long *) (PARAM+0x1e0)) -#define E820_MAP_NR (*(char*) (PARAM+E820NR)) -#define E820_MAP ((struct e820entry *) (PARAM+E820MAP)) -#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40)) -#define IST_INFO (*(struct ist_info *) (PARAM+0x60)) -#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80)) -#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0)) -#define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4))) -#define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8))) -#define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc))) -#define EFI_MEMMAP ((efi_memory_desc_t *) *((unsigned long *)(PARAM+0x1d0))) -#define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4))) -#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2)) -#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8)) -#define VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA)) -#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC)) -#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF)) -#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210)) -#define KERNEL_START (*(unsigned long *) (PARAM+0x214)) -#define INITRD_START (__pa(xen_start_info.mod_start)) -#define INITRD_SIZE (xen_start_info.mod_len) -#define EDID_INFO (*(struct edid_info *) (PARAM+0x440)) -#define EDD_NR (*(unsigned char *) (PARAM+EDDNR)) -#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF)) -#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF)) -#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF)) - -#endif /* __ASSEMBLY__ */ - -#endif /* _i386_SETUP_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/spinlock.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,250 +0,0 @@ -#ifndef __ASM_SPINLOCK_H -#define __ASM_SPINLOCK_H - -#include <asm/atomic.h> -#include <asm/rwlock.h> -#include <asm/page.h> -#include <linux/config.h> -#include <linux/compiler.h> - -asmlinkage int printk(const char * fmt, ...) - __attribute__ ((format (printf, 1, 2))); - -/* - * Your basic SMP spinlocks, allowing only a single CPU anywhere - */ - -typedef struct { - volatile unsigned int slock; -#ifdef CONFIG_DEBUG_SPINLOCK - unsigned magic; -#endif -#ifdef CONFIG_PREEMPT - unsigned int break_lock; -#endif -} spinlock_t; - -#define SPINLOCK_MAGIC 0xdead4ead - -#ifdef CONFIG_DEBUG_SPINLOCK -#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC -#else -#define SPINLOCK_MAGIC_INIT /* */ -#endif - -#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT } - -#define spin_lock_init(x) do { *(x) = SPIN_LOCK_UNLOCKED; } while(0) - -/* - * Simple spin lock operations. There are two variants, one clears IRQ's - * on the local processor, one does not. - * - * We make no fairness assumptions. They have a cost. - */ - -#define spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) <= 0) -#define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) - -#define spin_lock_string \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "jns 3f\n" \ - "2:\t" \ - "rep;nop\n\t" \ - "cmpb $0,%0\n\t" \ - "jle 2b\n\t" \ - "jmp 1b\n" \ - "3:\n\t" - -#define spin_lock_string_flags \ - "\n1:\t" \ - "lock ; decb %0\n\t" \ - "jns 4f\n\t" \ - "2:\t" \ - "testl $0x200, %1\n\t" \ - "jz 3f\n\t" \ - "#sti\n\t" \ - "3:\t" \ - "rep;nop\n\t" \ - "cmpb $0, %0\n\t" \ - "jle 3b\n\t" \ - "#cli\n\t" \ - "jmp 1b\n" \ - "4:\n\t" - -/* - * This works. Despite all the confusion. - * (except on PPro SMP or if we are using OOSTORE) - * (PPro errata 66, 92) - */ - -#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) - -#define spin_unlock_string \ - "movb $1,%0" \ - :"=m" (lock->slock) : : "memory" - - -static inline void _raw_spin_unlock(spinlock_t *lock) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(lock->magic != SPINLOCK_MAGIC); - BUG_ON(!spin_is_locked(lock)); -#endif - __asm__ __volatile__( - spin_unlock_string - ); -} - -#else - -#define spin_unlock_string \ - "xchgb %b0, %1" \ - :"=q" (oldval), "=m" (lock->slock) \ - :"0" (oldval) : "memory" - -static inline void _raw_spin_unlock(spinlock_t *lock) -{ - char oldval = 1; -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(lock->magic != SPINLOCK_MAGIC); - BUG_ON(!spin_is_locked(lock)); -#endif - __asm__ __volatile__( - spin_unlock_string - ); -} - -#endif - -static inline int _raw_spin_trylock(spinlock_t *lock) -{ - char oldval; - __asm__ __volatile__( - "xchgb %b0,%1" - :"=q" (oldval), "=m" (lock->slock) - :"0" (0) : "memory"); - return oldval > 0; -} - -static inline void _raw_spin_lock(spinlock_t *lock) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - if (unlikely(lock->magic != SPINLOCK_MAGIC)) { - printk("eip: %p\n", __builtin_return_address(0)); - BUG(); - } -#endif - __asm__ __volatile__( - spin_lock_string - :"=m" (lock->slock) : : "memory"); -} - -static inline void _raw_spin_lock_flags (spinlock_t *lock, unsigned long flags) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - if (unlikely(lock->magic != SPINLOCK_MAGIC)) { - printk("eip: %p\n", __builtin_return_address(0)); - BUG(); - } -#endif - __asm__ __volatile__( - spin_lock_string_flags - :"=m" (lock->slock) : "r" (flags) : "memory"); -} - -/* - * Read-write spinlocks, allowing multiple readers - * but only one writer. - * - * NOTE! it is quite common to have readers in interrupts - * but no interrupt writers. For those circumstances we - * can "mix" irq-safe locks - any writer needs to get a - * irq-safe write-lock, but readers can get non-irqsafe - * read-locks. - */ -typedef struct { - volatile unsigned int lock; -#ifdef CONFIG_DEBUG_SPINLOCK - unsigned magic; -#endif -#ifdef CONFIG_PREEMPT - unsigned int break_lock; -#endif -} rwlock_t; - -#define RWLOCK_MAGIC 0xdeaf1eed - -#ifdef CONFIG_DEBUG_SPINLOCK -#define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC -#else -#define RWLOCK_MAGIC_INIT /* */ -#endif - -#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } - -#define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) - -/** - * read_can_lock - would read_trylock() succeed? - * @lock: the rwlock in question. - */ -#define read_can_lock(x) ((int)(x)->lock > 0) - -/** - * write_can_lock - would write_trylock() succeed? - * @lock: the rwlock in question. - */ -#define write_can_lock(x) ((x)->lock == RW_LOCK_BIAS) - -/* - * On x86, we implement read-write locks as a 32-bit counter - * with the high bit (sign) being the "contended" bit. - * - * The inline assembly is non-obvious. Think about it. - * - * Changed to use the same technique as rw semaphores. See - * semaphore.h for details. -ben - */ -/* the spinlock helpers are in arch/i386/kernel/semaphore.c */ - -static inline void _raw_read_lock(rwlock_t *rw) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(rw->magic != RWLOCK_MAGIC); -#endif - __build_read_lock(rw, "__read_lock_failed"); -} - -static inline void _raw_write_lock(rwlock_t *rw) -{ -#ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(rw->magic != RWLOCK_MAGIC); -#endif - __build_write_lock(rw, "__write_lock_failed"); -} - -#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") -#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") - -static inline int _raw_read_trylock(rwlock_t *lock) -{ - atomic_t *count = (atomic_t *)lock; - atomic_dec(count); - if (atomic_read(count) >= 0) - return 1; - atomic_inc(count); - return 0; -} - -static inline int _raw_write_trylock(rwlock_t *lock) -{ - atomic_t *count = (atomic_t *)lock; - if (atomic_sub_and_test(RW_LOCK_BIAS, count)) - return 1; - atomic_add(RW_LOCK_BIAS, count); - return 0; -} - -#endif /* __ASM_SPINLOCK_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/synch_bitops.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/synch_bitops.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,140 +0,0 @@ -#ifndef __XEN_SYNCH_BITOPS_H__ -#define __XEN_SYNCH_BITOPS_H__ - -/* - * Copyright 1992, Linus Torvalds. - * Heavily modified to provide guaranteed strong synchronisation - * when communicating with Xen or other guest OSes running on other CPUs. - */ - -#include <linux/config.h> - -#define ADDR (*(volatile long *) addr) - -static __inline__ void synch_set_bit(int nr, volatile void * addr) -{ - __asm__ __volatile__ ( - "lock btsl %1,%0" - : "=m" (ADDR) : "Ir" (nr) : "memory" ); -} - -static __inline__ void synch_clear_bit(int nr, volatile void * addr) -{ - __asm__ __volatile__ ( - "lock btrl %1,%0" - : "=m" (ADDR) : "Ir" (nr) : "memory" ); -} - -static __inline__ void synch_change_bit(int nr, volatile void * addr) -{ - __asm__ __volatile__ ( - "lock btcl %1,%0" - : "=m" (ADDR) : "Ir" (nr) : "memory" ); -} - -static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr) -{ - int oldbit; - __asm__ __volatile__ ( - "lock btsl %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); - return oldbit; -} - -static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr) -{ - int oldbit; - __asm__ __volatile__ ( - "lock btrl %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); - return oldbit; -} - -static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr) -{ - int oldbit; - - __asm__ __volatile__ ( - "lock btcl %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory"); - return oldbit; -} - -struct __synch_xchg_dummy { unsigned long a[100]; }; -#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x)) - -#define synch_cmpxchg(ptr, old, new) \ -((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\ - (unsigned long)(old), \ - (unsigned long)(new), \ - sizeof(*(ptr)))) - -static inline unsigned long __synch_cmpxchg(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__("lock; cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__synch_xg(ptr)), - "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__("lock; cmpxchgw %w1,%2" - : "=a"(prev) - : "q"(new), "m"(*__synch_xg(ptr)), - "0"(old) - : "memory"); - return prev; -#ifdef CONFIG_X86_64 - case 4: - __asm__ __volatile__("lock; cmpxchgl %k1,%2" - : "=a"(prev) - : "q"(new), "m"(*__synch_xg(ptr)), - "0"(old) - : "memory"); - return prev; - case 8: - __asm__ __volatile__("lock; cmpxchgq %1,%2" - : "=a"(prev) - : "q"(new), "m"(*__synch_xg(ptr)), - "0"(old) - : "memory"); - return prev; -#else - case 4: - __asm__ __volatile__("lock; cmpxchgl %1,%2" - : "=a"(prev) - : "q"(new), "m"(*__synch_xg(ptr)), - "0"(old) - : "memory"); - return prev; -#endif - } - return old; -} - -static __inline__ int synch_const_test_bit(int nr, const volatile void * addr) -{ - return ((1UL << (nr & 31)) & - (((const volatile unsigned int *) addr)[nr >> 5])) != 0; -} - -static __inline__ int synch_var_test_bit(int nr, volatile void * addr) -{ - int oldbit; - __asm__ __volatile__ ( - "btl %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) ); - return oldbit; -} - -#define synch_test_bit(nr,addr) \ -(__builtin_constant_p(nr) ? \ - synch_const_test_bit((nr),(addr)) : \ - synch_var_test_bit((nr),(addr))) - -#endif /* __XEN_SYNCH_BITOPS_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/system.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/system.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,577 +0,0 @@ -#ifndef __ASM_SYSTEM_H -#define __ASM_SYSTEM_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/bitops.h> -#include <asm-xen/synch_bitops.h> -#include <asm/segment.h> -#include <asm/cpufeature.h> -#include <asm-xen/hypervisor.h> -#include <asm/smp_alt.h> - -#ifdef __KERNEL__ - -struct task_struct; /* one of the stranger aspects of C forward declarations.. */ -extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); - -#define switch_to(prev,next,last) do { \ - unsigned long esi,edi; \ - asm volatile("pushfl\n\t" \ - "pushl %%ebp\n\t" \ - "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %5,%%esp\n\t" /* restore ESP */ \ - "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %6\n\t" /* restore EIP */ \ - "jmp __switch_to\n" \ - "1:\t" \ - "popl %%ebp\n\t" \ - "popfl" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=a" (last),"=S" (esi),"=D" (edi) \ - :"m" (next->thread.esp),"m" (next->thread.eip), \ - "2" (prev), "d" (next)); \ -} while (0) - -#define _set_base(addr,base) do { unsigned long __pr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %%dl,%2\n\t" \ - "movb %%dh,%3" \ - :"=&d" (__pr) \ - :"m" (*((addr)+2)), \ - "m" (*((addr)+4)), \ - "m" (*((addr)+7)), \ - "0" (base) \ - ); } while(0) - -#define _set_limit(addr,limit) do { unsigned long __lr; \ -__asm__ __volatile__ ("movw %%dx,%1\n\t" \ - "rorl $16,%%edx\n\t" \ - "movb %2,%%dh\n\t" \ - "andb $0xf0,%%dh\n\t" \ - "orb %%dh,%%dl\n\t" \ - "movb %%dl,%2" \ - :"=&d" (__lr) \ - :"m" (*(addr)), \ - "m" (*((addr)+6)), \ - "0" (limit) \ - ); } while(0) - -#define set_base(ldt,base) _set_base( ((char *)&(ldt)) , (base) ) -#define set_limit(ldt,limit) _set_limit( ((char *)&(ldt)) , ((limit)-1)>>12 ) - -static inline unsigned long _get_base(char * addr) -{ - unsigned long __base; - __asm__("movb %3,%%dh\n\t" - "movb %2,%%dl\n\t" - "shll $16,%%edx\n\t" - "movw %1,%%dx" - :"=&d" (__base) - :"m" (*((addr)+2)), - "m" (*((addr)+4)), - "m" (*((addr)+7))); - return __base; -} - -#define get_base(ldt) _get_base( ((char *)&(ldt)) ) - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg,value) \ - asm volatile("\n" \ - "1:\t" \ - "movl %0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "pushl $0\n\t" \ - "popl %%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 4\n\t" \ - ".long 1b,3b\n" \ - ".previous" \ - : :"m" (*(unsigned int *)&(value))) - -/* - * Save a segment register away - */ -#define savesegment(seg, value) \ - asm volatile("movl %%" #seg ",%0":"=m" (*(int *)&(value))) - -/* - * Clear and set 'TS' bit respectively - */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) -#define read_cr0() ({ \ - unsigned int __dummy; \ - __asm__( \ - "movl %%cr0,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr0(x) \ - __asm__("movl %0,%%cr0": :"r" (x)); - -#define read_cr4() ({ \ - unsigned int __dummy; \ - __asm__( \ - "movl %%cr4,%0\n\t" \ - :"=r" (__dummy)); \ - __dummy; \ -}) -#define write_cr4(x) \ - __asm__("movl %0,%%cr4": :"r" (x)); -#define stts() (HYPERVISOR_fpu_taskswitch(1)) - -#endif /* __KERNEL__ */ - -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory"); - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - __asm__("lsll %1,%0" - :"=r" (__limit):"r" (segment)); - return __limit+1; -} - -#define nop() __asm__ __volatile__ ("nop") - -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) - -#define tas(ptr) (xchg((ptr),1)) - -struct __xchg_dummy { unsigned long a[100]; }; -#define __xg(x) ((struct __xchg_dummy *)(x)) - - -/* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) - * - * cmpxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. - */ -static inline void __set_64bit (unsigned long long * ptr, - unsigned int low, unsigned int high) -{ - __asm__ __volatile__ ( - "\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - "lock cmpxchg8b (%0)\n\t" - "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) - : "ax","dx","memory"); -} - -static inline void __set_64bit_constant (unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL)); -} -#define ll_low(x) *(((unsigned int*)&(x))+0) -#define ll_high(x) *(((unsigned int*)&(x))+1) - -static inline void __set_64bit_var (unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr,ll_low(value), ll_high(value)); -} - -#define set_64bit(ptr,value) \ -(__builtin_constant_p(value) ? \ - __set_64bit_constant(ptr, value) : \ - __set_64bit_var(ptr, value) ) - -#define _set_64bit(ptr,value) \ -(__builtin_constant_p(value) ? \ - __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ - __set_64bit(ptr, ll_low(value), ll_high(value)) ) - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) -{ - switch (size) { - case 1: - __asm__ __volatile__("xchgb %b0,%1" - :"=q" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 2: - __asm__ __volatile__("xchgw %w0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 4: - __asm__ __volatile__("xchgl %0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - } - return x; -} - -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ - -#ifdef CONFIG_X86_CMPXCHG -#define __HAVE_ARCH_CMPXCHG 1 -#endif - -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__(LOCK "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__(LOCK "cmpxchgw %w1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - __asm__ __volatile__(LOCK "cmpxchgl %1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) - -#ifdef __KERNEL__ -struct alt_instr { - __u8 *instr; /* original instruction */ - __u8 *replacement; - __u8 cpuid; /* cpuid bit set for replacement */ - __u8 instrlen; /* length of original instruction */ - __u8 replacementlen; /* length of new instruction, <= instrlen */ - __u8 pad; -}; -#endif - -/* - * Alternative instructions for different CPU types or capabilities. - * - * This allows to use optimized instructions even on generic binary - * kernels. - * - * length of oldinstr must be longer or equal the length of newinstr - * It can be padded with nops as needed. - * - * For non barrier like inlines please define new variants - * without volatile and memory clobber. - */ -#define alternative(oldinstr, newinstr, feature) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .align 4\n" \ - " .long 661b\n" /* label */ \ - " .long 663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature) : "memory") - -/* - * Alternative inline assembly with input. - * - * Pecularities: - * No memory clobber here. - * Argument numbers start with 1. - * Best is to use constraints that are fixed size (like (%1) ... "r") - * If you use variable sized constraints like "m" or "g" in the - * replacement maake sure to pad to the worst case length. - */ -#define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .align 4\n" \ - " .long 661b\n" /* label */ \ - " .long 663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature), ##input) - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - * - * For now, "wmb()" doesn't actually do anything, as all - * Intel CPU's follow what Intel calls a *Processor Order*, - * in which all writes are seen in the program order even - * outside the CPU. - * - * I expect future Intel CPU's to have a weaker ordering, - * but I'd also expect them to finally get their act together - * and add some real memory barriers if so. - * - * Some non intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ - - -/* - * Actually only lfence would be needed for mb() because all stores done - * by the kernel should be already ordered. But keep a full barrier for now. - */ - -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like thiswhere there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while(0) - -#ifdef CONFIG_X86_OOSTORE -/* Actually there are no OOO store capable CPUs for now that do SSE, - but make it already an possibility. */ -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define wmb() __asm__ __volatile__ ("": : :"memory") -#endif - -#ifdef CONFIG_SMP -#define smp_wmb() wmb() -#if defined(CONFIG_SMP_ALTERNATIVES) && !defined(MODULE) -#define smp_alt_mb(instr) \ -__asm__ __volatile__("6667:\nnop\nnop\nnop\nnop\nnop\nnop\n6668:\n" \ - ".section __smp_alternatives,\"a\"\n" \ - ".long 6667b\n" \ - ".long 6673f\n" \ - ".previous\n" \ - ".section __smp_replacements,\"a\"\n" \ - "6673:.byte 6668b-6667b\n" \ - ".byte 6670f-6669f\n" \ - ".byte 6671f-6670f\n" \ - ".byte 0\n" \ - ".byte %c0\n" \ - "6669:lock;addl $0,0(%%esp)\n" \ - "6670:" instr "\n" \ - "6671:\n" \ - ".previous\n" \ - : \ - : "i" (X86_FEATURE_XMM2) \ - : "memory") -#define smp_rmb() smp_alt_mb("lfence") -#define smp_mb() smp_alt_mb("mfence") -#define set_mb(var, value) do { \ -unsigned long __set_mb_temp; \ -__asm__ __volatile__("6667:movl %1, %0\n6668:\n" \ - ".section __smp_alternatives,\"a\"\n" \ - ".long 6667b\n" \ - ".long 6673f\n" \ - ".previous\n" \ - ".section __smp_replacements,\"a\"\n" \ - "6673: .byte 6668b-6667b\n" \ - ".byte 6670f-6669f\n" \ - ".byte 0\n" \ - ".byte 6671f-6670f\n" \ - ".byte -1\n" \ - "6669: xchg %1, %0\n" \ - "6670:movl %1, %0\n" \ - "6671:\n" \ - ".previous\n" \ - : "=m" (var), "=r" (__set_mb_temp) \ - : "1" (value) \ - : "memory"); } while (0) -#else -#define smp_rmb() rmb() -#define smp_mb() mb() -#define set_mb(var, value) do { xchg(&var, value); } while (0) -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -#define set_wmb(var, value) do { var = value; wmb(); } while (0) - -/* interrupt control.. */ - -/* - * The use of 'barrier' in the following reflects their use as local-lock - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following - * critical operations are executed. All critical operations must complete - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also - * includes these barriers, for example. - */ - -#define __cli() \ -do { \ - vcpu_info_t *_vcpu; \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - _vcpu->evtchn_upcall_mask = 1; \ - preempt_enable_no_resched(); \ - barrier(); \ -} while (0) - -#define __sti() \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - _vcpu->evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ - preempt_enable(); \ -} while (0) - -#define __save_flags(x) \ -do { \ - vcpu_info_t *_vcpu; \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - (x) = _vcpu->evtchn_upcall_mask; \ -} while (0) - -#define __restore_flags(x) \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ - preempt_enable(); \ - } else \ - preempt_enable_no_resched(); \ -} while (0) - -#define safe_halt() ((void)0) - -#define __save_and_cli(x) \ -do { \ - vcpu_info_t *_vcpu; \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - (x) = _vcpu->evtchn_upcall_mask; \ - _vcpu->evtchn_upcall_mask = 1; \ - preempt_enable_no_resched(); \ - barrier(); \ -} while (0) - -#define local_irq_save(x) __save_and_cli(x) -#define local_irq_restore(x) __restore_flags(x) -#define local_save_flags(x) __save_flags(x) -#define local_irq_disable() __cli() -#define local_irq_enable() __sti() - -#define irqs_disabled() \ - HYPERVISOR_shared_info->vcpu_data[smp_processor_id()].evtchn_upcall_mask - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -void disable_hlt(void); -void enable_hlt(void); - -extern int es7000_plat; -void cpu_idle_wait(void); - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/tlbflush.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,102 +0,0 @@ -#ifndef _I386_TLBFLUSH_H -#define _I386_TLBFLUSH_H - -#include <linux/config.h> -#include <linux/mm.h> -#include <asm/processor.h> - -#define __flush_tlb() xen_tlb_flush() -#define __flush_tlb_global() xen_tlb_flush() -#define __flush_tlb_all() xen_tlb_flush() - -extern unsigned long pgkern_mask; - -#define cpu_has_invlpg (boot_cpu_data.x86 > 3) - -#define __flush_tlb_single(addr) xen_invlpg(addr) - -#define __flush_tlb_one(addr) __flush_tlb_single(addr) - -/* - * TLB flushing: - * - * - flush_tlb() flushes the current mm struct TLBs - * - flush_tlb_all() flushes all processes TLBs - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables - * - * ..but the i386 has somewhat limited tlb flushing capabilities, - * and page-granular flushes are available only on i486 and up. - */ - -#ifndef CONFIG_SMP - -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb(); -} - -#else - -#include <asm/smp.h> - -#define local_flush_tlb() \ - __flush_tlb() - -extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); - -#define flush_tlb() flush_tlb_current_task() - -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) -{ - flush_tlb_mm(vma->vm_mm); -} - -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - -struct tlb_state -{ - struct mm_struct *active_mm; - int state; - char __cacheline_padding[L1_CACHE_BYTES-8]; -}; -DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); - - -#endif - -#define flush_tlb_kernel_range(start, end) flush_tlb_all() - -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* i386 does not keep any page table caches in TLB */ -} - -#endif /* _I386_TLBFLUSH_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/vga.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/vga.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,20 +0,0 @@ -/* - * Access to VGA videoram - * - * (c) 1998 Martin Mares <mj@xxxxxx> - */ - -#ifndef _LINUX_ASM_VGA_H_ -#define _LINUX_ASM_VGA_H_ - -/* - * On the PC, we can just recalculate addresses and then - * access the videoram directly without any black magic. - */ - -#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x) - -#define vga_readb(x) (*(x)) -#define vga_writeb(x,y) (*(y) = (x)) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/arch_hooks.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/arch_hooks.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,27 +0,0 @@ -#ifndef _ASM_ARCH_HOOKS_H -#define _ASM_ARCH_HOOKS_H - -#include <linux/interrupt.h> - -/* - * linux/include/asm/arch_hooks.h - * - * define the architecture specific hooks - */ - -/* these aren't arch hooks, they are generic routines - * that can be used by the hooks */ -extern void init_ISA_irqs(void); -extern void apic_intr_init(void); -extern void smp_intr_init(void); -extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs); - -/* these are the defined hooks */ -extern void intr_init_hook(void); -extern void pre_intr_init_hook(void); -extern void pre_setup_arch_hook(void); -extern void trap_init_hook(void); -extern void time_init_hook(void); -extern void mca_nmi_hook(void); - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/bootsetup.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/bootsetup.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,41 +0,0 @@ - -#ifndef _X86_64_BOOTSETUP_H -#define _X86_64_BOOTSETUP_H 1 - -extern char x86_boot_params[2048]; - -/* - * This is set up by the setup-routine at boot-time - */ -#define PARAM ((unsigned char *)x86_boot_params) -#define SCREEN_INFO (*(struct screen_info *) (PARAM+0)) -#define EXT_MEM_K (*(unsigned short *) (PARAM+2)) -#define ALT_MEM_K (*(unsigned int *) (PARAM+0x1e0)) -#define E820_MAP_NR (*(char*) (PARAM+E820NR)) -#define E820_MAP ((struct e820entry *) (PARAM+E820MAP)) -#define APM_BIOS_INFO (*(struct apm_bios_info *) (PARAM+0x40)) -#define DRIVE_INFO (*(struct drive_info_struct *) (PARAM+0x80)) -#define SYS_DESC_TABLE (*(struct sys_desc_table_struct*)(PARAM+0xa0)) -#define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2)) -#define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8)) -#define SAVED_VIDEO_MODE (*(unsigned short *) (PARAM+0x1FA)) -#define ORIG_ROOT_DEV (*(unsigned short *) (PARAM+0x1FC)) -#define AUX_DEVICE_INFO (*(unsigned char *) (PARAM+0x1FF)) -#define LOADER_TYPE (*(unsigned char *) (PARAM+0x210)) -#define KERNEL_START (*(unsigned int *) (PARAM+0x214)) - -#define INITRD_START (__pa(xen_start_info.mod_start)) -#define INITRD_SIZE (xen_start_info.mod_len) -#define EDID_INFO (*(struct edid_info *) (PARAM+0x440)) - -#define EDD_NR (*(unsigned char *) (PARAM+EDDNR)) -#define EDD_MBR_SIG_NR (*(unsigned char *) (PARAM+EDD_MBR_SIG_NR_BUF)) -#define EDD_MBR_SIGNATURE ((unsigned int *) (PARAM+EDD_MBR_SIG_BUF)) -#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF)) -#define COMMAND_LINE saved_command_line - -#define RAMDISK_IMAGE_START_MASK 0x07FF -#define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/desc.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/desc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,240 +0,0 @@ -/* Written 2000 by Andi Kleen */ -#ifndef __ARCH_DESC_H -#define __ARCH_DESC_H - -#include <linux/threads.h> -#include <asm/ldt.h> - -#ifndef __ASSEMBLY__ - -#include <linux/string.h> -#include <asm/segment.h> -#include <asm/mmu.h> - -// 8 byte segment descriptor -struct desc_struct { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 4, s : 1, dpl : 2, p : 1; - unsigned limit : 4, avl : 1, l : 1, d : 1, g : 1, base2 : 8; -} __attribute__((packed)); - -struct n_desc_struct { - unsigned int a,b; -}; - -enum { - GATE_INTERRUPT = 0xE, - GATE_TRAP = 0xF, - GATE_CALL = 0xC, -}; - -// 16byte gate -struct gate_struct { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long)(x) >> 32) - -enum { - DESC_TSS = 0x9, - DESC_LDT = 0x2, -}; - -// LDT or TSS descriptor in the GDT. 16 bytes. -struct ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - -struct desc_ptr { - unsigned short size; - unsigned long address; -} __attribute__((packed)) ; - -extern struct desc_ptr idt_descr, cpu_gdt_descr[NR_CPUS]; - -extern struct desc_struct cpu_gdt_table[NR_CPUS][GDT_ENTRIES]; - -#define get_cpu_gdt_table(_cpu) ((struct desc_struct *)(cpu_gdt_descr[(_cpu)].address)) - -#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8)) -#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8)) - -static inline void clear_LDT(void) -{ - int cpu = get_cpu(); - - /* - * NB. We load the default_ldt for lcall7/27 handling on demand, as - * it slows down context switching. Noone uses it anyway. - */ - cpu = cpu; /* XXX avoid compiler warning */ - xen_set_ldt(0UL, 0); - put_cpu(); -} - -/* - * This is the ldt that every process will get unless we need - * something other than this. - */ -extern struct desc_struct default_ldt[]; -extern struct gate_struct idt_table[]; - -static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist) -{ - struct gate_struct s; - s.offset_low = PTR_LOW(func); - s.segment = __KERNEL_CS; - s.ist = ist; - s.p = 1; - s.dpl = dpl; - s.zero0 = 0; - s.zero1 = 0; - s.type = type; - s.offset_middle = PTR_MIDDLE(func); - s.offset_high = PTR_HIGH(func); - /* does not need to be atomic because it is only done once at setup time */ - memcpy(adr, &s, 16); -} - -static inline void set_intr_gate(int nr, void *func) -{ - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0); -} - -static inline void set_intr_gate_ist(int nr, void *func, unsigned ist) -{ - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist); -} - -static inline void set_system_gate(int nr, void *func) -{ - _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); -} - -static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, - unsigned size) -{ - struct ldttss_desc d; - memset(&d,0,sizeof(d)); - d.limit0 = size & 0xFFFF; - d.base0 = PTR_LOW(tss); - d.base1 = PTR_MIDDLE(tss) & 0xFF; - d.type = type; - d.p = 1; - d.limit1 = (size >> 16) & 0xF; - d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF; - d.base3 = PTR_HIGH(tss); - memcpy(ptr, &d, 16); -} - -static inline void set_tss_desc(unsigned cpu, void *addr) -{ - set_tssldt_descriptor((struct ldttss_desc *)&get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS], - (unsigned long)addr, - DESC_TSS, - sizeof(struct tss_struct) - 1); -} - -static inline void set_ldt_desc(unsigned cpu, void *addr, int size) -{ - set_tssldt_descriptor((struct ldttss_desc *)&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], - (unsigned long)addr, - DESC_LDT, size * 8 - 1); -} - -static inline void set_seg_base(unsigned cpu, int entry, void *base) -{ - struct desc_struct *d = (struct desc_struct *)&get_cpu_gdt_table(cpu)[entry]; - u32 addr = (u32)(u64)base; - BUG_ON((u64)base >> 32); - d->base0 = addr & 0xffff; - d->base1 = (addr >> 16) & 0xff; - d->base2 = (addr >> 24) & 0xff; -} - -#define LDT_entry_a(info) \ - ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) -/* Don't allow setting of the lm bit. It is useless anyways because - 64bit system calls require __USER_CS. */ -#define LDT_entry_b(info) \ - (((info)->base_addr & 0xff000000) | \ - (((info)->base_addr & 0x00ff0000) >> 16) | \ - ((info)->limit & 0xf0000) | \ - (((info)->read_exec_only ^ 1) << 9) | \ - ((info)->contents << 10) | \ - (((info)->seg_not_present ^ 1) << 15) | \ - ((info)->seg_32bit << 22) | \ - ((info)->limit_in_pages << 23) | \ - ((info)->useable << 20) | \ - /* ((info)->lm << 21) | */ \ - 0x7000) - -#define LDT_empty(info) (\ - (info)->base_addr == 0 && \ - (info)->limit == 0 && \ - (info)->contents == 0 && \ - (info)->read_exec_only == 1 && \ - (info)->seg_32bit == 0 && \ - (info)->limit_in_pages == 0 && \ - (info)->seg_not_present == 1 && \ - (info)->useable == 0 && \ - (info)->lm == 0) - -#if TLS_SIZE != 24 -# error update this code. -#endif - -static inline void load_TLS(struct thread_struct *t, unsigned int cpu) -{ -#if 0 - u64 *gdt = (u64 *)(cpu_gdt_table[cpu] + GDT_ENTRY_TLS_MIN); - gdt[0] = t->tls_array[0]; - gdt[1] = t->tls_array[1]; - gdt[2] = t->tls_array[2]; -#endif -#define C(i) \ - HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i]) - - C(0); C(1); C(2); -#undef C -} - -/* - * load one particular LDT into the current CPU - */ -extern inline void load_LDT_nolock (mm_context_t *pc, int cpu) -{ - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) - segments = NULL; - - xen_set_ldt((unsigned long)segments, count); -} - -static inline void load_LDT(mm_context_t *pc) -{ - int cpu = get_cpu(); - load_LDT_nolock(pc, cpu); - put_cpu(); -} - -extern struct desc_ptr idt_descr; - -#endif /* !__ASSEMBLY__ */ - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,136 +0,0 @@ -#ifndef _X8664_DMA_MAPPING_H -#define _X8664_DMA_MAPPING_H 1 - -/* - * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for - * documentation. - */ - -#include <linux/config.h> - -#include <asm/scatterlist.h> -#include <asm/io.h> -#include <asm/swiotlb.h> - -extern dma_addr_t bad_dma_address; -#define dma_mapping_error(x) \ - (swiotlb ? swiotlb_dma_mapping_error(x) : ((x) == bad_dma_address)) - -void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned gfp); -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle); - -#ifdef CONFIG_GART_IOMMU - -extern dma_addr_t dma_map_single(struct device *hwdev, void *ptr, size_t size, - int direction); -extern void dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size, - int direction); - -#else - -/* No IOMMU */ - -static inline dma_addr_t dma_map_single(struct device *hwdev, void *ptr, - size_t size, int direction) -{ - dma_addr_t addr; - - if (direction == DMA_NONE) - out_of_line_bug(); - addr = virt_to_machine(ptr); - - if ((addr+size) & ~*hwdev->dma_mask) - out_of_line_bug(); - return addr; -} - -static inline void dma_unmap_single(struct device *hwdev, dma_addr_t dma_addr, - size_t size, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - /* Nothing to do */ -} -#endif - -#define dma_map_page(dev,page,offset,size,dir) \ - dma_map_single((dev), page_address(page)+(offset), (size), (dir)) - -static inline void dma_sync_single_for_cpu(struct device *hwdev, - dma_addr_t dma_handle, - size_t size, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - - if (swiotlb) - return swiotlb_sync_single_for_cpu(hwdev,dma_handle,size,direction); - - flush_write_buffers(); -} - -static inline void dma_sync_single_for_device(struct device *hwdev, - dma_addr_t dma_handle, - size_t size, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - - if (swiotlb) - return swiotlb_sync_single_for_device(hwdev,dma_handle,size,direction); - - flush_write_buffers(); -} - -static inline void dma_sync_sg_for_cpu(struct device *hwdev, - struct scatterlist *sg, - int nelems, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - - if (swiotlb) - return swiotlb_sync_sg_for_cpu(hwdev,sg,nelems,direction); - - flush_write_buffers(); -} - -static inline void dma_sync_sg_for_device(struct device *hwdev, - struct scatterlist *sg, - int nelems, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - - if (swiotlb) - return swiotlb_sync_sg_for_device(hwdev,sg,nelems,direction); - - flush_write_buffers(); -} - -extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction); -extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction); - -#define dma_unmap_page dma_unmap_single - -extern int dma_supported(struct device *hwdev, u64 mask); -extern int dma_get_cache_alignment(void); -#define dma_is_consistent(h) 1 - -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - *dev->dma_mask = mask; - return 0; -} - -static inline void dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir) -{ - flush_write_buffers(); -} -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/fixmap.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/fixmap.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,114 +0,0 @@ -/* - * fixmap.h: compile-time virtual memory allocation - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1998 Ingo Molnar - */ - -#ifndef _ASM_FIXMAP_H -#define _ASM_FIXMAP_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <asm/apicdef.h> -#include <asm-xen/gnttab.h> -#include <asm/page.h> -#include <asm/vsyscall.h> -#include <asm/vsyscall32.h> -#include <asm/acpi.h> - -/* - * Here we define all the compile-time 'special' virtual - * addresses. The point is to have a constant address at - * compile time, but to set the physical address only - * in the boot process. - * - * these 'compile-time allocated' memory buffers are - * fixed-size 4k pages. (or larger if used with an increment - * highger than 1) use fixmap_set(idx,phys) to associate - * physical memory with fixmap indices. - * - * TLB entries of such buffers will not be flushed across - * task switches. - */ - -enum fixed_addresses { - VSYSCALL_LAST_PAGE, - VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, - VSYSCALL_HPET, - FIX_HPET_BASE, -#ifdef CONFIG_X86_LOCAL_APIC - FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ -#endif -#ifdef CONFIG_X86_IO_APIC - FIX_IO_APIC_BASE_0, - FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1, -#endif -#ifdef CONFIG_ACPI_BOOT - FIX_ACPI_BEGIN, - FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, - FIX_ACPI_RSDP_PAGE, -#endif - FIX_SHARED_INFO, - FIX_GNTTAB_BEGIN, - FIX_GNTTAB_END = FIX_GNTTAB_BEGIN + NR_GRANT_FRAMES - 1, -#ifdef CONFIG_XEN_PHYSDEV_ACCESS -#define NR_FIX_ISAMAPS 256 - FIX_ISAMAP_END, - FIX_ISAMAP_BEGIN = FIX_ISAMAP_END + NR_FIX_ISAMAPS - 1, -#endif - __end_of_fixed_addresses -}; - -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); - -#define set_fixmap(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL) -/* - * Some hardware wants to get fixmapped without caching. - */ -#define set_fixmap_nocache(idx, phys) \ - __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) - -#define clear_fixmap(idx) \ - __set_fixmap(idx, 0, __pgprot(0)) - -#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) -#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - -/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */ -#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL) -#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) - -#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) - -extern void __this_fixmap_does_not_exist(void); - -/* - * 'index to address' translation. If anyone tries to use the idx - * directly without translation, we catch the bug with a NULL-deference - * kernel oops. Illegal ranges of incoming indices are caught too. - */ -extern inline unsigned long fix_to_virt(const unsigned int idx) -{ - /* - * this branch gets completely eliminated after inlining, - * except when someone tries to use fixaddr indices in an - * illegal way. (such as mixing up address types or using - * out-of-range indices). - * - * If it doesn't get removed, the linker will complain - * loudly with a reasonably clear error message.. - */ - if (idx >= __end_of_fixed_addresses) - __this_fixmap_does_not_exist(); - - return __fix_to_virt(idx); -} - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/floppy.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/floppy.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,204 +0,0 @@ -/* - * Architecture specific parts of the Floppy driver - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 1995 - * - * Modifications for Xen are Copyright (c) 2004, Keir Fraser. - */ -#ifndef __ASM_XEN_X86_64_FLOPPY_H -#define __ASM_XEN_X86_64_FLOPPY_H - -#include <linux/vmalloc.h> - - -/* - * The DMA channel used by the floppy controller cannot access data at - * addresses >= 16MB - * - * Went back to the 1MB limit, as some people had problems with the floppy - * driver otherwise. It doesn't matter much for performance anyway, as most - * floppy accesses go through the track buffer. - */ -#define _CROSS_64KB(a,s,vdma) \ -(!(vdma) && ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64)) - -#include <linux/vmalloc.h> - -/* XEN: Hit DMA paths on the head. This trick from asm-m68k/floppy.h. */ -#include <asm/dma.h> -#undef MAX_DMA_ADDRESS -#define MAX_DMA_ADDRESS 0 -#define CROSS_64KB(a,s) (0) - -#define fd_inb(port) inb_p(port) -#define fd_outb(value,port) outb_p(value,port) - -#define fd_request_dma() (0) -#define fd_free_dma() ((void)0) -#define fd_enable_irq() enable_irq(FLOPPY_IRQ) -#define fd_disable_irq() disable_irq(FLOPPY_IRQ) -#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL) -#define fd_get_dma_residue() vdma_get_dma_residue(FLOPPY_DMA) -#define fd_dma_mem_alloc(size) vdma_mem_alloc(size) -#define fd_dma_mem_free(addr, size) vdma_mem_free(addr, size) -#define fd_dma_setup(addr, size, mode, io) vdma_dma_setup(addr, size, mode, io) - -static int virtual_dma_count; -static int virtual_dma_residue; -static char *virtual_dma_addr; -static int virtual_dma_mode; -static int doing_pdma; - -static irqreturn_t floppy_hardint(int irq, void *dev_id, struct pt_regs * regs) -{ - register unsigned char st; - -#undef TRACE_FLPY_INT - -#ifdef TRACE_FLPY_INT - static int calls=0; - static int bytes=0; - static int dma_wait=0; -#endif - if (!doing_pdma) - return floppy_interrupt(irq, dev_id, regs); - -#ifdef TRACE_FLPY_INT - if(!calls) - bytes = virtual_dma_count; -#endif - - { - register int lcount; - register char *lptr; - - st = 1; - for(lcount=virtual_dma_count, lptr=virtual_dma_addr; - lcount; lcount--, lptr++) { - st=inb(virtual_dma_port+4) & 0xa0 ; - if(st != 0xa0) - break; - if(virtual_dma_mode) - outb_p(*lptr, virtual_dma_port+5); - else - *lptr = inb_p(virtual_dma_port+5); - } - virtual_dma_count = lcount; - virtual_dma_addr = lptr; - st = inb(virtual_dma_port+4); - } - -#ifdef TRACE_FLPY_INT - calls++; -#endif - if(st == 0x20) - return IRQ_HANDLED; - if(!(st & 0x20)) { - virtual_dma_residue += virtual_dma_count; - virtual_dma_count=0; -#ifdef TRACE_FLPY_INT - printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", - virtual_dma_count, virtual_dma_residue, calls, bytes, - dma_wait); - calls = 0; - dma_wait=0; -#endif - doing_pdma = 0; - floppy_interrupt(irq, dev_id, regs); - return IRQ_HANDLED; - } -#ifdef TRACE_FLPY_INT - if(!virtual_dma_count) - dma_wait++; -#endif - return IRQ_HANDLED; -} - -static void fd_disable_dma(void) -{ - doing_pdma = 0; - virtual_dma_residue += virtual_dma_count; - virtual_dma_count=0; -} - -static int vdma_get_dma_residue(unsigned int dummy) -{ - return virtual_dma_count + virtual_dma_residue; -} - - -static int fd_request_irq(void) -{ - return request_irq(FLOPPY_IRQ, floppy_hardint,SA_INTERRUPT, - "floppy", NULL); -} - - -static unsigned long vdma_mem_alloc(unsigned long size) -{ - return (unsigned long) vmalloc(size); - -} - -static void vdma_mem_free(unsigned long addr, unsigned long size) -{ - vfree((void *)addr); -} - -static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io) -{ - doing_pdma = 1; - virtual_dma_port = io; - virtual_dma_mode = (mode == DMA_MODE_WRITE); - virtual_dma_addr = addr; - virtual_dma_count = size; - virtual_dma_residue = 0; - return 0; -} - -/* XEN: This trick to force 'virtual DMA' is from include/asm-m68k/floppy.h. */ -#define FDC1 xen_floppy_init() -static int FDC2 = -1; - -static int xen_floppy_init(void) -{ - use_virtual_dma = 1; - can_use_virtual_dma = 1; - return 0x340; -} - -/* - * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock - * is needed to prevent corrupted CMOS RAM in case "insmod floppy" - * coincides with another rtc CMOS user. Paul G. - */ -#define FLOPPY0_TYPE ({ \ - unsigned long flags; \ - unsigned char val; \ - spin_lock_irqsave(&rtc_lock, flags); \ - val = (CMOS_READ(0x10) >> 4) & 15; \ - spin_unlock_irqrestore(&rtc_lock, flags); \ - val; \ -}) - -#define FLOPPY1_TYPE ({ \ - unsigned long flags; \ - unsigned char val; \ - spin_lock_irqsave(&rtc_lock, flags); \ - val = CMOS_READ(0x10) & 15; \ - spin_unlock_irqrestore(&rtc_lock, flags); \ - val; \ -}) - -#define N_FDC 2 -#define N_DRIVE 8 - -#define FLOPPY_MOTOR_MASK 0xf0 - -#define EXTRA_FLOPPY_PARAMS - -#endif /* __ASM_XEN_X86_64_FLOPPY_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,505 +0,0 @@ -/****************************************************************************** - * hypercall.h - * - * Linux-specific hypervisor handling. - * - * Copyright (c) 2002-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -/* - * Benjamin Liu <benjamin.liu@xxxxxxxxx> - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Ported to x86-64. - * - */ - -#ifndef __HYPERCALL_H__ -#define __HYPERCALL_H__ -#include <asm-xen/xen-public/xen.h> - -#define __syscall_clobber "r11","rcx","memory" - -/* - * Assembler stubs for hyper-calls. - */ -static inline int -HYPERVISOR_set_trap_table( - trap_info_t *table) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_trap_table), "D" (table) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_mmu_update( - mmu_update_t *req, int count, int *success_count, domid_t domid) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5, %%r10;" TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_mmu_update), "D" (req), "S" ((long)count), - "d" (success_count), "g" ((unsigned long)domid) - : __syscall_clobber, "r10" ); - - return ret; -} - -static inline int -HYPERVISOR_mmuext_op( - struct mmuext_op *op, int count, int *success_count, domid_t domid) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5, %%r10;" TRAP_INSTR - : "=a" (ret) - : "0" (__HYPERVISOR_mmuext_op), "D" (op), "S" ((long)count), - "d" (success_count), "g" ((unsigned long)domid) - : __syscall_clobber, "r10" ); - - return ret; -} - -static inline int -HYPERVISOR_set_gdt( - unsigned long *frame_list, int entries) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_gdt), "D" (frame_list), "S" ((long)entries) - : __syscall_clobber ); - - - return ret; -} -static inline int -HYPERVISOR_stack_switch( - unsigned long ss, unsigned long esp) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_stack_switch), "D" (ss), "S" (esp) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_set_callbacks( - unsigned long event_address, unsigned long failsafe_address, - unsigned long syscall_address) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_callbacks), "D" (event_address), - "S" (failsafe_address), "d" (syscall_address) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_fpu_taskswitch( - int set) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) : "0" ((unsigned long)__HYPERVISOR_fpu_taskswitch), - "D" ((unsigned long) set) : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_yield( - void) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), "D" ((unsigned long)SCHEDOP_yield) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_block( - void) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), "D" ((unsigned long)SCHEDOP_block) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_shutdown( - void) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)(SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift))) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_reboot( - void) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)(SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift))) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_suspend( - unsigned long srec) -{ - int ret; - - /* NB. On suspend, control software expects a suspend record in %esi. */ - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)(SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift))), - "S" (srec) - : __syscall_clobber ); - - return ret; -} - -/* - * We can have the timeout value in a single argument for the hypercall, but - * that will break the common code. - */ -static inline long -HYPERVISOR_set_timer_op( - u64 timeout) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_timer_op), - "D" (timeout) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_dom0_op( - dom0_op_t *dom0_op) -{ - int ret; - - dom0_op->interface_version = DOM0_INTERFACE_VERSION; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_dom0_op), "D" (dom0_op) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_set_debugreg( - int reg, unsigned long value) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_debugreg), "D" ((unsigned long)reg), "S" (value) - : __syscall_clobber ); - - return ret; -} - -static inline unsigned long -HYPERVISOR_get_debugreg( - int reg) -{ - unsigned long ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_get_debugreg), "D" ((unsigned long)reg) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_update_descriptor( - unsigned long ma, unsigned long word) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_update_descriptor), "D" (ma), - "S" (word) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_dom_mem_op( - unsigned int op, unsigned long *extent_list, - unsigned long nr_extents, unsigned int extent_order) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5,%%r10; movq %6,%%r8;" TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_dom_mem_op), "D" ((unsigned long)op), "S" (extent_list), - "d" (nr_extents), "g" ((unsigned long) extent_order), "g" ((unsigned long) DOMID_SELF) - : __syscall_clobber,"r8","r10"); - - return ret; -} - -static inline int -HYPERVISOR_multicall( - void *call_list, int nr_calls) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_multicall), "D" (call_list), "S" ((unsigned long)nr_calls) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_update_va_mapping( - unsigned long page_nr, pte_t new_val, unsigned long flags) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_update_va_mapping), - "D" (page_nr), "S" (new_val.pte), "d" (flags) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_event_channel_op( - void *op) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_event_channel_op), "D" (op) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_xen_version( - int cmd) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_xen_version), "D" ((unsigned long)cmd) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_console_io( - int cmd, int count, char *str) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_console_io), "D" ((unsigned long)cmd), "S" ((unsigned long)count), "d" (str) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_physdev_op( - void *physdev_op) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_physdev_op), "D" (physdev_op) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_grant_table_op( - unsigned int cmd, void *uop, unsigned int count) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_grant_table_op), "D" ((unsigned long)cmd), "S" ((unsigned long)uop), "d" (count) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_update_va_mapping_otherdomain( - unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5, %%r10;" TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_update_va_mapping_otherdomain), - "D" (page_nr), "S" (new_val.pte), "d" (flags), "g" ((unsigned long)domid) - : __syscall_clobber,"r10"); - - return ret; -} - -static inline int -HYPERVISOR_vm_assist( - unsigned int cmd, unsigned int type) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_vm_assist), "D" ((unsigned long)cmd), "S" ((unsigned long)type) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_switch_to_user(void) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) : "0" ((unsigned long)__HYPERVISOR_switch_to_user) : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_boot_vcpu( - unsigned long vcpu, vcpu_guest_context_t *ctxt) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" (__HYPERVISOR_boot_vcpu), "D" (vcpu), "S" (ctxt) - : __syscall_clobber); - - return ret; -} - -static inline int -HYPERVISOR_set_segment_base( - int reg, unsigned long value) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_segment_base), "D" ((unsigned long)reg), "S" (value) - : __syscall_clobber ); - - return ret; -} - -#endif /* __HYPERCALL_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/io.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/io.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,365 +0,0 @@ -#ifndef _ASM_IO_H -#define _ASM_IO_H - -#include <linux/config.h> -#include <asm/fixmap.h> -/* - * This file contains the definitions for the x86 IO instructions - * inb/inw/inl/outb/outw/outl and the "string versions" of the same - * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing" - * versions of the single-IO instructions (inb_p/inw_p/..). - * - * This file is not meant to be obfuscating: it's just complicated - * to (a) handle it all in a way that makes gcc able to optimize it - * as well as possible and (b) trying to avoid writing the same thing - * over and over again with slight variations and possibly making a - * mistake somewhere. - */ - -/* - * Thanks to James van Artsdalen for a better timing-fix than - * the two short jumps: using outb's to a nonexistent port seems - * to guarantee better timings even on fast machines. - * - * On the other hand, I'd like to be sure of a non-existent port: - * I feel a bit unsafe about using 0x80 (should be safe, though) - * - * Linus - */ - - /* - * Bit simplified and optimized by Jan Hubicka - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999. - * - * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added, - * isa_read[wl] and isa_write[wl] fixed - * - Arnaldo Carvalho de Melo <acme@xxxxxxxxxxxxxxxx> - */ - -#ifdef SLOW_IO_BY_JUMPING -#define __SLOW_DOWN_IO "\njmp 1f\n1:\tjmp 1f\n1:" -#else -#define __SLOW_DOWN_IO "\noutb %%al,$0x80" -#endif - -#ifdef REALLY_SLOW_IO -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO -#else -#define __FULL_SLOW_DOWN_IO __SLOW_DOWN_IO -#endif - -/* - * Talk about misusing macros.. - */ -#define __OUT1(s,x) \ -extern inline void out##s(unsigned x value, unsigned short port) { - -#define __OUT2(s,s1,s2) \ -__asm__ __volatile__ ("out" #s " %" s1 "0,%" s2 "1" - -#define __OUT(s,s1,x) \ -__OUT1(s,x) __OUT2(s,s1,"w") : : "a" (value), "Nd" (port)); } \ -__OUT1(s##_p,x) __OUT2(s,s1,"w") __FULL_SLOW_DOWN_IO : : "a" (value), "Nd" (port));} \ - -#define __IN1(s) \ -extern inline RETURN_TYPE in##s(unsigned short port) { RETURN_TYPE _v; - -#define __IN2(s,s1,s2) \ -__asm__ __volatile__ ("in" #s " %" s2 "1,%" s1 "0" - -#define __IN(s,s1,i...) \ -__IN1(s) __IN2(s,s1,"w") : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ -__IN1(s##_p) __IN2(s,s1,"w") __FULL_SLOW_DOWN_IO : "=a" (_v) : "Nd" (port) ,##i ); return _v; } \ - -#define __INS(s) \ -extern inline void ins##s(unsigned short port, void * addr, unsigned long count) \ -{ __asm__ __volatile__ ("rep ; ins" #s \ -: "=D" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); } - -#define __OUTS(s) \ -extern inline void outs##s(unsigned short port, const void * addr, unsigned long count) \ -{ __asm__ __volatile__ ("rep ; outs" #s \ -: "=S" (addr), "=c" (count) : "d" (port),"0" (addr),"1" (count)); } - -#define RETURN_TYPE unsigned char -__IN(b,"") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned short -__IN(w,"") -#undef RETURN_TYPE -#define RETURN_TYPE unsigned int -__IN(l,"") -#undef RETURN_TYPE - -__OUT(b,"b",char) -__OUT(w,"w",short) -__OUT(l,,int) - -__INS(b) -__INS(w) -__INS(l) - -__OUTS(b) -__OUTS(w) -__OUTS(l) - -#define IO_SPACE_LIMIT 0xffff - -#if defined(__KERNEL__) && __x86_64__ - -#include <linux/vmalloc.h> - -#ifndef __i386__ -/* - * Change virtual addresses to physical addresses and vv. - * These are pretty trivial - */ -extern inline unsigned long virt_to_phys(volatile void * address) -{ - return __pa(address); -} - -extern inline void * phys_to_virt(unsigned long address) -{ - return __va(address); -} - - -#define virt_to_bus(_x) phys_to_machine(__pa(_x)) -#define bus_to_virt(_x) __va(machine_to_phys(_x)) -#endif - -/* - * Change "struct page" to physical address. - */ -#ifdef CONFIG_DISCONTIGMEM -#include <asm/mmzone.h> -#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) - -#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \ - (unsigned long) bio_offset((bio))) -#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ - (unsigned long) (bv)->bv_offset) - -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \ - ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \ - bvec_to_pseudophys((vec2)))) -#else -// #define page_to_phys(page) ((page - mem_map) << PAGE_SHIFT) -#define page_to_pseudophys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT) -#define page_to_phys(page) (phys_to_machine(page_to_pseudophys(page))) - -#define bio_to_pseudophys(bio) (page_to_pseudophys(bio_page((bio))) + \ - (unsigned long) bio_offset((bio))) -#define bvec_to_pseudophys(bv) (page_to_pseudophys((bv)->bv_page) + \ - (unsigned long) (bv)->bv_offset) - -#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ - (((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) && \ - ((bvec_to_pseudophys((vec1)) + (vec1)->bv_len) == \ - bvec_to_pseudophys((vec2)))) -#endif - -#include <asm-generic/iomap.h> - -extern void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags); - -extern inline void __iomem * ioremap (unsigned long offset, unsigned long size) -{ - return __ioremap(offset, size, 0); -} - -/* - * This one maps high address device memory and turns off caching for that area. - * it's useful if some control registers are in such an area and write combining - * or read caching is not desirable: - */ -extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); -extern void iounmap(volatile void __iomem *addr); - -/* - * ISA I/O bus memory addresses are 1:1 with the physical address. - */ - -#define isa_virt_to_bus(_x) isa_virt_to_bus_is_UNSUPPORTED->x -#define isa_page_to_bus(_x) isa_page_to_bus_is_UNSUPPORTED->x -#ifdef CONFIG_XEN_PHYSDEV_ACCESS -#define isa_bus_to_virt(_x) (void *)(__fix_to_virt(FIX_ISAMAP_BEGIN) + (_x)) -#else -#define isa_bus_to_virt(_x) isa_bus_to_virt_needs_PRIVILEGED_BUILD -#endif -/* - * However PCI ones are not necessarily 1:1 and therefore these interfaces - * are forbidden in portable PCI drivers. - * - * Allow them on x86 for legacy drivers, though. - */ -#define virt_to_bus(_x) phys_to_machine(__pa(_x)) -#define bus_to_virt(_x) __va(machine_to_phys(_x)) - -/* - * readX/writeX() are used to access memory mapped devices. On some - * architectures the memory mapped IO stuff needs to be accessed - * differently. On the x86 architecture, we just read/write the - * memory location directly. - */ - -static inline __u8 __readb(const volatile void __iomem *addr) -{ - return *(__force volatile __u8 *)addr; -} -static inline __u16 __readw(const volatile void __iomem *addr) -{ - return *(__force volatile __u16 *)addr; -} -static inline __u32 __readl(const volatile void __iomem *addr) -{ - return *(__force volatile __u32 *)addr; -} -static inline __u64 __readq(const volatile void __iomem *addr) -{ - return *(__force volatile __u64 *)addr; -} -#define readb(x) __readb(x) -#define readw(x) __readw(x) -#define readl(x) __readl(x) -#define readq(x) __readq(x) -#define readb_relaxed(a) readb(a) -#define readw_relaxed(a) readw(a) -#define readl_relaxed(a) readl(a) -#define readq_relaxed(a) readq(a) -#define __raw_readb readb -#define __raw_readw readw -#define __raw_readl readl -#define __raw_readq readq - -#define mmiowb() - -#ifdef CONFIG_UNORDERED_IO -static inline void __writel(__u32 val, volatile void __iomem *addr) -{ - volatile __u32 __iomem *target = addr; - asm volatile("movnti %1,%0" - : "=m" (*target) - : "r" (val) : "memory"); -} - -static inline void __writeq(__u64 val, volatile void __iomem *addr) -{ - volatile __u64 __iomem *target = addr; - asm volatile("movnti %1,%0" - : "=m" (*target) - : "r" (val) : "memory"); -} -#else -static inline void __writel(__u32 b, volatile void __iomem *addr) -{ - *(__force volatile __u32 *)addr = b; -} -static inline void __writeq(__u64 b, volatile void __iomem *addr) -{ - *(__force volatile __u64 *)addr = b; -} -#endif -static inline void __writeb(__u8 b, volatile void __iomem *addr) -{ - *(__force volatile __u8 *)addr = b; -} -static inline void __writew(__u16 b, volatile void __iomem *addr) -{ - *(__force volatile __u16 *)addr = b; -} -#define writeq(val,addr) __writeq((val),(addr)) -#define writel(val,addr) __writel((val),(addr)) -#define writew(val,addr) __writew((val),(addr)) -#define writeb(val,addr) __writeb((val),(addr)) -#define __raw_writeb writeb -#define __raw_writew writew -#define __raw_writel writel -#define __raw_writeq writeq - -void __memcpy_fromio(void*,unsigned long,unsigned); -void __memcpy_toio(unsigned long,const void*,unsigned); - -static inline void memcpy_fromio(void *to, const volatile void __iomem *from, unsigned len) -{ - __memcpy_fromio(to,(unsigned long)from,len); -} -static inline void memcpy_toio(volatile void __iomem *to, const void *from, unsigned len) -{ - __memcpy_toio((unsigned long)to,from,len); -} - -void memset_io(volatile void __iomem *a, int b, size_t c); - -/* - * ISA space is 'always mapped' on a typical x86 system, no need to - * explicitly ioremap() it. The fact that the ISA IO space is mapped - * to PAGE_OFFSET is pure coincidence - it does not mean ISA values - * are physical addresses. The following constant pointer can be - * used as the IO-area pointer (it can be iounmapped as well, so the - * analogy with PCI is quite large): - */ -#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) - -#define isa_readb(a) readb(__ISA_IO_base + (a)) -#define isa_readw(a) readw(__ISA_IO_base + (a)) -#define isa_readl(a) readl(__ISA_IO_base + (a)) -#define isa_writeb(b,a) writeb(b,__ISA_IO_base + (a)) -#define isa_writew(w,a) writew(w,__ISA_IO_base + (a)) -#define isa_writel(l,a) writel(l,__ISA_IO_base + (a)) -#define isa_memset_io(a,b,c) memset_io(__ISA_IO_base + (a),(b),(c)) -#define isa_memcpy_fromio(a,b,c) memcpy_fromio((a),__ISA_IO_base + (b),(c)) -#define isa_memcpy_toio(a,b,c) memcpy_toio(__ISA_IO_base + (a),(b),(c)) - - -/* - * Again, x86-64 does not require mem IO specific function. - */ - -#define eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(b),(c),(d)) -#define isa_eth_io_copy_and_sum(a,b,c,d) eth_copy_and_sum((a),(void *)(__ISA_IO_base + (b)),(c),(d)) - -/** - * check_signature - find BIOS signatures - * @io_addr: mmio address to check - * @signature: signature block - * @length: length of signature - * - * Perform a signature comparison with the mmio address io_addr. This - * address should have been obtained by ioremap. - * Returns 1 on a match. - */ - -static inline int check_signature(void __iomem *io_addr, - const unsigned char *signature, int length) -{ - int retval = 0; - do { - if (readb(io_addr) != *signature) - goto out; - io_addr++; - signature++; - length--; - } while (length); - retval = 1; -out: - return retval; -} - -/* Nothing to do */ - -#define dma_cache_inv(_start,_size) do { } while (0) -#define dma_cache_wback(_start,_size) do { } while (0) -#define dma_cache_wback_inv(_start,_size) do { } while (0) - -#define flush_write_buffers() - -extern int iommu_bio_merge; -#define BIO_VMERGE_BOUNDARY iommu_bio_merge - -#endif /* __KERNEL__ */ - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/irq.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/irq.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,36 +0,0 @@ -#ifndef _ASM_IRQ_H -#define _ASM_IRQ_H - -/* - * linux/include/asm/irq.h - * - * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar - * - * IRQ/IPI changes taken from work by Thomas Radke - * <tomsoft@xxxxxxxxxxxxxxxxxxxxxxxxx> - */ - -#include <linux/config.h> -#include <linux/sched.h> -/* include comes from machine specific directory */ -#include "irq_vectors.h" -#include <asm/thread_info.h> - -static __inline__ int irq_canonicalize(int irq) -{ - return ((irq == 2) ? 9 : irq); -} - -#ifdef CONFIG_X86_LOCAL_APIC -#define ARCH_HAS_NMI_WATCHDOG /* See include/linux/nmi.h */ -#endif - -#define KDB_VECTOR 0xf9 - -# define irq_ctx_init(cpu) do { } while (0) - -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - -#endif /* _ASM_IRQ_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/io_ports.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/io_ports.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,30 +0,0 @@ -/* - * arch/i386/mach-generic/io_ports.h - * - * Machine specific IO port address definition for generic. - * Written by Osamu Tomita <tomita@xxxxxxxxxxx> - */ -#ifndef _MACH_IO_PORTS_H -#define _MACH_IO_PORTS_H - -/* i8253A PIT registers */ -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -#define PIT_CH2 0x42 - -/* i8259A PIC registers */ -#define PIC_MASTER_CMD 0x20 -#define PIC_MASTER_IMR 0x21 -#define PIC_MASTER_ISR PIC_MASTER_CMD -#define PIC_MASTER_POLL PIC_MASTER_ISR -#define PIC_MASTER_OCW3 PIC_MASTER_ISR -#define PIC_SLAVE_CMD 0xa0 -#define PIC_SLAVE_IMR 0xa1 - -/* i8259A PIC related value */ -#define PIC_CASCADE_IR 2 -#define MASTER_ICW4_DEFAULT 0x01 -#define SLAVE_ICW4_DEFAULT 0x01 -#define PIC_ICW4_AEOI 2 - -#endif /* !_MACH_IO_PORTS_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/mach_time.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/mach_time.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,122 +0,0 @@ -/* - * include/asm-i386/mach-default/mach_time.h - * - * Machine specific set RTC function for generic. - * Split out from time.c by Osamu Tomita <tomita@xxxxxxxxxxx> - */ -#ifndef _MACH_TIME_H -#define _MACH_TIME_H - -#include <linux/mc146818rtc.h> - -/* for check timing call set_rtc_mmss() 500ms */ -/* used in arch/i386/time.c::do_timer_interrupt() */ -#define USEC_AFTER 500000 -#define USEC_BEFORE 500000 - -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be - * called 500 ms after the second nowtime has started, because when - * nowtime is written into the registers of the CMOS clock, it will - * jump to the next second precisely 500 ms later. Check the Motorola - * MC146818A or Dallas DS12887 data sheet for details. - * - * BUG: This routine does not handle hour overflow properly; it just - * sets the minutes. Usually you'll only notice that after reboot! - */ -static inline int mach_set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char save_control, save_freq_select; - - save_control = CMOS_READ(RTC_CONTROL); /* tell the clock it's being set */ - CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); - - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); /* stop and reset prescaler */ - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); - - /* - * since we're only adjusting minutes and seconds, - * don't interfere with hour overflow. This avoids - * messing with unknown time zones but requires your - * RTC not to be off by more than 15 minutes - */ - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) < 30) { - if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - } - CMOS_WRITE(real_seconds,RTC_SECONDS); - CMOS_WRITE(real_minutes,RTC_MINUTES); - } else { - printk(KERN_WARNING - "set_rtc_mmss: can't update from %d to %d\n", - cmos_minutes, real_minutes); - retval = -1; - } - - /* The following flags have to be released exactly in this order, - * otherwise the DS12887 (popular MC146818A clone with integrated - * battery and quartz) will not reset the oscillator and will not - * update precisely 500 ms later. You won't find this mentioned in - * the Dallas Semiconductor data sheets, but who believes data - * sheets anyway ... -- Markus Kuhn - */ - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - - return retval; -} - -static inline unsigned long mach_get_cmos_time(void) -{ - unsigned int year, mon, day, hour, min, sec; - int i; - - /* The Linux interpretation of the CMOS clock register contents: - * When the Update-In-Progress (UIP) flag goes from 1 to 0, the - * RTC registers show the second which has precisely just started. - * Let's hope other operating systems interpret the RTC the same way. - */ - /* read RTC exactly on falling edge of update flag */ - for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */ - if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) - break; - for (i = 0 ; i < 1000000 ; i++) /* must try at least 2.228 ms */ - if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - break; - do { /* Isn't this overkill ? UIP above should guarantee consistency */ - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); - } while (sec != CMOS_READ(RTC_SECONDS)); - if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - } - if ((year += 1900) < 1970) - year += 100; - - return mktime(year, mon, day, hour, min, sec); -} - -#endif /* !_MACH_TIME_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/mach_timer.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/mach_timer.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,48 +0,0 @@ -/* - * include/asm-i386/mach-default/mach_timer.h - * - * Machine specific calibrate_tsc() for generic. - * Split out from timer_tsc.c by Osamu Tomita <tomita@xxxxxxxxxxx> - */ -/* ------ Calibrate the TSC ------- - * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset(). - * Too much 64-bit arithmetic here to do this cleanly in C, and for - * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2) - * output busy loop as low as possible. We avoid reading the CTC registers - * directly because of the awkward 8-bit access mechanism of the 82C54 - * device. - */ -#ifndef _MACH_TIMER_H -#define _MACH_TIMER_H - -#define CALIBRATE_LATCH (5 * LATCH) - -static inline void mach_prepare_counter(void) -{ - /* Set the Gate high, disable speaker */ - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - /* - * Now let's take care of CTC channel 2 - * - * Set the Gate high, program CTC channel 2 for mode 0, - * (interrupt on terminal count mode), binary count, - * load 5 * LATCH count, (LSB and MSB) to begin countdown. - * - * Some devices need a delay here. - */ - outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */ - outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */ - outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */ -} - -static inline void mach_countup(unsigned long *count_p) -{ - unsigned long count = 0; - do { - count++; - } while ((inb_p(0x61) & 0x20) == 0); - *count_p = count; -} - -#endif /* !_MACH_TIMER_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_post.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_post.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,47 +0,0 @@ -/** - * machine_specific_memory_setup - Hook for machine specific memory setup. - * - * Description: - * This is included late in kernel/setup.c so that it can make - * use of all of the static functions. - **/ - -static char * __init machine_specific_memory_setup(void) -{ - char *who; - unsigned long start_pfn, max_pfn; - - who = "Xen"; - - start_pfn = 0; - max_pfn = xen_start_info.nr_pages; - - e820.nr_map = 0; - add_memory_region(PFN_PHYS(start_pfn), PFN_PHYS(max_pfn) - PFN_PHYS(start_pfn), E820_RAM); - - return who; -} - -void __init machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c) -{ - clear_bit(X86_FEATURE_VME, c->x86_capability); - clear_bit(X86_FEATURE_DE, c->x86_capability); - clear_bit(X86_FEATURE_PSE, c->x86_capability); - clear_bit(X86_FEATURE_PGE, c->x86_capability); - clear_bit(X86_FEATURE_SEP, c->x86_capability); - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - clear_bit(X86_FEATURE_MTRR, c->x86_capability); -} - -extern void hypervisor_callback(void); -extern void failsafe_callback(void); - -static void __init machine_specific_arch_setup(void) -{ - HYPERVISOR_set_callbacks( - (unsigned long) hypervisor_callback, - (unsigned long) failsafe_callback, - (unsigned long) system_call); - - machine_specific_modify_cpu_capabilities(&boot_cpu_data); -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_pre.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/setup_arch_pre.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,5 +0,0 @@ -/* Hook to call BIOS initialisation function */ - -#define ARCH_SETUP machine_specific_arch_setup(); - -static void __init machine_specific_arch_setup(void); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/smpboot_hooks.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/smpboot_hooks.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,55 +0,0 @@ -/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws - * which needs to alter them. */ - -static inline void smpboot_clear_io_apic_irqs(void) -{ -#ifdef CONFIG_X86_IO_APIC - io_apic_irqs = 0; -#endif -} - -static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) -{ -#if 1 - printk("smpboot_setup_warm_reset_vector\n"); -#else - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - Dprintk("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; - Dprintk("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; - Dprintk("3.\n"); -#endif -} - -static inline void smpboot_restore_warm_reset_vector(void) -{ - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - *((volatile long *) phys_to_virt(0x467)) = 0; -} - -static inline void smpboot_setup_io_apic(void) -{ -#ifdef CONFIG_X86_IO_APIC - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif -} - - -#define smp_found_config (HYPERVISOR_shared_info->n_vcpu > 1) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,126 +0,0 @@ -#ifndef __X86_64_MMU_CONTEXT_H -#define __X86_64_MMU_CONTEXT_H - -#include <linux/config.h> -#include <asm/desc.h> -#include <asm/atomic.h> -#include <asm/pgalloc.h> -#include <asm/page.h> -#include <asm/pda.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> - -/* - * possibly do the LDT unload here? - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -#if 0 /* XEN: no lazy tlb */ - if (read_pda(mmu_state) == TLBSTATE_OK) - write_pda(mmu_state, TLBSTATE_LAZY); -#endif -} - -#define prepare_arch_switch(rq,next) __prepare_arch_switch() -#define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -#define task_running(rq, p) ((rq)->curr == (p)) - -static inline void __prepare_arch_switch(void) -{ - /* - * Save away %es, %ds, %fs and %gs. Must happen before reload - * of cr3/ldt (i.e., not in __switch_to). - */ - __asm__ __volatile__ ( - "movl %%es,%0 ; movl %%ds,%1 ; movl %%fs,%2 ; movl %%gs,%3" - : "=m" (current->thread.es), - "=m" (current->thread.ds), - "=m" (current->thread.fsindex), - "=m" (current->thread.gsindex) ); - - if (current->thread.ds) - __asm__ __volatile__ ( "movl %0,%%ds" : : "r" (0) ); - - if (current->thread.es) - __asm__ __volatile__ ( "movl %0,%%es" : : "r" (0) ); - - if (current->thread.fsindex) { - __asm__ __volatile__ ( "movl %0,%%fs" : : "r" (0) ); - current->thread.fs = 0; - } - - if (current->thread.gsindex) { - load_gs_index(0); - current->thread.gs = 0; - } -} - - -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); - struct mmuext_op _op[3], *op = _op; - - if (likely(prev != next)) { - /* stop flush ipis for the previous mm */ - clear_bit(cpu, &prev->cpu_vm_mask); -#if 0 /* XEN: no lazy tlb */ - write_pda(mmu_state, TLBSTATE_OK); - write_pda(active_mm, next); -#endif - set_bit(cpu, &next->cpu_vm_mask); - - /* load_cr3(next->pgd) */ - per_cpu(cur_pgd, smp_processor_id()) = next->pgd; - op->cmd = MMUEXT_NEW_BASEPTR; - op->mfn = pfn_to_mfn(__pa(next->pgd) >> PAGE_SHIFT); - op++; - - /* xen_new_user_pt(__pa(__user_pgd(next->pgd))) */ - op->cmd = MMUEXT_NEW_USER_BASEPTR; - op->mfn = pfn_to_mfn(__pa(__user_pgd(next->pgd)) >> PAGE_SHIFT); - op++; - - if (unlikely(next->context.ldt != prev->context.ldt)) { - /* load_LDT_nolock(&next->context, cpu) */ - op->cmd = MMUEXT_SET_LDT; - op->linear_addr = (unsigned long)next->context.ldt; - op->nr_ents = next->context.size; - op++; - } - - BUG_ON(HYPERVISOR_mmuext_op(_op, op-_op, NULL, DOMID_SELF)); - } - -#if 0 /* XEN: no lazy tlb */ - else { - write_pda(mmu_state, TLBSTATE_OK); - if (read_pda(active_mm) != next) - out_of_line_bug(); - if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) { - /* We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - */ - load_cr3(next->pgd); - xen_new_user_pt(__pa(__user_pgd(next->pgd))); - load_LDT_nolock(&next->context, cpu); - } - } -#endif -} - -#define deactivate_mm(tsk,mm) do { \ - load_gs_index(0); \ - asm volatile("movl %0,%%fs"::"r"(0)); \ -} while(0) - -#define activate_mm(prev, next) do { \ - switch_mm((prev),(next),NULL); \ -} while (0) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/page.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/page.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,229 +0,0 @@ -#ifndef _X86_64_PAGE_H -#define _X86_64_PAGE_H - -#include <linux/config.h> -/* #include <linux/string.h> */ -#ifndef __ASSEMBLY__ -#include <linux/types.h> -#endif -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/foreign_page.h> - -#define arch_free_page(_page,_order) \ -({ int foreign = PageForeign(_page); \ - if (foreign) \ - (PageForeignDestructor(_page))(_page); \ - foreign; \ -}) -#define HAVE_ARCH_FREE_PAGE - -#ifdef CONFIG_XEN_SCRUB_PAGES -#define scrub_pages(_p,_n) memset((void *)(_p), 0, (_n) << PAGE_SHIFT) -#else -#define scrub_pages(_p,_n) ((void)0) -#endif - -/* PAGE_SHIFT determines the page size */ -#define PAGE_SHIFT 12 -#ifdef __ASSEMBLY__ -#define PAGE_SIZE (0x1 << PAGE_SHIFT) -#else -#define PAGE_SIZE (1UL << PAGE_SHIFT) -#endif -#define PAGE_MASK (~(PAGE_SIZE-1)) -#define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & (__PHYSICAL_MASK << PAGE_SHIFT)) - -#define THREAD_ORDER 1 -#ifdef __ASSEMBLY__ -#define THREAD_SIZE (1 << (PAGE_SHIFT + THREAD_ORDER)) -#else -#define THREAD_SIZE (1UL << (PAGE_SHIFT + THREAD_ORDER)) -#endif -#define CURRENT_MASK (~(THREAD_SIZE-1)) - -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) -#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) - -#define HPAGE_SHIFT PMD_SHIFT -#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) -#define HPAGE_MASK (~(HPAGE_SIZE - 1)) -#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) - -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -void clear_page(void *); -void copy_page(void *, void *); - -#define clear_user_page(page, vaddr, pg) clear_page(page) -#define copy_user_page(to, from, vaddr, pg) copy_page(to, from) - -#define alloc_zeroed_user_highpage(vma, vaddr) alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO, vma, vaddr) -#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE - -/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ -extern u32 *phys_to_machine_mapping; -#define pfn_to_mfn(_pfn) ((unsigned long) phys_to_machine_mapping[(unsigned int)(_pfn)]) -#define mfn_to_pfn(_mfn) ((unsigned long) machine_to_phys_mapping[(unsigned int)(_mfn)]) -static inline unsigned long phys_to_machine(unsigned long phys) -{ - unsigned long machine = pfn_to_mfn(phys >> PAGE_SHIFT); - machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); - return machine; -} - -static inline unsigned long machine_to_phys(unsigned long machine) -{ - unsigned long phys = mfn_to_pfn(machine >> PAGE_SHIFT); - phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); - return phys; -} - -/* - * These are used to make use of C type-checking.. - */ -typedef struct { unsigned long pte; } pte_t; -typedef struct { unsigned long pmd; } pmd_t; -typedef struct { unsigned long pud; } pud_t; -typedef struct { unsigned long pgd; } pgd_t; -#define PTE_MASK PHYSICAL_PAGE_MASK - -typedef struct { unsigned long pgprot; } pgprot_t; - -#define pte_val(x) (((x).pte & 1) ? machine_to_phys((x).pte) : \ - (x).pte) -#define pte_val_ma(x) ((x).pte) - -static inline unsigned long pmd_val(pmd_t x) -{ - unsigned long ret = x.pmd; - if (ret) ret = machine_to_phys(ret); - return ret; -} - -static inline unsigned long pud_val(pud_t x) -{ - unsigned long ret = x.pud; - if (ret) ret = machine_to_phys(ret); - return ret; -} - -static inline unsigned long pgd_val(pgd_t x) -{ - unsigned long ret = x.pgd; - if (ret) ret = machine_to_phys(ret); - return ret; -} - -#define pgprot_val(x) ((x).pgprot) - -#define __pte_ma(x) ((pte_t) { (x) } ) - -static inline pte_t __pte(unsigned long x) -{ - if (x & 1) x = phys_to_machine(x); - return ((pte_t) { (x) }); -} - -static inline pmd_t __pmd(unsigned long x) -{ - if ((x & 1)) x = phys_to_machine(x); - return ((pmd_t) { (x) }); -} - -static inline pud_t __pud(unsigned long x) -{ - if ((x & 1)) x = phys_to_machine(x); - return ((pud_t) { (x) }); -} - -static inline pgd_t __pgd(unsigned long x) -{ - if ((x & 1)) x = phys_to_machine(x); - return ((pgd_t) { (x) }); -} - -#define __pgprot(x) ((pgprot_t) { (x) } ) - -extern unsigned long vm_stack_flags, vm_stack_flags32; -extern unsigned long vm_data_default_flags, vm_data_default_flags32; -extern unsigned long vm_force_exec32; - -#define __START_KERNEL 0xffffffff80100000UL -#define __START_KERNEL_map 0xffffffff80000000UL -#define __PAGE_OFFSET 0xffff880000000000UL - -#else -#define __START_KERNEL 0xffffffff80100000 -#define __START_KERNEL_map 0xffffffff80000000 -#define __PAGE_OFFSET 0xffff880000000000 -#endif /* !__ASSEMBLY__ */ - -/* to align the pointer to the (next) page boundary */ -#define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) - -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ -#define __PHYSICAL_MASK_SHIFT 46 -#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) -#define __VIRTUAL_MASK_SHIFT 48 -#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) - -#define KERNEL_TEXT_SIZE (40UL*1024*1024) -#define KERNEL_TEXT_START 0xffffffff80000000UL - -#ifndef __ASSEMBLY__ - -#include <asm/bug.h> - -/* Pure 2^n version of get_order */ -extern __inline__ int get_order(unsigned long size) -{ - int order; - - size = (size-1) >> (PAGE_SHIFT-1); - order = -1; - do { - size >>= 1; - order++; - } while (size); - return order; -} - -#endif /* __ASSEMBLY__ */ - -#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) - -/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. - Otherwise you risk miscompilation. */ -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) -/* __pa_symbol should be used for C visible symbols. - This seems to be the official gcc blessed way to do such arithmetic. */ -#define __pa_symbol(x) \ - ({unsigned long v; \ - asm("" : "=r" (v) : "0" (x)); \ - __pa(v); }) - -#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#ifndef CONFIG_DISCONTIGMEM -#define pfn_to_page(pfn) (mem_map + (pfn)) -#define page_to_pfn(page) ((unsigned long)((page) - mem_map)) -#define pfn_valid(pfn) ((pfn) < max_mapnr) -#endif - -#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) - -/* VIRT <-> MACHINE conversion */ -#define virt_to_machine(_a) (phys_to_machine(__pa(_a))) -#define machine_to_virt(_m) (__va(machine_to_phys(_m))) - -#define VM_DATA_DEFAULT_FLAGS \ - (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ - VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#define __HAVE_ARCH_GATE_AREA 1 - -#endif /* __KERNEL__ */ - -#endif /* _X86_64_PAGE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/param.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/param.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,22 +0,0 @@ -#ifndef _ASMx86_64_PARAM_H -#define _ASMx86_64_PARAM_H - -#ifdef __KERNEL__ -# define HZ 100 /* Internal kernel timer frequency */ -# define USER_HZ 100 /* .. some user interfaces are in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* like times() */ -#endif - -#ifndef HZ -#define HZ 100 -#endif - -#define EXEC_PAGESIZE 4096 - -#ifndef NOGROUP -#define NOGROUP (-1) -#endif - -#define MAXHOSTNAMELEN 64 /* max length of hostname */ - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pci.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pci.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,148 +0,0 @@ -#ifndef __x8664_PCI_H -#define __x8664_PCI_H - -#include <linux/config.h> -#include <asm/io.h> - -#ifdef __KERNEL__ - -#include <linux/mm.h> /* for struct page */ - -/* Can be used to override the logic in pci_scan_bus for skipping - already-configured bus numbers - to be used for buggy BIOSes - or architectures with incomplete PCI setup by the loader */ - -#ifdef CONFIG_PCI -extern unsigned int pcibios_assign_all_busses(void); -#else -#define pcibios_assign_all_busses() 0 -#endif -#define pcibios_scan_all_fns(a, b) 0 - -extern int no_iommu, force_iommu; - -extern unsigned long pci_mem_start; -#define PCIBIOS_MIN_IO 0x1000 -#define PCIBIOS_MIN_MEM (pci_mem_start) - -#define PCIBIOS_MIN_CARDBUS_IO 0x4000 - -void pcibios_config_init(void); -struct pci_bus * pcibios_scan_root(int bus); -extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); -extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); - -void pcibios_set_master(struct pci_dev *dev); -void pcibios_penalize_isa_irq(int irq); -struct irq_routing_table *pcibios_get_irq_routing_table(void); -int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); - -#include <linux/types.h> -#include <linux/slab.h> -#include <asm/scatterlist.h> -#include <linux/string.h> -#include <asm/page.h> - -extern int iommu_setup(char *opt); - -#ifdef CONFIG_GART_IOMMU -/* The PCI address space does equal the physical memory - * address space. The networking and block device layers use - * this boolean for bounce buffer decisions - * - * On AMD64 it mostly equals, but we set it to zero to tell some subsystems - * that an IOMMU is available. - */ -#define PCI_DMA_BUS_IS_PHYS (no_iommu ? 1 : 0) - -/* - * x86-64 always supports DAC, but sometimes it is useful to force - * devices through the IOMMU to get automatic sg list merging. - * Optional right now. - */ -extern int iommu_sac_force; -#define pci_dac_dma_supported(pci_dev, mask) (!iommu_sac_force) - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ - dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ - __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME) \ - ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME) \ - ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - (((PTR)->LEN_NAME) = (VAL)) - -#else -/* No IOMMU */ - -#define PCI_DMA_BUS_IS_PHYS 1 -#define pci_dac_dma_supported(pci_dev, mask) 1 - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) (0) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define pci_unmap_len(PTR, LEN_NAME) (0) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) - -#endif - -#include <asm-generic/pci-dma-compat.h> - -static inline dma64_addr_t -pci_dac_page_to_dma(struct pci_dev *pdev, struct page *page, unsigned long offset, int direction) -{ - return ((dma64_addr_t) page_to_phys(page) + - (dma64_addr_t) offset); -} - -static inline struct page * -pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) -{ - return virt_to_page(__va(dma_addr)); -} - -static inline unsigned long -pci_dac_dma_to_offset(struct pci_dev *pdev, dma64_addr_t dma_addr) -{ - return (dma_addr & ~PAGE_MASK); -} - -static inline void -pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) -{ -} - -static inline void -pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr, size_t len, int direction) -{ - flush_write_buffers(); -} - -#define HAVE_PCI_MMAP -extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine); - -static inline void pcibios_add_platform_entries(struct pci_dev *dev) -{ -} - -#endif /* __KERNEL__ */ - -/* generic pci stuff */ -#ifdef CONFIG_PCI -#include <asm-generic/pci.h> -#endif - -/* On Xen we have to scan all functions since Xen hides bridges from - * us. If a bridge is at fn=0 and that slot has a multifunction - * device, we won't find the additional devices without scanning all - * functions. */ -#undef pcibios_scan_all_fns -#define pcibios_scan_all_fns(a, b) 1 - -#endif /* __x8664_PCI_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pda.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pda.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,85 +0,0 @@ -#ifndef X86_64_PDA_H -#define X86_64_PDA_H - -#ifndef __ASSEMBLY__ -#include <linux/stddef.h> -#include <linux/types.h> -#include <linux/cache.h> - -/* Per processor datastructure. %gs points to it while the kernel runs */ -struct x8664_pda { - struct task_struct *pcurrent; /* Current process */ - unsigned long data_offset; /* Per cpu data offset from linker address */ - struct x8664_pda *me; /* Pointer to itself */ - unsigned long kernelstack; /* top of kernel stack for current */ - unsigned long oldrsp; /* user rsp for system call */ - unsigned long irqrsp; /* Old rsp for interrupts. */ - int irqcount; /* Irq nesting counter. Starts with -1 */ - int cpunumber; /* Logical CPU number */ - char *irqstackptr; /* top of irqstack */ - unsigned int __softirq_pending; - unsigned int __nmi_count; /* number of NMI on this CPUs */ - unsigned long idle_timestamp; - struct mm_struct *active_mm; - int mmu_state; - unsigned apic_timer_irqs; - int kernel_mode; /* kernel or user mode */ -} ____cacheline_aligned; - - -#define IRQSTACK_ORDER 2 -#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) - -extern struct x8664_pda cpu_pda[]; - -/* - * There is no fast way to get the base address of the PDA, all the accesses - * have to mention %fs/%gs. So it needs to be done this Torvaldian way. - */ -#define sizeof_field(type,field) (sizeof(((type *)0)->field)) -#define typeof_field(type,field) typeof(((type *)0)->field) - -extern void __bad_pda_field(void); - -#define pda_offset(field) offsetof(struct x8664_pda, field) - -#define pda_to_op(op,field,val) do { \ - switch (sizeof_field(struct x8664_pda, field)) { \ -case 2: \ -asm volatile(op "w %0,%%gs:%P1"::"r" (val),"i"(pda_offset(field)):"memory"); break; \ -case 4: \ -asm volatile(op "l %0,%%gs:%P1"::"r" (val),"i"(pda_offset(field)):"memory"); break; \ -case 8: \ -asm volatile(op "q %0,%%gs:%P1"::"r" (val),"i"(pda_offset(field)):"memory"); break; \ - default: __bad_pda_field(); \ - } \ - } while (0) - -/* - * AK: PDA read accesses should be neither volatile nor have an memory clobber. - * Unfortunately removing them causes all hell to break lose currently. - */ -#define pda_from_op(op,field) ({ \ - typedef typeof_field(struct x8664_pda, field) T__; T__ ret__; \ - switch (sizeof_field(struct x8664_pda, field)) { \ -case 2: \ -asm volatile(op "w %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ -case 4: \ -asm volatile(op "l %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ -case 8: \ -asm volatile(op "q %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ - default: __bad_pda_field(); \ - } \ - ret__; }) - - -#define read_pda(field) pda_from_op("mov",field) -#define write_pda(field,val) pda_to_op("mov",field,val) -#define add_pda(field,val) pda_to_op("add",field,val) -#define sub_pda(field,val) pda_to_op("sub",field,val) - -#endif - -#define PDA_STACKOFFSET (5*8) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,171 +0,0 @@ -#ifndef _X86_64_PGALLOC_H -#define _X86_64_PGALLOC_H - -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <asm/pda.h> -#include <linux/threads.h> -#include <linux/mm.h> -#include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ - -void make_page_readonly(void *va); -void make_page_writable(void *va); -void make_pages_readonly(void *va, unsigned int nr); -void make_pages_writable(void *va, unsigned int nr); - -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) - -static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) -{ - set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))); -} - -static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) -{ - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); -} - -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) -{ - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); -} - -/* - * We need to use the batch mode here, but pgd_pupulate() won't be - * be called frequently. - */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) -{ - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); -} - -extern __inline__ pmd_t *get_pmd(void) -{ - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - return pmd; -} - -extern __inline__ void pmd_free(pmd_t *pmd) -{ - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - xen_pmd_unpin(__pa(pmd)); - make_page_writable(pmd); - free_page((unsigned long)pmd); -} - -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - return pmd; -} - -static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) -{ - pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pud) - return NULL; - make_page_readonly(pud); - xen_pud_pin(__pa(pud)); - return pud; -} - -static inline void pud_free(pud_t *pud) -{ - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - xen_pud_unpin(__pa(pud)); - make_page_writable(pud); - free_page((unsigned long)pud); -} - -static inline pgd_t *pgd_alloc(struct mm_struct *mm) -{ - /* - * We allocate two contiguous pages for kernel and user. - */ - unsigned boundary; - pgd_t *pgd = (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_REPEAT, 1); - - if (!pgd) - return NULL; - /* - * Copy kernel pointers in from init. - * Could keep a freelist or slab cache of those because the kernel - * part never changes. - */ - boundary = pgd_index(__PAGE_OFFSET); - memset(pgd, 0, boundary * sizeof(pgd_t)); - memcpy(pgd + boundary, - init_level4_pgt + boundary, - (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); - - memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ - make_pages_readonly(pgd, 2); - - xen_pgd_pin(__pa(pgd)); /* kernel */ - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ - /* - * Set level3_user_pgt for vsyscall area - */ - set_pgd(__user_pgd(pgd) + pgd_index(VSYSCALL_START), - mk_kernel_pgd(__pa_symbol(level3_user_pgt))); - return pgd; -} - -static inline void pgd_free(pgd_t *pgd) -{ - BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - xen_pgd_unpin(__pa(pgd)); - xen_pgd_unpin(__pa(__user_pgd(pgd))); - make_pages_writable(pgd, 2); - free_pages((unsigned long)pgd, 1); -} - -static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - return pte; -} - -static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - return virt_to_page((unsigned long)pte); -} - -/* Should really implement gc for free page table pages. This could be - done with a reference count in struct page. */ - -extern __inline__ void pte_free_kernel(pte_t *pte) -{ - BUG_ON((unsigned long)pte & (PAGE_SIZE-1)); - xen_pte_unpin(__pa(pte)); - make_page_writable(pte); - free_page((unsigned long)pte); -} - -extern void pte_free(struct page *pte); - -//#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - -#define __pte_free_tlb(tlb,x) pte_free((x)) -#define __pmd_free_tlb(tlb,x) pmd_free((x)) -#define __pud_free_tlb(tlb,x) pud_free((x)) - -#endif /* _X86_64_PGALLOC_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,541 +0,0 @@ -#ifndef _X86_64_PGTABLE_H -#define _X86_64_PGTABLE_H - -/* - * This file contains the functions and defines necessary to modify and use - * the x86-64 page table tree. - * - * x86-64 has a 4 level table setup. Generic linux MM only supports - * three levels. The fourth level is currently a single static page that - * is shared by everybody and just contains a pointer to the current - * three level page setup on the beginning and some kernel mappings at - * the end. For more details see Documentation/x86_64/mm.txt - */ -#include <asm/processor.h> -#include <asm/fixmap.h> -#include <asm/bitops.h> -#include <linux/threads.h> -#include <asm/pda.h> -#include <asm-xen/hypervisor.h> -extern pud_t level3_user_pgt[512]; -extern pud_t init_level4_pgt[]; -extern pud_t init_level4_user_pgt[]; -extern unsigned long __supported_pte_mask; - -#define swapper_pg_dir NULL - -extern int nonx_setup(char *str); -extern void paging_init(void); -extern void clear_kernel_mapping(unsigned long addr, unsigned long size); - -extern unsigned long pgkern_mask; - -#define virt_to_ptep(__va) \ -({ \ - pgd_t *__pgd = pgd_offset_k((unsigned long)(__va)); \ - pud_t *__pud = pud_offset(__pgd, (unsigned long)(__va)); \ - pmd_t *__pmd = pmd_offset(__pud, (unsigned long)(__va)); \ - pte_offset_kernel(__pmd, (unsigned long)(__va)); \ -}) - -#define arbitrary_virt_to_machine(__va) \ -({ \ - pte_t *__pte = virt_to_ptep(__va); \ - unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK; \ - __pa | ((unsigned long)(__va) & (PAGE_SIZE-1)); \ -}) - -/* - * ZERO_PAGE is a global shared page that is always zero: used - * for zero-mapped memory areas etc.. - */ -extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) - -#define PGDIR_SHIFT 39 -#define PTRS_PER_PGD 512 - -/* - * PUDIR_SHIFT determines what a top-level page table entry can map - */ -#define PUD_SHIFT 30 -#define PTRS_PER_PUD 512 - -/* - * PMD_SHIFT determines the size of the area a middle-level - * page table can map - */ -#define PMD_SHIFT 21 -#define PTRS_PER_PMD 512 - -/* - * entries per page directory level - */ -#define PTRS_PER_PTE 512 - -#define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e)) -#define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e)) -#define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e)) -#define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) - -#define pgd_none(x) (!pgd_val(x)) -#define pud_none(x) (!pud_val(x)) - -#define set_pte_batched(pteptr, pteval) \ - queue_l1_entry_update(pteptr, (pteval)) - -extern inline int pud_present(pud_t pud) { return !pud_none(pud); } - -#ifdef CONFIG_SMP -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval).pte) - -#else -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval.pte)) -#if 0 -static inline void set_pte(pte_t *dst, pte_t val) -{ - *dst = val; -} -#endif -#endif - -#define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) -#define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) -#define set_pgd(pgdptr, pgdval) xen_l4_entry_update(pgdptr, (pgdval)) - -extern inline void pud_clear (pud_t * pud) -{ - set_pud(pud, __pud(0)); -} - -#define __user_pgd(pgd) ((pgd) + PTRS_PER_PGD) - -extern inline void pgd_clear (pgd_t * pgd) -{ - set_pgd(pgd, __pgd(0)); - set_pgd(__user_pgd(pgd), __pgd(0)); -} - -#define pud_page(pud) \ - ((unsigned long) __va(pud_val(pud) & PHYSICAL_PAGE_MASK)) - -/* - * A note on implementation of this atomic 'get-and-clear' operation. - * This is actually very simple because Xen Linux can only run on a single - * processor. Therefore, we cannot race other processors setting the 'accessed' - * or 'dirty' bits on a page-table entry. - * Even if pages are shared between domains, that is not a problem because - * each domain will have separate page tables, with their own versions of - * accessed & dirty state. - */ -static inline pte_t ptep_get_and_clear(pte_t *xp) -{ - pte_t pte = *xp; - if (pte.pte) - set_pte(xp, __pte_ma(0)); - return pte; -} - -#define pte_same(a, b) ((a).pte == (b).pte) - -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) -#define FIRST_USER_PGD_NR 0 - -#ifndef __ASSEMBLY__ -#define MAXMEM 0x3fffffffffffUL -#define VMALLOC_START 0xffffc20000000000UL -#define VMALLOC_END 0xffffe1ffffffffffUL -#define MODULES_VADDR 0xffffffff88000000UL -#define MODULES_END 0xfffffffffff00000UL -#define MODULES_LEN (MODULES_END - MODULES_VADDR) - -#define _PAGE_BIT_PRESENT 0 -#define _PAGE_BIT_RW 1 -#define _PAGE_BIT_USER 2 -#define _PAGE_BIT_PWT 3 -#define _PAGE_BIT_PCD 4 -#define _PAGE_BIT_ACCESSED 5 -#define _PAGE_BIT_DIRTY 6 -#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ -#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ - -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PSE 0x080 /* 2MB page */ -#define _PAGE_FILE 0x040 /* set:pagecache, unset:swap */ -#define _PAGE_GLOBAL 0x100 /* Global TLB entry */ - -#define _PAGE_PROTNONE 0x080 /* If not present */ -#define _PAGE_NX (1UL<<_PAGE_BIT_NX) - -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) -#define _KERNPG_TABLE _PAGE_TABLE - -#define _PAGE_CHG_MASK (PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) - -#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_COPY PAGE_COPY_NOEXEC -#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) -#define __PAGE_KERNEL \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER ) -#define __PAGE_KERNEL_EXEC \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER ) -#define __PAGE_KERNEL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER ) -#define __PAGE_KERNEL_RO \ - (_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER ) -#define __PAGE_KERNEL_VSYSCALL \ - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_USER ) -#define __PAGE_KERNEL_VSYSCALL_NOCACHE \ - (_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD | _PAGE_USER ) -#define __PAGE_KERNEL_LARGE \ - (__PAGE_KERNEL | _PAGE_PSE | _PAGE_USER ) - - -/* - * We don't support GLOBAL page in xenolinux64 - */ -#define MAKE_GLOBAL(x) __pgprot((x)) - -#define PAGE_KERNEL MAKE_GLOBAL(__PAGE_KERNEL) -#define PAGE_KERNEL_EXEC MAKE_GLOBAL(__PAGE_KERNEL_EXEC) -#define PAGE_KERNEL_RO MAKE_GLOBAL(__PAGE_KERNEL_RO) -#define PAGE_KERNEL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_NOCACHE) -#define PAGE_KERNEL_VSYSCALL32 __pgprot(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_VSYSCALL MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL) -#define PAGE_KERNEL_LARGE MAKE_GLOBAL(__PAGE_KERNEL_LARGE) -#define PAGE_KERNEL_VSYSCALL_NOCACHE MAKE_GLOBAL(__PAGE_KERNEL_VSYSCALL_NOCACHE) - -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - -static inline unsigned long pgd_bad(pgd_t pgd) -{ - unsigned long val = pgd_val(pgd); - val &= ~PTE_MASK; - val &= ~(_PAGE_USER | _PAGE_DIRTY); - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); -} - -static inline unsigned long pud_bad(pud_t pud) -{ - unsigned long val = pud_val(pud); - val &= ~PTE_MASK; - val &= ~(_PAGE_USER | _PAGE_DIRTY); - return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); -} - -#define pte_none(x) (!(x).pte) -#define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) - -#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) - -/* - * We detect special mappings in one of two ways: - * 1. If the MFN is an I/O page then Xen will set the m2p entry - * to be outside our maximum possible pseudophys range. - * 2. If the MFN belongs to a different domain then we will certainly - * not have MFN in our p2m table. Conversely, if the page is ours, - * then we'll have p2m(m2p(MFN))==MFN. - * If we detect a special mapping then it doesn't have a 'struct page'. - * We force !pfn_valid() by returning an out-of-range pointer. - * - * NB. These checks require that, for any MFN that is not in our reservation, - * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if - * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. - * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. - * - * NB2. When deliberately mapping foreign pages into the p2m table, you *must* - * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we - * require. In all the cases we care about, the high bit gets shifted out - * (e.g., phys_to_machine()) so behaviour there is correct. - */ -#define INVALID_P2M_ENTRY (~0U) -#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) -#define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) -#define pte_pfn(_pte) \ -({ \ - unsigned long mfn = pte_mfn(_pte); \ - unsigned pfn = mfn_to_pfn(mfn); \ - if ((pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn)) \ - pfn = max_mapnr; /* special: force !pfn_valid() */ \ - pfn; \ -}) - -#define pte_page(x) pfn_to_page(pte_pfn(x)) - -static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) -{ - pte_t pte; - - (pte).pte = (pfn_to_mfn(page_nr) << PAGE_SHIFT); - (pte).pte |= pgprot_val(pgprot); - (pte).pte &= __supported_pte_mask; - return pte; -} - -#define pfn_pte_ma(pfn, prot) __pte_ma((((pfn) << PAGE_SHIFT) | pgprot_val(prot)) & __supported_pte_mask) -/* - * The following only work if pte_present() is true. - * Undefined behaviour if not.. - */ -#define __pte_val(x) ((x).pte) - -static inline int pte_user(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } -extern inline int pte_read(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } -extern inline int pte_exec(pte_t pte) { return __pte_val(pte) & _PAGE_USER; } -extern inline int pte_dirty(pte_t pte) { return __pte_val(pte) & _PAGE_DIRTY; } -extern inline int pte_young(pte_t pte) { return __pte_val(pte) & _PAGE_ACCESSED; } -extern inline int pte_write(pte_t pte) { return __pte_val(pte) & _PAGE_RW; } -static inline int pte_file(pte_t pte) { return __pte_val(pte) & _PAGE_FILE; } - -extern inline pte_t pte_rdprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } -extern inline pte_t pte_exprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_USER; return pte; } -extern inline pte_t pte_mkclean(pte_t pte) { __pte_val(pte) &= ~_PAGE_DIRTY; return pte; } -extern inline pte_t pte_mkold(pte_t pte) { __pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } -extern inline pte_t pte_wrprotect(pte_t pte) { __pte_val(pte) &= ~_PAGE_RW; return pte; } -extern inline pte_t pte_mkread(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } -extern inline pte_t pte_mkexec(pte_t pte) { __pte_val(pte) |= _PAGE_USER; return pte; } -extern inline pte_t pte_mkdirty(pte_t pte) { __pte_val(pte) |= _PAGE_DIRTY; return pte; } -extern inline pte_t pte_mkyoung(pte_t pte) { __pte_val(pte) |= _PAGE_ACCESSED; return pte; } -extern inline pte_t pte_mkwrite(pte_t pte) { __pte_val(pte) |= _PAGE_RW; return pte; } - -static inline int ptep_test_and_clear_dirty(pte_t *ptep) -{ - pte_t pte = *ptep; - int ret = pte_dirty(pte); - if (ret) - xen_l1_entry_update(ptep, pte_mkclean(pte).pte); - return ret; -} - -static inline int ptep_test_and_clear_young(pte_t *ptep) -{ - pte_t pte = *ptep; - int ret = pte_young(pte); - if (ret) - xen_l1_entry_update(ptep, pte_mkold(pte).pte); - return ret; -} - -static inline void ptep_set_wrprotect(pte_t *ptep) -{ - pte_t pte = *ptep; - if (pte_write(pte)) - set_pte(ptep, pte_wrprotect(pte)); -} -static inline void ptep_mkdirty(pte_t *ptep) -{ - pte_t pte = *ptep; - if (!pte_dirty(pte)) - xen_l1_entry_update(ptep, pte_mkdirty(pte).pte); -} - -/* - * Macro to mark a page protection value as "uncacheable". - */ -#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) - -#define __LARGE_PTE (_PAGE_PSE|_PAGE_PRESENT) -static inline int pmd_large(pmd_t pte) { - return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE; -} - - -/* - * Conversion functions: convert a page and protection to a page entry, - * and a page entry and page directory to the page they refer to. - */ - -#define page_pte(page) page_pte_prot(page, __pgprot(0)) - -/* - * Level 4 access. - * Never use these in the common code. - */ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PTE_MASK)) -#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr)) -#define pgd_offset_k(address) (pgd_t *)(init_level4_pgt + pgd_index(address)) -#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT) -#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE) - -/* PUD - Level3 access */ -/* to find an entry in a page-table-directory. */ -#define pud_index(address) ((address >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -#define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address)) -static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address) -{ - return pud + pud_index(address); -} - -/* Find correct pud via the hidden fourth level page level: */ - -/* This accesses the reference page table of the boot cpu. - Other CPUs get synced lazily via the page fault handler. */ -static inline pud_t *pud_offset_k(unsigned long address) -{ - unsigned long addr; - - addr = pud_val(init_level4_pgt[pud_index(address)]); - addr &= PHYSICAL_PAGE_MASK; /* machine physical */ - addr = machine_to_phys(addr); - return __pud_offset_k((pud_t *)__va(addr), address); -} - -/* PMD - Level 2 access */ -#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) -#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) - -#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \ - pmd_index(address)) -#define pmd_none(x) (!pmd_val(x)) -#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) -#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & ~PTE_MASK) != _KERNPG_TABLE ) -#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) -#define pmd_pfn(x) ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK) - -#define pte_to_pgoff(pte) ((pte_val(pte) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT) -#define pgoff_to_pte(off) ((pte_t) { ((off) << PAGE_SHIFT) | _PAGE_FILE }) -#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT - -/* PTE - Level 1 access. */ - -/* page, protection -> pte */ -#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) -#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE) - -/* physical address -> PTE */ -static inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot) -{ - pte_t pte; - (pte).pte = physpage | pgprot_val(pgprot); - return pte; -} - -/* Change flags of a PTE */ -extern inline pte_t pte_modify(pte_t pte, pgprot_t newprot) -{ - (pte).pte &= _PAGE_CHG_MASK; - (pte).pte |= pgprot_val(newprot); - (pte).pte &= __supported_pte_mask; - return pte; -} - -#define pte_index(address) \ - ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_kernel(*(dir)) + \ - pte_index(address)) - -/* x86-64 always has all page tables mapped. */ -#define pte_offset_map(dir,address) pte_offset_kernel(dir,address) -#define pte_offset_map_nested(dir,address) pte_offset_kernel(dir,address) -#define pte_unmap(pte) /* NOP */ -#define pte_unmap_nested(pte) /* NOP */ - -#define update_mmu_cache(vma,address,pte) do { } while (0) - -/* We only update the dirty/accessed state if we set - * the dirty bit by hand in the kernel, since the hardware - * will do the accessed bit for us, and we don't want to - * race with other CPU's that might be updating the dirty - * bit at the same time. */ -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ - do { \ - if (__dirty) { \ - set_pte(__ptep, __entry); \ - flush_tlb_page(__vma, __address); \ - } \ - } while (0) - -/* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 1) & 0x3f) -#define __swp_offset(x) ((x).val >> 8) -#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | ((offset) << 8) }) -#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) - -#endif /* !__ASSEMBLY__ */ - -extern int kern_addr_valid(unsigned long addr); - -#define DOMID_LOCAL (0xFFFFU) - -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid); -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v); - -#define io_remap_page_range(vma, vaddr, paddr, size, prot) \ - remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) - -#define HAVE_ARCH_UNMAPPED_AREA - -#define pgtable_cache_init() do { } while (0) -#define check_pgt_cache() do { } while (0) - -#define PAGE_AGP PAGE_KERNEL_NOCACHE -#define HAVE_PAGE_AGP 1 - -/* fs/proc/kcore.c */ -#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK) -#define kc_offset_to_vaddr(o) \ - (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) - -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -#define __HAVE_ARCH_PTEP_MKDIRTY -#define __HAVE_ARCH_PTE_SAME -#include <asm-generic/pgtable.h> - -#endif /* _X86_64_PGTABLE_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/processor.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/processor.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,474 +0,0 @@ -/* - * include/asm-x86_64/processor.h - * - * Copyright (C) 1994 Linus Torvalds - */ - -#ifndef __ASM_X86_64_PROCESSOR_H -#define __ASM_X86_64_PROCESSOR_H - -#include <asm/segment.h> -#include <asm/page.h> -#include <asm/types.h> -#include <asm/sigcontext.h> -#include <asm/cpufeature.h> -#include <linux/config.h> -#include <linux/threads.h> -#include <asm/msr.h> -#include <asm/current.h> -#include <asm/system.h> -#include <asm/mmsegment.h> -#include <asm/percpu.h> -#include <linux/personality.h> - -#define TF_MASK 0x00000100 -#define IF_MASK 0x00000200 -#define IOPL_MASK 0x00003000 -#define NT_MASK 0x00004000 -#define VM_MASK 0x00020000 -#define AC_MASK 0x00040000 -#define VIF_MASK 0x00080000 /* virtual interrupt flag */ -#define VIP_MASK 0x00100000 /* virtual interrupt pending */ -#define ID_MASK 0x00200000 - -#define desc_empty(desc) \ - (!((desc)->a + (desc)->b)) - -#define desc_equal(desc1, desc2) \ - (((desc1)->a == (desc2)->a) && ((desc1)->b == (desc2)->b)) - -/* - * Default implementation of macro that returns current - * instruction pointer ("program counter"). - */ -#define current_text_addr() ({ void *pc; asm volatile("leaq 1f(%%rip),%0\n1:":"=r"(pc)); pc; }) - -/* - * CPU type and hardware bug flags. Kept separately for each CPU. - */ - -struct cpuinfo_x86 { - __u8 x86; /* CPU family */ - __u8 x86_vendor; /* CPU vendor */ - __u8 x86_model; - __u8 x86_mask; - int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ - __u32 x86_capability[NCAPINTS]; - char x86_vendor_id[16]; - char x86_model_id[64]; - int x86_cache_size; /* in KB */ - int x86_clflush_size; - int x86_cache_alignment; - int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ - __u8 x86_virt_bits, x86_phys_bits; - __u8 x86_num_cores; - __u8 x86_apicid; - __u32 x86_power; - __u32 x86_cpuid_level; /* Max CPUID function supported */ - unsigned long loops_per_jiffy; -} ____cacheline_aligned; - -#define X86_VENDOR_INTEL 0 -#define X86_VENDOR_CYRIX 1 -#define X86_VENDOR_AMD 2 -#define X86_VENDOR_UMC 3 -#define X86_VENDOR_NEXGEN 4 -#define X86_VENDOR_CENTAUR 5 -#define X86_VENDOR_RISE 6 -#define X86_VENDOR_TRANSMETA 7 -#define X86_VENDOR_NUM 8 -#define X86_VENDOR_UNKNOWN 0xff - -#ifdef CONFIG_SMP -extern struct cpuinfo_x86 cpu_data[]; -#define current_cpu_data cpu_data[smp_processor_id()] -#else -#define cpu_data (&boot_cpu_data) -#define current_cpu_data boot_cpu_data -#endif - -extern char ignore_irq13; - -extern void identify_cpu(struct cpuinfo_x86 *); -extern void print_cpu_info(struct cpuinfo_x86 *); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern void dodgy_tsc(void); - -/* - * EFLAGS bits - */ -#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ -#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ -#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ -#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ -#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ -#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ -#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ -#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ -#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ -#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ -#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ -#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ -#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ -#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ -#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ -#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ - -/* - * Intel CPU features in CR4 - */ -#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ -#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ -#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ -#define X86_CR4_DE 0x0008 /* enable debugging extensions */ -#define X86_CR4_PSE 0x0010 /* enable page size extensions */ -#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ -#define X86_CR4_MCE 0x0040 /* Machine check enable */ -#define X86_CR4_PGE 0x0080 /* enable global pages */ -#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ -#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ -#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ - -/* - * Save the cr4 feature set we're using (ie - * Pentium 4MB enable and PPro Global page - * enable), so that any CPU's that boot up - * after us can get the correct flags. - */ -extern unsigned long mmu_cr4_features; - -static inline void set_in_cr4 (unsigned long mask) -{ - mmu_cr4_features |= mask; - switch (mask) { - case X86_CR4_OSFXSR: - case X86_CR4_OSXMMEXCPT: - break; - default: - do { - const char *msg = "Xen unsupported cr4 update\n"; - (void)HYPERVISOR_console_io( - CONSOLEIO_write, __builtin_strlen(msg), - (char *)msg); - BUG(); - } while (0); - } -} - -#define load_cr3(pgdir) do { \ - xen_pt_switch(__pa(pgdir)); \ - per_cpu(cur_pgd, smp_processor_id()) = pgdir; \ -} while (/* CONSTCOND */0) - -/* - * Bus types - */ -#define MCA_bus 0 -#define MCA_bus__is_a_macro - - -/* - * User space process size. 47bits. - */ -#define TASK_SIZE (0x800000000000UL) - -/* This decides where the kernel will search for a free chunk of vm - * space during mmap's. - */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? 0xc0000000 : 0xFFFFe000) -#define TASK_UNMAPPED_32 PAGE_ALIGN(IA32_PAGE_OFFSET/3) -#define TASK_UNMAPPED_64 PAGE_ALIGN(TASK_SIZE/3) -#define TASK_UNMAPPED_BASE \ - (test_thread_flag(TIF_IA32) ? TASK_UNMAPPED_32 : TASK_UNMAPPED_64) - -/* - * Size of io_bitmap. - */ -#define IO_BITMAP_BITS 65536 -#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) -#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct,io_bitmap) -#define INVALID_IO_BITMAP_OFFSET 0x8000 - -struct i387_fxsave_struct { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */ - u32 padding[24]; -} __attribute__ ((aligned (16))); - -union i387_union { - struct i387_fxsave_struct fxsave; -}; - -struct tss_struct { - u32 reserved1; - u64 rsp0; - u64 rsp1; - u64 rsp2; - u64 reserved2; - u64 ist[7]; - u32 reserved3; - u32 reserved4; - u16 reserved5; - u16 io_bitmap_base; - /* - * The extra 1 is there because the CPU will access an - * additional byte beyond the end of the IO permission - * bitmap. The extra byte must be all 1 bits, and must - * be within the limit. Thus we have: - * - * 128 bytes, the bitmap itself, for ports 0..0x3ff - * 8 bytes, for an extra "long" of ~0UL - */ - unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; -} __attribute__((packed)) ____cacheline_aligned; - -extern struct cpuinfo_x86 boot_cpu_data; -DECLARE_PER_CPU(struct tss_struct,init_tss); -DECLARE_PER_CPU(pgd_t *, cur_pgd); - -#define ARCH_MIN_TASKALIGN 16 - -struct thread_struct { - unsigned long rsp0; - unsigned long rsp; - unsigned long userrsp; /* Copy from PDA */ - unsigned long fs; - unsigned long gs; - unsigned int io_pl; - unsigned short es, ds, fsindex, gsindex; -/* Hardware debugging registers */ - unsigned long debugreg0; - unsigned long debugreg1; - unsigned long debugreg2; - unsigned long debugreg3; - unsigned long debugreg6; - unsigned long debugreg7; -/* fault info */ - unsigned long cr2, trap_no, error_code; -/* floating point info */ - union i387_union i387 __attribute__((aligned(16))); -/* IO permissions. the bitmap could be moved into the GDT, that would make - switch faster for a limited number of ioperm using tasks. -AK */ - int ioperm; - unsigned long *io_bitmap_ptr; - unsigned io_bitmap_max; -/* cached TLS descriptors. */ - u64 tls_array[GDT_ENTRY_TLS_ENTRIES]; -} __attribute__((aligned(16))); - -#define INIT_THREAD {} - -#define INIT_MMAP \ -{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL } - -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ -#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define EXCEPTION_STACK_ORDER 0 - -#define start_thread(regs,new_rip,new_rsp) do { \ - asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ - load_gs_index(0); \ - (regs)->rip = (new_rip); \ - (regs)->rsp = (new_rsp); \ - write_pda(oldrsp, (new_rsp)); \ - (regs)->cs = __USER_CS; \ - (regs)->ss = __USER_DS; \ - (regs)->eflags = 0x200; \ - set_fs(USER_DS); \ -} while(0) - -struct task_struct; -struct mm_struct; - -/* Free all resources held by a thread. */ -extern void release_thread(struct task_struct *); - -/* Prepare to copy thread state - unlazy all lazy status */ -extern void prepare_to_copy(struct task_struct *tsk); - -/* - * create a kernel thread without removing it from tasklists - */ -extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); - -/* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.rsp - 8)) - -extern unsigned long get_wchan(struct task_struct *p); -#define KSTK_EIP(tsk) \ - (((struct pt_regs *)(tsk->thread.rsp0 - sizeof(struct pt_regs)))->rip) -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ - - -struct microcode_header { - unsigned int hdrver; - unsigned int rev; - unsigned int date; - unsigned int sig; - unsigned int cksum; - unsigned int ldrver; - unsigned int pf; - unsigned int datasize; - unsigned int totalsize; - unsigned int reserved[3]; -}; - -struct microcode { - struct microcode_header hdr; - unsigned int bits[0]; -}; - -typedef struct microcode microcode_t; -typedef struct microcode_header microcode_header_t; - -/* microcode format is extended from prescott processors */ -struct extended_signature { - unsigned int sig; - unsigned int pf; - unsigned int cksum; -}; - -struct extended_sigtable { - unsigned int count; - unsigned int cksum; - unsigned int reserved[3]; - struct extended_signature sigs[0]; -}; - -/* '6' because it used to be for P6 only (but now covers Pentium 4 as well) */ -#define MICROCODE_IOCFREE _IO('6',0) - - -#define ASM_NOP1 K8_NOP1 -#define ASM_NOP2 K8_NOP2 -#define ASM_NOP3 K8_NOP3 -#define ASM_NOP4 K8_NOP4 -#define ASM_NOP5 K8_NOP5 -#define ASM_NOP6 K8_NOP6 -#define ASM_NOP7 K8_NOP7 -#define ASM_NOP8 K8_NOP8 - -/* Opteron nops */ -#define K8_NOP1 ".byte 0x90\n" -#define K8_NOP2 ".byte 0x66,0x90\n" -#define K8_NOP3 ".byte 0x66,0x66,0x90\n" -#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" -#define K8_NOP5 K8_NOP3 K8_NOP2 -#define K8_NOP6 K8_NOP3 K8_NOP3 -#define K8_NOP7 K8_NOP4 K8_NOP3 -#define K8_NOP8 K8_NOP4 K8_NOP4 - -#define ASM_NOP_MAX 8 - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -extern inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -/* Stop speculative execution */ -extern inline void sync_core(void) -{ - int tmp; - asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory"); -} - -#define cpu_has_fpu 1 - -#define ARCH_HAS_PREFETCH -static inline void prefetch(void *x) -{ - asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); -} - -#define ARCH_HAS_PREFETCHW 1 -static inline void prefetchw(void *x) -{ - alternative_input(ASM_NOP5, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); -} - -#define ARCH_HAS_SPINLOCK_PREFETCH 1 - -#define spin_lock_prefetch(x) prefetchw(x) - -#define cpu_relax() rep_nop() - -/* - * NSC/Cyrix CPU configuration register indexes - */ -#define CX86_CCR0 0xc0 -#define CX86_CCR1 0xc1 -#define CX86_CCR2 0xc2 -#define CX86_CCR3 0xc3 -#define CX86_CCR4 0xe8 -#define CX86_CCR5 0xe9 -#define CX86_CCR6 0xea -#define CX86_CCR7 0xeb -#define CX86_DIR0 0xfe -#define CX86_DIR1 0xff -#define CX86_ARR_BASE 0xc4 -#define CX86_RCR_BASE 0xdc - -/* - * NSC/Cyrix CPU indexed register access macros - */ - -#define getCx86(reg) ({ outb((reg), 0x22); inb(0x23); }) - -#define setCx86(reg, data) do { \ - outb((reg), 0x22); \ - outb((data), 0x23); \ -} while (0) - -static inline void __monitor(const void *eax, unsigned long ecx, - unsigned long edx) -{ - /* "monitor %eax,%ecx,%edx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc8;" - : :"a" (eax), "c" (ecx), "d"(edx)); -} - -static inline void __mwait(unsigned long eax, unsigned long ecx) -{ - /* "mwait %eax,%ecx;" */ - asm volatile( - ".byte 0x0f,0x01,0xc9;" - : :"a" (eax), "c" (ecx)); -} - -#define stack_current() \ -({ \ - struct thread_info *ti; \ - asm("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->task; \ -}) - -#define cache_line_size() (boot_cpu_data.x86_cache_alignment) - -extern unsigned long boot_option_idle_override; -/* Boot loader type from the setup header */ -extern int bootloader_type; - -#endif /* __ASM_X86_64_PROCESSOR_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/ptrace.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/ptrace.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,119 +0,0 @@ -#ifndef _X86_64_PTRACE_H -#define _X86_64_PTRACE_H - -#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 -/* arguments: interrupts/non tracing syscalls only save upto here*/ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* = ERROR */ -/* end of arguments */ -/* cpu exception frame or undefined in case of fast syscall. */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 -#define ARGOFFSET R11 -#endif /* __ASSEMBLY__ */ - -/* top of stack page */ -#define FRAME_SIZE 168 - -#define PTRACE_OLDSETOPTIONS 21 - -#ifndef __ASSEMBLY__ - -struct pt_regs { - unsigned long r15; - unsigned long r14; - unsigned long r13; - unsigned long r12; - unsigned long rbp; - unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save upto here*/ - unsigned long r11; - unsigned long r10; - unsigned long r9; - unsigned long r8; - unsigned long rax; - unsigned long rcx; - unsigned long rdx; - unsigned long rsi; - unsigned long rdi; - unsigned long orig_rax; -/* end of arguments */ -/* cpu exception frame or undefined */ - unsigned long rip; - unsigned long cs; - unsigned long eflags; - unsigned long rsp; - unsigned long ss; -/* top of stack page */ -}; - -#endif - -/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */ -#define PTRACE_GETREGS 12 -#define PTRACE_SETREGS 13 -#define PTRACE_GETFPREGS 14 -#define PTRACE_SETFPREGS 15 -#define PTRACE_GETFPXREGS 18 -#define PTRACE_SETFPXREGS 19 - -/* only useful for access 32bit programs */ -#define PTRACE_GET_THREAD_AREA 25 -#define PTRACE_SET_THREAD_AREA 26 - -#define PTRACE_ARCH_PRCTL 30 /* arch_prctl for child */ - -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) -#define user_mode(regs) (!!((regs)->cs & 3)) -#define instruction_pointer(regs) ((regs)->rip) -#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -extern unsigned long profile_pc(struct pt_regs *regs); -#else -#define profile_pc(regs) instruction_pointer(regs) -#endif - -void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - -enum { - EF_CF = 0x00000001, - EF_PF = 0x00000004, - EF_AF = 0x00000010, - EF_ZF = 0x00000040, - EF_SF = 0x00000080, - EF_TF = 0x00000100, - EF_IE = 0x00000200, - EF_DF = 0x00000400, - EF_OF = 0x00000800, - EF_IOPL = 0x00003000, - EF_IOPL_RING0 = 0x00000000, - EF_IOPL_RING1 = 0x00001000, - EF_IOPL_RING2 = 0x00002000, - EF_NT = 0x00004000, /* nested task */ - EF_RF = 0x00010000, /* resume */ - EF_VM = 0x00020000, /* virtual mode */ - EF_AC = 0x00040000, /* alignment */ - EF_VIF = 0x00080000, /* virtual interrupt */ - EF_VIP = 0x00100000, /* virtual interrupt pending */ - EF_ID = 0x00200000, /* id */ -}; - -#endif - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/segment.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/segment.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,47 +0,0 @@ -#ifndef _ASM_SEGMENT_H -#define _ASM_SEGMENT_H - -#include <asm/cache.h> - -#define __KERNEL_CS 0x10 -#define __KERNEL_DS 0x1b - -#define __KERNEL32_CS 0x3b - -/* - * we cannot use the same code segment descriptor for user and kernel - * -- not even in the long flat mode, because of different DPL /kkeil - * The segment offset needs to contain a RPL. Grr. -AK - * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) - */ - -#define __USER32_CS 0x23 /* 4*8+3 */ -#define __USER_DS 0x2b /* 5*8+3 */ -#define __USER_CS 0x33 /* 6*8+3 */ -#define __USER32_DS __USER_DS -#define __KERNEL16_CS (GDT_ENTRY_KERNELCS16 * 8) -#define __KERNEL_COMPAT32_CS 0x8 - -#define GDT_ENTRY_TLS 1 -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 -#define GDT_ENTRY_TLS_MIN 11 -#define GDT_ENTRY_TLS_MAX 13 -/* 14 free */ -#define GDT_ENTRY_KERNELCS16 15 - -#define GDT_ENTRY_TLS_ENTRIES 3 - -/* TLS indexes for 64bit - hardcoded in arch_prctl */ -#define FS_TLS 0 -#define GS_TLS 1 - -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) - -#define IDT_ENTRIES 256 -#define GDT_ENTRIES 16 -#define GDT_SIZE (GDT_ENTRIES * 8) -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/smp.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/smp.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,154 +0,0 @@ -#ifndef __ASM_SMP_H -#define __ASM_SMP_H - -/* - * We need the APIC definitions automatically as part of 'smp.h' - */ -#ifndef __ASSEMBLY__ -#include <linux/config.h> -#include <linux/threads.h> -#include <linux/cpumask.h> -#include <linux/bitops.h> -extern int disable_apic; -#endif - -#ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ -#include <asm/fixmap.h> -#include <asm/mpspec.h> -#ifdef CONFIG_X86_IO_APIC -#include <asm/io_apic.h> -#endif -#include <asm/apic.h> -#include <asm/thread_info.h> -#endif -#endif - -#ifdef CONFIG_SMP -#ifndef ASSEMBLY - -#include <asm/pda.h> - -struct pt_regs; - -/* - * Private routines/data - */ - -extern void smp_alloc_memory(void); -extern cpumask_t cpu_online_map; -extern volatile unsigned long smp_invalidate_needed; -extern int pic_mode; -extern int smp_num_siblings; -extern void smp_flush_tlb(void); -extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); -extern void smp_send_reschedule(int cpu); -extern void smp_invalidate_rcv(void); /* Process an NMI */ -extern void (*mtrr_hook) (void); -extern void zap_low_mappings(void); -void smp_stop_cpu(void); -extern cpumask_t cpu_sibling_map[NR_CPUS]; -extern u8 phys_proc_id[NR_CPUS]; - -#define SMP_TRAMPOLINE_BASE 0x6000 - -/* - * On x86 all CPUs are mapped 1:1 to the APIC space. - * This simplifies scheduling and IPI sending and - * compresses data structures. - */ - -extern cpumask_t cpu_callout_map; -extern cpumask_t cpu_callin_map; -#define cpu_possible_map cpu_callout_map - -static inline int num_booting_cpus(void) -{ - return cpus_weight(cpu_callout_map); -} - -#define __smp_processor_id() read_pda(cpunumber) - -#ifdef CONFIG_X86_LOCAL_APIC -extern __inline int hard_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); -} -#endif - -#define safe_smp_processor_id() (disable_apic ? 0 : x86_apicid_to_cpu(hard_smp_processor_id())) - -#endif /* !ASSEMBLY */ - -#define NO_PROC_ID 0xFF /* No processor magic marker */ - -#endif - -#ifndef ASSEMBLY -/* - * Some lowlevel functions might want to know about - * the real APIC ID <-> CPU # mapping. - */ -extern u8 x86_cpu_to_apicid[NR_CPUS]; /* physical ID */ -extern u8 x86_cpu_to_log_apicid[NR_CPUS]; -extern u8 bios_cpu_apicid[]; -#ifdef CONFIG_X86_LOCAL_APIC -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) -{ - return cpus_addr(cpumask)[0]; -} - -static inline int x86_apicid_to_cpu(u8 apicid) -{ - int i; - - for (i = 0; i < NR_CPUS; ++i) - if (x86_cpu_to_apicid[i] == apicid) - return i; - - /* No entries in x86_cpu_to_apicid? Either no MPS|ACPI, - * or called too early. Either way, we must be CPU 0. */ - if (x86_cpu_to_apicid[0] == BAD_APICID) - return 0; - - return -1; -} - -static inline int cpu_present_to_apicid(int mps_cpu) -{ - if (mps_cpu < NR_CPUS) - return (int)bios_cpu_apicid[mps_cpu]; - else - return BAD_APICID; -} -#endif - -#endif /* !ASSEMBLY */ - -#ifndef CONFIG_SMP -#define stack_smp_processor_id() 0 -#define safe_smp_processor_id() 0 -#define cpu_logical_map(x) (x) -#else -#include <asm/thread_info.h> -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) -#endif - -#ifndef __ASSEMBLY__ -#ifdef CONFIG_X86_LOCAL_APIC -static __inline int logical_smp_processor_id(void) -{ - /* we don't want to mark this access volatile - bad code generation */ - return GET_APIC_LOGICAL_ID(*(unsigned long *)(APIC_BASE+APIC_LDR)); -} -#endif -#endif - -#endif - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/system.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/system.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,403 +0,0 @@ -#ifndef __ASM_SYSTEM_H -#define __ASM_SYSTEM_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <asm/segment.h> -#include <asm-xen/synch_bitops.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/xen-public/arch-x86_64.h> - -#ifdef __KERNEL__ - -#ifdef CONFIG_SMP -#define LOCK_PREFIX "lock ; " -#else -#define LOCK_PREFIX "" -#endif - -#define __STR(x) #x -#define STR(x) __STR(x) - -#define __SAVE(reg,offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t" -#define __RESTORE(reg,offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t" - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushfq ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popfq\n\t" - -#define __EXTRA_CLOBBER \ - ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" - -#define switch_to(prev,next,last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ - "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - LOCK "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "jc ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)),\ - [tif_fork] "i" (TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ - [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ - : "memory", "cc" __EXTRA_CLOBBER) - - -extern void load_gs_index(unsigned); - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg,value) \ - asm volatile("\n" \ - "1:\t" \ - "movl %k0,%%" #seg "\n" \ - "2:\n" \ - ".section .fixup,\"ax\"\n" \ - "3:\t" \ - "movl %1,%%" #seg "\n\t" \ - "jmp 2b\n" \ - ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - ".align 8\n\t" \ - ".quad 1b,3b\n" \ - ".previous" \ - : :"r" (value), "r" (0)) - -#define set_debug(value,register) \ - __asm__("movq %0,%%db" #register \ - : /* no output */ \ - :"r" ((unsigned long) value)) - - -#ifdef __KERNEL__ -struct alt_instr { - __u8 *instr; /* original instruction */ - __u8 *replacement; - __u8 cpuid; /* cpuid bit set for replacement */ - __u8 instrlen; /* length of original instruction */ - __u8 replacementlen; /* length of new instruction, <= instrlen */ - __u8 pad[5]; -}; -#endif - -/* - * Alternative instructions for different CPU types or capabilities. - * - * This allows to use optimized instructions even on generic binary - * kernels. - * - * length of oldinstr must be longer or equal the length of newinstr - * It can be padded with nops as needed. - * - * For non barrier like inlines please define new variants - * without volatile and memory clobber. - */ -#define alternative(oldinstr, newinstr, feature) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .align 8\n" \ - " .quad 661b\n" /* label */ \ - " .quad 663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature) : "memory") - -/* - * Alternative inline assembly with input. - * - * Pecularities: - * No memory clobber here. - * Argument numbers start with 1. - * Best is to use constraints that are fixed size (like (%1) ... "r") - * If you use variable sized constraints like "m" or "g" in the - * replacement maake sure to pad to the worst case length. - */ -#define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .align 8\n" \ - " .quad 661b\n" /* label */ \ - " .quad 663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature), ##input) - -/* - * Clear and set 'TS' bit respectively - */ -#define clts() (HYPERVISOR_fpu_taskswitch(0)) - -static inline unsigned long read_cr0(void) -{ - unsigned long cr0; - asm volatile("movq %%cr0,%0" : "=r" (cr0)); - return cr0; -} - -static inline void write_cr0(unsigned long val) -{ - asm volatile("movq %0,%%cr0" :: "r" (val)); -} - -static inline unsigned long read_cr3(void) -{ - unsigned long cr3; - asm("movq %%cr3,%0" : "=r" (cr3)); - return cr3; -} - -static inline unsigned long read_cr4(void) -{ - unsigned long cr4; - asm("movq %%cr4,%0" : "=r" (cr4)); - return cr4; -} - -static inline void write_cr4(unsigned long val) -{ - asm volatile("movq %0,%%cr4" :: "r" (val)); -} - -#define stts() (HYPERVISOR_fpu_taskswitch(1)) - -#define wbinvd() \ - __asm__ __volatile__ ("wbinvd": : :"memory"); - -#endif /* __KERNEL__ */ - -#define nop() __asm__ __volatile__ ("nop") - -#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) - -#define tas(ptr) (xchg((ptr),1)) - -#define __xg(x) ((volatile long *)(x)) - -extern inline void set_64bit(volatile unsigned long *ptr, unsigned long val) -{ - *ptr = val; -} - -#define _set_64bit set_64bit - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway - * Note 2: xchg has side effect, so that attribute volatile is necessary, - * but generally the primitive is invalid, *ptr is output argument. --ANK - */ -static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) -{ - switch (size) { - case 1: - __asm__ __volatile__("xchgb %b0,%1" - :"=q" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 2: - __asm__ __volatile__("xchgw %w0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 4: - __asm__ __volatile__("xchgl %k0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - case 8: - __asm__ __volatile__("xchgq %0,%1" - :"=r" (x) - :"m" (*__xg(ptr)), "0" (x) - :"memory"); - break; - } - return x; -} - -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ - -#define __HAVE_ARCH_CMPXCHG 1 - -static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, - unsigned long new, int size) -{ - unsigned long prev; - switch (size) { - case 1: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 2: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 4: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - case 8: - __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2" - : "=a"(prev) - : "q"(new), "m"(*__xg(ptr)), "0"(old) - : "memory"); - return prev; - } - return old; -} - -#define cmpxchg(ptr,o,n)\ - ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ - (unsigned long)(n),sizeof(*(ptr)))) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#define smp_read_barrier_depends() do {} while(0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do {} while(0) -#endif - - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") - -#ifdef CONFIG_UNORDERED_IO -#define wmb() asm volatile("sfence" ::: "memory") -#else -#define wmb() asm volatile("" ::: "memory") -#endif -#define read_barrier_depends() do {} while(0) -#define set_mb(var, value) do { xchg(&var, value); } while (0) -#define set_wmb(var, value) do { var = value; wmb(); } while (0) - -#define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0) - - -/* - * The use of 'barrier' in the following reflects their use as local-lock - * operations. Reentrancy must be prevented (e.g., __cli()) /before/ following - * critical operations are executed. All critical operations must complete - * /before/ reentrancy is permitted (e.g., __sti()). Alpha architecture also - * includes these barriers, for example. - */ - -#define __cli() \ -do { \ - vcpu_info_t *_vcpu; \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - _vcpu->evtchn_upcall_mask = 1; \ - preempt_enable_no_resched(); \ - barrier(); \ -} while (0) - -#define __sti() \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - _vcpu->evtchn_upcall_mask = 0; \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ - preempt_enable(); \ -} while (0) - -#define __save_flags(x) \ -do { \ - vcpu_info_t *_vcpu; \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - (x) = _vcpu->evtchn_upcall_mask; \ -} while (0) - -#define __restore_flags(x) \ -do { \ - vcpu_info_t *_vcpu; \ - barrier(); \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ - barrier(); /* unmask then check (avoid races) */ \ - if ( unlikely(_vcpu->evtchn_upcall_pending) ) \ - force_evtchn_callback(); \ - preempt_enable(); \ - } else \ - preempt_enable_no_resched(); \ -} while (0) - -#define safe_halt() ((void)0) - -#define __save_and_cli(x) \ -do { \ - vcpu_info_t *_vcpu; \ - preempt_disable(); \ - _vcpu = &HYPERVISOR_shared_info->vcpu_data[smp_processor_id()]; \ - (x) = _vcpu->evtchn_upcall_mask; \ - _vcpu->evtchn_upcall_mask = 1; \ - preempt_enable_no_resched(); \ - barrier(); \ -} while (0) - -void cpu_idle_wait(void); - -#define local_irq_save(x) __save_and_cli(x) -#define local_irq_restore(x) __restore_flags(x) -#define local_save_flags(x) __save_flags(x) -#define local_irq_disable() __cli() -#define local_irq_enable() __sti() - -#define irqs_disabled() \ - HYPERVISOR_shared_info->vcpu_data[smp_processor_id()].evtchn_upcall_mask - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -void disable_hlt(void); -void enable_hlt(void); - -#define HAVE_EAT_KEY -void eat_key(void); - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/timer.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/timer.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,64 +0,0 @@ -#ifndef _ASMi386_TIMER_H -#define _ASMi386_TIMER_H -#include <linux/init.h> - -/** - * struct timer_ops - used to define a timer source - * - * @name: name of the timer. - * @init: Probes and initializes the timer. Takes clock= override - * string as an argument. Returns 0 on success, anything else - * on failure. - * @mark_offset: called by the timer interrupt. - * @get_offset: called by gettimeofday(). Returns the number of microseconds - * since the last timer interupt. - * @monotonic_clock: returns the number of nanoseconds since the init of the - * timer. - * @delay: delays this many clock cycles. - */ -struct timer_opts { - char* name; - void (*mark_offset)(void); - unsigned long (*get_offset)(void); - unsigned long long (*monotonic_clock)(void); - void (*delay)(unsigned long); -}; - -struct init_timer_opts { - int (*init)(char *override); - struct timer_opts *opts; -}; - -#define TICK_SIZE (tick_nsec / 1000) - -extern struct timer_opts* __init select_timer(void); -extern void clock_fallback(void); -void setup_pit_timer(void); - -/* Modifiers for buggy PIT handling */ - -extern int pit_latch_buggy; - -extern struct timer_opts *cur_timer; -extern int timer_ack; - -/* list of externed timers */ -extern struct timer_opts timer_none; -extern struct timer_opts timer_pit; -extern struct init_timer_opts timer_pit_init; -extern struct init_timer_opts timer_tsc_init; -#ifdef CONFIG_X86_CYCLONE_TIMER -extern struct init_timer_opts timer_cyclone_init; -#endif - -extern unsigned long calibrate_tsc(void); -extern void init_cpu_khz(void); -#ifdef CONFIG_HPET_TIMER -extern struct init_timer_opts timer_hpet_init; -extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr); -#endif - -#ifdef CONFIG_X86_PM_TIMER -extern struct init_timer_opts timer_pmtmr_init; -#endif -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,97 +0,0 @@ -#ifndef _X8664_TLBFLUSH_H -#define _X8664_TLBFLUSH_H - -#include <linux/config.h> -#include <linux/mm.h> -#include <asm/processor.h> - -#define __flush_tlb() xen_tlb_flush() - -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -#define __flush_tlb_global() xen_tlb_flush() - - -extern unsigned long pgkern_mask; - -#define __flush_tlb_all() __flush_tlb_global() - -#define __flush_tlb_one(addr) xen_invlpg(addr) - - -/* - * TLB flushing: - * - * - flush_tlb() flushes the current mm struct TLBs - * - flush_tlb_all() flushes all processes TLBs - * - flush_tlb_mm(mm) flushes the specified mm context TLB's - * - flush_tlb_page(vma, vmaddr) flushes one page - * - flush_tlb_range(vma, start, end) flushes a range of pages - * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages - * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables - * - * ..but the x86_64 has somewhat limited tlb flushing capabilities, - * and page-granular flushes are available only on i486 and up. - */ - -#ifndef CONFIG_SMP - -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() - -static inline void flush_tlb_mm(struct mm_struct *mm) -{ - if (mm == current->active_mm) - __flush_tlb(); -} - -static inline void flush_tlb_page(struct vm_area_struct *vma, - unsigned long addr) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb_one(addr); -} - -static inline void flush_tlb_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - if (vma->vm_mm == current->active_mm) - __flush_tlb(); -} - -#else - -#include <asm/smp.h> - -#define local_flush_tlb() \ - __flush_tlb() - -extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); -extern void flush_tlb_mm(struct mm_struct *); -extern void flush_tlb_page(struct vm_area_struct *, unsigned long); - -#define flush_tlb() flush_tlb_current_task() - -static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) -{ - flush_tlb_mm(vma->vm_mm); -} - -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 - -#endif - -#define flush_tlb_kernel_range(start, end) flush_tlb_all() - -static inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - /* x86_64 does not keep any page table caches in TLB */ -} - -#endif /* _X8664_TLBFLUSH_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/vga.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/vga.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,20 +0,0 @@ -/* - * Access to VGA videoram - * - * (c) 1998 Martin Mares <mj@xxxxxx> - */ - -#ifndef _LINUX_ASM_VGA_H_ -#define _LINUX_ASM_VGA_H_ - -/* - * On the PC, we can just recalculate addresses and then - * access the videoram directly without any black magic. - */ - -#define VGA_MAP_MEM(x) (unsigned long)isa_bus_to_virt(x) - -#define vga_readb(x) (*(x)) -#define vga_writeb(x,y) (*(y) = (x)) - -#endif diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/xor.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-x86_64/xor.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,328 +0,0 @@ -/* - * x86-64 changes / gcc fixes from Andi Kleen. - * Copyright 2002 Andi Kleen, SuSE Labs. - * - * This hasn't been optimized for the hammer yet, but there are likely - * no advantages to be gotten from x86-64 here anyways. - */ - -typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; - -/* Doesn't use gcc to save the XMM registers, because there is no easy way to - tell it to do a clts before the register saving. */ -#define XMMS_SAVE do { \ - preempt_disable(); \ - if (!(current_thread_info()->status & TS_USEDFPU)) \ - clts(); \ - __asm__ __volatile__ ( \ - "movups %%xmm0,(%1) ;\n\t" \ - "movups %%xmm1,0x10(%1) ;\n\t" \ - "movups %%xmm2,0x20(%1) ;\n\t" \ - "movups %%xmm3,0x30(%1) ;\n\t" \ - : "=&r" (cr0) \ - : "r" (xmm_save) \ - : "memory"); \ -} while(0) - -#define XMMS_RESTORE do { \ - asm volatile ( \ - "sfence ;\n\t" \ - "movups (%1),%%xmm0 ;\n\t" \ - "movups 0x10(%1),%%xmm1 ;\n\t" \ - "movups 0x20(%1),%%xmm2 ;\n\t" \ - "movups 0x30(%1),%%xmm3 ;\n\t" \ - : \ - : "r" (cr0), "r" (xmm_save) \ - : "memory"); \ - if (!(current_thread_info()->status & TS_USEDFPU)) \ - stts(); \ - preempt_enable(); \ -} while(0) - -#define OFFS(x) "16*("#x")" -#define PF_OFFS(x) "256+16*("#x")" -#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" -#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" -#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" -#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" -#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" -#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" -#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" -#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" -#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" -#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" -#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" -#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" -#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" - - -static void -xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) -{ - unsigned int lines = bytes >> 8; - unsigned long cr0; - xmm_store_t xmm_save[4]; - - XMMS_SAVE; - - asm volatile ( -#undef BLOCK -#define BLOCK(i) \ - LD(i,0) \ - LD(i+1,1) \ - PF1(i) \ - PF1(i+2) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " decl %[cnt] ; jnz 1b" - : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) - : [inc] "r" (256UL) - : "memory"); - - XMMS_RESTORE; -} - -static void -xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3) -{ - unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - - XMMS_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " decl %[cnt] ; jnz 1b" - : [cnt] "+r" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) - : [inc] "r" (256UL) - : "memory"); - XMMS_RESTORE; -} - -static void -xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4) -{ - unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - - XMMS_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " addq %[inc], %[p4] ;\n" - " decl %[cnt] ; jnz 1b" - : [cnt] "+c" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) - : [inc] "r" (256UL) - : "memory" ); - - XMMS_RESTORE; -} - -static void -xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, - unsigned long *p3, unsigned long *p4, unsigned long *p5) -{ - unsigned int lines = bytes >> 8; - xmm_store_t xmm_save[4]; - unsigned long cr0; - - XMMS_SAVE; - - __asm__ __volatile__ ( -#undef BLOCK -#define BLOCK(i) \ - PF1(i) \ - PF1(i+2) \ - LD(i,0) \ - LD(i+1,1) \ - LD(i+2,2) \ - LD(i+3,3) \ - PF2(i) \ - PF2(i+2) \ - XO1(i,0) \ - XO1(i+1,1) \ - XO1(i+2,2) \ - XO1(i+3,3) \ - PF3(i) \ - PF3(i+2) \ - XO2(i,0) \ - XO2(i+1,1) \ - XO2(i+2,2) \ - XO2(i+3,3) \ - PF4(i) \ - PF4(i+2) \ - PF0(i+4) \ - PF0(i+6) \ - XO3(i,0) \ - XO3(i+1,1) \ - XO3(i+2,2) \ - XO3(i+3,3) \ - XO4(i,0) \ - XO4(i+1,1) \ - XO4(i+2,2) \ - XO4(i+3,3) \ - ST(i,0) \ - ST(i+1,1) \ - ST(i+2,2) \ - ST(i+3,3) \ - - - PF0(0) - PF0(2) - - " .align 32 ;\n" - " 1: ;\n" - - BLOCK(0) - BLOCK(4) - BLOCK(8) - BLOCK(12) - - " addq %[inc], %[p1] ;\n" - " addq %[inc], %[p2] ;\n" - " addq %[inc], %[p3] ;\n" - " addq %[inc], %[p4] ;\n" - " addq %[inc], %[p5] ;\n" - " decl %[cnt] ; jnz 1b" - : [cnt] "+c" (lines), - [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), - [p5] "+r" (p5) - : [inc] "r" (256UL) - : "memory"); - - XMMS_RESTORE; -} - -static struct xor_block_template xor_block_sse = { - .name = "generic_sse", - .do_2 = xor_sse_2, - .do_3 = xor_sse_3, - .do_4 = xor_sse_4, - .do_5 = xor_sse_5, -}; - -#undef XOR_TRY_TEMPLATES -#define XOR_TRY_TEMPLATES \ - do { \ - xor_speed(&xor_block_sse); \ - } while (0) - -/* We force the use of the SSE xor block because it can write around L2. - We may also be able to load into the L1 only depending on how the cpu - deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/balloon.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/balloon.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,51 +0,0 @@ -/****************************************************************************** - * balloon.h - * - * Xen balloon driver - enables returning/claiming memory to/from Xen. - * - * Copyright (c) 2003, B Dragovic - * Copyright (c) 2003-2004, M Williamson, K Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __ASM_BALLOON_H__ -#define __ASM_BALLOON_H__ - -/* - * Inform the balloon driver that it should allow some slop for device-driver - * memory activities. - */ -extern void balloon_update_driver_allowance(long delta); - -/* Give up unmapped pages to the balloon driver. */ -extern void balloon_put_pages(unsigned long *mfn_list, unsigned long nr_mfns); - -/* - * Prevent the balloon driver from changing the memory reservation during - * a driver critical region. - */ -extern spinlock_t balloon_lock; -#define balloon_lock(__flags) spin_lock_irqsave(&balloon_lock, __flags) -#define balloon_unlock(__flags) spin_unlock_irqrestore(&balloon_lock, __flags) - -#endif /* __ASM_BALLOON_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/ctrl_if.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/ctrl_if.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,160 +0,0 @@ -/****************************************************************************** - * ctrl_if.h - * - * Management functions for special interface to the domain controller. - * - * Copyright (c) 2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __ASM_XEN__CTRL_IF_H__ -#define __ASM_XEN__CTRL_IF_H__ - -#include <asm-xen/hypervisor.h> -#include <asm-xen/queues.h> - -typedef control_msg_t ctrl_msg_t; - -/* - * Callback function type. Called for asynchronous processing of received - * request messages, and responses to previously-transmitted request messages. - * The parameters are (@msg, @id). - * @msg: Original request/response message (not a copy). The message can be - * modified in-place by the handler (e.g., a response callback can - * turn a request message into a response message in place). The message - * is no longer accessible after the callback handler returns -- if the - * message is required to persist for longer then it must be copied. - * @id: (Response callbacks only) The 'id' that was specified when the - * original request message was queued for transmission. - */ -typedef void (*ctrl_msg_handler_t)(ctrl_msg_t *, unsigned long); - -/* - * Send @msg to the domain controller. Execute @hnd when a response is - * received, passing the response message and the specified @id. This - * operation will not block: it will return -EAGAIN if there is no space. - * Notes: - * 1. The @msg is copied if it is transmitted and so can be freed after this - * function returns. - * 2. If @hnd is NULL then no callback is executed. - */ -int -ctrl_if_send_message_noblock( - ctrl_msg_t *msg, - ctrl_msg_handler_t hnd, - unsigned long id); - -/* - * Send @msg to the domain controller. Execute @hnd when a response is - * received, passing the response message and the specified @id. This - * operation will block until the message is sent, or a signal is received - * for the calling process (unless @wait_state is TASK_UNINTERRUPTIBLE). - * Notes: - * 1. The @msg is copied if it is transmitted and so can be freed after this - * function returns. - * 2. If @hnd is NULL then no callback is executed. - */ -int -ctrl_if_send_message_block( - ctrl_msg_t *msg, - ctrl_msg_handler_t hnd, - unsigned long id, - long wait_state); - -/* - * Send @msg to the domain controller. Block until the response is received, - * and then copy it into the provided buffer, @rmsg. - */ -int -ctrl_if_send_message_and_get_response( - ctrl_msg_t *msg, - ctrl_msg_t *rmsg, - long wait_state); - -/* - * Request a callback when there is /possibly/ space to immediately send a - * message to the domain controller. This function returns 0 if there is - * already space to trasnmit a message --- in this case the callback task /may/ - * still be executed. If this function returns 1 then the callback /will/ be - * executed when space becomes available. - */ -int -ctrl_if_enqueue_space_callback( - struct tq_struct *task); - -/* - * Send a response (@msg) to a message from the domain controller. This will - * never block. - * Notes: - * 1. The @msg is copied and so can be freed after this function returns. - * 2. The @msg may be the original request message, modified in-place. - */ -void -ctrl_if_send_response( - ctrl_msg_t *msg); - -/* - * Register a receiver for typed messages from the domain controller. The - * handler (@hnd) is called for every received message of specified @type. - * Returns TRUE (non-zero) if the handler was successfully registered. - * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will - * occur in a context in which it is safe to yield (i.e., process context). - */ -#define CALLBACK_IN_BLOCKING_CONTEXT 1 -int ctrl_if_register_receiver( - u8 type, - ctrl_msg_handler_t hnd, - unsigned int flags); - -/* - * Unregister a receiver for typed messages from the domain controller. The - * handler (@hnd) will not be executed after this function returns. - */ -void -ctrl_if_unregister_receiver( - u8 type, ctrl_msg_handler_t hnd); - -/* Suspend/resume notifications. */ -void ctrl_if_suspend(void); -void ctrl_if_resume(void); - -/* Start-of-day setup. */ -void ctrl_if_init(void); - -/* - * Returns TRUE if there are no outstanding message requests at the domain - * controller. This can be used to ensure that messages have really flushed - * through when it is not possible to use the response-callback interface. - * WARNING: If other subsystems are using the control interface then this - * function might never return TRUE! - */ -int ctrl_if_transmitter_empty(void); /* !! DANGEROUS FUNCTION !! */ - -/* - * Manually discard response messages from the domain controller. - * WARNING: This is usually done automatically -- this function should only - * be called when normal interrupt mechanisms are disabled! - */ -void ctrl_if_discard_responses(void); /* !! DANGEROUS FUNCTION !! */ - -#endif /* __ASM_XEN__CONTROL_IF_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/evtchn.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/evtchn.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,106 +0,0 @@ -/****************************************************************************** - * evtchn.h - * - * Communication via Xen event channels. - * Also definitions for the device that demuxes notifications to userspace. - * - * Copyright (c) 2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __ASM_EVTCHN_H__ -#define __ASM_EVTCHN_H__ - -#include <linux/config.h> -#include <asm-xen/hypervisor.h> -#include <asm/ptrace.h> -#include <asm-xen/synch_bitops.h> -#include <asm-xen/xen-public/event_channel.h> -#include <linux/smp.h> - -/* - * LOW-LEVEL DEFINITIONS - */ - -/* Entry point for notifications into Linux subsystems. */ -asmlinkage void evtchn_do_upcall(struct pt_regs *regs); - -/* Entry point for notifications into the userland character device. */ -void evtchn_device_upcall(int port); - -static inline void mask_evtchn(int port) -{ - shared_info_t *s = HYPERVISOR_shared_info; - synch_set_bit(port, &s->evtchn_mask[0]); -} - -static inline void unmask_evtchn(int port) -{ - shared_info_t *s = HYPERVISOR_shared_info; - vcpu_info_t *vcpu_info = &s->vcpu_data[smp_processor_id()]; - - synch_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of 'hw_resend_irq'. Just like - * a real IO-APIC we 'lose the interrupt edge' if the channel is masked. - */ - if ( synch_test_bit (port, &s->evtchn_pending[0]) && - !synch_test_and_set_bit(port>>5, &vcpu_info->evtchn_pending_sel) ) - { - vcpu_info->evtchn_upcall_pending = 1; - if ( !vcpu_info->evtchn_upcall_mask ) - force_evtchn_callback(); - } -} - -static inline void clear_evtchn(int port) -{ - shared_info_t *s = HYPERVISOR_shared_info; - synch_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline int notify_via_evtchn(int port) -{ - evtchn_op_t op; - op.cmd = EVTCHNOP_send; - op.u.send.local_port = port; - return HYPERVISOR_event_channel_op(&op); -} - -/* - * CHARACTER-DEVICE DEFINITIONS - */ - -/* /dev/xen/evtchn resides at device number major=10, minor=201 */ -#define EVTCHN_MINOR 201 - -/* /dev/xen/evtchn ioctls: */ -/* EVTCHN_RESET: Clear and reinit the event buffer. Clear error condition. */ -#define EVTCHN_RESET _IO('E', 1) -/* EVTCHN_BIND: Bind to teh specified event-channel port. */ -#define EVTCHN_BIND _IO('E', 2) -/* EVTCHN_UNBIND: Unbind from the specified event-channel port. */ -#define EVTCHN_UNBIND _IO('E', 3) - -#endif /* __ASM_EVTCHN_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/foreign_page.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/foreign_page.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,30 +0,0 @@ -/****************************************************************************** - * foreign_page.h - * - * Provide a "foreign" page type, that is owned by a foreign allocator and - * not the normal buddy allocator in page_alloc.c - * - * Copyright (c) 2004, K A Fraser - */ - -#ifndef __ASM_XEN_FOREIGN_PAGE_H__ -#define __ASM_XEN_FOREIGN_PAGE_H__ - -#define PG_foreign PG_arch_1 - -#define PageForeign(page) test_bit(PG_foreign, &(page)->flags) - -#define SetPageForeign(page, dtor) do { \ - set_bit(PG_foreign, &(page)->flags); \ - (page)->mapping = (void *)dtor; \ -} while (0) - -#define ClearPageForeign(page) do { \ - clear_bit(PG_foreign, &(page)->flags); \ - (page)->mapping = NULL; \ -} while (0) - -#define PageForeignDestructor(page) \ - ( (void (*) (struct page *)) (page)->mapping ) - -#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/gnttab.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/gnttab.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,72 +0,0 @@ -/****************************************************************************** - * gnttab.h - * - * Two sets of functionality: - * 1. Granting foreign access to our memory reservation. - * 2. Accessing others' memory reservations via grant references. - * (i.e., mechanisms for both sender and recipient of grant references) - * - * Copyright (c) 2004, K A Fraser - * Copyright (c) 2005, Christopher Clark - */ - -#ifndef __ASM_GNTTAB_H__ -#define __ASM_GNTTAB_H__ - -#include <linux/config.h> -#include <asm-xen/hypervisor.h> -#include <asm-xen/xen-public/grant_table.h> - -/* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ -#define NR_GRANT_FRAMES 4 -#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t)) - -int -gnttab_grant_foreign_access( - domid_t domid, unsigned long frame, int readonly); - -void -gnttab_end_foreign_access( - grant_ref_t ref, int readonly); - -int -gnttab_grant_foreign_transfer( - domid_t domid, unsigned long pfn); - -unsigned long -gnttab_end_foreign_transfer( - grant_ref_t ref); - -int -gnttab_query_foreign_access( - grant_ref_t ref ); - -/* - * operations on reserved batches of grant references - */ -int -gnttab_alloc_grant_references( - u16 count, grant_ref_t *pprivate_head, grant_ref_t *private_terminal ); - -void -gnttab_free_grant_references( - u16 count, grant_ref_t private_head ); - -int -gnttab_claim_grant_reference( grant_ref_t *pprivate_head, grant_ref_t terminal -); - -void -gnttab_release_grant_reference( - grant_ref_t *private_head, grant_ref_t release ); - -void -gnttab_grant_foreign_access_ref( - grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); - -void -gnttab_grant_foreign_transfer_ref( - grant_ref_t, domid_t domid, unsigned long pfn); - - -#endif /* __ASM_GNTTAB_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,187 +0,0 @@ -/****************************************************************************** - * hypervisor.h - * - * Linux-specific hypervisor handling. - * - * Copyright (c) 2002-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __HYPERVISOR_H__ -#define __HYPERVISOR_H__ - -#include <linux/config.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/version.h> -#include <asm-xen/xen-public/xen.h> -#include <asm-xen/xen-public/dom0_ops.h> -#include <asm-xen/xen-public/io/domain_controller.h> -#include <asm/ptrace.h> -#include <asm/page.h> -#if defined(__i386__) -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#include <asm-generic/pgtable-nopmd.h> -#endif -#endif - -/* arch/xen/i386/kernel/setup.c */ -union xen_start_info_union -{ - start_info_t xen_start_info; - char padding[2048]; -}; -extern union xen_start_info_union xen_start_info_union; -#define xen_start_info (xen_start_info_union.xen_start_info) - -/* arch/xen/kernel/evtchn.c */ -/* Force a proper event-channel callback from Xen. */ -void force_evtchn_callback(void); - -/* arch/xen/kernel/process.c */ -void xen_cpu_idle (void); - -/* arch/xen/i386/kernel/hypervisor.c */ -void do_hypervisor_callback(struct pt_regs *regs); - -/* arch/xen/i386/kernel/head.S */ -void lgdt_finish(void); - -/* arch/xen/i386/mm/hypervisor.c */ -/* - * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already - * be MACHINE addresses. - */ - -void xen_pt_switch(unsigned long ptr); -void xen_new_user_pt(unsigned long ptr); /* x86_64 only */ -void xen_load_gs(unsigned int selector); /* x86_64 only */ -void xen_tlb_flush(void); -void xen_invlpg(unsigned long ptr); - -#ifndef CONFIG_XEN_SHADOW_MODE -void xen_l1_entry_update(pte_t *ptr, unsigned long val); -void xen_l2_entry_update(pmd_t *ptr, pmd_t val); -#ifdef __x86_64__ -void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64 only */ -#endif -void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */ -void xen_pgd_pin(unsigned long ptr); -void xen_pgd_unpin(unsigned long ptr); -void xen_pud_pin(unsigned long ptr); /* x86_64 only */ -void xen_pud_unpin(unsigned long ptr); /* x86_64 only */ -void xen_pmd_pin(unsigned long ptr); /* x86_64 only */ -void xen_pmd_unpin(unsigned long ptr); /* x86_64 only */ -void xen_pte_pin(unsigned long ptr); -void xen_pte_unpin(unsigned long ptr); -#else -#define xen_l1_entry_update(_p, _v) set_pte((_p), (pte_t){(_v)}) -#define xen_l2_entry_update(_p, _v) set_pgd((_p), (pgd_t){(_v)}) -#define xen_pgd_pin(_p) ((void)0) -#define xen_pgd_unpin(_p) ((void)0) -#define xen_pte_pin(_p) ((void)0) -#define xen_pte_unpin(_p) ((void)0) -#endif - -void xen_set_ldt(unsigned long ptr, unsigned long bytes); -void xen_machphys_update(unsigned long mfn, unsigned long pfn); - -#ifdef CONFIG_SMP -#include <linux/cpumask.h> -void xen_tlb_flush_all(void); -void xen_invlpg_all(unsigned long ptr); -void xen_tlb_flush_mask(cpumask_t *mask); -void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr); -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -/* -** XXX SMH: 2.4 doesn't have percpu.h (or support SMP guests) so just -** include sufficient #defines to allow the below to build. -*/ -#define DEFINE_PER_CPU(type, name) \ - __typeof__(type) per_cpu__##name - -#define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) -#define __get_cpu_var(var) per_cpu__##var -#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name - -#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var) -#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) -#endif /* linux < 2.6.0 */ - -void xen_contig_memory(unsigned long vstart, unsigned int order); - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS -/* Allocate a contiguous empty region of low memory. Return virtual start. */ -unsigned long allocate_empty_lowmem_region(unsigned long pages); -#endif - -#include <asm/hypercall.h> - -static inline void -MULTI_update_va_mapping( - multicall_entry_t *mcl, unsigned long va, - pte_t new_val, unsigned long flags) -{ - mcl->op = __HYPERVISOR_update_va_mapping; - mcl->args[0] = va; -#if defined(CONFIG_X86_64) - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; -#elif defined(CONFIG_X86_PAE) - mcl->args[1] = new_val.pte_low; - mcl->args[2] = new_val.pte_high; - mcl->args[3] = flags; -#else - mcl->args[1] = new_val.pte_low; - mcl->args[2] = 0; - mcl->args[3] = flags; -#endif -} - -static inline void -MULTI_update_va_mapping_otherdomain( - multicall_entry_t *mcl, unsigned long va, - pte_t new_val, unsigned long flags, domid_t domid) -{ - mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl->args[0] = va; -#if defined(CONFIG_X86_64) - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; - mcl->args[3] = domid; -#elif defined(CONFIG_X86_PAE) - mcl->args[1] = new_val.pte_low; - mcl->args[2] = new_val.pte_high; - mcl->args[3] = flags; - mcl->args[4] = domid; -#else - mcl->args[1] = new_val.pte_low; - mcl->args[2] = 0; - mcl->args[3] = flags; - mcl->args[4] = domid; -#endif -} - -#endif /* __HYPERVISOR_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/linux-public/privcmd.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/linux-public/privcmd.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,90 +0,0 @@ -/****************************************************************************** - * privcmd.h - * - * Interface to /proc/xen/privcmd. - * - * Copyright (c) 2003-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __PRIVCMD_H__ -#define __PRIVCMD_H__ - -typedef struct privcmd_hypercall -{ - unsigned long op; - unsigned long arg[5]; -} privcmd_hypercall_t; - -typedef struct privcmd_mmap_entry { - unsigned long va; - unsigned long mfn; - unsigned long npages; -} privcmd_mmap_entry_t; - -typedef struct privcmd_mmap { - int num; - domid_t dom; /* target domain */ - privcmd_mmap_entry_t *entry; -} privcmd_mmap_t; - -typedef struct privcmd_mmapbatch { - int num; /* number of pages to populate */ - domid_t dom; /* target domain */ - unsigned long addr; /* virtual address */ - unsigned long *arr; /* array of mfns - top nibble set on err */ -} privcmd_mmapbatch_t; - -typedef struct privcmd_blkmsg -{ - unsigned long op; - void *buf; - int buf_size; -} privcmd_blkmsg_t; - -/* - * @cmd: IOCTL_PRIVCMD_HYPERCALL - * @arg: &privcmd_hypercall_t - * Return: Value returned from execution of the specified hypercall. - */ -#define IOCTL_PRIVCMD_HYPERCALL \ - _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t)) - -/* - * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN - * @arg: n/a - * Return: Port associated with domain-controller end of control event channel - * for the initial domain. - */ -#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \ - _IOC(_IOC_NONE, 'P', 1, 0) -#define IOCTL_PRIVCMD_MMAP \ - _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t)) -#define IOCTL_PRIVCMD_MMAPBATCH \ - _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t)) -#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN \ - _IOC(_IOC_READ, 'P', 4, sizeof(unsigned long)) -#define IOCTL_PRIVCMD_INITDOMAIN_STORE \ - _IOC(_IOC_READ, 'P', 5, 0) - -#endif /* __PRIVCMD_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/linux-public/suspend.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/linux-public/suspend.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,43 +0,0 @@ -/****************************************************************************** - * suspend.h - * - * Copyright (c) 2003-2004, K A Fraser - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef __ASM_XEN_SUSPEND_H__ -#define __ASM_XEN_SUSPEND_H__ - -typedef struct suspend_record_st { - /* To be filled in before resume. */ - start_info_t resume_info; - /* - * The number of a machine frame containing, in sequence, the number of - * each machine frame that contains PFN -> MFN translation table data. - */ - unsigned long pfn_to_mfn_frame_list; - /* Number of entries in the PFN -> MFN translation table. */ - unsigned long nr_pfns; -} suspend_record_t; - -#endif /* __ASM_XEN_SUSPEND_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/queues.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/queues.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,81 +0,0 @@ - -/* - * Oh dear. Task queues were removed from Linux 2.6 and replaced by work - * queues. Unfortunately the semantics is not the same. With task queues we - * can defer work until a particular event occurs -- this is not - * straightforwardly done with work queues (queued work is performed asap, or - * after some fixed timeout). Conversely, work queues are a (slightly) neater - * way of deferring work to a process context than using task queues in 2.4. - * - * This is a bit of a needless reimplementation -- should have just pulled - * the code from 2.4, but I tried leveraging work queues to simplify things. - * They didn't help. :-( - */ - -#ifndef __QUEUES_H__ -#define __QUEUES_H__ - -#include <linux/version.h> -#include <linux/list.h> -#include <linux/workqueue.h> - -struct tq_struct { - void (*fn)(void *); - void *arg; - struct list_head list; - unsigned long pending; -}; -#define INIT_TQUEUE(_name, _fn, _arg) \ - do { \ - INIT_LIST_HEAD(&(_name)->list); \ - (_name)->pending = 0; \ - (_name)->fn = (_fn); (_name)->arg = (_arg); \ - } while ( 0 ) -#define DECLARE_TQUEUE(_name, _fn, _arg) \ - struct tq_struct _name = { (_fn), (_arg), LIST_HEAD_INIT((_name).list), 0 } - -typedef struct { - struct list_head list; - spinlock_t lock; -} task_queue; -#define DECLARE_TASK_QUEUE(_name) \ - task_queue _name = { LIST_HEAD_INIT((_name).list), SPIN_LOCK_UNLOCKED } - -static inline int queue_task(struct tq_struct *tqe, task_queue *tql) -{ - unsigned long flags; - if ( test_and_set_bit(0, &tqe->pending) ) - return 0; - spin_lock_irqsave(&tql->lock, flags); - list_add_tail(&tqe->list, &tql->list); - spin_unlock_irqrestore(&tql->lock, flags); - return 1; -} - -static inline void run_task_queue(task_queue *tql) -{ - struct list_head head, *ent; - struct tq_struct *tqe; - unsigned long flags; - void (*fn)(void *); - void *arg; - - spin_lock_irqsave(&tql->lock, flags); - list_add(&head, &tql->list); - list_del_init(&tql->list); - spin_unlock_irqrestore(&tql->lock, flags); - - while ( !list_empty(&head) ) - { - ent = head.next; - list_del_init(ent); - tqe = list_entry(ent, struct tq_struct, list); - fn = tqe->fn; - arg = tqe->arg; - wmb(); - tqe->pending = 0; - fn(arg); - } -} - -#endif /* __QUEUES_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/synch_bitops.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/synch_bitops.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2 +0,0 @@ - -#include <asm-i386/synch_bitops.h> diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/xen_proc.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/xen_proc.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,13 +0,0 @@ - -#ifndef __ASM_XEN_PROC_H__ -#define __ASM_XEN_PROC_H__ - -#include <linux/config.h> -#include <linux/proc_fs.h> - -extern struct proc_dir_entry *create_xen_proc_entry( - const char *name, mode_t mode); -extern void remove_xen_proc_entry( - const char *name); - -#endif /* __ASM_XEN_PROC_H__ */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/asm-xen/xenbus.h --- a/linux-2.6.11-xen-sparse/include/asm-xen/xenbus.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,143 +0,0 @@ -#ifndef _ASM_XEN_XENBUS_H -#define _ASM_XEN_XENBUS_H -/****************************************************************************** - * xenbus.h - * - * Talks to Xen Store to figure out what devices we have. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This file may be distributed separately from the Linux kernel, or - * incorporated into other software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ -#include <linux/device.h> -#include <asm/semaphore.h> - -/* A xenbus device. */ -struct xenbus_device { - char *devicetype; - char *subtype; - char *nodename; - int id; - struct device dev; -}; - -static inline struct xenbus_device *to_xenbus_device(struct device *dev) -{ - return container_of(dev, struct xenbus_device, dev); -} - -struct xenbus_device_id -{ - /* .../device/<device_type>/<identifier> */ - char devicetype[32]; /* General class of device. */ - char subtype[32]; /* Contents of "subtype" for this device */ -}; - -/* A xenbus driver. */ -struct xenbus_driver { - char *name; - struct module *owner; - const struct xenbus_device_id *ids; - /* Called when xenstore is connected. */ - int (*connect) (struct xenbus_driver * drv); - - int (*probe) (struct xenbus_device * dev, const struct xenbus_device_id * id); - int (*remove) (struct xenbus_device * dev); - int (*configure)(struct xenbus_device * dev); - - struct device_driver driver; -}; - -struct xenbus_evtchn { - unsigned long dom1; - unsigned long port1; - unsigned long dom2; - unsigned long port2; -}; - -static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv) -{ - return container_of(drv, struct xenbus_driver, driver); -} - -int xenbus_register_driver(struct xenbus_driver *drv); -void xenbus_unregister_driver(struct xenbus_driver *drv); - -int xenbus_register_backend(struct xenbus_driver *drv); -void xenbus_unregister_backend(struct xenbus_driver *drv); - -/* Iterator over xenbus devices (frontend). */ -int xenbus_for_each_dev(struct xenbus_device * start, void * data, - int (*fn)(struct xenbus_device *, void *)); - -/* Iterator over xenbus drivers (frontend). */ -int xenbus_for_each_drv(struct xenbus_driver * start, void * data, - int (*fn)(struct xenbus_driver *, void *)); - -/* Iterator over xenbus drivers (backend). */ -int xenbus_for_each_backend(struct xenbus_driver * start, void * data, - int (*fn)(struct xenbus_driver *, void *)); - -/* Caller must hold this lock to call these functions. */ -extern struct semaphore xs_lock; - -char **xs_directory(const char *path, unsigned int *num); -void *xs_read(const char *path, unsigned int *len); -int xs_write(const char *path, - const void *data, unsigned int len, int createflags); -int xs_mkdir(const char *path); -int xs_exists(const char *path); -int xs_mkdirs(const char *path); -int xs_rm(const char *path); -int xs_transaction_start(const char *subtree); -int xs_transaction_end(int abort); -char *xs_get_domain_path(domid_t domid); - -/* Register callback to watch this node. */ -struct xenbus_watch -{ - struct list_head list; - char *node; - unsigned int priority; - void (*callback)(struct xenbus_watch *, const char *node); -}; - -int register_xenbus_watch(struct xenbus_watch *watch); -void unregister_xenbus_watch(struct xenbus_watch *watch); - -char *xenbus_path(const char *dir, const char *name); -char *xenbus_read(const char *dir, const char *name, unsigned int *data_n); -int xenbus_write(const char *dir, const char *name, - const char *data, int data_n); - -int xenbus_read_string(const char *dir, const char *name, char **val); -int xenbus_write_string(const char *dir, const char *name, const char *val); -int xenbus_read_ulong(const char *dir, const char *name, unsigned long *val); -int xenbus_write_ulong(const char *dir, const char *name, unsigned long val); -int xenbus_read_long(const char *dir, const char *name, long *val); -int xenbus_write_long(const char *dir, const char *name, long val); -int xenbus_read_mac(const char *dir, const char *name, unsigned char mac[6]); -int xenbus_write_mac(const char *dir, const char *name, const unsigned char mac[6]); -int xenbus_read_evtchn(const char *dir, const char *name, struct xenbus_evtchn *evtchn); -int xenbus_message(const char *dir, const char *val, ...); - -#endif /* _ASM_XEN_XENBUS_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/linux/gfp.h --- a/linux-2.6.11-xen-sparse/include/linux/gfp.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,138 +0,0 @@ -#ifndef __LINUX_GFP_H -#define __LINUX_GFP_H - -#include <linux/mmzone.h> -#include <linux/stddef.h> -#include <linux/linkage.h> -#include <linux/config.h> - -struct vm_area_struct; - -/* - * GFP bitmasks.. - */ -/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ -#define __GFP_DMA 0x01 -#define __GFP_HIGHMEM 0x02 - -/* - * Action modifiers - doesn't change the zoning - * - * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt - * _might_ fail. This depends upon the particular VM implementation. - * - * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller - * cannot handle allocation failures. - * - * __GFP_NORETRY: The VM implementation must not retry indefinitely. - */ -#define __GFP_WAIT 0x10 /* Can wait and reschedule? */ -#define __GFP_HIGH 0x20 /* Should access emergency pools? */ -#define __GFP_IO 0x40 /* Can start physical IO? */ -#define __GFP_FS 0x80 /* Can call down to low-level FS? */ -#define __GFP_COLD 0x100 /* Cache-cold page required */ -#define __GFP_NOWARN 0x200 /* Suppress page allocation failure warning */ -#define __GFP_REPEAT 0x400 /* Retry the allocation. Might fail */ -#define __GFP_NOFAIL 0x800 /* Retry for ever. Cannot fail */ -#define __GFP_NORETRY 0x1000 /* Do not retry. Might fail */ -#define __GFP_NO_GROW 0x2000 /* Slab internal usage */ -#define __GFP_COMP 0x4000 /* Add compound page metadata */ -#define __GFP_ZERO 0x8000 /* Return zeroed page on success */ - -#define __GFP_BITS_SHIFT 16 /* Room for 16 __GFP_FOO bits */ -#define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) - -/* if you forget to add the bitmask here kernel will crash, period */ -#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \ - __GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \ - __GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP) - -#define GFP_ATOMIC (__GFP_HIGH) -#define GFP_NOIO (__GFP_WAIT) -#define GFP_NOFS (__GFP_WAIT | __GFP_IO) -#define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS) -#define GFP_HIGHUSER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HIGHMEM) - -/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some - platforms, used as appropriate on others */ - -#define GFP_DMA __GFP_DMA - - -/* - * There is only one page-allocator function, and two main namespaces to - * it. The alloc_page*() variants return 'struct page *' and as such - * can allocate highmem pages, the *get*page*() variants return - * virtual kernel addresses to the allocated page(s). - */ - -/* - * We get the zone list from the current node and the gfp_mask. - * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. - * - * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets - * optimized to &contig_page_data at compile-time. - */ - -/* - * If arch_free_page returns non-zero then the generic free_page code can - * immediately bail: the arch-specific function has done all the work. - */ -#ifndef HAVE_ARCH_FREE_PAGE -#define arch_free_page(page, order) 0 -#endif - -extern struct page * -FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *)); - -static inline struct page *alloc_pages_node(int nid, unsigned int gfp_mask, - unsigned int order) -{ - if (unlikely(order >= MAX_ORDER)) - return NULL; - - return __alloc_pages(gfp_mask, order, - NODE_DATA(nid)->node_zonelists + (gfp_mask & GFP_ZONEMASK)); -} - -#ifdef CONFIG_NUMA -extern struct page *alloc_pages_current(unsigned gfp_mask, unsigned order); - -static inline struct page * -alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - if (unlikely(order >= MAX_ORDER)) - return NULL; - - return alloc_pages_current(gfp_mask, order); -} -extern struct page *alloc_page_vma(unsigned gfp_mask, - struct vm_area_struct *vma, unsigned long addr); -#else -#define alloc_pages(gfp_mask, order) \ - alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) -#endif -#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) - -extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); -extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); - -#define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask),0) - -#define __get_dma_pages(gfp_mask, order) \ - __get_free_pages((gfp_mask) | GFP_DMA,(order)) - -extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); -extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); -extern void FASTCALL(free_hot_page(struct page *page)); -extern void FASTCALL(free_cold_page(struct page *page)); - -#define __free_page(page) __free_pages((page), 0) -#define free_page(addr) free_pages((addr),0) - -void page_alloc_init(void); - -#endif /* __LINUX_GFP_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/linux/highmem.h --- a/linux-2.6.11-xen-sparse/include/linux/highmem.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,106 +0,0 @@ -#ifndef _LINUX_HIGHMEM_H -#define _LINUX_HIGHMEM_H - -#include <linux/config.h> -#include <linux/fs.h> -#include <linux/mm.h> - -#include <asm/cacheflush.h> - -#ifdef CONFIG_HIGHMEM - -#include <asm/highmem.h> - -/* declarations for linux/mm/highmem.c */ -unsigned int nr_free_highpages(void); -void kmap_flush_unused(void); - -#else /* CONFIG_HIGHMEM */ - -static inline unsigned int nr_free_highpages(void) { return 0; } -static inline void kmap_flush_unused(void) { } - -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} - -#define kunmap(page) do { (void) (page); } while (0) - -#define kmap_atomic(page, idx) page_address(page) -#define kunmap_atomic(addr, idx) do { } while (0) -#define kmap_atomic_to_page(ptr) virt_to_page(ptr) - -#endif /* CONFIG_HIGHMEM */ - -/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ -static inline void clear_user_highpage(struct page *page, unsigned long vaddr) -{ - void *addr = kmap_atomic(page, KM_USER0); - clear_user_page(addr, vaddr, page); - kunmap_atomic(addr, KM_USER0); - /* Make sure this page is cleared on other CPU's too before using it */ - smp_wmb(); -} - -#ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE -static inline struct page * -alloc_zeroed_user_highpage(struct vm_area_struct *vma, unsigned long vaddr) -{ - struct page *page = alloc_page_vma(GFP_HIGHUSER, vma, vaddr); - - if (page) - clear_user_highpage(page, vaddr); - - return page; -} -#endif - -static inline void clear_highpage(struct page *page) -{ - void *kaddr = kmap_atomic(page, KM_USER0); - clear_page(kaddr); - kunmap_atomic(kaddr, KM_USER0); -} - -/* - * Same but also flushes aliased cache contents to RAM. - */ -static inline void memclear_highpage_flush(struct page *page, unsigned int offset, unsigned int size) -{ - void *kaddr; - - BUG_ON(offset + size > PAGE_SIZE); - - kaddr = kmap_atomic(page, KM_USER0); - memset((char *)kaddr + offset, 0, size); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); -} - -static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr) -{ - char *vfrom, *vto; - - vfrom = kmap_atomic(from, KM_USER0); - vto = kmap_atomic(to, KM_USER1); - copy_user_page(vto, vfrom, vaddr, to); - kunmap_atomic(vfrom, KM_USER0); - kunmap_atomic(vto, KM_USER1); - /* Make sure this page is cleared on other CPU's too before using it */ - smp_wmb(); -} - -static inline void copy_highpage(struct page *to, struct page *from) -{ - char *vfrom, *vto; - - vfrom = kmap_atomic(from, KM_USER0); - vto = kmap_atomic(to, KM_USER1); - copy_page(vto, vfrom); - kunmap_atomic(vfrom, KM_USER0); - kunmap_atomic(vto, KM_USER1); -} - -#endif /* _LINUX_HIGHMEM_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/linux/irq.h --- a/linux-2.6.11-xen-sparse/include/linux/irq.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,98 +0,0 @@ -#ifndef __irq_h -#define __irq_h - -/* - * Please do not include this file in generic code. There is currently - * no requirement for any architecture to implement anything held - * within this file. - * - * Thanks. --rmk - */ - -#include <linux/config.h> - -#if !defined(CONFIG_ARCH_S390) - -#include <linux/linkage.h> -#include <linux/cache.h> -#include <linux/spinlock.h> -#include <linux/cpumask.h> - -#include <asm/irq.h> -#include <asm/ptrace.h> - -/* - * IRQ line status. - */ -#define IRQ_INPROGRESS 1 /* IRQ handler active - do not enter! */ -#define IRQ_DISABLED 2 /* IRQ disabled - do not enter! */ -#define IRQ_PENDING 4 /* IRQ pending - replay on enable */ -#define IRQ_REPLAY 8 /* IRQ has been replayed but not acked yet */ -#define IRQ_AUTODETECT 16 /* IRQ is being autodetected */ -#define IRQ_WAITING 32 /* IRQ not yet seen - for autodetection */ -#define IRQ_LEVEL 64 /* IRQ level triggered */ -#define IRQ_MASKED 128 /* IRQ masked - shouldn't be seen again */ -#define IRQ_PER_CPU 256 /* IRQ is per CPU */ - -/* - * Interrupt controller descriptor. This is all we need - * to describe about the low-level hardware. - */ -struct hw_interrupt_type { - const char * typename; - unsigned int (*startup)(unsigned int irq); - void (*shutdown)(unsigned int irq); - void (*enable)(unsigned int irq); - void (*disable)(unsigned int irq); - void (*ack)(unsigned int irq); - void (*end)(unsigned int irq); - void (*set_affinity)(unsigned int irq, cpumask_t dest); -}; - -typedef struct hw_interrupt_type hw_irq_controller; - -/* - * This is the "IRQ descriptor", which contains various information - * about the irq, including what kind of hardware handling it has, - * whether it is disabled etc etc. - * - * Pad this out to 32 bytes for cache and indexing reasons. - */ -typedef struct irq_desc { - hw_irq_controller *handler; - void *handler_data; - struct irqaction *action; /* IRQ action list */ - unsigned int status; /* IRQ status */ - unsigned int depth; /* nested irq disables */ - unsigned int irq_count; /* For detecting broken interrupts */ - unsigned int irqs_unhandled; - spinlock_t lock; -} ____cacheline_aligned irq_desc_t; - -extern irq_desc_t irq_desc [NR_IRQS]; - -#include <asm/hw_irq.h> /* the arch dependent stuff */ - -extern int setup_irq(unsigned int irq, struct irqaction * new); -extern int teardown_irq(unsigned int irq, struct irqaction * old); - -#ifdef CONFIG_GENERIC_HARDIRQS -extern cpumask_t irq_affinity[NR_IRQS]; -extern int no_irq_affinity; -extern int noirqdebug_setup(char *str); - -extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, - struct irqaction *action); -extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs); -extern void note_interrupt(unsigned int irq, irq_desc_t *desc, int action_ret); -extern void report_bad_irq(unsigned int irq, irq_desc_t *desc, int action_ret); -extern int can_request_irq(unsigned int irq, unsigned long irqflags); - -extern void init_irq_proc(void); -#endif - -extern hw_irq_controller no_irq_type; /* needed in every arch ? */ - -#endif - -#endif /* __irq_h */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/linux/mm.h --- a/linux-2.6.11-xen-sparse/include/linux/mm.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,865 +0,0 @@ -#ifndef _LINUX_MM_H -#define _LINUX_MM_H - -#include <linux/sched.h> -#include <linux/errno.h> - -#ifdef __KERNEL__ - -#include <linux/config.h> -#include <linux/gfp.h> -#include <linux/list.h> -#include <linux/mmzone.h> -#include <linux/rbtree.h> -#include <linux/prio_tree.h> -#include <linux/fs.h> - -struct mempolicy; -struct anon_vma; - -#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; -#endif - -extern unsigned long num_physpages; -extern void * high_memory; -extern unsigned long vmalloc_earlyreserve; -extern int page_cluster; - -#ifdef CONFIG_SYSCTL -extern int sysctl_legacy_va_layout; -#else -#define sysctl_legacy_va_layout 0 -#endif - -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/atomic.h> - -#ifndef MM_VM_SIZE -#define MM_VM_SIZE(mm) ((TASK_SIZE + PGDIR_SIZE - 1) & PGDIR_MASK) -#endif - -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) - -/* - * Linux kernel virtual memory manager primitives. - * The idea being to have a "virtual" mm in the same way - * we have a virtual fs - giving a cleaner interface to the - * mm details, and allowing different kinds of memory mappings - * (from shared memory to executable loading to arbitrary - * mmap() functions). - */ - -/* - * This struct defines a memory VMM memory area. There is one of these - * per VM-area/task. A VM area is any part of the process virtual memory - * space that has a special rule for the page-fault handlers (ie a shared - * library, the executable area etc). - */ -struct vm_area_struct { - struct mm_struct * vm_mm; /* The address space we belong to. */ - unsigned long vm_start; /* Our start address within vm_mm. */ - unsigned long vm_end; /* The first byte after our end address - within vm_mm. */ - - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next; - - pgprot_t vm_page_prot; /* Access permissions of this VMA. */ - unsigned long vm_flags; /* Flags, listed below. */ - - struct rb_node vm_rb; - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap prio tree, or - * linkage to the list of like vmas hanging off its node, or - * linkage of vma in the address_space->i_mmap_nonlinear list. - */ - union { - struct { - struct list_head list; - void *parent; /* aligns with prio_tree_node parent */ - struct vm_area_struct *head; - } vm_set; - - struct raw_prio_tree_node prio_tree_node; - } shared; - - /* - * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma - * list, after a COW of one of the file pages. A MAP_SHARED vma - * can only be in the i_mmap tree. An anonymous MAP_PRIVATE, stack - * or brk vma (with NULL file) can only be in an anon_vma list. - */ - struct list_head anon_vma_node; /* Serialized by anon_vma->lock */ - struct anon_vma *anon_vma; /* Serialized by page_table_lock */ - - /* Function pointers to deal with this struct. */ - struct vm_operations_struct * vm_ops; - - /* Information about our backing store: */ - unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE - units, *not* PAGE_CACHE_SIZE */ - struct file * vm_file; /* File we map to (can be NULL). */ - void * vm_private_data; /* was vm_pte (shared mem) */ - unsigned long vm_truncate_count;/* truncate_count or restart_addr */ - -#ifndef CONFIG_MMU - atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ -#endif -#ifdef CONFIG_NUMA - struct mempolicy *vm_policy; /* NUMA policy for the VMA */ -#endif -}; - -/* - * This struct defines the per-mm list of VMAs for uClinux. If CONFIG_MMU is - * disabled, then there's a single shared list of VMAs maintained by the - * system, and mm's subscribe to these individually - */ -struct vm_list_struct { - struct vm_list_struct *next; - struct vm_area_struct *vma; -}; - -#ifndef CONFIG_MMU -extern struct rb_root nommu_vma_tree; -extern struct rw_semaphore nommu_vma_sem; - -extern unsigned int kobjsize(const void *objp); -#endif - -/* - * vm_flags.. - */ -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 - -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 - -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ -#define VM_GROWSUP 0x00000200 -#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ -#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ - -#define VM_EXECUTABLE 0x00001000 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ -#define VM_FOREIGN 0x01000000 /* Has pages belonging to another VM */ - -#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#endif - -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#else -#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#endif - -#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) -#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK -#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) -#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) -#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) - -/* - * mapping from the currently active vm_flags protection bits (the - * low four bits) to a page protection mask.. - */ -extern pgprot_t protection_map[16]; - - -/* - * These are the virtual MM functions - opening of an area, closing and - * unmapping it (needed to keep files on disk up-to-date etc), pointer - * to the functions called when a no-page or a wp-page exception occurs. - */ -struct vm_operations_struct { - void (*open)(struct vm_area_struct * area); - void (*close)(struct vm_area_struct * area); - struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type); - int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); -#ifdef CONFIG_NUMA - int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); - struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); -#endif -}; - -struct mmu_gather; -struct inode; - -#ifdef ARCH_HAS_ATOMIC_UNSIGNED -typedef unsigned page_flags_t; -#else -typedef unsigned long page_flags_t; -#endif - -/* - * Each physical page in the system has a struct page associated with - * it to keep track of whatever it is we are using the page for at the - * moment. Note that we have no way to track which tasks are using - * a page. - */ -struct page { - page_flags_t flags; /* Atomic flags, some possibly - * updated asynchronously */ - atomic_t _count; /* Usage count, see below. */ - atomic_t _mapcount; /* Count of ptes mapped in mms, - * to show when page is mapped - * & limit reverse map searches. - */ - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache - * When page is free, this indicates - * order in the buddy system. - */ - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. - */ - pgoff_t index; /* Our offset within mapping. */ - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone->lru_lock ! - */ - /* - * On machines where all RAM is mapped into kernel address space, - * we can simply calculate the virtual address. On machines with - * highmem some memory is mapped into kernel virtual memory - * dynamically, so we need a place to store that address. - * Note that this field could be 16 bits on x86 ... ;) - * - * Architectures with slow multiplication can define - * WANT_PAGE_VIRTUAL in asm/page.h - */ -#if defined(WANT_PAGE_VIRTUAL) - void *virtual; /* Kernel virtual address (NULL if - not kmapped, ie. highmem) */ -#endif /* WANT_PAGE_VIRTUAL */ -}; - -/* - * FIXME: take this include out, include page-flags.h in - * files which need it (119 of them) - */ -#include <linux/page-flags.h> - -/* - * Methods to modify the page usage count. - * - * What counts for a page usage: - * - cache mapping (page->mapping) - * - private data (page->private) - * - page mapped in a task's page tables, each mapping - * is counted separately - * - * Also, many kernel routines increase the page count before a critical - * routine so they can be sure the page doesn't go away from under them. - * - * Since 2.6.6 (approx), a free page has ->_count = -1. This is so that we - * can use atomic_add_negative(-1, page->_count) to detect when the page - * becomes free and so that we can also use atomic_inc_and_test to atomically - * detect when we just tried to grab a ref on a page which some other CPU has - * already deemed to be freeable. - * - * NO code should make assumptions about this internal detail! Use the provided - * macros which retain the old rules: page_count(page) == 0 is a free page. - */ - -/* - * Drop a ref, return true if the logical refcount fell to zero (the page has - * no users) - */ -#define put_page_testzero(p) \ - ({ \ - BUG_ON(page_count(p) == 0); \ - atomic_add_negative(-1, &(p)->_count); \ - }) - -/* - * Grab a ref, return true if the page previously had a logical refcount of - * zero. ie: returns true if we just grabbed an already-deemed-to-be-free page - */ -#define get_page_testone(p) atomic_inc_and_test(&(p)->_count) - -#define set_page_count(p,v) atomic_set(&(p)->_count, v - 1) -#define __put_page(p) atomic_dec(&(p)->_count) - -extern void FASTCALL(__page_cache_release(struct page *)); - -#ifdef CONFIG_HUGETLB_PAGE - -static inline int page_count(struct page *p) -{ - if (PageCompound(p)) - p = (struct page *)p->private; - return atomic_read(&(p)->_count) + 1; -} - -static inline void get_page(struct page *page) -{ - if (unlikely(PageCompound(page))) - page = (struct page *)page->private; - atomic_inc(&page->_count); -} - -void put_page(struct page *page); - -#else /* CONFIG_HUGETLB_PAGE */ - -#define page_count(p) (atomic_read(&(p)->_count) + 1) - -static inline void get_page(struct page *page) -{ - atomic_inc(&page->_count); -} - -static inline void put_page(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) - __page_cache_release(page); -} - -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * Multiple processes may "see" the same page. E.g. for untouched - * mappings of /dev/null, all processes see the same page full of - * zeroes, and text pages of executables and shared libraries have - * only one copy in memory, at most, normally. - * - * For the non-reserved pages, page_count(page) denotes a reference count. - * page_count() == 0 means the page is free. - * page_count() == 1 means the page is used for exactly one purpose - * (e.g. a private data page of one process). - * - * A page may be used for kmalloc() or anyone else who does a - * __get_free_page(). In this case the page_count() is at least 1, and - * all other fields are unused but should be 0 or NULL. The - * management of this page is the responsibility of the one who uses - * it. - * - * The other pages (we may call them "process pages") are completely - * managed by the Linux memory manager: I/O, buffers, swapping etc. - * The following discussion applies only to them. - * - * A page may belong to an inode's memory mapping. In this case, - * page->mapping is the pointer to the inode, and page->index is the - * file offset of the page, in units of PAGE_CACHE_SIZE. - * - * A page contains an opaque `private' member, which belongs to the - * page's address_space. Usually, this is the address of a circular - * list of the page's disk buffers. - * - * For pages belonging to inodes, the page_count() is the number of - * attaches, plus 1 if `private' contains something, plus one for - * the page cache itself. - * - * All pages belonging to an inode are in these doubly linked lists: - * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; - * using the page->list list_head. These fields are also used for - * freelist managemet (when page_count()==0). - * - * There is also a per-mapping radix tree mapping index to the page - * in memory if present. The tree is rooted at mapping->root. - * - * All process pages can do I/O: - * - inode pages may need to be read from disk, - * - inode pages which have been modified and are MAP_SHARED may need - * to be written to disk, - * - private pages which have been modified may need to be swapped out - * to swap space and (later) to be read back into memory. - */ - -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total, - * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits. - */ -#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT) -#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) - -static inline unsigned long page_zonenum(struct page *page) -{ - return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT)); -} -static inline unsigned long page_to_nid(struct page *page) -{ - return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT)); -} - -struct zone; -extern struct zone *zone_table[]; - -static inline struct zone *page_zone(struct page *page) -{ - return zone_table[page->flags >> NODEZONE_SHIFT]; -} - -static inline void set_page_zone(struct page *page, unsigned long nodezone_num) -{ - page->flags &= ~(~0UL << NODEZONE_SHIFT); - page->flags |= nodezone_num << NODEZONE_SHIFT; -} - -#ifndef CONFIG_DISCONTIGMEM -/* The array of struct pages - for discontigmem use pgdat->lmem_map */ -extern struct page *mem_map; -#endif - -static inline void *lowmem_page_address(struct page *page) -{ - return __va(page_to_pfn(page) << PAGE_SHIFT); -} - -#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) -#define HASHED_PAGE_VIRTUAL -#endif - -#if defined(WANT_PAGE_VIRTUAL) -#define page_address(page) ((page)->virtual) -#define set_page_address(page, address) \ - do { \ - (page)->virtual = (address); \ - } while(0) -#define page_address_init() do { } while(0) -#endif - -#if defined(HASHED_PAGE_VIRTUAL) -void *page_address(struct page *page); -void set_page_address(struct page *page, void *virtual); -void page_address_init(void); -#endif - -#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) -#define page_address(page) lowmem_page_address(page) -#define set_page_address(page, address) do { } while(0) -#define page_address_init() do { } while(0) -#endif - -/* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; - * with the PAGE_MAPPING_ANON bit set to distinguish it. - * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. - */ -#define PAGE_MAPPING_ANON 1 - -extern struct address_space swapper_space; -static inline struct address_space *page_mapping(struct page *page) -{ - struct address_space *mapping = page->mapping; - - if (unlikely(PageSwapCache(page))) - mapping = &swapper_space; - else if (unlikely((unsigned long)mapping & PAGE_MAPPING_ANON)) - mapping = NULL; - return mapping; -} - -static inline int PageAnon(struct page *page) -{ - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; -} - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use ->private - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return page->private; - return page->index; -} - -/* - * The atomic page->_mapcount, like _count, starts from -1: - * so that transitions both from it and to it can be tracked, - * using atomic_inc_and_test and atomic_add_negative(-1). - */ -static inline void reset_page_mapcount(struct page *page) -{ - atomic_set(&(page)->_mapcount, -1); -} - -static inline int page_mapcount(struct page *page) -{ - return atomic_read(&(page)->_mapcount) + 1; -} - -/* - * Return true if this page is mapped into pagetables. - */ -static inline int page_mapped(struct page *page) -{ - return atomic_read(&(page)->_mapcount) >= 0; -} - -/* - * Error return values for the *_nopage functions - */ -#define NOPAGE_SIGBUS (NULL) -#define NOPAGE_OOM ((struct page *) (-1)) - -/* - * Different kinds of faults, as returned by handle_mm_fault(). - * Used to decide whether a process gets delivered SIGBUS or - * just gets major/minor fault counters bumped up. - */ -#define VM_FAULT_OOM (-1) -#define VM_FAULT_SIGBUS 0 -#define VM_FAULT_MINOR 1 -#define VM_FAULT_MAJOR 2 - -#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) - -extern void show_free_areas(void); - -#ifdef CONFIG_SHMEM -struct page *shmem_nopage(struct vm_area_struct *vma, - unsigned long address, int *type); -int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new); -struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr); -int shmem_lock(struct file *file, int lock, struct user_struct *user); -#else -#define shmem_nopage filemap_nopage -#define shmem_lock(a, b, c) ({0;}) /* always in memory, no need to lock */ -#define shmem_set_policy(a, b) (0) -#define shmem_get_policy(a, b) (NULL) -#endif -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); - -int shmem_zero_setup(struct vm_area_struct *); - -static inline int can_do_mlock(void) -{ - if (capable(CAP_IPC_LOCK)) - return 1; - if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) - return 1; - return 0; -} -extern int user_shm_lock(size_t, struct user_struct *); -extern void user_shm_unlock(size_t, struct user_struct *); - -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ - struct address_space *check_mapping; /* Check page->mapping if set */ - pgoff_t first_index; /* Lowest page->index to unmap */ - pgoff_t last_index; /* Highest page->index to unmap */ - spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ - unsigned long break_addr; /* Where unmap_vmas stopped */ - unsigned long truncate_count; /* Compare vm_truncate_count */ -}; - -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *); -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, - struct vm_area_struct *start_vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *); -void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); -int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, - unsigned long size, pgprot_t prot); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); - -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - -extern int vmtruncate(struct inode * inode, loff_t offset); -extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); -extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); -extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); -extern int make_pages_present(unsigned long addr, unsigned long end); -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); - -int __set_page_dirty_buffers(struct page *page); -int __set_page_dirty_nobuffers(struct page *page); -int redirty_page_for_writepage(struct writeback_control *wbc, - struct page *page); -int FASTCALL(set_page_dirty(struct page *page)); -int set_page_dirty_lock(struct page *page); -int clear_page_dirty_for_io(struct page *page); - -extern unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); - -/* - * Prototype to add a shrinker callback for ageable caches. - * - * These functions are passed a count `nr_to_scan' and a gfpmask. They should - * scan `nr_to_scan' objects, attempting to free them. - * - * The callback must the number of objects which remain in the cache. - * - * The callback will be passes nr_to_scan == 0 when the VM is querying the - * cache size, so a fastpath for that case is appropriate. - */ -typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); - -/* - * Add an aging callback. The int is the number of 'seeks' it takes - * to recreate one of the objects that these functions age. - */ - -#define DEFAULT_SEEKS 2 -struct shrinker; -extern struct shrinker *set_shrinker(int, shrinker_t); -extern void remove_shrinker(struct shrinker *shrinker); - -/* - * On a two-level or three-level page table, this ends up being trivial. Thus - * the inlining and the symmetry break with pte_alloc_map() that does all - * of this out-of-line. - */ -/* - * The following ifdef needed to get the 4level-fixup.h header to work. - * Remove it when 4level-fixup.h has been removed. - */ -#ifdef CONFIG_MMU -#ifndef __ARCH_HAS_4LEVEL_HACK -static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pud_alloc(mm, pgd, address); - return pud_offset(pgd, address); -} - -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - if (pud_none(*pud)) - return __pmd_alloc(mm, pud, address); - return pmd_offset(pud, address); -} -#endif -#endif /* CONFIG_MMU */ - -extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, pg_data_t *pgdat, - unsigned long * zones_size, unsigned long zone_start_pfn, - unsigned long *zholes_size); -extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); -extern void mem_init(void); -extern void show_mem(void); -extern void si_meminfo(struct sysinfo * val); -extern void si_meminfo_node(struct sysinfo *val, int nid); - -/* prio_tree.c */ -void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); -void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); -void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); -struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, - struct prio_tree_iter *iter); - -#define vma_prio_tree_foreach(vma, iter, root, begin, end) \ - for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ - (vma = vma_prio_tree_next(vma, iter)); ) - -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - vma->shared.vm_set.parent = NULL; - list_add_tail(&vma->shared.vm_set.list, list); -} - -/* mmap.c */ -extern int __vm_enough_memory(long pages, int cap_sys_admin); -extern void vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); -extern struct vm_area_struct *vma_merge(struct mm_struct *, - struct vm_area_struct *prev, unsigned long addr, unsigned long end, - unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *); -extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int split_vma(struct mm_struct *, - struct vm_area_struct *, unsigned long addr, int new_below); -extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); -extern struct vm_area_struct *copy_vma(struct vm_area_struct **, - unsigned long addr, unsigned long len, pgoff_t pgoff); -extern void exit_mmap(struct mm_struct *); - -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff); - -static inline unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long offset) -{ - unsigned long ret = -EINVAL; - if ((offset + PAGE_ALIGN(len)) < offset) - goto out; - if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); -out: - return ret; -} - -extern int do_munmap(struct mm_struct *, unsigned long, size_t); - -extern unsigned long do_brk(unsigned long, unsigned long); - -/* filemap.c */ -extern unsigned long page_unuse(struct page *); -extern void truncate_inode_pages(struct address_space *, loff_t); - -/* generic vm_area_ops exported for stackable file systems */ -extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *); -extern int filemap_populate(struct vm_area_struct *, unsigned long, - unsigned long, pgprot_t, unsigned long, int); - -/* mm/page-writeback.c */ -int write_one_page(struct page *page, int wait); - -/* readahead.c */ -#define VM_MAX_READAHEAD 128 /* kbytes */ -#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ -#define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before - * turning readahead off */ - -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read); -int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - unsigned long offset, unsigned long nr_to_read); -unsigned long page_cache_readahead(struct address_space *mapping, - struct file_ra_state *ra, - struct file *filp, - unsigned long offset, - unsigned long size); -void handle_ra_miss(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset); -unsigned long max_sane_readahead(unsigned long nr); - -/* Do stack extension */ -extern int expand_stack(struct vm_area_struct * vma, unsigned long address); - -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); -extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, - struct vm_area_struct **pprev); - -/* Look up the first VMA which intersects the interval start_addr..end_addr-1, - NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) -{ - struct vm_area_struct * vma = find_vma(mm,start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} - -static inline unsigned long vma_pages(struct vm_area_struct *vma) -{ - return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; -} - -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); - -extern struct page * vmalloc_to_page(void *addr); -extern unsigned long vmalloc_to_pfn(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, - int write); -extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); -int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); -/* Allow arch override for mapping of device and I/O (non-RAM) pages. */ -#ifndef io_remap_pfn_range -#define io_remap_pfn_range remap_pfn_range -#endif - -#ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); -#else -static inline void __vm_stat_account(struct mm_struct *mm, - unsigned long flags, struct file *file, long pages) -{ -} -#endif /* CONFIG_PROC_FS */ - -static inline void vm_stat_account(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); -} - -static inline void vm_stat_unaccount(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - -vma_pages(vma)); -} - -/* update per process rss and vm hiwater data */ -extern void update_mem_hiwater(void); - -#ifndef CONFIG_DEBUG_PAGEALLOC -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ -} -#endif - -extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); -#ifdef __HAVE_ARCH_GATE_AREA -int in_gate_area_no_task(unsigned long addr); -int in_gate_area(struct task_struct *task, unsigned long addr); -#else -int in_gate_area_no_task(unsigned long addr); -#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) -#endif /* __HAVE_ARCH_GATE_AREA */ - -#endif /* __KERNEL__ */ -#endif /* _LINUX_MM_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/include/linux/skbuff.h --- a/linux-2.6.11-xen-sparse/include/linux/skbuff.h Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1184 +0,0 @@ -/* - * Definitions for the 'struct sk_buff' memory handlers. - * - * Authors: - * Alan Cox, <gw4pts@xxxxxxxxxxxxxxx> - * Florian La Roche, <rzsfl@xxxxxxxxxxxx> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _LINUX_SKBUFF_H -#define _LINUX_SKBUFF_H - -#include <linux/config.h> -#include <linux/kernel.h> -#include <linux/compiler.h> -#include <linux/time.h> -#include <linux/cache.h> - -#include <asm/atomic.h> -#include <asm/types.h> -#include <linux/spinlock.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/poll.h> -#include <linux/net.h> -#include <net/checksum.h> - -#define HAVE_ALLOC_SKB /* For the drivers to know */ -#define HAVE_ALIGNABLE_SKB /* Ditto 8) */ -#define SLAB_SKB /* Slabified skbuffs */ - -#define CHECKSUM_NONE 0 -#define CHECKSUM_HW 1 -#define CHECKSUM_UNNECESSARY 2 - -#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES - 1)) & \ - ~(SMP_CACHE_BYTES - 1)) -#define SKB_MAX_ORDER(X, ORDER) (((PAGE_SIZE << (ORDER)) - (X) - \ - sizeof(struct skb_shared_info)) & \ - ~(SMP_CACHE_BYTES - 1)) -#define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0)) -#define SKB_MAX_ALLOC (SKB_MAX_ORDER(0, 2)) - -/* A. Checksumming of received packets by device. - * - * NONE: device failed to checksum this packet. - * skb->csum is undefined. - * - * UNNECESSARY: device parsed packet and wouldbe verified checksum. - * skb->csum is undefined. - * It is bad option, but, unfortunately, many of vendors do this. - * Apparently with secret goal to sell you new device, when you - * will add new protocol to your host. F.e. IPv6. 8) - * - * HW: the most generic way. Device supplied checksum of _all_ - * the packet as seen by netif_rx in skb->csum. - * NOTE: Even if device supports only some protocols, but - * is able to produce some skb->csum, it MUST use HW, - * not UNNECESSARY. - * - * B. Checksumming on output. - * - * NONE: skb is checksummed by protocol or csum is not required. - * - * HW: device is required to csum packet as seen by hard_start_xmit - * from skb->h.raw to the end and to record the checksum - * at skb->h.raw+skb->csum. - * - * Device must show its capabilities in dev->features, set - * at device setup time. - * NETIF_F_HW_CSUM - it is clever device, it is able to checksum - * everything. - * NETIF_F_NO_CSUM - loopback or reliable single hop media. - * NETIF_F_IP_CSUM - device is dumb. It is able to csum only - * TCP/UDP over IPv4. Sigh. Vendors like this - * way by an unknown reason. Though, see comment above - * about CHECKSUM_UNNECESSARY. 8) - * - * Any questions? No questions, good. --ANK - */ - -#ifdef __i386__ -#define NET_CALLER(arg) (*(((void **)&arg) - 1)) -#else -#define NET_CALLER(arg) __builtin_return_address(0) -#endif - -struct net_device; - -#ifdef CONFIG_NETFILTER -struct nf_conntrack { - atomic_t use; - void (*destroy)(struct nf_conntrack *); -}; - -#ifdef CONFIG_BRIDGE_NETFILTER -struct nf_bridge_info { - atomic_t use; - struct net_device *physindev; - struct net_device *physoutdev; -#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) - struct net_device *netoutdev; -#endif - unsigned int mask; - unsigned long data[32 / sizeof(unsigned long)]; -}; -#endif - -#endif - -struct sk_buff_head { - /* These two members must be first. */ - struct sk_buff *next; - struct sk_buff *prev; - - __u32 qlen; - spinlock_t lock; -}; - -struct sk_buff; - -/* To allow 64K frame to be packed as single skb without frag_list */ -#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2) - -typedef struct skb_frag_struct skb_frag_t; - -struct skb_frag_struct { - struct page *page; - __u16 page_offset; - __u16 size; -}; - -/* This data is invariant across clones and lives at - * the end of the header data, ie. at skb->end. - */ -struct skb_shared_info { - atomic_t dataref; - unsigned int nr_frags; - unsigned short tso_size; - unsigned short tso_segs; - struct sk_buff *frag_list; - skb_frag_t frags[MAX_SKB_FRAGS]; -}; - -/** - * struct sk_buff - socket buffer - * @next: Next buffer in list - * @prev: Previous buffer in list - * @list: List we are on - * @sk: Socket we are owned by - * @stamp: Time we arrived - * @dev: Device we arrived on/are leaving by - * @input_dev: Device we arrived on - * @real_dev: The real device we are using - * @h: Transport layer header - * @nh: Network layer header - * @mac: Link layer header - * @dst: FIXME: Describe this field - * @cb: Control buffer. Free for use by every layer. Put private vars here - * @len: Length of actual data - * @data_len: Data length - * @mac_len: Length of link layer header - * @csum: Checksum - * @__unused: Dead field, may be reused - * @cloned: Head may be cloned (check refcnt to be sure) - * @proto_csum_valid: Protocol csum validated since arriving at localhost - * @proto_csum_blank: Protocol csum must be added before leaving localhost - * @pkt_type: Packet class - * @ip_summed: Driver fed us an IP checksum - * @priority: Packet queueing priority - * @users: User count - see {datagram,tcp}.c - * @protocol: Packet protocol from driver - * @security: Security level of packet - * @truesize: Buffer size - * @head: Head of buffer - * @data: Data head pointer - * @tail: Tail pointer - * @end: End pointer - * @destructor: Destruct function - * @nfmark: Can be used for communication between hooks - * @nfcache: Cache info - * @nfct: Associated connection, if any - * @nfctinfo: Relationship of this skb to the connection - * @nf_debug: Netfilter debugging - * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c - * @private: Data which is private to the HIPPI implementation - * @tc_index: Traffic control index - */ - -struct sk_buff { - /* These two members must be first. */ - struct sk_buff *next; - struct sk_buff *prev; - - struct sk_buff_head *list; - struct sock *sk; - struct timeval stamp; - struct net_device *dev; - struct net_device *input_dev; - struct net_device *real_dev; - - union { - struct tcphdr *th; - struct udphdr *uh; - struct icmphdr *icmph; - struct igmphdr *igmph; - struct iphdr *ipiph; - struct ipv6hdr *ipv6h; - unsigned char *raw; - } h; - - union { - struct iphdr *iph; - struct ipv6hdr *ipv6h; - struct arphdr *arph; - unsigned char *raw; - } nh; - - union { - unsigned char *raw; - } mac; - - struct dst_entry *dst; - struct sec_path *sp; - - /* - * This is the control buffer. It is free to use for every - * layer. Please put your private variables there. If you - * want to keep them across layers you have to do a skb_clone() - * first. This is owned by whoever has the skb queued ATM. - */ - char cb[40]; - - unsigned int len, - data_len, - mac_len, - csum; - unsigned char local_df, - cloned:1, - proto_csum_valid:1, - proto_csum_blank:1, - pkt_type, - ip_summed; - __u32 priority; - unsigned short protocol, - security; - - void (*destructor)(struct sk_buff *skb); -#ifdef CONFIG_NETFILTER - unsigned long nfmark; - __u32 nfcache; - __u32 nfctinfo; - struct nf_conntrack *nfct; -#ifdef CONFIG_NETFILTER_DEBUG - unsigned int nf_debug; -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - struct nf_bridge_info *nf_bridge; -#endif -#endif /* CONFIG_NETFILTER */ -#if defined(CONFIG_HIPPI) - union { - __u32 ifield; - } private; -#endif -#ifdef CONFIG_NET_SCHED - __u32 tc_index; /* traffic control index */ -#ifdef CONFIG_NET_CLS_ACT - __u32 tc_verd; /* traffic control verdict */ - __u32 tc_classid; /* traffic control classid */ -#endif - -#endif - - - /* These elements must be at the end, see alloc_skb() for details. */ - unsigned int truesize; - atomic_t users; - unsigned char *head, - *data, - *tail, - *end; -}; - -#ifdef __KERNEL__ -/* - * Handling routines are only of interest to the kernel - */ -#include <linux/slab.h> - -#include <asm/system.h> - -extern void __kfree_skb(struct sk_buff *skb); -extern struct sk_buff *alloc_skb(unsigned int size, int priority); -extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, - unsigned int size, int priority); -extern void kfree_skbmem(struct sk_buff *skb); -extern struct sk_buff *skb_clone(struct sk_buff *skb, int priority); -extern struct sk_buff *skb_copy(const struct sk_buff *skb, int priority); -extern struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask); -extern int pskb_expand_head(struct sk_buff *skb, - int nhead, int ntail, int gfp_mask); -extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, - unsigned int headroom); -extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb, - int newheadroom, int newtailroom, - int priority); -extern struct sk_buff * skb_pad(struct sk_buff *skb, int pad); -#define dev_kfree_skb(a) kfree_skb(a) -extern void skb_over_panic(struct sk_buff *skb, int len, - void *here); -extern void skb_under_panic(struct sk_buff *skb, int len, - void *here); - -/* Internal */ -#define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) - -/** - * skb_queue_empty - check if a queue is empty - * @list: queue head - * - * Returns true if the queue is empty, false otherwise. - */ -static inline int skb_queue_empty(const struct sk_buff_head *list) -{ - return list->next == (struct sk_buff *)list; -} - -/** - * skb_get - reference buffer - * @skb: buffer to reference - * - * Makes another reference to a socket buffer and returns a pointer - * to the buffer. - */ -static inline struct sk_buff *skb_get(struct sk_buff *skb) -{ - atomic_inc(&skb->users); - return skb; -} - -/* - * If users == 1, we are the only owner and are can avoid redundant - * atomic change. - */ - -/** - * kfree_skb - free an sk_buff - * @skb: buffer to free - * - * Drop a reference to the buffer and free it if the usage count has - * hit zero. - */ -static inline void kfree_skb(struct sk_buff *skb) -{ - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) - return; - __kfree_skb(skb); -} - -/** - * skb_cloned - is the buffer a clone - * @skb: buffer to check - * - * Returns true if the buffer was generated with skb_clone() and is - * one of multiple shared copies of the buffer. Cloned buffers are - * shared data so must not be written to under normal circumstances. - */ -static inline int skb_cloned(const struct sk_buff *skb) -{ - return skb->cloned && atomic_read(&skb_shinfo(skb)->dataref) != 1; -} - -/** - * skb_shared - is the buffer shared - * @skb: buffer to check - * - * Returns true if more than one person has a reference to this - * buffer. - */ -static inline int skb_shared(const struct sk_buff *skb) -{ - return atomic_read(&skb->users) != 1; -} - -/** - * skb_share_check - check if buffer is shared and if so clone it - * @skb: buffer to check - * @pri: priority for memory allocation - * - * If the buffer is shared the buffer is cloned and the old copy - * drops a reference. A new clone with a single reference is returned. - * If the buffer is not shared the original buffer is returned. When - * being called from interrupt status or with spinlocks held pri must - * be GFP_ATOMIC. - * - * NULL is returned on a memory allocation failure. - */ -static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri) -{ - might_sleep_if(pri & __GFP_WAIT); - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, pri); - kfree_skb(skb); - skb = nskb; - } - return skb; -} - -/* - * Copy shared buffers into a new sk_buff. We effectively do COW on - * packets to handle cases where we have a local reader and forward - * and a couple of other messy ones. The normal one is tcpdumping - * a packet thats being forwarded. - */ - -/** - * skb_unshare - make a copy of a shared buffer - * @skb: buffer to check - * @pri: priority for memory allocation - * - * If the socket buffer is a clone then this function creates a new - * copy of the data, drops a reference count on the old copy and returns - * the new copy with the reference count at 1. If the buffer is not a clone - * the original buffer is returned. When called with a spinlock held or - * from interrupt state @pri must be %GFP_ATOMIC - * - * %NULL is returned on a memory allocation failure. - */ -static inline struct sk_buff *skb_unshare(struct sk_buff *skb, int pri) -{ - might_sleep_if(pri & __GFP_WAIT); - if (skb_cloned(skb)) { - struct sk_buff *nskb = skb_copy(skb, pri); - kfree_skb(skb); /* Free our shared copy */ - skb = nskb; - } - return skb; -} - -/** - * skb_peek - * @list_: list to peek at - * - * Peek an &sk_buff. Unlike most other operations you _MUST_ - * be careful with this one. A peek leaves the buffer on the - * list and someone else may run off with it. You must hold - * the appropriate locks or have a private queue to do this. - * - * Returns %NULL for an empty list or a pointer to the head element. - * The reference count is not incremented and the reference is therefore - * volatile. Use with caution. - */ -static inline struct sk_buff *skb_peek(struct sk_buff_head *list_) -{ - struct sk_buff *list = ((struct sk_buff *)list_)->next; - if (list == (struct sk_buff *)list_) - list = NULL; - return list; -} - -/** - * skb_peek_tail - * @list_: list to peek at - * - * Peek an &sk_buff. Unlike most other operations you _MUST_ - * be careful with this one. A peek leaves the buffer on the - * list and someone else may run off with it. You must hold - * the appropriate locks or have a private queue to do this. - * - * Returns %NULL for an empty list or a pointer to the tail element. - * The reference count is not incremented and the reference is therefore - * volatile. Use with caution. - */ -static inline struct sk_buff *skb_peek_tail(struct sk_buff_head *list_) -{ - struct sk_buff *list = ((struct sk_buff *)list_)->prev; - if (list == (struct sk_buff *)list_) - list = NULL; - return list; -} - -/** - * skb_queue_len - get queue length - * @list_: list to measure - * - * Return the length of an &sk_buff queue. - */ -static inline __u32 skb_queue_len(const struct sk_buff_head *list_) -{ - return list_->qlen; -} - -static inline void skb_queue_head_init(struct sk_buff_head *list) -{ - spin_lock_init(&list->lock); - list->prev = list->next = (struct sk_buff *)list; - list->qlen = 0; -} - -/* - * Insert an sk_buff at the start of a list. - * - * The "__skb_xxxx()" functions are the non-atomic ones that - * can only be called with interrupts disabled. - */ - -/** - * __skb_queue_head - queue a buffer at the list head - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the start of a list. This function takes no locks - * and you must therefore hold required locks before calling it. - * - * A buffer cannot be placed on two lists at the same time. - */ -extern void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk); -static inline void __skb_queue_head(struct sk_buff_head *list, - struct sk_buff *newsk) -{ - struct sk_buff *prev, *next; - - newsk->list = list; - list->qlen++; - prev = (struct sk_buff *)list; - next = prev->next; - newsk->next = next; - newsk->prev = prev; - next->prev = prev->next = newsk; -} - -/** - * __skb_queue_tail - queue a buffer at the list tail - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the end of a list. This function takes no locks - * and you must therefore hold required locks before calling it. - * - * A buffer cannot be placed on two lists at the same time. - */ -extern void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk); -static inline void __skb_queue_tail(struct sk_buff_head *list, - struct sk_buff *newsk) -{ - struct sk_buff *prev, *next; - - newsk->list = list; - list->qlen++; - next = (struct sk_buff *)list; - prev = next->prev; - newsk->next = next; - newsk->prev = prev; - next->prev = prev->next = newsk; -} - - -/** - * __skb_dequeue - remove from the head of the queue - * @list: list to dequeue from - * - * Remove the head of the list. This function does not take any locks - * so must be used with appropriate locks held only. The head item is - * returned or %NULL if the list is empty. - */ -extern struct sk_buff *skb_dequeue(struct sk_buff_head *list); -static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) -{ - struct sk_buff *next, *prev, *result; - - prev = (struct sk_buff *) list; - next = prev->next; - result = NULL; - if (next != prev) { - result = next; - next = next->next; - list->qlen--; - next->prev = prev; - prev->next = next; - result->next = result->prev = NULL; - result->list = NULL; - } - return result; -} - - -/* - * Insert a packet on a list. - */ -extern void skb_insert(struct sk_buff *old, struct sk_buff *newsk); -static inline void __skb_insert(struct sk_buff *newsk, - struct sk_buff *prev, struct sk_buff *next, - struct sk_buff_head *list) -{ - newsk->next = next; - newsk->prev = prev; - next->prev = prev->next = newsk; - newsk->list = list; - list->qlen++; -} - -/* - * Place a packet after a given packet in a list. - */ -extern void skb_append(struct sk_buff *old, struct sk_buff *newsk); -static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) -{ - __skb_insert(newsk, old, old->next, old->list); -} - -/* - * remove sk_buff from list. _Must_ be called atomically, and with - * the list known.. - */ -extern void skb_unlink(struct sk_buff *skb); -static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) -{ - struct sk_buff *next, *prev; - - list->qlen--; - next = skb->next; - prev = skb->prev; - skb->next = skb->prev = NULL; - skb->list = NULL; - next->prev = prev; - prev->next = next; -} - - -/* XXX: more streamlined implementation */ - -/** - * __skb_dequeue_tail - remove from the tail of the queue - * @list: list to dequeue from - * - * Remove the tail of the list. This function does not take any locks - * so must be used with appropriate locks held only. The tail item is - * returned or %NULL if the list is empty. - */ -extern struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list); -static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) -{ - struct sk_buff *skb = skb_peek_tail(list); - if (skb) - __skb_unlink(skb, list); - return skb; -} - - -static inline int skb_is_nonlinear(const struct sk_buff *skb) -{ - return skb->data_len; -} - -static inline unsigned int skb_headlen(const struct sk_buff *skb) -{ - return skb->len - skb->data_len; -} - -static inline int skb_pagelen(const struct sk_buff *skb) -{ - int i, len = 0; - - for (i = (int)skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) - len += skb_shinfo(skb)->frags[i].size; - return len + skb_headlen(skb); -} - -static inline void skb_fill_page_desc(struct sk_buff *skb, int i, - struct page *page, int off, int size) -{ - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - frag->page = page; - frag->page_offset = off; - frag->size = size; - skb_shinfo(skb)->nr_frags = i + 1; -} - -#define SKB_PAGE_ASSERT(skb) BUG_ON(skb_shinfo(skb)->nr_frags) -#define SKB_FRAG_ASSERT(skb) BUG_ON(skb_shinfo(skb)->frag_list) -#define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) - -/* - * Add data to an sk_buff - */ -static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len) -{ - unsigned char *tmp = skb->tail; - SKB_LINEAR_ASSERT(skb); - skb->tail += len; - skb->len += len; - return tmp; -} - -/** - * skb_put - add data to a buffer - * @skb: buffer to use - * @len: amount of data to add - * - * This function extends the used data area of the buffer. If this would - * exceed the total buffer size the kernel will panic. A pointer to the - * first byte of the extra data is returned. - */ -static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len) -{ - unsigned char *tmp = skb->tail; - SKB_LINEAR_ASSERT(skb); - skb->tail += len; - skb->len += len; - if (unlikely(skb->tail>skb->end)) - skb_over_panic(skb, len, current_text_addr()); - return tmp; -} - -static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len) -{ - skb->data -= len; - skb->len += len; - return skb->data; -} - -/** - * skb_push - add data to the start of a buffer - * @skb: buffer to use - * @len: amount of data to add - * - * This function extends the used data area of the buffer at the buffer - * start. If this would exceed the total buffer headroom the kernel will - * panic. A pointer to the first byte of the extra data is returned. - */ -static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len) -{ - skb->data -= len; - skb->len += len; - if (unlikely(skb->data<skb->head)) - skb_under_panic(skb, len, current_text_addr()); - return skb->data; -} - -static inline unsigned char *__skb_pull(struct sk_buff *skb, unsigned int len) -{ - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - return skb->data += len; -} - -/** - * skb_pull - remove data from the start of a buffer - * @skb: buffer to use - * @len: amount of data to remove - * - * This function removes data from the start of a buffer, returning - * the memory to the headroom. A pointer to the next data in the buffer - * is returned. Once the data has been pulled future pushes will overwrite - * the old data. - */ -static inline unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) -{ - return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len); -} - -extern unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta); - -static inline unsigned char *__pskb_pull(struct sk_buff *skb, unsigned int len) -{ - if (len > skb_headlen(skb) && - !__pskb_pull_tail(skb, len-skb_headlen(skb))) - return NULL; - skb->len -= len; - return skb->data += len; -} - -static inline unsigned char *pskb_pull(struct sk_buff *skb, unsigned int len) -{ - return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len); -} - -static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) -{ - if (likely(len <= skb_headlen(skb))) - return 1; - if (unlikely(len > skb->len)) - return 0; - return __pskb_pull_tail(skb, len-skb_headlen(skb)) != NULL; -} - -/** - * skb_headroom - bytes at buffer head - * @skb: buffer to check - * - * Return the number of bytes of free space at the head of an &sk_buff. - */ -static inline int skb_headroom(const struct sk_buff *skb) -{ - return skb->data - skb->head; -} - -/** - * skb_tailroom - bytes at buffer end - * @skb: buffer to check - * - * Return the number of bytes of free space at the tail of an sk_buff - */ -static inline int skb_tailroom(const struct sk_buff *skb) -{ - return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail; -} - -/** - * skb_reserve - adjust headroom - * @skb: buffer to alter - * @len: bytes to move - * - * Increase the headroom of an empty &sk_buff by reducing the tail - * room. This is only allowed for an empty buffer. - */ -static inline void skb_reserve(struct sk_buff *skb, unsigned int len) -{ - skb->data += len; - skb->tail += len; -} - -/* - * CPUs often take a performance hit when accessing unaligned memory - * locations. The actual performance hit varies, it can be small if the - * hardware handles it or large if we have to take an exception and fix it - * in software. - * - * Since an ethernet header is 14 bytes network drivers often end up with - * the IP header at an unaligned offset. The IP header can be aligned by - * shifting the start of the packet by 2 bytes. Drivers should do this - * with: - * - * skb_reserve(NET_IP_ALIGN); - * - * The downside to this alignment of the IP header is that the DMA is now - * unaligned. On some architectures the cost of an unaligned DMA is high - * and this cost outweighs the gains made by aligning the IP header. - * - * Since this trade off varies between architectures, we allow NET_IP_ALIGN - * to be overridden. - */ -#ifndef NET_IP_ALIGN -#define NET_IP_ALIGN 2 -#endif - -extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); - -static inline void __skb_trim(struct sk_buff *skb, unsigned int len) -{ - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data + len; - } else - ___pskb_trim(skb, len, 0); -} - -/** - * skb_trim - remove end from a buffer - * @skb: buffer to alter - * @len: new length - * - * Cut the length of a buffer down by removing data from the tail. If - * the buffer is already under the length specified it is not modified. - */ -static inline void skb_trim(struct sk_buff *skb, unsigned int len) -{ - if (skb->len > len) - __skb_trim(skb, len); -} - - -static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) -{ - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data+len; - return 0; - } - return ___pskb_trim(skb, len, 1); -} - -static inline int pskb_trim(struct sk_buff *skb, unsigned int len) -{ - return (len < skb->len) ? __pskb_trim(skb, len) : 0; -} - -/** - * skb_orphan - orphan a buffer - * @skb: buffer to orphan - * - * If a buffer currently has an owner then we call the owner's - * destructor function and make the @skb unowned. The buffer continues - * to exist but is no longer charged to its former owner. - */ -static inline void skb_orphan(struct sk_buff *skb) -{ - if (skb->destructor) - skb->destructor(skb); - skb->destructor = NULL; - skb->sk = NULL; -} - -/** - * __skb_queue_purge - empty a list - * @list: list to empty - * - * Delete all buffers on an &sk_buff list. Each buffer is removed from - * the list and one reference dropped. This function does not take the - * list lock and the caller must hold the relevant locks to use it. - */ -extern void skb_queue_purge(struct sk_buff_head *list); -static inline void __skb_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb = __skb_dequeue(list)) != NULL) - kfree_skb(skb); -} - -/** - * __dev_alloc_skb - allocate an skbuff for sending - * @length: length to allocate - * @gfp_mask: get_free_pages mask, passed to alloc_skb - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned in there is no free memory. - */ -#ifndef CONFIG_HAVE_ARCH_DEV_ALLOC_SKB -static inline struct sk_buff *__dev_alloc_skb(unsigned int length, - int gfp_mask) -{ - struct sk_buff *skb = alloc_skb(length + 16, gfp_mask); - if (likely(skb)) - skb_reserve(skb, 16); - return skb; -} -#else -extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask); -#endif - -/** - * dev_alloc_skb - allocate an skbuff for sending - * @length: length to allocate - * - * Allocate a new &sk_buff and assign it a usage count of one. The - * buffer has unspecified headroom built in. Users should allocate - * the headroom they think they need without accounting for the - * built in space. The built in space is used for optimisations. - * - * %NULL is returned in there is no free memory. Although this function - * allocates memory it can be called from an interrupt. - */ -static inline struct sk_buff *dev_alloc_skb(unsigned int length) -{ - return __dev_alloc_skb(length, GFP_ATOMIC); -} - -/** - * skb_cow - copy header of skb when it is required - * @skb: buffer to cow - * @headroom: needed headroom - * - * If the skb passed lacks sufficient headroom or its data part - * is shared, data is reallocated. If reallocation fails, an error - * is returned and original skb is not changed. - * - * The result is skb with writable area skb->head...skb->tail - * and at least @headroom of space at head. - */ -static inline int skb_cow(struct sk_buff *skb, unsigned int headroom) -{ - int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb); - - if (delta < 0) - delta = 0; - - if (delta || skb_cloned(skb)) - return pskb_expand_head(skb, (delta + 15) & ~15, 0, GFP_ATOMIC); - return 0; -} - -/** - * skb_padto - pad an skbuff up to a minimal size - * @skb: buffer to pad - * @len: minimal length - * - * Pads up a buffer to ensure the trailing bytes exist and are - * blanked. If the buffer already contains sufficient data it - * is untouched. Returns the buffer, which may be a replacement - * for the original, or NULL for out of memory - in which case - * the original buffer is still freed. - */ - -static inline struct sk_buff *skb_padto(struct sk_buff *skb, unsigned int len) -{ - unsigned int size = skb->len; - if (likely(size >= len)) - return skb; - return skb_pad(skb, len-size); -} - -static inline int skb_add_data(struct sk_buff *skb, - char __user *from, int copy) -{ - const int off = skb->len; - - if (skb->ip_summed == CHECKSUM_NONE) { - int err = 0; - unsigned int csum = csum_and_copy_from_user(from, - skb_put(skb, copy), - copy, 0, &err); - if (!err) { - skb->csum = csum_block_add(skb->csum, csum, off); - return 0; - } - } else if (!copy_from_user(skb_put(skb, copy), from, copy)) - return 0; - - __skb_trim(skb, off); - return -EFAULT; -} - -static inline int skb_can_coalesce(struct sk_buff *skb, int i, - struct page *page, int off) -{ - if (i) { - struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i - 1]; - - return page == frag->page && - off == frag->page_offset + frag->size; - } - return 0; -} - -/** - * skb_linearize - convert paged skb to linear one - * @skb: buffer to linarize - * @gfp: allocation mode - * - * If there is no free memory -ENOMEM is returned, otherwise zero - * is returned and the old skb data released. - */ -extern int __skb_linearize(struct sk_buff *skb, int gfp); -static inline int skb_linearize(struct sk_buff *skb, int gfp) -{ - return __skb_linearize(skb, gfp); -} - -static inline void *kmap_skb_frag(const skb_frag_t *frag) -{ -#ifdef CONFIG_HIGHMEM - BUG_ON(in_irq()); - - local_bh_disable(); -#endif - return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); -} - -static inline void kunmap_skb_frag(void *vaddr) -{ - kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ); -#ifdef CONFIG_HIGHMEM - local_bh_enable(); -#endif -} - -#define skb_queue_walk(queue, skb) \ - for (skb = (queue)->next; \ - prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \ - skb = skb->next) - - -extern struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, - int noblock, int *err); -extern unsigned int datagram_poll(struct file *file, struct socket *sock, - struct poll_table_struct *wait); -extern int skb_copy_datagram_iovec(const struct sk_buff *from, - int offset, struct iovec *to, - int size); -extern int skb_copy_and_csum_datagram_iovec(const - struct sk_buff *skb, - int hlen, - struct iovec *iov); -extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb); -extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, - int len, unsigned int csum); -extern int skb_copy_bits(const struct sk_buff *skb, int offset, - void *to, int len); -extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, - int offset, u8 *to, int len, - unsigned int csum); -extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); -extern void skb_split(struct sk_buff *skb, - struct sk_buff *skb1, const u32 len); - -static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, - int len, void *buffer) -{ - int hlen = skb_headlen(skb); - - if (offset + len <= hlen) - return skb->data + offset; - - if (skb_copy_bits(skb, offset, buffer, len) < 0) - return NULL; - - return buffer; -} - -extern void skb_init(void); -extern void skb_add_mtu(int mtu); - -struct skb_iter { - /* Iteration functions set these */ - unsigned char *data; - unsigned int len; - - /* Private to iteration */ - unsigned int nextfrag; - struct sk_buff *fraglist; -}; - -/* Keep iterating until skb_iter_next returns false. */ -extern void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i); -extern int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i); -/* Call this if aborting loop before !skb_iter_next */ -extern void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i); - -#ifdef CONFIG_NETFILTER -static inline void nf_conntrack_put(struct nf_conntrack *nfct) -{ - if (nfct && atomic_dec_and_test(&nfct->use)) - nfct->destroy(nfct); -} -static inline void nf_conntrack_get(struct nf_conntrack *nfct) -{ - if (nfct) - atomic_inc(&nfct->use); -} -static inline void nf_reset(struct sk_buff *skb) -{ - nf_conntrack_put(skb->nfct); - skb->nfct = NULL; -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif -} -static inline void nf_reset_debug(struct sk_buff *skb) -{ -#ifdef CONFIG_NETFILTER_DEBUG - skb->nf_debug = 0; -#endif -} - -#ifdef CONFIG_BRIDGE_NETFILTER -static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge && atomic_dec_and_test(&nf_bridge->use)) - kfree(nf_bridge); -} -static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge) -{ - if (nf_bridge) - atomic_inc(&nf_bridge->use); -} -#endif /* CONFIG_BRIDGE_NETFILTER */ -#else /* CONFIG_NETFILTER */ -static inline void nf_reset(struct sk_buff *skb) {} -#endif /* CONFIG_NETFILTER */ - -#endif /* __KERNEL__ */ -#endif /* _LINUX_SKBUFF_H */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/kernel/irq/manage.c --- a/linux-2.6.11-xen-sparse/kernel/irq/manage.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,392 +0,0 @@ -/* - * linux/kernel/irq/manage.c - * - * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar - * - * This file contains driver APIs to the irq subsystem. - */ - -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/interrupt.h> - -#include "internals.h" - -#ifdef CONFIG_SMP - -cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; - -/** - * synchronize_irq - wait for pending IRQ handlers (on other CPUs) - * - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void synchronize_irq(unsigned int irq) -{ - struct irq_desc *desc = irq_desc + irq; - - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); -} - -EXPORT_SYMBOL(synchronize_irq); - -#endif - -/** - * disable_irq_nosync - disable an irq without waiting - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Disables and Enables are - * nested. - * Unlike disable_irq(), this function does not ensure existing - * instances of the IRQ handler have completed before returning. - * - * This function may be called from IRQ context. - */ -void disable_irq_nosync(unsigned int irq) -{ - irq_desc_t *desc = irq_desc + irq; - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->depth++) { - desc->status |= IRQ_DISABLED; - desc->handler->disable(irq); - } - spin_unlock_irqrestore(&desc->lock, flags); -} - -EXPORT_SYMBOL(disable_irq_nosync); - -/** - * disable_irq - disable an irq and wait for completion - * @irq: Interrupt to disable - * - * Disable the selected interrupt line. Enables and Disables are - * nested. - * This function waits for any pending IRQ handlers for this interrupt - * to complete before returning. If you use this function while - * holding a resource the IRQ handler may need you will deadlock. - * - * This function may be called - with care - from IRQ context. - */ -void disable_irq(unsigned int irq) -{ - irq_desc_t *desc = irq_desc + irq; - - disable_irq_nosync(irq); - if (desc->action) - synchronize_irq(irq); -} - -EXPORT_SYMBOL(disable_irq); - -/** - * enable_irq - enable handling of an irq - * @irq: Interrupt to enable - * - * Undoes the effect of one call to disable_irq(). If this - * matches the last disable, processing of interrupts on this - * IRQ line is re-enabled. - * - * This function may be called from IRQ context. - */ -void enable_irq(unsigned int irq) -{ - irq_desc_t *desc = irq_desc + irq; - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - switch (desc->depth) { - case 0: - WARN_ON(1); - break; - case 1: { - unsigned int status = desc->status & ~IRQ_DISABLED; - - desc->status = status; - if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { - desc->status = status | IRQ_REPLAY; - hw_resend_irq(desc->handler,irq); - } - desc->handler->enable(irq); - /* fall-through */ - } - default: - desc->depth--; - } - spin_unlock_irqrestore(&desc->lock, flags); -} - -EXPORT_SYMBOL(enable_irq); - -/* - * Internal function that tells the architecture code whether a - * particular irq has been exclusively allocated or is available - * for driver use. - */ -int can_request_irq(unsigned int irq, unsigned long irqflags) -{ - struct irqaction *action; - - if (irq >= NR_IRQS) - return 0; - - action = irq_desc[irq].action; - if (action) - if (irqflags & action->flags & SA_SHIRQ) - action = NULL; - - return !action; -} - -/** - * setup_irq - register an irqaction structure - * @irq: Interrupt to register - * @irqaction: The irqaction structure to be registered - * - * Normally called by request_irq, this function can be used - * directly to allocate special interrupts that are part of the - * architecture. - */ -int setup_irq(unsigned int irq, struct irqaction * new) -{ - struct irq_desc *desc = irq_desc + irq; - struct irqaction *old, **p; - unsigned long flags; - int shared = 0; - - if (desc->handler == &no_irq_type) - return -ENOSYS; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & SA_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } - - /* - * The following block of code has to be executed atomically - */ - spin_lock_irqsave(&desc->lock,flags); - p = &desc->action; - if ((old = *p) != NULL) { - /* Can't share interrupts unless both agree to */ - if (!(old->flags & new->flags & SA_SHIRQ)) { - spin_unlock_irqrestore(&desc->lock,flags); - return -EBUSY; - } - - /* add new interrupt at end of irq queue */ - do { - p = &old->next; - old = *p; - } while (old); - shared = 1; - } - - *p = new; - - if (!shared) { - desc->depth = 0; - desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | - IRQ_WAITING | IRQ_INPROGRESS); - if (desc->handler->startup) - desc->handler->startup(irq); - else - desc->handler->enable(irq); - } - spin_unlock_irqrestore(&desc->lock,flags); - - new->irq = irq; - register_irq_proc(irq); - new->dir = NULL; - register_handler_proc(irq, new); - - return 0; -} - -/* - * teardown_irq - unregister an irqaction - * @irq: Interrupt line being freed - * @old: Pointer to the irqaction that is to be unregistered - * - * This function is called by free_irq and does the actual - * business of unregistering the handler. It exists as a - * seperate function to enable handlers to be unregistered - * for irqactions that have been allocated statically at - * boot time. - * - * This function must not be called from interrupt context. - */ -int teardown_irq(unsigned int irq, struct irqaction * old) -{ - struct irq_desc *desc; - struct irqaction **p; - unsigned long flags; - - if (irq >= NR_IRQS) - return -ENOENT; - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); - p = &desc->action; - for (;;) { - struct irqaction * action = *p; - - if (action) { - struct irqaction **pp = p; - - p = &action->next; - if (action != old) - continue; - - /* Found it - now remove it from the list of entries */ - *pp = action->next; - if (!desc->action) { - desc->status |= IRQ_DISABLED; - if (desc->handler->shutdown) - desc->handler->shutdown(irq); - else - desc->handler->disable(irq); - } - spin_unlock_irqrestore(&desc->lock,flags); - unregister_handler_proc(irq, action); - - /* Make sure it's not being used on another CPU */ - synchronize_irq(irq); - return 0; - } - printk(KERN_ERR "Trying to teardown free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); - return -ENOENT; - } -} - -/** - * free_irq - free an interrupt - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Remove an interrupt handler. The handler is removed and if the - * interrupt line is no longer in use by any driver it is disabled. - * On a shared IRQ the caller must ensure the interrupt is disabled - * on the card it drives before calling this function. The function - * does not return until any executing interrupts for this IRQ - * have completed. - * - * This function must not be called from interrupt context. - */ -void free_irq(unsigned int irq, void *dev_id) -{ - struct irq_desc *desc; - struct irqaction *action; - unsigned long flags; - - if (irq >= NR_IRQS) - return; - - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock,flags); - for (action = desc->action; action != NULL; action = action->next) { - if (action->dev_id != dev_id) - continue; - - spin_unlock_irqrestore(&desc->lock,flags); - - if (teardown_irq(irq, action) == 0) - kfree(action); - return; - } - printk(KERN_ERR "Trying to free free IRQ%d\n",irq); - spin_unlock_irqrestore(&desc->lock,flags); - return; -} - -EXPORT_SYMBOL(free_irq); - -/** - * request_irq - allocate an interrupt line - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * This call allocates interrupt resources and enables the - * interrupt line and IRQ handling. From the point this - * call is made your handler function may be invoked. Since - * your handler function must clear any interrupt the board - * raises, you must take care both to initialise your hardware - * and to set up the interrupt handler in the right order. - * - * Dev_id must be globally unique. Normally the address of the - * device data structure is used as the cookie. Since the handler - * receives this value it makes sense to use it. - * - * If your interrupt is shared you must pass a non NULL dev_id - * as this is required when freeing the interrupt. - * - * Flags: - * - * SA_SHIRQ Interrupt is shared - * SA_INTERRUPT Disable local interrupts while processing - * SA_SAMPLE_RANDOM The interrupt can be used for entropy - * - */ -int request_irq(unsigned int irq, - irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long irqflags, const char * devname, void *dev_id) -{ - struct irqaction * action; - int retval; - - /* - * Sanity-check: shared interrupts must pass in a real dev-ID, - * otherwise we'll have trouble later trying to figure out - * which interrupt is which (messes up the interrupt freeing - * logic etc). - */ - if ((irqflags & SA_SHIRQ) && !dev_id) - return -EINVAL; - if (irq >= NR_IRQS) - return -EINVAL; - if (!handler) - return -EINVAL; - - action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC); - if (!action) - return -ENOMEM; - - action->handler = handler; - action->flags = irqflags; - cpus_clear(action->mask); - action->name = devname; - action->next = NULL; - action->dev_id = dev_id; - - retval = setup_irq(irq, action); - if (retval) - kfree(action); - - return retval; -} - -EXPORT_SYMBOL(request_irq); - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/mkbuildtree --- a/linux-2.6.11-xen-sparse/mkbuildtree Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,111 +0,0 @@ -#!/bin/bash - -# mkbuildtree <build tree> -# -# Creates symbolic links in <build tree> for the sparse tree -# in the current directory. - -# Script to determine the relative path between two directories. -# Copyright (c) D. J. Hawkey Jr. 2002 -# Fixed for Xen project by K. Fraser in 2003. -abs_to_rel () -{ - local CWD SRCPATH - - if [ "$1" != "/" -a "${1##*[^/]}" = "/" ]; then - SRCPATH=${1%?} - else - SRCPATH=$1 - fi - if [ "$2" != "/" -a "${2##*[^/]}" = "/" ]; then - DESTPATH=${2%?} - else - DESTPATH=$2 - fi - - CWD=$PWD - [ "${1%%[^/]*}" != "/" ] && cd $1 && SRCPATH=$PWD - [ "${2%%[^/]*}" != "/" ] && cd $2 && DESTPATH=$PWD - [ "$CWD" != "$PWD" ] && cd $CWD - - BASEPATH=$SRCPATH - - [ "$SRCPATH" = "$DESTPATH" ] && DESTPATH="." && return - [ "$SRCPATH" = "/" ] && DESTPATH=${DESTPATH#?} && return - - while [ "$BASEPATH/" != "${DESTPATH%${DESTPATH#$BASEPATH/}}" ]; do - BASEPATH=${BASEPATH%/*} - done - - SRCPATH=${SRCPATH#$BASEPATH} - DESTPATH=${DESTPATH#$BASEPATH} - DESTPATH=${DESTPATH#?} - while [ -n "$SRCPATH" ]; do - SRCPATH=${SRCPATH%/*} - DESTPATH="../$DESTPATH" - done - - [ -z "$BASEPATH" ] && BASEPATH="/" - [ "${DESTPATH##*[^/]}" = "/" ] && DESTPATH=${DESTPATH%?} -} - -# relative_lndir <target_dir> -# Creates a tree of symlinks in the current working directory that mirror -# real files in <target_dir>. <target_dir> should be relative to the current -# working directory. Symlinks in <target_dir> are ignored. Source-control files -# are ignored. -relative_lndir () -{ - local SYMLINK_DIR REAL_DIR pref i j - SYMLINK_DIR=$PWD - REAL_DIR=$1 - ( - cd $REAL_DIR - for i in `find . -type d | grep -v SCCS`; do - [ -d $SYMLINK_DIR/$i ] || mkdir -p $SYMLINK_DIR/$i - ( - cd $i - pref=`echo $i | sed -e 's#/[^/]*#../#g' -e 's#^\.##'` - for j in `find . -maxdepth 1 -type f -o -type l`; do - ln -sf ${pref}${REAL_DIR}/$i/$j ${SYMLINK_DIR}/$i/$j - done - ) - done - ) -} - -[ "$1" == "" ] && { echo "Syntax: $0 <linux tree to xenify>"; exit 1; } - -# Get absolute path to the destination directory -pushd . >/dev/null -cd ${1} || { echo "cannot cd to ${1}"; exit 1; } -AD=$PWD -popd >/dev/null - -# Get absolute path to the source directory -AS=`pwd` - -# Get path to source, relative to destination -abs_to_rel ${AD} ${AS} -RS=$DESTPATH - -# Remove old copies of files and directories at the destination -for i in `find . -type f -o -type l` ; do rm -f ${AD}/${i#./} ; done - -# We now work from the destination directory -cd ${AD} || { echo "cannot cd to ${AD}"; exit 1; } - -# Remove old symlinks -for i in `find . -type l`; do rm -f $i; done - -# Create symlinks of files and directories which exist in the sparse source -relative_lndir ${RS} -rm -f mkbuildtree - - -# Create links to the shared definitions of the Xen interfaces. -rm -rf ${AD}/include/asm-xen/xen-public -mkdir ${AD}/include/asm-xen/xen-public -cd ${AD}/include/asm-xen/xen-public -relative_lndir ../../../${RS}/../xen/include/public - diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/mm/highmem.c --- a/linux-2.6.11-xen-sparse/mm/highmem.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,614 +0,0 @@ -/* - * High memory handling common code and variables. - * - * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@xxxxxxx - * Gerhard Wichert, Siemens AG, Gerhard.Wichert@xxxxxxxxxxxxxx - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * 64-bit physical space. With current x86 CPUs this - * means up to 64 Gigabytes physical RAM. - * - * Rewrote high memory support to move the page cache into - * high memory. Implemented permanent (schedulable) kmaps - * based on Linus' idea. - * - * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx> - */ - -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/swap.h> -#include <linux/bio.h> -#include <linux/pagemap.h> -#include <linux/mempool.h> -#include <linux/blkdev.h> -#include <linux/init.h> -#include <linux/hash.h> -#include <linux/highmem.h> -#include <asm/tlbflush.h> - -static mempool_t *page_pool, *isa_page_pool; - -static void *page_pool_alloc(int gfp_mask, void *data) -{ - int gfp = gfp_mask | (int) (long) data; - - return alloc_page(gfp); -} - -static void page_pool_free(void *page, void *data) -{ - __free_page(page); -} - -/* - * Virtual_count is not a pure "count". - * 0 means that it is not mapped, and has not been mapped - * since a TLB flush - it is usable. - * 1 means that there are no users, but it has been mapped - * since the last TLB flush - so we can't use it. - * n means that there are (n-1) current users of it. - */ -#ifdef CONFIG_HIGHMEM -static int pkmap_count[LAST_PKMAP]; -static unsigned int last_pkmap_nr; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); - -pte_t * pkmap_page_table; - -static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); - -static void flush_all_zero_pkmaps(void) -{ - int i; - - flush_cache_kmaps(); - - for (i = 0; i < LAST_PKMAP; i++) { - struct page *page; - - /* - * zero means we don't have anything to do, - * >1 means that it is still in use. Only - * a count of 1 means that it is free but - * needs to be unmapped - */ - if (pkmap_count[i] != 1) - continue; - pkmap_count[i] = 0; - - /* sanity check */ - if (pte_none(pkmap_page_table[i])) - BUG(); - - /* - * Don't need an atomic fetch-and-clear op here; - * no-one has the page mapped, and cannot get at - * its virtual address (and hence PTE) without first - * getting the kmap_lock (which is held here). - * So no dangers, even with speculative execution. - */ - page = pte_page(pkmap_page_table[i]); - pte_clear(&pkmap_page_table[i]); - - set_page_address(page, NULL); - } - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); -} - -static inline unsigned long map_new_virtual(struct page *page) -{ - unsigned long vaddr; - int count; - -start: - count = LAST_PKMAP; - /* Find an empty entry */ - for (;;) { - last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK; - if (!last_pkmap_nr) { - flush_all_zero_pkmaps(); - count = LAST_PKMAP; - } - if (!pkmap_count[last_pkmap_nr]) - break; /* Found a usable entry */ - if (--count) - continue; - - /* - * Sleep for somebody else to unmap their entries - */ - { - DECLARE_WAITQUEUE(wait, current); - - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&pkmap_map_wait, &wait); - spin_unlock(&kmap_lock); - schedule(); - remove_wait_queue(&pkmap_map_wait, &wait); - spin_lock(&kmap_lock); - - /* Somebody else might have mapped it while we slept */ - if (page_address(page)) - return (unsigned long)page_address(page); - - /* Re-start */ - goto start; - } - } - vaddr = PKMAP_ADDR(last_pkmap_nr); - set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); - - pkmap_count[last_pkmap_nr] = 1; - set_page_address(page, (void *)vaddr); - - return vaddr; -} - -void kmap_flush_unused(void) -{ - spin_lock(&kmap_lock); - flush_all_zero_pkmaps(); - spin_unlock(&kmap_lock); -} - -EXPORT_SYMBOL(kmap_flush_unused); - -void fastcall *kmap_high(struct page *page) -{ - unsigned long vaddr; - - /* - * For highmem pages, we can't trust "virtual" until - * after we have the lock. - * - * We cannot call this from interrupts, as it may block - */ - spin_lock(&kmap_lock); - vaddr = (unsigned long)page_address(page); - if (!vaddr) - vaddr = map_new_virtual(page); - pkmap_count[PKMAP_NR(vaddr)]++; - if (pkmap_count[PKMAP_NR(vaddr)] < 2) - BUG(); - spin_unlock(&kmap_lock); - return (void*) vaddr; -} - -EXPORT_SYMBOL(kmap_high); - -void fastcall kunmap_high(struct page *page) -{ - unsigned long vaddr; - unsigned long nr; - int need_wakeup; - - spin_lock(&kmap_lock); - vaddr = (unsigned long)page_address(page); - if (!vaddr) - BUG(); - nr = PKMAP_NR(vaddr); - - /* - * A count must never go down to zero - * without a TLB flush! - */ - need_wakeup = 0; - switch (--pkmap_count[nr]) { - case 0: - BUG(); - case 1: - /* - * Avoid an unnecessary wake_up() function call. - * The common case is pkmap_count[] == 1, but - * no waiters. - * The tasks queued in the wait-queue are guarded - * by both the lock in the wait-queue-head and by - * the kmap_lock. As the kmap_lock is held here, - * no need for the wait-queue-head's lock. Simply - * test if the queue is empty. - */ - need_wakeup = waitqueue_active(&pkmap_map_wait); - } - spin_unlock(&kmap_lock); - - /* do wake-up, if needed, race-free outside of the spin lock */ - if (need_wakeup) - wake_up(&pkmap_map_wait); -} - -EXPORT_SYMBOL(kunmap_high); - -#define POOL_SIZE 64 - -static __init int init_emergency_pool(void) -{ - struct sysinfo i; - si_meminfo(&i); - si_swapinfo(&i); - - if (!i.totalhigh) - return 0; - - page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); - if (!page_pool) - BUG(); - printk("highmem bounce pool size: %d pages\n", POOL_SIZE); - - return 0; -} - -__initcall(init_emergency_pool); - -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned long flags; - unsigned char *vto; - - local_irq_save(flags); - vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif - -#define ISA_POOL_SIZE 16 - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - if (isa_page_pool) - return 0; - - isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc, page_pool_free, (void *) __GFP_DMA); - if (!isa_page_pool) - BUG(); - - printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); - return 0; -} - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - unsigned char *vfrom; - struct bio_vec *tovec, *fromvec; - int i; - - __bio_for_each_segment(tovec, to, i, 0) { - fromvec = from->bi_io_vec + i; - - /* - * not bounced - */ - if (tovec->bv_page == fromvec->bv_page) - continue; - - /* - * fromvec->bv_offset and fromvec->bv_len might have been - * modified by the block layer, so use the original copy, - * bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - - flush_dcache_page(tovec->bv_page); - bounce_copy_vec(tovec, vfrom); - } -} - -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; - int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - - /* - * free up bounce indirect pages used - */ - __bio_for_each_segment(bvec, bio, i, 0) { - org_vec = bio_orig->bi_io_vec + i; - if (bvec->bv_page == org_vec->bv_page) - continue; - - mempool_free(bvec->bv_page, pool); - } - - bio_endio(bio_orig, bio_orig->bi_size, err); - bio_put(bio); -} - -static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) -{ - if (bio->bi_size) - return 1; - - bounce_end_io(bio, page_pool, err); - return 0; -} - -static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - bounce_end_io(bio, isa_page_pool, err); - return 0; -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio, pool, err); -} - -static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - __bounce_end_io_read(bio, page_pool, err); - return 0; -} - -static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - __bounce_end_io_read(bio, isa_page_pool, err); - return 0; -} - -static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, - mempool_t *pool) -{ - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); - struct bio_vec *to, *from; - - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; - - /* - * is destination page below bounce pfn? - */ - if (page_to_pfn(page) < q->bounce_pfn) - continue; - - /* - * irk, bounce it - */ - if (!bio) - bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); - - to = bio->bi_io_vec + i; - - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - - if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(from->bv_page); - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); - } - } - - /* - * no pages bounced - */ - if (!bio) - return; - - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; - bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; - - if (pool == page_pool) { - bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } - - bio->bi_private = *bio_orig; - *bio_orig = bio; -} - -void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) -{ - mempool_t *pool; - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (q->bounce_pfn >= blk_max_pfn) - return; - pool = page_pool; - } else { - BUG_ON(!isa_page_pool); - pool = isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool); -} - -EXPORT_SYMBOL(blk_queue_bounce); - -#if defined(HASHED_PAGE_VIRTUAL) - -#define PA_HASH_ORDER 7 - -/* - * Describes one page->virtual association - */ -struct page_address_map { - struct page *page; - void *virtual; - struct list_head list; -}; - -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ - -/* - * Hash table bucket - */ -static struct page_address_slot { - struct list_head lh; /* List of page_address_maps */ - spinlock_t lock; /* Protect this bucket's list */ -} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; - -static struct page_address_slot *page_slot(struct page *page) -{ - return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; -} - -void *page_address(struct page *page) -{ - unsigned long flags; - void *ret; - struct page_address_slot *pas; - - if (!PageHighMem(page)) - return lowmem_page_address(page); - - pas = page_slot(page); - ret = NULL; - spin_lock_irqsave(&pas->lock, flags); - if (!list_empty(&pas->lh)) { - struct page_address_map *pam; - - list_for_each_entry(pam, &pas->lh, list) { - if (pam->page == page) { - ret = pam->virtual; - goto done; - } - } - } -done: - spin_unlock_irqrestore(&pas->lock, flags); - return ret; -} - -EXPORT_SYMBOL(page_address); - -void set_page_address(struct page *page, void *virtual) -{ - unsigned long flags; - struct page_address_slot *pas; - struct page_address_map *pam; - - BUG_ON(!PageHighMem(page)); - - pas = page_slot(page); - if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); - - spin_lock_irqsave(&pool_lock, flags); - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); - - pam->page = page; - pam->virtual = virtual; - - spin_lock_irqsave(&pas->lock, flags); - list_add_tail(&pam->list, &pas->lh); - spin_unlock_irqrestore(&pas->lock, flags); - } else { /* Remove */ - spin_lock_irqsave(&pas->lock, flags); - list_for_each_entry(pam, &pas->lh, list) { - if (pam->page == page) { - list_del(&pam->list); - spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); - list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); - goto done; - } - } - spin_unlock_irqrestore(&pas->lock, flags); - } -done: - return; -} - -static struct page_address_map page_address_maps[LAST_PKMAP]; - -void __init page_address_init(void) -{ - int i; - - INIT_LIST_HEAD(&page_address_pool); - for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); - for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { - INIT_LIST_HEAD(&page_address_htable[i].lh); - spin_lock_init(&page_address_htable[i].lock); - } - spin_lock_init(&pool_lock); -} - -#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/mm/memory.c --- a/linux-2.6.11-xen-sparse/mm/memory.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2321 +0,0 @@ -/* - * linux/mm/memory.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - */ - -/* - * demand-loading started 01.12.91 - seems it is high on the list of - * things wanted, and it should be easy to implement. - Linus - */ - -/* - * Ok, demand-loading was easy, shared pages a little bit tricker. Shared - * pages started 02.12.91, seems to work. - Linus. - * - * Tested sharing by executing about 30 /bin/sh: under the old kernel it - * would have taken more than the 6M I have free, but it worked well as - * far as I could see. - * - * Also corrected some "invalidate()"s - I wasn't doing enough of them. - */ - -/* - * Real VM (paging to/from disk) started 18.12.91. Much more work and - * thought has to go into this. Oh, well.. - * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. - * Found it. Everything seems to work now. - * 20.12.91 - Ok, making the swap-device changeable like the root. - */ - -/* - * 05.04.94 - Multi-page memory management added for v1.1. - * Idea by Alex Bligh (alex@xxxxxxxxxxxxxxx) - * - * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG - * (Gerhard.Wichert@xxxxxxxxxxxxxx) - * - * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) - */ - -#include <linux/kernel_stat.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/mman.h> -#include <linux/swap.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> -#include <linux/rmap.h> -#include <linux/acct.h> -#include <linux/module.h> -#include <linux/init.h> - -#include <asm/pgalloc.h> -#include <asm/uaccess.h> -#include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> - -#include <linux/swapops.h> -#include <linux/elf.h> - -#ifndef CONFIG_DISCONTIGMEM -/* use the per-pgdat data instead for discontigmem - mbligh */ -unsigned long max_mapnr; -struct page *mem_map; - -EXPORT_SYMBOL(max_mapnr); -EXPORT_SYMBOL(mem_map); -#endif - -unsigned long num_physpages; -/* - * A number of key systems in x86 including ioremap() rely on the assumption - * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and - * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ -void * high_memory; -unsigned long vmalloc_earlyreserve; - -EXPORT_SYMBOL(num_physpages); -EXPORT_SYMBOL(high_memory); -EXPORT_SYMBOL(vmalloc_earlyreserve); - -/* - * Note: this doesn't free the actual pages themselves. That - * has been handled earlier when unmapping all the memory regions. - */ -static inline void clear_pmd_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long start, unsigned long end) -{ - struct page *page; - - if (pmd_none(*pmd)) - return; - if (unlikely(pmd_bad(*pmd))) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - if (!((start | end) & ~PMD_MASK)) { - /* Only clear full, aligned ranges */ - page = pmd_page(*pmd); - pmd_clear(pmd); - dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; - pte_free_tlb(tlb, page); - } -} - -static inline void clear_pud_range(struct mmu_gather *tlb, pud_t *pud, unsigned long start, unsigned long end) -{ - unsigned long addr = start, next; - pmd_t *pmd, *__pmd; - - if (pud_none(*pud)) - return; - if (unlikely(pud_bad(*pud))) { - pud_ERROR(*pud); - pud_clear(pud); - return; - } - - pmd = __pmd = pmd_offset(pud, start); - do { - next = (addr + PMD_SIZE) & PMD_MASK; - if (next > end || next <= addr) - next = end; - - clear_pmd_range(tlb, pmd, addr, next); - pmd++; - addr = next; - } while (addr && (addr < end)); - - if (!((start | end) & ~PUD_MASK)) { - /* Only clear full, aligned ranges */ - pud_clear(pud); - pmd_free_tlb(tlb, __pmd); - } -} - - -static inline void clear_pgd_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long start, unsigned long end) -{ - unsigned long addr = start, next; - pud_t *pud, *__pud; - - if (pgd_none(*pgd)) - return; - if (unlikely(pgd_bad(*pgd))) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - - pud = __pud = pud_offset(pgd, start); - do { - next = (addr + PUD_SIZE) & PUD_MASK; - if (next > end || next <= addr) - next = end; - - clear_pud_range(tlb, pud, addr, next); - pud++; - addr = next; - } while (addr && (addr < end)); - - if (!((start | end) & ~PGDIR_MASK)) { - /* Only clear full, aligned ranges */ - pgd_clear(pgd); - pud_free_tlb(tlb, __pud); - } -} - -/* - * This function clears user-level page tables of a process. - * - * Must be called with pagetable lock held. - */ -void clear_page_range(struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - unsigned long addr = start, next; - pgd_t * pgd = pgd_offset(tlb->mm, start); - unsigned long i; - - for (i = pgd_index(start); i <= pgd_index(end-1); i++) { - next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= addr) - next = end; - - clear_pgd_range(tlb, pgd, addr, next); - pgd++; - addr = next; - } -} - -pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -{ - if (!pmd_present(*pmd)) { - struct page *new; - - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pmd_present(*pmd)) { - pte_free(new); - goto out; - } - mm->nr_ptes++; - inc_page_state(nr_page_table_pages); - pmd_populate(mm, pmd, new); - } -out: - return pte_offset_map(pmd, address); -} - -pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) -{ - if (!pmd_present(*pmd)) { - pte_t *new; - - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one_kernel(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pmd_present(*pmd)) { - pte_free_kernel(new); - goto out; - } - pmd_populate_kernel(mm, pmd, new); - } -out: - return pte_offset_kernel(pmd, address); -} - -/* - * copy one vm_area from one task to the other. Assumes the page tables - * already present in the new task to be cleared in the whole range - * covered by this vma. - * - * dst->page_table_lock is held on entry and exit, - * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). - */ - -static inline void -copy_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte) -{ - if (pte_file(pte)) - return; - swap_duplicate(pte_to_swp_entry(pte)); - if (list_empty(&dst_mm->mmlist)) { - spin_lock(&mmlist_lock); - list_add(&dst_mm->mmlist, &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } -} - -static inline void -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, - unsigned long addr) -{ - pte_t pte = *src_pte; - struct page *page; - unsigned long pfn; - - /* pte contains position in swap, so copy. */ - if (!pte_present(pte)) { - copy_swap_pte(dst_mm, src_mm, pte); - set_pte(dst_pte, pte); - return; - } - pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the - * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the - * mapping as is. - */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - return; - } - - /* - * If it's a COW mapping, write protect it both - * in the parent and the child - */ - if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { - ptep_set_wrprotect(src_pte); - pte = *src_pte; - } - - /* - * If it's a shared mapping, mark it clean in - * the child - */ - if (vm_flags & VM_SHARED) - pte = pte_mkclean(pte); - pte = pte_mkold(pte); - get_page(page); - dst_mm->rss++; - if (PageAnon(page)) - dst_mm->anon_rss++; - set_pte(dst_pte, pte); - page_dup_rmap(page); -} - -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pte_t *src_pte, *dst_pte; - pte_t *s, *d; - unsigned long vm_flags = vma->vm_flags; - - d = dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); - if (!dst_pte) - return -ENOMEM; - - spin_lock(&src_mm->page_table_lock); - s = src_pte = pte_offset_map_nested(src_pmd, addr); - for (; addr < end; addr += PAGE_SIZE, s++, d++) { - if (pte_none(*s)) - continue; - copy_one_pte(dst_mm, src_mm, d, s, vm_flags, addr); - } - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src_mm->page_table_lock); - cond_resched_lock(&dst_mm->page_table_lock); - return 0; -} - -static int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pmd_t *src_pmd, *dst_pmd; - int err = 0; - unsigned long next; - - src_pmd = pmd_offset(src_pud, addr); - dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); - if (!dst_pmd) - return -ENOMEM; - - for (; addr < end; addr = next, src_pmd++, dst_pmd++) { - next = (addr + PMD_SIZE) & PMD_MASK; - if (next > end || next <= addr) - next = end; - if (pmd_none(*src_pmd)) - continue; - if (pmd_bad(*src_pmd)) { - pmd_ERROR(*src_pmd); - pmd_clear(src_pmd); - continue; - } - err = copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next); - if (err) - break; - } - return err; -} - -static int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pud_t *src_pud, *dst_pud; - int err = 0; - unsigned long next; - - src_pud = pud_offset(src_pgd, addr); - dst_pud = pud_alloc(dst_mm, dst_pgd, addr); - if (!dst_pud) - return -ENOMEM; - - for (; addr < end; addr = next, src_pud++, dst_pud++) { - next = (addr + PUD_SIZE) & PUD_MASK; - if (next > end || next <= addr) - next = end; - if (pud_none(*src_pud)) - continue; - if (pud_bad(*src_pud)) { - pud_ERROR(*src_pud); - pud_clear(src_pud); - continue; - } - err = copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next); - if (err) - break; - } - return err; -} - -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pgd_t *src_pgd, *dst_pgd; - unsigned long addr, start, end, next; - int err = 0; - - if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst, src, vma); - - start = vma->vm_start; - src_pgd = pgd_offset(src, start); - dst_pgd = pgd_offset(dst, start); - - end = vma->vm_end; - addr = start; - while (addr && (addr < end-1)) { - next = (addr + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= addr) - next = end; - if (pgd_none(*src_pgd)) - goto next_pgd; - if (pgd_bad(*src_pgd)) { - pgd_ERROR(*src_pgd); - pgd_clear(src_pgd); - goto next_pgd; - } - err = copy_pud_range(dst, src, dst_pgd, src_pgd, - vma, addr, next); - if (err) - break; - -next_pgd: - src_pgd++; - dst_pgd++; - addr = next; - } - - return err; -} - -static void zap_pte_range(struct mmu_gather *tlb, - pmd_t *pmd, unsigned long address, - unsigned long size, struct zap_details *details) -{ - unsigned long offset; - pte_t *ptep; - - if (pmd_none(*pmd)) - return; - if (unlikely(pmd_bad(*pmd))) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - ptep = pte_offset_map(pmd, address); - offset = address & ~PMD_MASK; - if (offset + size > PMD_SIZE) - size = PMD_SIZE - offset; - size &= PAGE_MASK; - if (details && !details->check_mapping && !details->nonlinear_vma) - details = NULL; - for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { - pte_t pte = *ptep; - if (pte_none(pte)) - continue; - if (pte_present(pte)) { - struct page *page = NULL; - unsigned long pfn = pte_pfn(pte); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - if (unlikely(details) && page) { - /* - * unmap_shared_mapping_pages() wants to - * invalidate cache without truncating: - * unmap shared but keep private pages. - */ - if (details->check_mapping && - details->check_mapping != page->mapping) - continue; - /* - * Each page->index must be checked when - * invalidating or truncating nonlinear. - */ - if (details->nonlinear_vma && - (page->index < details->first_index || - page->index > details->last_index)) - continue; - } - pte = ptep_get_and_clear(ptep); - tlb_remove_tlb_entry(tlb, ptep, address+offset); - if (unlikely(!page)) - continue; - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - address+offset) != page->index) - set_pte(ptep, pgoff_to_pte(page->index)); - if (pte_dirty(pte)) - set_page_dirty(page); - if (PageAnon(page)) - tlb->mm->anon_rss--; - else if (pte_young(pte)) - mark_page_accessed(page); - tlb->freed++; - page_remove_rmap(page); - tlb_remove_page(tlb, page); - continue; - } - /* - * If details->check_mapping, we leave swap entries; - * if details->nonlinear_vma, we leave file entries. - */ - if (unlikely(details)) - continue; - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); - } - pte_unmap(ptep-1); -} - -static void zap_pmd_range(struct mmu_gather *tlb, - pud_t *pud, unsigned long address, - unsigned long size, struct zap_details *details) -{ - pmd_t * pmd; - unsigned long end; - - if (pud_none(*pud)) - return; - if (unlikely(pud_bad(*pud))) { - pud_ERROR(*pud); - pud_clear(pud); - return; - } - pmd = pmd_offset(pud, address); - end = address + size; - if (end > ((address + PUD_SIZE) & PUD_MASK)) - end = ((address + PUD_SIZE) & PUD_MASK); - do { - zap_pte_range(tlb, pmd, address, end - address, details); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); -} - -static void zap_pud_range(struct mmu_gather *tlb, - pgd_t * pgd, unsigned long address, - unsigned long end, struct zap_details *details) -{ - pud_t * pud; - - if (pgd_none(*pgd)) - return; - if (unlikely(pgd_bad(*pgd))) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pud = pud_offset(pgd, address); - do { - zap_pmd_range(tlb, pud, address, end - address, details); - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); -} - -static void unmap_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long address, - unsigned long end, struct zap_details *details) -{ - unsigned long next; - pgd_t *pgd; - int i; - - BUG_ON(address >= end); - pgd = pgd_offset(vma->vm_mm, address); - tlb_start_vma(tlb, vma); - for (i = pgd_index(address); i <= pgd_index(end-1); i++) { - next = (address + PGDIR_SIZE) & PGDIR_MASK; - if (next <= address || next > end) - next = end; - zap_pud_range(tlb, pgd, address, next, details); - address = next; - pgd++; - } - tlb_end_vma(tlb, vma); -} - -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - -/** - * unmap_vmas - unmap a range of memory covered by a list of vma's - * @tlbp: address of the caller's struct mmu_gather - * @mm: the controlling mm_struct - * @vma: the starting vma - * @start_addr: virtual address at which to start unmapping - * @end_addr: virtual address at which to end unmapping - * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here - * @details: details of nonlinear truncation or shared cache invalidation - * - * Returns the number of vma's which were covered by the unmapping. - * - * Unmap all pages in the vma list. Called under page_table_lock. - * - * We aim to not hold page_table_lock for too long (for scheduling latency - * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * - * Only addresses between `start' and `end' will be unmapped. - * - * The VMA list must be sorted in ascending virtual address order. - * - * unmap_vmas() assumes that the caller will flush the whole unmapped address - * range after unmap_vmas() returns. So the only responsibility here is to - * ensure that any thus-far unmapped pages are flushed before unmap_vmas() - * drops the lock and schedules. - */ -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long *nr_accounted, - struct zap_details *details) -{ - unsigned long zap_bytes = ZAP_BLOCK_SIZE; - unsigned long tlb_start = 0; /* For tlb_finish_mmu */ - int tlb_start_valid = 0; - int ret = 0; - spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = tlb_is_full_mm(*tlbp); - - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { - unsigned long start; - unsigned long end; - - start = max(vma->vm_start, start_addr); - if (start >= vma->vm_end) - continue; - end = min(vma->vm_end, end_addr); - if (end <= vma->vm_start) - continue; - - if (vma->vm_flags & VM_ACCOUNT) - *nr_accounted += (end - start) >> PAGE_SHIFT; - - ret++; - while (start != end) { - unsigned long block; - - if (!tlb_start_valid) { - tlb_start = start; - tlb_start_valid = 1; - } - - if (is_vm_hugetlb_page(vma)) { - block = end - start; - unmap_hugepage_range(vma, start, end); - } else { - block = min(zap_bytes, end - start); - unmap_page_range(*tlbp, vma, start, - start + block, details); - } - - start += block; - zap_bytes -= block; - if ((long)zap_bytes > 0) - continue; - - tlb_finish_mmu(*tlbp, tlb_start, start); - - if (need_resched() || - need_lockbreak(&mm->page_table_lock) || - (i_mmap_lock && need_lockbreak(i_mmap_lock))) { - if (i_mmap_lock) { - /* must reset count of rss freed */ - *tlbp = tlb_gather_mmu(mm, fullmm); - details->break_addr = start; - goto out; - } - spin_unlock(&mm->page_table_lock); - cond_resched(); - spin_lock(&mm->page_table_lock); - } - - *tlbp = tlb_gather_mmu(mm, fullmm); - tlb_start_valid = 0; - zap_bytes = ZAP_BLOCK_SIZE; - } - } -out: - return ret; -} - -/** - * zap_page_range - remove user pages in a given range - * @vma: vm_area_struct holding the applicable pages - * @address: starting address of pages to zap - * @size: number of bytes to zap - * @details: details of nonlinear truncation or shared cache invalidation - */ -void zap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, struct zap_details *details) -{ - struct mm_struct *mm = vma->vm_mm; - struct mmu_gather *tlb; - unsigned long end = address + size; - unsigned long nr_accounted = 0; - - if (is_vm_hugetlb_page(vma)) { - zap_hugepage_range(vma, address, size); - return; - } - - lru_add_drain(); - spin_lock(&mm->page_table_lock); - tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); - tlb_finish_mmu(tlb, address, end); - acct_update_integrals(); - spin_unlock(&mm->page_table_lock); -} - -/* - * Do a quick page-table lookup for a single page. - * mm->page_table_lock must be held. - */ -static struct page * -__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - unsigned long pfn; - struct page *page; - - page = follow_huge_addr(mm, address, write); - if (! IS_ERR(page)) - return page; - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); - - ptep = pte_offset_map(pmd, address); - if (!ptep) - goto out; - - pte = *ptep; - pte_unmap(ptep); - if (pte_present(pte)) { - if (write && !pte_write(pte)) - goto out; - if (read && !pte_read(pte)) - goto out; - pfn = pte_pfn(pte); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); - return page; - } - } - -out: - return NULL; -} - -struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) -{ - return __follow_page(mm, address, /*read*/0, write); -} - -int -check_user_page_readable(struct mm_struct *mm, unsigned long address) -{ - return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; -} - -EXPORT_SYMBOL(check_user_page_readable); - -/* - * Given a physical address, is there a useful struct page pointing to - * it? This may become more complex in the future if we start dealing - * with IO-aperture pages for direct-IO. - */ - -static inline struct page *get_page_map(struct page *page) -{ - if (!pfn_valid(page_to_pfn(page))) - return NULL; - return page; -} - - -static inline int -untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - /* Check if the vma is for an anonymous mapping. */ - if (vma->vm_ops && vma->vm_ops->nopage) - return 0; - - /* Check if page directory entry exists. */ - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return 1; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - return 1; - - /* Check if page middle directory entry exists. */ - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return 1; - - /* There is a pte slot for 'address' in 'mm'. */ - return 0; -} - - -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) -{ - int i; - unsigned int flags; - - /* - * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. - */ - flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - i = 0; - - do { - struct vm_area_struct * vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { - unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (write) /* user gate pages are read-only */ - return i ? : -EFAULT; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - BUG_ON(pmd_none(*pmd)); - pte = pte_offset_map(pmd, pg); - BUG_ON(pte_none(*pte)); - if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); - } - pte_unmap(pte); - if (vmas) - vmas[i] = gate_vma; - i++; - start += PAGE_SIZE; - len--; - continue; - } - - if (vma && (vma->vm_flags & VM_FOREIGN)) - { - struct page **map = vma->vm_private_data; - int offset = (start - vma->vm_start) >> PAGE_SHIFT; - - if (map[offset] != NULL) { - if (pages) { - pages[i] = map[offset]; - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - continue; - } - } - - if (!vma || (vma->vm_flags & VM_IO) - || !(flags & vma->vm_flags)) - return i ? : -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i); - continue; - } - spin_lock(&mm->page_table_lock); - do { - struct page *map; - int lookup_write = write; - - cond_resched_lock(&mm->page_table_lock); - while (!(map = follow_page(mm, start, lookup_write))) { - /* - * Shortcut for anonymous pages. We don't want - * to force the creation of pages tables for - * insanly big anonymously mapped areas that - * nobody touched so far. This is important - * for doing a core dump for these mappings. - */ - if (!lookup_write && - untouched_anonymous_page(mm,vma,start)) { - map = ZERO_PAGE(start); - break; - } - spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm,vma,start,write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - return i ? i : -EFAULT; - case VM_FAULT_OOM: - return i ? i : -ENOMEM; - default: - BUG(); - } - /* - * Now that we have performed a write fault - * and surely no longer have a shared page we - * shouldn't write, we shouldn't ignore an - * unwritable page in the page table if - * we are forcing write access. - */ - lookup_write = write && !force; - spin_lock(&mm->page_table_lock); - } - if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { - spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); - i = -EFAULT; - goto out; - } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); - } - if (vmas) - vmas[i] = vma; - i++; - start += PAGE_SIZE; - len--; - } while(len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); - } while(len); -out: - return i; -} - -EXPORT_SYMBOL(get_user_pages); - -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); - BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, - unsigned long address, unsigned long size, pgprot_t prot) -{ - unsigned long base, end; - - base = address & PUD_MASK; - address &= ~PUD_MASK; - end = address + size; - if (end > PUD_SIZE) - end = PUD_SIZE; - do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); - if (!pte) - return -ENOMEM; - zeromap_pte_range(pte, base + address, end - address, prot); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -static inline int zeromap_pud_range(struct mm_struct *mm, pud_t * pud, - unsigned long address, - unsigned long size, pgprot_t prot) -{ - unsigned long base, end; - int error = 0; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - pmd_t * pmd = pmd_alloc(mm, pud, base + address); - error = -ENOMEM; - if (!pmd) - break; - error = zeromap_pmd_range(mm, pmd, base + address, - end - address, prot); - if (error) - break; - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); - return 0; -} - -int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, - unsigned long size, pgprot_t prot) -{ - int i; - int error = 0; - pgd_t * pgd; - unsigned long beg = address; - unsigned long end = address + size; - unsigned long next; - struct mm_struct *mm = vma->vm_mm; - - pgd = pgd_offset(mm, address); - flush_cache_range(vma, beg, end); - BUG_ON(address >= end); - BUG_ON(end > vma->vm_end); - - spin_lock(&mm->page_table_lock); - for (i = pgd_index(address); i <= pgd_index(end-1); i++) { - pud_t *pud = pud_alloc(mm, pgd, address); - error = -ENOMEM; - if (!pud) - break; - next = (address + PGDIR_SIZE) & PGDIR_MASK; - if (next <= beg || next > end) - next = end; - error = zeromap_pud_range(mm, pud, address, - next - address, prot); - if (error) - break; - address = next; - pgd++; - } - /* - * Why flush? zeromap_pte_range has a BUG_ON for !pte_none() - */ - flush_tlb_range(vma, beg, end); - spin_unlock(&mm->page_table_lock); - return error; -} - -/* - * maps a range of physical memory into the requested pages. the old - * mappings are removed. any references to nonexistent pages results - * in null mappings (currently treated as "copy-on-access") - */ -static inline void -remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long pfn, pgprot_t prot) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - BUG_ON(!pte_none(*pte)); - if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); - address += PAGE_SIZE; - pfn++; - pte++; - } while (address && (address < end)); -} - -static inline int -remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, unsigned long pfn, pgprot_t prot) -{ - unsigned long base, end; - - base = address & PUD_MASK; - address &= ~PUD_MASK; - end = address + size; - if (end > PUD_SIZE) - end = PUD_SIZE; - pfn -= (address >> PAGE_SHIFT); - do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); - if (!pte) - return -ENOMEM; - remap_pte_range(pte, base + address, end - address, - (address >> PAGE_SHIFT) + pfn, prot); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -static inline int remap_pud_range(struct mm_struct *mm, pud_t * pud, - unsigned long address, unsigned long size, - unsigned long pfn, pgprot_t prot) -{ - unsigned long base, end; - int error; - - base = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - pfn -= address >> PAGE_SHIFT; - do { - pmd_t *pmd = pmd_alloc(mm, pud, base+address); - error = -ENOMEM; - if (!pmd) - break; - error = remap_pmd_range(mm, pmd, base + address, end - address, - (address >> PAGE_SHIFT) + pfn, prot); - if (error) - break; - address = (address + PUD_SIZE) & PUD_MASK; - pud++; - } while (address && (address < end)); - return error; -} - -/* Note: this is only safe if the mm semaphore is held when called. */ -int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, - unsigned long pfn, unsigned long size, pgprot_t prot) -{ - int error = 0; - pgd_t *pgd; - unsigned long beg = from; - unsigned long end = from + size; - unsigned long next; - struct mm_struct *mm = vma->vm_mm; - int i; - - pfn -= from >> PAGE_SHIFT; - pgd = pgd_offset(mm, from); - flush_cache_range(vma, beg, end); - BUG_ON(from >= end); - - /* - * Physically remapped pages are special. Tell the - * rest of the world about it: - * VM_IO tells people not to look at these pages - * (accesses can have side effects). - * VM_RESERVED tells swapout not to try to touch - * this region. - */ - vma->vm_flags |= VM_IO | VM_RESERVED; - - spin_lock(&mm->page_table_lock); - for (i = pgd_index(beg); i <= pgd_index(end-1); i++) { - pud_t *pud = pud_alloc(mm, pgd, from); - error = -ENOMEM; - if (!pud) - break; - next = (from + PGDIR_SIZE) & PGDIR_MASK; - if (next > end || next <= from) - next = end; - error = remap_pud_range(mm, pud, from, end - from, - pfn + (from >> PAGE_SHIFT), prot); - if (error) - break; - from = next; - pgd++; - } - /* - * Why flush? remap_pte_range has a BUG_ON for !pte_none() - */ - flush_tlb_range(vma, beg, end); - spin_unlock(&mm->page_table_lock); - - return error; -} - -EXPORT_SYMBOL(remap_pfn_range); - -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - -/* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - pte_t entry; - - flush_cache_page(vma, address); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), - vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); -} - -/* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. - * - * Goto-purists beware: the only reason for goto's here is that it results - * in better assembly code.. The "default" path will see no jumps at all. - * - * Note that this routine assumes that the protection checks have been - * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. - * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. - * - * We hold the mm semaphore and the page_table_lock on entry and exit - * with the page_table_lock released. - */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) -{ - struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(pte); - pte_t entry; - - if (unlikely(!pfn_valid(pfn))) { - /* - * This should really halt the system so it can be debugged or - * at least the kernel stops what it's doing before it corrupts - * data, but for the moment just pretend this is OOM. - */ - pte_unmap(page_table); - printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", - address); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; - } - old_page = pfn_to_page(pfn); - - if (!TestSetPageLocked(old_page)) { - int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { - flush_cache_page(vma, address); - entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), - vma); - ptep_set_access_flags(vma, address, page_table, entry, 1); - update_mmu_cache(vma, address, entry); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; - } - } - pte_unmap(page_table); - - /* - * Ok, we need to copy. Oh, well.. - */ - if (!PageReserved(old_page)) - page_cache_get(old_page); - spin_unlock(&mm->page_table_lock); - - if (unlikely(anon_vma_prepare(vma))) - goto no_new_page; - if (old_page == ZERO_PAGE(address)) { - new_page = alloc_zeroed_user_highpage(vma, address); - if (!new_page) - goto no_new_page; - } else { - new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!new_page) - goto no_new_page; - copy_user_highpage(new_page, old_page, address); - } - /* - * Re-check the pte - we dropped the lock - */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, pte))) { - if (PageAnon(old_page)) - mm->anon_rss--; - if (PageReserved(old_page)) { - ++mm->rss; - acct_update_integrals(); - update_mem_hiwater(); - } else - page_remove_rmap(old_page); - break_cow(vma, new_page, address, page_table); - lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); - - /* Free the old page.. */ - new_page = old_page; - } - pte_unmap(page_table); - page_cache_release(new_page); - page_cache_release(old_page); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; - -no_new_page: - page_cache_release(old_page); - return VM_FAULT_OOM; -} - -/* - * Helper functions for unmap_mapping_range(). - * - * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ - * - * We have to restart searching the prio_tree whenever we drop the lock, - * since the iterator is only valid while the lock is held, and anyway - * a later vma might be split and reinserted earlier while lock dropped. - * - * The list of nonlinear vmas could be handled more efficiently, using - * a placeholder, but handle it in the same way until a need is shown. - * It is important to search the prio_tree before nonlinear list: a vma - * may become nonlinear and be shifted from prio_tree to nonlinear list - * while the lock is dropped; but never shifted from list to prio_tree. - * - * In order to make forward progress despite restarting the search, - * vm_truncate_count is used to mark a vma as now dealt with, so we can - * quickly skip it next time around. Since the prio_tree search only - * shows us those vmas affected by unmapping the range in question, we - * can't efficiently keep all vmas in step with mapping->truncate_count: - * so instead reset them all whenever it wraps back to 0 (then go to 1). - * mapping->truncate_count and vma->vm_truncate_count are protected by - * i_mmap_lock. - * - * In order to make forward progress despite repeatedly restarting some - * large vma, note the break_addr set by unmap_vmas when it breaks out: - * and restart from that address when we reach that vma again. It might - * have been split or merged, shrunk or extended, but never shifted: so - * restart_addr remains valid so long as it remains in the vma's range. - * unmap_mapping_range forces truncate_count to leap over page-aligned - * values so we can save vma's restart_addr in its truncate_count field. - */ -#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) - -static void reset_vma_truncate_counts(struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - vma->vm_truncate_count = 0; - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - vma->vm_truncate_count = 0; -} - -static int unmap_mapping_range_vma(struct vm_area_struct *vma, - unsigned long start_addr, unsigned long end_addr, - struct zap_details *details) -{ - unsigned long restart_addr; - int need_break; - -again: - restart_addr = vma->vm_truncate_count; - if (is_restart_addr(restart_addr) && start_addr < restart_addr) { - start_addr = restart_addr; - if (start_addr >= end_addr) { - /* Top of vma has been split off since last time */ - vma->vm_truncate_count = details->truncate_count; - return 0; - } - } - - details->break_addr = end_addr; - zap_page_range(vma, start_addr, end_addr - start_addr, details); - - /* - * We cannot rely on the break test in unmap_vmas: - * on the one hand, we don't want to restart our loop - * just because that broke out for the page_table_lock; - * on the other hand, it does no test when vma is small. - */ - need_break = need_resched() || - need_lockbreak(details->i_mmap_lock); - - if (details->break_addr >= end_addr) { - /* We have now completed this vma: mark it so */ - vma->vm_truncate_count = details->truncate_count; - if (!need_break) - return 0; - } else { - /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = details->break_addr; - if (!need_break) - goto again; - } - - spin_unlock(details->i_mmap_lock); - cond_resched(); - spin_lock(details->i_mmap_lock); - return -EINTR; -} - -static inline void unmap_mapping_range_tree(struct prio_tree_root *root, - struct zap_details *details) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - pgoff_t vba, vea, zba, zea; - -restart: - vma_prio_tree_foreach(vma, &iter, root, - details->first_index, details->last_index) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; - - vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; - /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ - zba = details->first_index; - if (zba < vba) - zba = vba; - zea = details->last_index; - if (zea > vea) - zea = vea; - - if (unmap_mapping_range_vma(vma, - ((zba - vba) << PAGE_SHIFT) + vma->vm_start, - ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, - details) < 0) - goto restart; - } -} - -static inline void unmap_mapping_range_list(struct list_head *head, - struct zap_details *details) -{ - struct vm_area_struct *vma; - - /* - * In nonlinear VMAs there is no correspondence between virtual address - * offset and file offset. So we must perform an exhaustive search - * across *all* the pages in each nonlinear VMA, not just the pages - * whose virtual address lies outside the file truncation point. - */ -restart: - list_for_each_entry(vma, head, shared.vm_set.list) { - /* Skip quickly over those we have already dealt with */ - if (vma->vm_truncate_count == details->truncate_count) - continue; - details->nonlinear_vma = vma; - if (unmap_mapping_range_vma(vma, vma->vm_start, - vma->vm_end, details) < 0) - goto restart; - } -} - -/** - * unmap_mapping_range - unmap the portion of all mmaps - * in the specified address_space corresponding to the specified - * page range in the underlying file. - * @address_space: the address space containing mmaps to be unmapped. - * @holebegin: byte in first page to unmap, relative to the start of - * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which - * must keep the partial page. In contrast, we must get rid of - * partial pages. - * @holelen: size of prospective hole in bytes. This will be rounded - * up to a PAGE_SIZE boundary. A holelen of zero truncates to the - * end of the file. - * @even_cows: 1 when truncating a file, unmap even private COWed pages; - * but 0 when invalidating pagecache, don't throw away private data. - */ -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows) -{ - struct zap_details details; - pgoff_t hba = holebegin >> PAGE_SHIFT; - pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; - - /* Check for overflow. */ - if (sizeof(holelen) > sizeof(hlen)) { - long long holeend = - (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (holeend & ~(long long)ULONG_MAX) - hlen = ULONG_MAX - hba + 1; - } - - details.check_mapping = even_cows? NULL: mapping; - details.nonlinear_vma = NULL; - details.first_index = hba; - details.last_index = hba + hlen - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; - details.i_mmap_lock = &mapping->i_mmap_lock; - - spin_lock(&mapping->i_mmap_lock); - - /* serialize i_size write against truncate_count write */ - smp_wmb(); - /* Protect against page faults, and endless unmapping loops */ - mapping->truncate_count++; - /* - * For archs where spin_lock has inclusive semantics like ia64 - * this smp_mb() will prevent to read pagetable contents - * before the truncate_count increment is visible to - * other cpus. - */ - smp_mb(); - if (unlikely(is_restart_addr(mapping->truncate_count))) { - if (mapping->truncate_count == 0) - reset_vma_truncate_counts(mapping); - mapping->truncate_count++; - } - details.truncate_count = mapping->truncate_count; - - if (unlikely(!prio_tree_empty(&mapping->i_mmap))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); - if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) - unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); - spin_unlock(&mapping->i_mmap_lock); -} -EXPORT_SYMBOL(unmap_mapping_range); - -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - /* - * truncation of in-use swapfiles is disallowed - it would cause - * subsequent swapout to scribble on the now-freed blocks. - */ - if (IS_SWAPFILE(inode)) - goto out_busy; - i_size_write(inode, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op && inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -out_busy: - return -ETXTBSY; -} - -EXPORT_SYMBOL(vmtruncate); - -/* - * Primitive swap readahead code. We simply read an aligned block of - * (1 << page_cluster) entries in the swap area. This method is chosen - * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... - * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. - */ -void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma) -{ -#ifdef CONFIG_NUMA - struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL; -#endif - int i, num; - struct page *new_page; - unsigned long offset; - - /* - * Get the number of handles we should do readahead io to. - */ - num = valid_swaphandles(entry, &offset); - for (i = 0; i < num; offset++, i++) { - /* Ok, do the async read-ahead now */ - new_page = read_swap_cache_async(swp_entry(swp_type(entry), - offset), vma, addr); - if (!new_page) - break; - page_cache_release(new_page); -#ifdef CONFIG_NUMA - /* - * Find the next applicable VMA for the NUMA policy. - */ - addr += PAGE_SIZE; - if (addr == 0) - vma = NULL; - if (vma) { - if (addr >= vma->vm_end) { - vma = next_vma; - next_vma = vma ? vma->vm_next : NULL; - } - if (vma && addr < vma->vm_start) - vma = NULL; - } else { - if (next_vma && addr >= next_vma->vm_start) { - vma = next_vma; - next_vma = vma->vm_next; - } - } -#endif - } - lru_add_drain(); /* Push any new pages onto the LRU now */ -} - -/* - * We hold the mm semaphore and the page_table_lock on entry and - * should release the pagetable lock on exit.. - */ -static int do_swap_page(struct mm_struct * mm, - struct vm_area_struct * vma, unsigned long address, - pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) -{ - struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); - pte_t pte; - int ret = VM_FAULT_MINOR; - - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - page = lookup_swap_cache(entry); - if (!page) { - swapin_readahead(entry, address, vma); - page = read_swap_cache_async(entry, vma, address); - if (!page) { - /* - * Back out if somebody else faulted in this pte while - * we released the page table lock. - */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, orig_pte))) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_MINOR; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - goto out; - } - - /* Had to read the page from swap area: Major fault */ - ret = VM_FAULT_MAJOR; - inc_page_state(pgmajfault); - grab_swap_token(); - } - - mark_page_accessed(page); - lock_page(page); - - /* - * Back out if somebody else faulted in this pte while we - * released the page table lock. - */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); - ret = VM_FAULT_MINOR; - goto out; - } - - /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); - - mm->rss++; - acct_update_integrals(); - update_mem_hiwater(); - - pte = mk_pte(page, vma->vm_page_prot); - if (write_access && can_share_swap_page(page)) { - pte = maybe_mkwrite(pte_mkdirty(pte), vma); - write_access = 0; - } - unlock_page(page); - - flush_icache_page(vma, page); - set_pte(page_table, pte); - page_add_anon_rmap(page, vma, address); - - if (write_access) { - if (do_wp_page(mm, vma, address, - page_table, pmd, pte) == VM_FAULT_OOM) - ret = VM_FAULT_OOM; - goto out; - } - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); -out: - return ret; -} - -/* - * We are called with the MM semaphore and page_table_lock - * spinlock held to protect against concurrent faults in - * multithreaded programs. - */ -static int -do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - pte_t *page_table, pmd_t *pmd, int write_access, - unsigned long addr) -{ - pte_t entry; - struct page * page = ZERO_PAGE(addr); - - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); - - /* ..except if it's a write access */ - if (write_access) { - /* Allocate our own private page. */ - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - - if (unlikely(anon_vma_prepare(vma))) - goto no_mem; - page = alloc_zeroed_user_highpage(vma, addr); - if (!page) - goto no_mem; - - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - - if (!pte_none(*page_table)) { - pte_unmap(page_table); - page_cache_release(page); - spin_unlock(&mm->page_table_lock); - goto out; - } - mm->rss++; - acct_update_integrals(); - update_mem_hiwater(); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot)), - vma); - lru_cache_add_active(page); - SetPageReferenced(page); - page_add_anon_rmap(page, vma, addr); - } - - ptep_establish_new(vma, addr, page_table, entry); - pte_unmap(page_table); - - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); - spin_unlock(&mm->page_table_lock); -out: - return VM_FAULT_MINOR; -no_mem: - return VM_FAULT_OOM; -} - -/* - * do_no_page() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the "write_access" parameter is true in order to avoid the next - * page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * This is called with the MM semaphore held and the page table - * spinlock held. Exit with the spinlock released. - */ -static int -do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) -{ - struct page * new_page; - struct address_space *mapping = NULL; - pte_t entry; - unsigned int sequence = 0; - int ret = VM_FAULT_MINOR; - int anon = 0; - - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, - pmd, write_access, address); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - sequence = mapping->truncate_count; - smp_rmb(); /* serializes i_size against truncate_count */ - } -retry: - cond_resched(); - new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); - /* - * No smp_rmb is needed here as long as there's a full - * spin_lock/unlock sequence inside the ->nopage callback - * (for the pagecache lookup) that acts as an implicit - * smp_mb() and prevents the i_size read to happen - * after the next truncate_count read. - */ - - /* no page was available -- either SIGBUS or OOM */ - if (new_page == NOPAGE_SIGBUS) - return VM_FAULT_SIGBUS; - if (new_page == NOPAGE_OOM) - return VM_FAULT_OOM; - - /* - * Should we do an early C-O-W break? - */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { - struct page *page; - - if (unlikely(anon_vma_prepare(vma))) - goto oom; - page = alloc_page_vma(GFP_HIGHUSER, vma, address); - if (!page) - goto oom; - copy_user_highpage(page, new_page, address); - page_cache_release(new_page); - new_page = page; - anon = 1; - } - - spin_lock(&mm->page_table_lock); - /* - * For a file-backed vma, someone could have truncated or otherwise - * invalidated this page. If unmap_mapping_range got called, - * retry getting the page. - */ - if (mapping && unlikely(sequence != mapping->truncate_count)) { - sequence = mapping->truncate_count; - spin_unlock(&mm->page_table_lock); - page_cache_release(new_page); - goto retry; - } - page_table = pte_offset_map(pmd, address); - - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if write_access is true, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - ++mm->rss; - acct_update_integrals(); - update_mem_hiwater(); - - flush_icache_page(vma, new_page); - entry = mk_pte(new_page, vma->vm_page_prot); - if (write_access) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - ptep_establish_new(vma, address, page_table, entry); - if (anon) { - lru_cache_add_active(new_page); - page_add_anon_rmap(new_page, vma, address); - } else - page_add_file_rmap(new_page); - pte_unmap(page_table); - } else { - /* One of our sibling threads was faster, back out. */ - pte_unmap(page_table); - page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); - goto out; - } - - /* no need to invalidate: a not-present page shouldn't be cached */ - update_mmu_cache(vma, address, entry); - spin_unlock(&mm->page_table_lock); -out: - return ret; -oom: - page_cache_release(new_page); - ret = VM_FAULT_OOM; - goto out; -} - -/* - * Fault of a previously existing named mapping. Repopulate the pte - * from the encoded file_pte if possible. This enables swappable - * nonlinear vmas. - */ -static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) -{ - unsigned long pgoff; - int err; - - BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); - /* - * Fall back to the linear mapping if the fs does not support - * ->populate: - */ - if (!vma->vm_ops || !vma->vm_ops->populate || - (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); - return do_no_page(mm, vma, address, write_access, pte, pmd); - } - - pgoff = pte_to_pgoff(*pte); - - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); - - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); - if (err == -ENOMEM) - return VM_FAULT_OOM; - if (err) - return VM_FAULT_SIGBUS; - return VM_FAULT_MAJOR; -} - -/* - * These routines also need to handle stuff like marking pages dirty - * and/or accessed for architectures that don't do it in hardware (most - * RISC architectures). The early dirtying is also good on the i386. - * - * There is also a hook called "update_mmu_cache()" that architectures - * with external mmu caches can use to update those (ie the Sparc or - * PowerPC hashed page tables that act as extended TLBs). - * - * Note the "page_table_lock". It is to protect against kswapd removing - * pages from under us. Note that kswapd only ever _removes_ pages, never - * adds them. As such, once we have noticed that the page is not present, - * we can drop the lock early. - * - * The adding of pages is protected by the MM semaphore (which we hold), - * so we don't need to worry about a page being suddenly been added into - * our VM. - * - * We enter with the pagetable spinlock held, we are supposed to - * release it when done. - */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct * vma, unsigned long address, - int write_access, pte_t *pte, pmd_t *pmd) -{ - pte_t entry; - - entry = *pte; - if (!pte_present(entry)) { - /* - * If it truly wasn't present, we know that kswapd - * and the PTE updates will not touch it later. So - * drop the lock. - */ - if (pte_none(entry)) - return do_no_page(mm, vma, address, write_access, pte, pmd); - if (pte_file(entry)) - return do_file_page(mm, vma, address, write_access, pte, pmd); - return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); - } - - if (write_access) { - if (!pte_write(entry)) - return do_wp_page(mm, vma, address, pte, pmd, entry); - - entry = pte_mkdirty(entry); - } - entry = pte_mkyoung(entry); - ptep_set_access_flags(vma, address, pte, entry, write_access); - update_mmu_cache(vma, address, entry); - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; -} - -/* - * By the time we get here, we already hold the mm semaphore - */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, int write_access) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - __set_current_state(TASK_RUNNING); - - inc_page_state(pgfault); - - if (is_vm_hugetlb_page(vma)) - return VM_FAULT_SIGBUS; /* mapping truncation does this. */ - - /* - * We need the page table lock to synchronize with kswapd - * and the SMP-safe atomic PTE updates. - */ - pgd = pgd_offset(mm, address); - spin_lock(&mm->page_table_lock); - - pud = pud_alloc(mm, pgd, address); - if (!pud) - goto oom; - - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - goto oom; - - pte = pte_alloc_map(mm, pmd, address); - if (!pte) - goto oom; - - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); - - oom: - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; -} - -#ifndef __ARCH_HAS_4LEVEL_HACK -/* - * Allocate page upper directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - * - * On a two-level or three-level page table, this ends up actually being - * entirely optimized away. - */ -pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - pud_t *new; - - spin_unlock(&mm->page_table_lock); - new = pud_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pgd_present(*pgd)) { - pud_free(new); - goto out; - } - pgd_populate(mm, pgd, new); - out: - return pud_offset(pgd, address); -} - -/* - * Allocate page middle directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. - * - * On a two-level page table, this ends up actually being entirely - * optimized away. - */ -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - pmd_t *new; - - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pud_present(*pud)) { - pmd_free(new); - goto out; - } - pud_populate(mm, pud, new); - out: - return pmd_offset(pud, address); -} -#else -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) -{ - pmd_t *new; - - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pgd_present(*pud)) { - pmd_free(new); - goto out; - } - pgd_populate(mm, pud, new); -out: - return pmd_offset(pud, address); -} -#endif - -int make_pages_present(unsigned long addr, unsigned long end) -{ - int ret, len, write; - struct vm_area_struct * vma; - - vma = find_vma(current->mm, addr); - if (!vma) - return -1; - write = (vma->vm_flags & VM_WRITE) != 0; - if (addr >= end) - BUG(); - if (end > vma->vm_end) - BUG(); - len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; - ret = get_user_pages(current, current->mm, addr, - len, write, 0, NULL, NULL); - if (ret < 0) - return ret; - return ret == len ? 0 : -1; -} - -/* - * Map a vmalloc()-space virtual address to the physical page. - */ -struct page * vmalloc_to_page(void * vmalloc_addr) -{ - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - - if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); - if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); - if (!pmd_none(*pmd)) { - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - } - } - } - return page; -} - -EXPORT_SYMBOL(vmalloc_to_page); - -/* - * Map a vmalloc()-space virtual address to the physical page frame number. - */ -unsigned long vmalloc_to_pfn(void * vmalloc_addr) -{ - return page_to_pfn(vmalloc_to_page(vmalloc_addr)); -} - -EXPORT_SYMBOL(vmalloc_to_pfn); - -/* - * update_mem_hiwater - * - update per process rss and vm high water data - */ -void update_mem_hiwater(void) -{ - struct task_struct *tsk = current; - - if (tsk->mm) { - if (tsk->mm->hiwater_rss < tsk->mm->rss) - tsk->mm->hiwater_rss = tsk->mm->rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - -#if !defined(__HAVE_ARCH_GATE_AREA) - -#if defined(AT_SYSINFO_EHDR) -struct vm_area_struct gate_vma; - -static int __init gate_vma_init(void) -{ - gate_vma.vm_mm = NULL; - gate_vma.vm_start = FIXADDR_USER_START; - gate_vma.vm_end = FIXADDR_USER_END; - gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = 0; - return 0; -} -__initcall(gate_vma_init); -#endif - -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) -{ -#ifdef AT_SYSINFO_EHDR - return &gate_vma; -#else - return NULL; -#endif -} - -int in_gate_area_no_task(unsigned long addr) -{ -#ifdef AT_SYSINFO_EHDR - if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) - return 1; -#endif - return 0; -} - -#endif /* __HAVE_ARCH_GATE_AREA */ diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/mm/mmap.c --- a/linux-2.6.11-xen-sparse/mm/mmap.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2108 +0,0 @@ -/* - * mm/mmap.c - * - * Written by obz. - * - * Address space accounting code <alan@xxxxxxxxxx> - */ - -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/shm.h> -#include <linux/mman.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <linux/syscalls.h> -#include <linux/init.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/personality.h> -#include <linux/security.h> -#include <linux/hugetlb.h> -#include <linux/profile.h> -#include <linux/module.h> -#include <linux/acct.h> -#include <linux/mount.h> -#include <linux/mempolicy.h> -#include <linux/rmap.h> - -#include <asm/uaccess.h> -#include <asm/cacheflush.h> -#include <asm/tlb.h> - -/* - * WARNING: the debugging will use recursive algorithms so never enable this - * unless you know what you are doing. - */ -#undef DEBUG_MM_RB - -/* description of effects of mapping type and prot in current implementation. - * this is due to the limited x86 page protection hardware. The expected - * behavior is in parens: - * - * map_type prot - * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC - * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (yes) yes w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (copy) copy w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - */ -pgprot_t protection_map[16] = { - __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, - __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 -}; - -int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ -int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; -atomic_t vm_committed_space = ATOMIC_INIT(0); - -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(long pages, int cap_sys_admin) -{ - unsigned long free, allowed; - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; - - free = get_page_cache_size(); - free += nr_swap_pages; - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += atomic_read(&slab_reclaim_pages); - - /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - if (!cap_sys_admin) - n -= n / 32; - free += n; - - if (free > pages) - return 0; - vm_unacct_memory(pages); - return -ENOMEM; - } - - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; - /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - allowed -= allowed / 32; - allowed += total_swap_pages; - - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; - - if (atomic_read(&vm_committed_space) < allowed) - return 0; - - vm_unacct_memory(pages); - - return -ENOMEM; -} - -EXPORT_SYMBOL(sysctl_overcommit_memory); -EXPORT_SYMBOL(sysctl_overcommit_ratio); -EXPORT_SYMBOL(sysctl_max_map_count); -EXPORT_SYMBOL(vm_committed_space); -EXPORT_SYMBOL(__vm_enough_memory); - -/* - * Requires inode->i_mapping->i_mmap_lock - */ -static void __remove_shared_vm_struct(struct vm_area_struct *vma, - struct file *file, struct address_space *mapping) -{ - if (vma->vm_flags & VM_DENYWRITE) - atomic_inc(&file->f_dentry->d_inode->i_writecount); - if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable--; - - flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - list_del_init(&vma->shared.vm_set.list); - else - vma_prio_tree_remove(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); -} - -/* - * Remove one vm structure and free it. - */ -static void remove_vm_struct(struct vm_area_struct *vma) -{ - struct file *file = vma->vm_file; - - might_sleep(); - if (file) { - struct address_space *mapping = file->f_mapping; - spin_lock(&mapping->i_mmap_lock); - __remove_shared_vm_struct(vma, file, mapping); - spin_unlock(&mapping->i_mmap_lock); - } - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - if (file) - fput(file); - anon_vma_unlink(vma); - mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); -} - -/* - * sys_brk() for the most part doesn't need the global kernel - * lock, except when an application is doing something nasty - * like trying to un-brk an area that has already been mapped - * to a regular file. in this case, the unmapping will need - * to invoke file system routines that need the global lock. - */ -asmlinkage unsigned long sys_brk(unsigned long brk) -{ - unsigned long rlim, retval; - unsigned long newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); - return retval; -} - -#ifdef DEBUG_MM_RB -static int browse_rb(struct rb_root *root) -{ - int i = 0, j; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) - printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; - if (vma->vm_start < pend) - printk("vm_start %lx pend %lx\n", vma->vm_start, pend); - if (vma->vm_start > vma->vm_end) - printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); - i++; - pn = nd; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) { - j++; - } - if (i != j) - printk("backwards %d, forwards %d\n", j, i), i = 0; - return i; -} - -void validate_mm(struct mm_struct *mm) -{ - int bug = 0; - int i = 0; - struct vm_area_struct *tmp = mm->mmap; - while (tmp) { - tmp = tmp->vm_next; - i++; - } - if (i != mm->map_count) - printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; - i = browse_rb(&mm->mm_rb); - if (i != mm->map_count) - printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; - if (bug) - BUG(); -} -#else -#define validate_mm(mm) do { } while (0) -#endif - -static struct vm_area_struct * -find_vma_prepare(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev, struct rb_node ***rb_link, - struct rb_node ** rb_parent) -{ - struct vm_area_struct * vma; - struct rb_node ** __rb_link, * __rb_parent, * rb_prev; - - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; - vma = NULL; - - while (*__rb_link) { - struct vm_area_struct *vma_tmp; - - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - return vma; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } - - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return vma; -} - -static inline void -__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) -{ - if (prev) { - vma->vm_next = prev->vm_next; - prev->vm_next = vma; - } else { - mm->mmap = vma; - if (rb_parent) - vma->vm_next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - vma->vm_next = NULL; - } -} - -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) -{ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); -} - -static inline void __vma_link_file(struct vm_area_struct *vma) -{ - struct file * file; - - file = vma->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - if (vma->vm_flags & VM_DENYWRITE) - atomic_dec(&file->f_dentry->d_inode->i_writecount); - if (vma->vm_flags & VM_SHARED) - mapping->i_mmap_writable++; - - flush_dcache_mmap_lock(mapping); - if (unlikely(vma->vm_flags & VM_NONLINEAR)) - vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); - else - vma_prio_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - } -} - -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev, rb_parent); - __vma_link_rb(mm, vma, rb_link, rb_parent); - __anon_vma_link(vma); -} - -static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - struct address_space *mapping = NULL; - - if (vma->vm_file) - mapping = vma->vm_file->f_mapping; - - if (mapping) { - spin_lock(&mapping->i_mmap_lock); - vma->vm_truncate_count = mapping->truncate_count; - } - anon_vma_lock(vma); - - __vma_link(mm, vma, prev, rb_link, rb_parent); - __vma_link_file(vma); - - anon_vma_unlock(vma); - if (mapping) - spin_unlock(&mapping->i_mmap_lock); - - mm->map_count++; - validate_mm(mm); -} - -/* - * Helper for vma_adjust in the split_vma insert case: - * insert vm structure into list and rbtree and anon_vma, - * but it has already been inserted into prio_tree earlier. - */ -static void -__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) -{ - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; - - __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) - BUG(); - __vma_link(mm, vma, prev, rb_link, rb_parent); - mm->map_count++; -} - -static inline void -__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - prev->vm_next = vma->vm_next; - rb_erase(&vma->vm_rb, &mm->mm_rb); - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; -} - -/* - * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that - * is already present in an i_mmap tree without adjusting the tree. - * The following helper function should be used when such adjustments - * are necessary. The "insert" vma (if any) is to be inserted - * before we drop the necessary locks. - */ -void vma_adjust(struct vm_area_struct *vma, unsigned long start, - unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) -{ - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next; - struct vm_area_struct *importer = NULL; - struct address_space *mapping = NULL; - struct prio_tree_root *root = NULL; - struct file *file = vma->vm_file; - struct anon_vma *anon_vma = NULL; - long adjust_next = 0; - int remove_next = 0; - - if (next && !insert) { - if (end >= next->vm_end) { - /* - * vma expands, overlapping all the next, and - * perhaps the one after too (mprotect case 6). - */ -again: remove_next = 1 + (end > next->vm_end); - end = next->vm_end; - anon_vma = next->anon_vma; - importer = vma; - } else if (end > next->vm_start) { - /* - * vma expands, overlapping part of the next: - * mprotect case 5 shifting the boundary up. - */ - adjust_next = (end - next->vm_start) >> PAGE_SHIFT; - anon_vma = next->anon_vma; - importer = vma; - } else if (end < vma->vm_end) { - /* - * vma shrinks, and !insert tells it's not - * split_vma inserting another: so it must be - * mprotect case 4 shifting the boundary down. - */ - adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); - anon_vma = next->anon_vma; - importer = next; - } - } - - if (file) { - mapping = file->f_mapping; - if (!(vma->vm_flags & VM_NONLINEAR)) - root = &mapping->i_mmap; - spin_lock(&mapping->i_mmap_lock); - if (importer && - vma->vm_truncate_count != next->vm_truncate_count) { - /* - * unmap_mapping_range might be in progress: - * ensure that the expanding vma is rescanned. - */ - importer->vm_truncate_count = 0; - } - if (insert) { - insert->vm_truncate_count = vma->vm_truncate_count; - /* - * Put into prio_tree now, so instantiated pages - * are visible to arm/parisc __flush_dcache_page - * throughout; but we cannot insert into address - * space until vma start or end is updated. - */ - __vma_link_file(insert); - } - } - - /* - * When changing only vma->vm_end, we don't really need - * anon_vma lock: but is that case worth optimizing out? - */ - if (vma->anon_vma) - anon_vma = vma->anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (importer && !importer->anon_vma) { - importer->anon_vma = anon_vma; - __anon_vma_link(importer); - } - } - - if (root) { - flush_dcache_mmap_lock(mapping); - vma_prio_tree_remove(vma, root); - if (adjust_next) - vma_prio_tree_remove(next, root); - } - - vma->vm_start = start; - vma->vm_end = end; - vma->vm_pgoff = pgoff; - if (adjust_next) { - next->vm_start += adjust_next << PAGE_SHIFT; - next->vm_pgoff += adjust_next; - } - - if (root) { - if (adjust_next) - vma_prio_tree_insert(next, root); - vma_prio_tree_insert(vma, root); - flush_dcache_mmap_unlock(mapping); - } - - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - */ - __vma_unlink(mm, next, vma); - if (file) - __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); - } else if (insert) { - /* - * split_vma has split insert from vma, and needs - * us to insert it before dropping the locks - * (it may either follow vma or precede it). - */ - __insert_vm_struct(mm, insert); - } - - if (anon_vma) - spin_unlock(&anon_vma->lock); - if (mapping) - spin_unlock(&mapping->i_mmap_lock); - - if (remove_next) { - if (file) - fput(file); - mm->map_count--; - mpol_free(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); - /* - * In mprotect's case 6 (see comments on vma_merge), - * we must remove another next too. It would clutter - * up the code too much to do both in one go. - */ - if (remove_next == 2) { - next = vma->vm_next; - goto again; - } - } - - validate_mm(mm); -} - -/* - * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those. - */ -#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) - -static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) -{ - if (vma->vm_flags != vm_flags) - return 0; - if (vma->vm_file != file) - return 0; - if (vma->vm_ops && vma->vm_ops->close) - return 0; - return 1; -} - -static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2) -{ - return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2); -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * in front of (at a lower virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - * - * We don't check here for the merged mmap wrapping around the end of pagecache - * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which - * wrap, nor mmaps which cover the final page at index -1UL. - */ -static int -can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) -{ - if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { - if (vma->vm_pgoff == vm_pgoff) - return 1; - } - return 0; -} - -/* - * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) - * beyond (at a higher virtual address and file offset than) the vma. - * - * We cannot merge two vmas if they have differently assigned (non-NULL) - * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. - */ -static int -can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) -{ - if (is_mergeable_vma(vma, file, vm_flags) && - is_mergeable_anon_vma(anon_vma, vma->anon_vma)) { - pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - if (vma->vm_pgoff + vm_pglen == vm_pgoff) - return 1; - } - return 0; -} - -/* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). - * - * In most cases - when called for mmap, brk or mremap - [addr,end) is - * certain not to be mapped by the time vma_merge is called; but when - * called for mprotect, it is certain to be already mapped (either at - * an offset within prev, or at the start of next), and the flags of - * this area are about to be changed to vm_flags - and the no-change - * case has already been eliminated. - * - * The following mprotect cases have to be considered, where AAAA is - * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: - * - * AAAA AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX - * cannot merge might become might become might become - * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or - * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPNNNNNNNN 8 - * AAAA - * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN - * might become case 1 below case 2 below case 3 below - * - * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX: - * mprotect_fixup updates vm_flags & vm_page_prot on successful return. - */ -struct vm_area_struct *vma_merge(struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) -{ - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - - /* - * We later require that vma->vm_flags == vm_flags, - * so this tests vma->vm_flags & VM_SPECIAL, too. - */ - if (vm_flags & VM_SPECIAL) - return NULL; - - if (prev) - next = prev->vm_next; - else - next = mm->mmap; - area = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; - - /* - * Can it merge with the predecessor? - */ - if (prev && prev->vm_end == addr && - mpol_equal(vma_policy(prev), policy) && - can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma)) { - /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL); - } else /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL); - return prev; - } - - /* - * Can this new request be merged in front of next? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { - if (prev && addr < prev->vm_end) /* case 4 */ - vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL); - else /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL); - return area; - } - - return NULL; -} - -/* - * find_mergeable_anon_vma is used by anon_vma_prepare, to check - * neighbouring vmas for a suitable anon_vma, before it goes off - * to allocate a new anon_vma. It checks because a repetitive - * sequence of mprotects and faults may otherwise lead to distinct - * anon_vmas being allocated, preventing vma merge in subsequent - * mprotect. - */ -struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) -{ - struct vm_area_struct *near; - unsigned long vm_flags; - - near = vma->vm_next; - if (!near) - goto try_prev; - - /* - * Since only mprotect tries to remerge vmas, match flags - * which might be mprotected into each other later on. - * Neither mlock nor madvise tries to remerge at present, - * so leave their flags as obstructing a merge. - */ - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && vma->vm_end == near->vm_start && - mpol_equal(vma_policy(vma), vma_policy(near)) && - can_vma_merge_before(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) - return near->anon_vma; -try_prev: - /* - * It is potentially slow to have to call find_vma_prev here. - * But it's only on the first write fault on the vma, not - * every time, and we could devise a way to avoid it later - * (e.g. stash info in next's anon_vma_node when assigning - * an anon_vma, or when trying vma_merge). Another time. - */ - if (find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma) - BUG(); - if (!near) - goto none; - - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && near->vm_end == vma->vm_start && - mpol_equal(vma_policy(near), vma_policy(vma)) && - can_vma_merge_after(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff)) - return near->anon_vma; -none: - /* - * There's no absolute need to look only at touching neighbours: - * we could search further afield for "compatible" anon_vmas. - * But it would probably just be a waste of time searching, - * or lead to too many vmas hanging off the same anon_vma. - * We're trying to allow mprotect remerging later on, - * not trying to minimize memory used for anon_vmas. - */ - return NULL; -} - -#ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) -{ - const unsigned long stack_flags - = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); - -#ifdef CONFIG_HUGETLB - if (flags & VM_HUGETLB) { - if (!(flags & VM_DONTCOPY)) - mm->shared_vm += pages; - return; - } -#endif /* CONFIG_HUGETLB */ - - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; - } else if (flags & stack_flags) - mm->stack_vm += pages; - if (flags & (VM_RESERVED|VM_IO)) - mm->reserved_vm += pages; -} -#endif /* CONFIG_PROC_FS */ - -/* - * The caller must hold down_write(current->mm->mmap_sem). - */ - -unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) -{ - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; - struct inode *inode; - unsigned int vm_flags; - int correct_wcount = 0; - int error; - struct rb_node ** rb_link, * rb_parent; - int accountable = 1; - unsigned long charged = 0; - - if (file) { - if (is_file_hugepages(file)) - accountable = 0; - - if (!file->f_op || !file->f_op->mmap) - return -ENODEV; - - if ((prot & PROT_EXEC) && - (file->f_vfsmnt->mnt_flags & MNT_NOEXEC)) - return -EPERM; - } - /* - * Does the application expect PROT_READ to imply PROT_EXEC? - * - * (the exception is when the underlying filesystem is noexec - * mounted, in which case we dont add PROT_EXEC.) - */ - if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_vfsmnt->mnt_flags & MNT_NOEXEC))) - prot |= PROT_EXEC; - - if (!len) - return addr; - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) - return -EINVAL; - - /* offset overflow? */ - if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) - return -EINVAL; - - /* Too many mappings? */ - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - /* Obtain the address to map to. we verify (or select) it and ensure - * that it represents a valid section of the address space. - */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) - return addr; - - /* Do simple checking here so the lower-level routines won't have - * to. we assume access permissions have been handled by the open - * of the memory object, so we don't do any here. - */ - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | - mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - - if (flags & MAP_LOCKED) { - if (!can_do_mlock()) - return -EPERM; - vm_flags |= VM_LOCKED; - } - /* mlock MCL_FUTURE? */ - if (vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += len; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } - - inode = file ? file->f_dentry->d_inode : NULL; - - if (file) { - switch (flags & MAP_TYPE) { - case MAP_SHARED: - if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) - return -EACCES; - - /* - * Make sure we don't allow writing to an append-only - * file.. - */ - if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) - return -EACCES; - - /* - * Make sure there are no mandatory locks on the file. - */ - if (locks_verify_locked(inode)) - return -EAGAIN; - - vm_flags |= VM_SHARED | VM_MAYSHARE; - if (!(file->f_mode & FMODE_WRITE)) - vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - - /* fall through */ - case MAP_PRIVATE: - if (!(file->f_mode & FMODE_READ)) - return -EACCES; - break; - - default: - return -EINVAL; - } - } else { - switch (flags & MAP_TYPE) { - case MAP_SHARED: - vm_flags |= VM_SHARED | VM_MAYSHARE; - break; - case MAP_PRIVATE: - /* - * Set pgoff according to addr for anon_vma. - */ - pgoff = addr >> PAGE_SHIFT; - break; - default: - return -EINVAL; - } - } - - error = security_file_mmap(file, prot, flags); - if (error) - return error; - - /* Clear old maps */ - error = -ENOMEM; -munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) - return -ENOMEM; - goto munmap_back; - } - - /* Check against address space limit. */ - if ((mm->total_vm << PAGE_SHIFT) + len - > current->signal->rlim[RLIMIT_AS].rlim_cur) - return -ENOMEM; - - if (accountable && (!(flags & MAP_NORESERVE) || - sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { - if (vm_flags & VM_SHARED) { - /* Check memory availability in shmem_file_setup? */ - vm_flags |= VM_ACCOUNT; - } else if (vm_flags & VM_WRITE) { - /* - * Private writable mapping: check memory availability - */ - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - } - - /* - * Can we just expand an old private anonymous mapping? - * The VM_SHARED test is necessary because shmem_zero_setup - * will create the file object for a shared anonymous map below. - */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; - - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - memset(vma, 0, sizeof(*vma)); - - vma->vm_mm = mm; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_flags = vm_flags; - vma->vm_page_prot = protection_map[vm_flags & 0x0f]; - vma->vm_pgoff = pgoff; - - if (file) { - error = -EINVAL; - if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) - goto free_vma; - if (vm_flags & VM_DENYWRITE) { - error = deny_write_access(file); - if (error) - goto free_vma; - correct_wcount = 1; - } - vma->vm_file = file; - get_file(file); - error = file->f_op->mmap(file, vma); - if (error) - goto unmap_and_free_vma; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } - - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform - * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) - * that memory reservation must be checked; but that reservation - * belongs to shared memory object, not to vma: so now clear it. - */ - if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) - vma->vm_flags &= ~VM_ACCOUNT; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - addr = vma->vm_start; - pgoff = vma->vm_pgoff; - vm_flags = vma->vm_flags; - - if (!file || !vma_merge(mm, prev, addr, vma->vm_end, - vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { - file = vma->vm_file; - vma_link(mm, vma, prev, rb_link, rb_parent); - if (correct_wcount) - atomic_inc(&inode->i_writecount); - } else { - if (file) { - if (correct_wcount) - atomic_inc(&inode->i_writecount); - fput(file); - } - mpol_free(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); - } -out: - mm->total_vm += len >> PAGE_SHIFT; - __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); - if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if (flags & MAP_POPULATE) { - up_write(&mm->mmap_sem); - sys_remap_file_pages(addr, len, 0, - pgoff, flags & MAP_NONBLOCK); - down_write(&mm->mmap_sem); - } - acct_update_integrals(); - update_mem_hiwater(); - return addr; - -unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); - vma->vm_file = NULL; - fput(file); - - /* Undo any partial mapping done by a device driver. */ - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); -free_vma: - kmem_cache_free(vm_area_cachep, vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - return error; -} - -EXPORT_SYMBOL(do_mmap_pgoff); - -/* Get an address range which is currently unmapped. - * For shmat() with addr=0. - * - * Ugly calling convention alert: - * Return value with the low bits set means error value, - * ie - * if (ret & ~PAGE_MASK) - * error = ret; - * - * This function "knows" that -ENOMEM has the bits set. - */ -#ifndef HAVE_ARCH_UNMAPPED_AREA -unsigned long -arch_get_unmapped_area(struct file *filp, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long start_addr; - - if (len > TASK_SIZE) - return -ENOMEM; - - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) - return addr; - } - start_addr = addr = mm->free_area_cache; - -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - addr = vma->vm_end; - } -} -#endif - -void arch_unmap_area(struct vm_area_struct *area) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (area->vm_start >= TASK_UNMAPPED_BASE && - area->vm_start < area->vm_mm->free_area_cache) - area->vm_mm->free_area_cache = area->vm_start; -} - -/* - * This mmap-allocator allocates new areas top-down from below the - * stack's low limit (the base): - */ -#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -unsigned long -arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, const unsigned long pgoff, - const unsigned long flags) -{ - struct vm_area_struct *vma, *prev_vma; - struct mm_struct *mm = current->mm; - unsigned long base = mm->mmap_base, addr = addr0; - int first_time = 1; - - /* requested length too big for entire address space */ - if (len > TASK_SIZE) - return -ENOMEM; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > base) - mm->free_area_cache = base; - - /* requesting a specific address */ - if (addr) { - addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) - return addr; - } - -try_again: - /* make sure it can fit in the remaining address space */ - if (mm->free_area_cache < len) - goto fail; - - /* either no address requested or cant fit in requested address hole */ - addr = (mm->free_area_cache - len) & PAGE_MASK; - do { - /* - * Lookup failure means no vma is above this address, - * i.e. return with success: - */ - if (!(vma = find_vma_prev(mm, addr, &prev_vma))) - return addr; - - /* - * new region fits between prev_vma->vm_end and - * vma->vm_start, use it: - */ - if (addr+len <= vma->vm_start && - (!prev_vma || (addr >= prev_vma->vm_end))) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); - else - /* pull free_area_cache down to the first hole */ - if (mm->free_area_cache == vma->vm_end) - mm->free_area_cache = vma->vm_start; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len <= vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - */ - if (first_time) { - mm->free_area_cache = base; - first_time = 0; - goto try_again; - } - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = base; - - return addr; -} -#endif - -void arch_unmap_area_topdown(struct vm_area_struct *area) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (area->vm_end > area->vm_mm->free_area_cache) - area->vm_mm->free_area_cache = area->vm_end; -} - -unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - if (flags & MAP_FIXED) { - unsigned long ret; - - if (addr > TASK_SIZE - len) - return -ENOMEM; - if (addr & ~PAGE_MASK) - return -EINVAL; - if (file && is_file_hugepages(file)) { - /* - * Check if the given range is hugepage aligned, and - * can be made suitable for hugepages. - */ - ret = prepare_hugepage_range(addr, len); - } else { - /* - * Ensure that a normal request is not falling in a - * reserved hugepage range. For some archs like IA-64, - * there is a separate region for hugepages. - */ - ret = is_hugepage_only_range(addr, len); - } - if (ret) - return -EINVAL; - return addr; - } - - if (file && file->f_op && file->f_op->get_unmapped_area) - return file->f_op->get_unmapped_area(file, addr, len, - pgoff, flags); - - return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); -} - -EXPORT_SYMBOL(get_unmapped_area); - -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) -{ - struct vm_area_struct *vma = NULL; - - if (mm) { - /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node * rb_node; - - rb_node = mm->mm_rb.rb_node; - vma = NULL; - - while (rb_node) { - struct vm_area_struct * vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; - } - } - return vma; -} - -EXPORT_SYMBOL(find_vma); - -/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ -struct vm_area_struct * -find_vma_prev(struct mm_struct *mm, unsigned long addr, - struct vm_area_struct **pprev) -{ - struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node * rb_node; - if (!mm) - goto out; - - /* Guard against addr being lower than the first VMA */ - vma = mm->mmap; - - /* Go through the RB tree quickly. */ - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *vma_tmp; - vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); - - if (addr < vma_tmp->vm_end) { - rb_node = rb_node->rb_left; - } else { - prev = vma_tmp; - if (!prev->vm_next || (addr < prev->vm_next->vm_end)) - break; - rb_node = rb_node->rb_right; - } - } - -out: - *pprev = prev; - return prev ? prev->vm_next : vma; -} - -/* - * Verify that the stack growth is acceptable and - * update accounting. This is shared with both the - * grow-up and grow-down cases. - */ -static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) -{ - struct mm_struct *mm = vma->vm_mm; - struct rlimit *rlim = current->signal->rlim; - - /* address space limit tests */ - if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) - return -ENOMEM; - - /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) - return -ENOMEM; - - /* mlock limit tests */ - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked; - unsigned long limit; - locked = mm->locked_vm + grow; - limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - if (locked > limit && !capable(CAP_IPC_LOCK)) - return -ENOMEM; - } - - /* - * Overcommit.. This must be the final test, as it will - * update security statistics. - */ - if (security_vm_enough_memory(grow)) - return -ENOMEM; - - /* Ok, everything looks good - let it rip */ - mm->total_vm += grow; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); - acct_update_integrals(); - update_mem_hiwater(); - return 0; -} - -#ifdef CONFIG_STACK_GROWSUP -/* - * vma is the first one with address > vma->vm_end. Have to extend vma. - */ -int expand_stack(struct vm_area_struct * vma, unsigned long address) -{ - int error; - - if (!(vma->vm_flags & VM_GROWSUP)) - return -EFAULT; - - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - anon_vma_lock(vma); - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the - * anon_vma lock to serialize against concurrent expand_stacks. - */ - address += 4 + PAGE_SIZE - 1; - address &= PAGE_MASK; - error = 0; - - /* Somebody else might have raced and expanded it already */ - if (address > vma->vm_end) { - unsigned long size, grow; - - size = address - vma->vm_start; - grow = (address - vma->vm_end) >> PAGE_SHIFT; - - error = acct_stack_growth(vma, size, grow); - if (!error) - vma->vm_end = address; - } - anon_vma_unlock(vma); - return error; -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma, *prev; - - addr &= PAGE_MASK; - vma = find_vma_prev(mm, addr, &prev); - if (vma && (vma->vm_start <= addr)) - return vma; - if (!prev || expand_stack(prev, addr)) - return NULL; - if (prev->vm_flags & VM_LOCKED) { - make_pages_present(addr, prev->vm_end); - } - return prev; -} -#else -/* - * vma is the first one with address < vma->vm_start. Have to extend vma. - */ -int expand_stack(struct vm_area_struct *vma, unsigned long address) -{ - int error; - - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - anon_vma_lock(vma); - - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_sem in read mode. We need the - * anon_vma lock to serialize against concurrent expand_stacks. - */ - address &= PAGE_MASK; - error = 0; - - /* Somebody else might have raced and expanded it already */ - if (address < vma->vm_start) { - unsigned long size, grow; - - size = vma->vm_end - address; - grow = (vma->vm_start - address) >> PAGE_SHIFT; - - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_start = address; - vma->vm_pgoff -= grow; - } - } - anon_vma_unlock(vma); - return error; -} - -struct vm_area_struct * -find_extend_vma(struct mm_struct * mm, unsigned long addr) -{ - struct vm_area_struct * vma; - unsigned long start; - - addr &= PAGE_MASK; - vma = find_vma(mm,addr); - if (!vma) - return NULL; - if (vma->vm_start <= addr) - return vma; - if (!(vma->vm_flags & VM_GROWSDOWN)) - return NULL; - start = vma->vm_start; - if (expand_stack(vma, addr)) - return NULL; - if (vma->vm_flags & VM_LOCKED) { - make_pages_present(addr, start); - } - return vma; -} -#endif - -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, - unsigned long start, unsigned long end) -{ - unsigned long first = start & PGDIR_MASK; - unsigned long last = end + PGDIR_SIZE - 1; - struct mm_struct *mm = tlb->mm; - - if (last > MM_VM_SIZE(mm) || last < end) - last = MM_VM_SIZE(mm); - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) - first = FIRST_USER_PGD_NR * PGDIR_SIZE; - /* No point trying to free anything if we're in the same pte page */ - if ((first & PMD_MASK) < (last & PMD_MASK)) { - clear_page_range(tlb, first, last); - flush_tlb_pgtables(mm, first, last); - } -} - -/* Normal function to fix up a mapping - * This function is the default for when an area has no specific - * function. This may be used as part of a more specific routine. - * - * By the time this function is called, the area struct has been - * removed from the process mapping list. - */ -static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) -{ - size_t len = area->vm_end - area->vm_start; - - area->vm_mm->total_vm -= len >> PAGE_SHIFT; - if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; - vm_stat_unaccount(area); - area->vm_mm->unmap_area(area); - remove_vm_struct(area); -} - -/* - * Update the VMA and inode share lists. - * - * Ok - we have the memory areas we should free on the 'free' list, - * so release them, and do the vma updates. - */ -static void unmap_vma_list(struct mm_struct *mm, - struct vm_area_struct *mpnt) -{ - do { - struct vm_area_struct *next = mpnt->vm_next; - unmap_vma(mm, mpnt); - mpnt = next; - } while (mpnt != NULL); - validate_mm(mm); -} - -/* - * Get rid of page table information in the indicated region. - * - * Called with the page table lock held. - */ -static void unmap_region(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev, - unsigned long start, - unsigned long end) -{ - struct mmu_gather *tlb; - unsigned long nr_accounted = 0; - - lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); - - if (is_hugepage_only_range(start, end - start)) - hugetlb_free_pgtables(tlb, prev, start, end); - else - free_pgtables(tlb, prev, start, end); - tlb_finish_mmu(tlb, start, end); -} - -/* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static void -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - do { - rb_erase(&vma->vm_rb, &mm->mm_rb); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ -} - -/* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the the tail. - */ -int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long addr, int new_below) -{ - struct mempolicy *pol; - struct vm_area_struct *new; - - if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) - return -EINVAL; - - if (mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - - new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!new) - return -ENOMEM; - - /* most fields are the same, copy all, and then fixup */ - *new = *vma; - - if (new_below) - new->vm_end = addr; - else { - new->vm_start = addr; - new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); - } - - pol = mpol_copy(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new); - return PTR_ERR(pol); - } - vma_set_policy(new, pol); - - if (new->vm_file) - get_file(new->vm_file); - - if (new->vm_ops && new->vm_ops->open) - new->vm_ops->open(new); - - if (new_below) - vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + - ((addr - new->vm_start) >> PAGE_SHIFT), new); - else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - - return 0; -} - -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge <jeremy@xxxxxxxx> - */ -int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) -{ - unsigned long end; - struct vm_area_struct *mpnt, *prev, *last; - - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - - if ((len = PAGE_ALIGN(len)) == 0) - return -EINVAL; - - /* Find the first overlapping VMA */ - mpnt = find_vma_prev(mm, start, &prev); - if (!mpnt) - return 0; - /* we have start < mpnt->vm_end */ - - /* if it doesn't overlap, we have nothing.. */ - end = start + len; - if (mpnt->vm_start >= end) - return 0; - - /* - * If we need to split any vma, do it now to save pain later. - * - * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially - * unmapped vm_area_struct will remain in use: so lower split_vma - * places tmp vma above, and higher split_vma places tmp vma below. - */ - if (start > mpnt->vm_start) { - int error = split_vma(mm, mpnt, start, 0); - if (error) - return error; - prev = mpnt; - } - - /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { - int error = split_vma(mm, last, end, 1); - if (error) - return error; - } - mpnt = prev? prev->vm_next: mm->mmap; - - /* - * Remove the vma's, and unmap the actual pages - */ - detach_vmas_to_be_unmapped(mm, mpnt, prev, end); - spin_lock(&mm->page_table_lock); - unmap_region(mm, mpnt, prev, start, end); - spin_unlock(&mm->page_table_lock); - - /* Fix up all other VM information */ - unmap_vma_list(mm, mpnt); - - return 0; -} - -EXPORT_SYMBOL(do_munmap); - -asmlinkage long sys_munmap(unsigned long addr, size_t len) -{ - int ret; - struct mm_struct *mm = current->mm; - - profile_munmap(addr); - - down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; -} - -static inline void verify_mm_writelocked(struct mm_struct *mm) -{ -#ifdef CONFIG_DEBUG_KERNEL - if (unlikely(down_read_trylock(&mm->mmap_sem))) { - WARN_ON(1); - up_read(&mm->mmap_sem); - } -#endif -} - -/* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. - */ -unsigned long do_brk(unsigned long addr, unsigned long len) -{ - struct mm_struct * mm = current->mm; - struct vm_area_struct * vma, * prev; - unsigned long flags; - struct rb_node ** rb_link, * rb_parent; - pgoff_t pgoff = addr >> PAGE_SHIFT; - - len = PAGE_ALIGN(len); - if (!len) - return addr; - - if ((addr + len) > TASK_SIZE || (addr + len) < addr) - return -EINVAL; - - /* - * mlock MCL_FUTURE? - */ - if (mm->def_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += len; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - return -EAGAIN; - } - - /* - * mm->mmap_sem is required to protect against another thread - * changing the mappings in case we sleep. - */ - verify_mm_writelocked(mm); - - /* - * Clear old maps. this also does some error checking for us - */ - munmap_back: - vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - if (vma && vma->vm_start < addr + len) { - if (do_munmap(mm, addr, len)) - return -ENOMEM; - goto munmap_back; - } - - /* Check against address space limits *after* clearing old maps... */ - if ((mm->total_vm << PAGE_SHIFT) + len - > current->signal->rlim[RLIMIT_AS].rlim_cur) - return -ENOMEM; - - if (mm->map_count > sysctl_max_map_count) - return -ENOMEM; - - if (security_vm_enough_memory(len >> PAGE_SHIFT)) - return -ENOMEM; - - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - - /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) - goto out; - - /* - * create a vma struct for an anonymous mapping - */ - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; - } - memset(vma, 0, sizeof(*vma)); - - vma->vm_mm = mm; - vma->vm_start = addr; - vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; - vma->vm_flags = flags; - vma->vm_page_prot = protection_map[flags & 0x0f]; - vma_link(mm, vma, prev, rb_link, rb_parent); -out: - mm->total_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - acct_update_integrals(); - update_mem_hiwater(); - return addr; -} - -EXPORT_SYMBOL(do_brk); - -/* Release all mmaps. */ -void exit_mmap(struct mm_struct *mm) -{ - struct mmu_gather *tlb; - struct vm_area_struct *vma; - unsigned long nr_accounted = 0; - -#ifdef arch_exit_mmap - arch_exit_mmap(mm); -#endif - - lru_add_drain(); - - spin_lock(&mm->page_table_lock); - - tlb = tlb_gather_mmu(mm, 1); - flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted, NULL); - vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); - - tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); - - vma = mm->mmap; - mm->mmap = mm->mmap_cache = NULL; - mm->mm_rb = RB_ROOT; - mm->rss = 0; - mm->total_vm = 0; - mm->locked_vm = 0; - - spin_unlock(&mm->page_table_lock); - - /* - * Walk the list again, actually closing and freeing it - * without holding any MM locks. - */ - while (vma) { - struct vm_area_struct *next = vma->vm_next; - remove_vm_struct(vma); - vma = next; - } -} - -/* Insert vm structure into process list sorted by address - * and into the inode's i_mmap tree. If vm_file is non-NULL - * then i_mmap_lock is taken here. - */ -int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) -{ - struct vm_area_struct * __vma, * prev; - struct rb_node ** rb_link, * rb_parent; - - /* - * The vm_pgoff of a purely anonymous vma should be irrelevant - * until its first write fault, when page's anon_vma and index - * are set. But now set the vm_pgoff it will almost certainly - * end up with (unless mremap moves it elsewhere before that - * first wfault), so /proc/pid/maps tells a consistent story. - * - * By setting it to reflect the virtual start address of the - * vma, merges and splits can happen in a seamless way, just - * using the existing file pgoff checks and manipulations. - * Similarly in do_mmap_pgoff and in do_brk. - */ - if (!vma->vm_file) { - BUG_ON(vma->anon_vma); - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; - } - __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); - if (__vma && __vma->vm_start < vma->vm_end) - return -ENOMEM; - vma_link(mm, vma, prev, rb_link, rb_parent); - return 0; -} - -/* - * Copy the vma structure to a new location in the same mm, - * prior to moving page table entries, to effect an mremap move. - */ -struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, - unsigned long addr, unsigned long len, pgoff_t pgoff) -{ - struct vm_area_struct *vma = *vmap; - unsigned long vma_start = vma->vm_start; - struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; - - /* - * If anonymous vma has not yet been faulted, update new pgoff - * to match new location, to increase its chance of merging. - */ - if (!vma->vm_file && !vma->anon_vma) - pgoff = addr >> PAGE_SHIFT; - - find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); - new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); - if (new_vma) { - /* - * Source vma may have been merged into new_vma - */ - if (vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end) - *vmap = new_vma; - } else { - new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (new_vma) { - *new_vma = *vma; - pol = mpol_copy(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new_vma); - return NULL; - } - vma_set_policy(new_vma, pol); - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - } - } - return new_vma; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/mm/page_alloc.c --- a/linux-2.6.11-xen-sparse/mm/page_alloc.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2157 +0,0 @@ -/* - * linux/mm/page_alloc.c - * - * Manages the free list, the system allocates free pages here. - * Note that kmalloc() lives in slab.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * Swap reorganised 29.12.95, Stephen Tweedie - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 - * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 - * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 - * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 - * (lots of bits borrowed from Ingo Molnar & Andrew Morton) - */ - -#include <linux/config.h> -#include <linux/stddef.h> -#include <linux/mm.h> -#include <linux/swap.h> -#include <linux/interrupt.h> -#include <linux/pagemap.h> -#include <linux/bootmem.h> -#include <linux/compiler.h> -#include <linux/module.h> -#include <linux/suspend.h> -#include <linux/pagevec.h> -#include <linux/blkdev.h> -#include <linux/slab.h> -#include <linux/notifier.h> -#include <linux/topology.h> -#include <linux/sysctl.h> -#include <linux/cpu.h> -#include <linux/nodemask.h> -#include <linux/vmalloc.h> - -#include <asm/tlbflush.h> -#include "internal.h" - -/* MCD - HACK: Find somewhere to initialize this EARLY, or make this initializer cleaner */ -nodemask_t node_online_map = { { [0] = 1UL } }; -nodemask_t node_possible_map = NODE_MASK_ALL; -struct pglist_data *pgdat_list; -unsigned long totalram_pages; -unsigned long totalhigh_pages; -long nr_swap_pages; -/* - * results with 256, 32 in the lowmem_reserve sysctl: - * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) - * 1G machine -> (16M dma, 784M normal, 224M high) - * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA - * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL - * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA - */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; - -EXPORT_SYMBOL(totalram_pages); -EXPORT_SYMBOL(nr_swap_pages); - -/* - * Used by page_zone() to look up the address of the struct zone whose - * id is encoded in the upper bits of page->flags - */ -struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; -EXPORT_SYMBOL(zone_table); - -static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; -int min_free_kbytes = 1024; - -unsigned long __initdata nr_kernel_pages; -unsigned long __initdata nr_all_pages; - -/* - * Temporary debugging check for pages not lying within a given zone. - */ -static int bad_range(struct zone *zone, struct page *page) -{ - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) - return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; -#ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) - return 1; -#endif - if (zone != page_zone(page)) - return 1; - return 0; -} - -static void bad_page(const char *function, struct page *page) -{ - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); - dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); - page->flags &= ~(1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_swapcache | - 1 << PG_writeback); - set_page_count(page, 0); - reset_page_mapcount(page); - page->mapping = NULL; - tainted |= TAINT_BAD_PAGE; -} - -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else -/* - * Higher-order pages are called "compound pages". They are structured thusly: - * - * The first PAGE_SIZE page is called the "head page". - * - * The remaining PAGE_SIZE pages are called "tail pages". - * - * All pages have PG_compound set. All pages have their ->private pointing at - * the head page (even the head page has this). - * - * The first tail page's ->mapping, if non-zero, holds the address of the - * compound page's put_page() function. - * - * The order of the allocation is stored in the first tail page's ->index - * This is only for debug at present. This usage means that zero-order pages - * may not be compound. - */ -static void prep_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - - page[1].mapping = NULL; - page[1].index = order; - for (i = 0; i < nr_pages; i++) { - struct page *p = page + i; - - SetPageCompound(p); - p->private = (unsigned long)page; - } -} - -static void destroy_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); - - for (i = 0; i < nr_pages; i++) { - struct page *p = page + i; - - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (p->private != (unsigned long)page) - bad_page(__FUNCTION__, page); - ClearPageCompound(p); - } -} -#endif /* CONFIG_HUGETLB_PAGE */ - -/* - * function for dealing with page's order in buddy system. - * zone->lock is already acquired when we use these. - * So, we don't need atomic page->flags operations here. - */ -static inline unsigned long page_order(struct page *page) { - return page->private; -} - -static inline void set_page_order(struct page *page, int order) { - page->private = order; - __SetPagePrivate(page); -} - -static inline void rmv_page_order(struct page *page) -{ - __ClearPagePrivate(page); - page->private = 0; -} - -/* - * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. - * for recording page's order, we use page->private and PG_private. - * - */ -static inline int page_is_buddy(struct page *page, int order) -{ - if (PagePrivate(page) && - (page_order(page) == order) && - !PageReserved(page) && - page_count(page) == 0) - return 1; - return 0; -} - -/* - * Freeing function for a buddy system allocator. - * - * The concept of a buddy system is to maintain direct-mapped table - * (containing bit values) for memory blocks of various "orders". - * The bottom level table contains the map for the smallest allocatable - * units of memory (here, pages), and each level above it describes - * pairs of units from the levels below, hence, "buddies". - * At a high level, all that happens here is marking the table entry - * at the bottom level available, and propagating the changes upward - * as necessary, plus some accounting needed to play nicely with other - * parts of the VM system. - * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_Private.Page's - * order is recorded in page->private field. - * So when we are allocating or freeing one, we can derive the state of the - * other. That is, if we allocate a small block, and both were - * free, the remainder of the region must be split into blocks. - * If a block is freed, and its buddy is also free, then this - * triggers coalescing into a block of larger size. - * - * -- wli - */ - -static inline void __free_pages_bulk (struct page *page, struct page *base, - struct zone *zone, unsigned int order) -{ - unsigned long page_idx; - struct page *coalesced; - int order_size = 1 << order; - - if (unlikely(order)) - destroy_compound_page(page, order); - - page_idx = page - base; - - BUG_ON(page_idx & (order_size - 1)); - BUG_ON(bad_range(zone, page)); - - zone->free_pages += order_size; - while (order < MAX_ORDER-1) { - struct free_area *area; - struct page *buddy; - int buddy_idx; - - buddy_idx = (page_idx ^ (1 << order)); - buddy = base + buddy_idx; - if (bad_range(zone, buddy)) - break; - if (!page_is_buddy(buddy, order)) - break; - /* Move the buddy up one level. */ - list_del(&buddy->lru); - area = zone->free_area + order; - area->nr_free--; - rmv_page_order(buddy); - page_idx &= buddy_idx; - order++; - } - coalesced = base + page_idx; - set_page_order(coalesced, order); - list_add(&coalesced->lru, &zone->free_area[order].free_list); - zone->free_area[order].nr_free++; -} - -static inline void free_pages_check(const char *function, struct page *page) -{ - if ( page_mapped(page) || - page->mapping != NULL || - page_count(page) != 0 || - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback ))) - bad_page(function, page); - if (PageDirty(page)) - ClearPageDirty(page); -} - -/* - * Frees a list of pages. - * Assumes all pages on list are in same zone, and of same order. - * count is the number of pages to free, or 0 for all on the list. - * - * If the zone was previously in an "all pages pinned" state then look to - * see if this freeing clears that state. - * - * And clear the zone's pages_scanned counter, to hold off the "all pages are - * pinned" detection logic. - */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) -{ - unsigned long flags; - struct page *base, *page = NULL; - int ret = 0; - - base = zone->zone_mem_map; - spin_lock_irqsave(&zone->lock, flags); - zone->all_unreclaimable = 0; - zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->lru); - __free_pages_bulk(page, base, zone, order); - ret++; - } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; -} - -void __free_pages_ok(struct page *page, unsigned int order) -{ - LIST_HEAD(list); - int i; - - if (arch_free_page(page, order)) - return; - - mod_page_state(pgfree, 1 << order); - -#ifndef CONFIG_MMU - if (order > 0) - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); -#endif - - for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); - list_add(&page->lru, &list); - kernel_map_pages(page, 1<<order, 0); - free_pages_bulk(page_zone(page), 1, &list, order); -} - - -/* - * The order of subdivision here is critical for the IO subsystem. - * Please do not alter this order without good reasons and regression - * testing. Specifically, as large blocks of memory are subdivided, - * the order in which smaller blocks are delivered depends on the order - * they're subdivided in this function. This is the primary factor - * influencing the order in which pages are delivered to the IO - * subsystem according to empirical testing, and this is also justified - * by considering the behavior of a buddy system containing a single - * large block of memory acted on by a series of small allocations. - * This behavior is a critical factor in sglist merging's success. - * - * -- wli - */ -static inline struct page * -expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area) -{ - unsigned long size = 1 << high; - - while (high > low) { - area--; - high--; - size >>= 1; - BUG_ON(bad_range(zone, &page[size])); - list_add(&page[size].lru, &area->free_list); - area->nr_free++; - set_page_order(&page[size], high); - } - return page; -} - -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ -} - -/* - * This page is about to be returned from the page allocator - */ -static void prep_new_page(struct page *page, int order) -{ - if (page->mapping || page_mapped(page) || - (page->flags & ( - 1 << PG_private | - 1 << PG_locked | - 1 << PG_lru | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_swapcache | - 1 << PG_writeback ))) - bad_page(__FUNCTION__, page); - - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; - set_page_refs(page, order); - kernel_map_pages(page, 1 << order, 1); -} - -/* - * Do the hard work of removing an element from the buddy allocator. - * Call me with the zone->lock already held. - */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) -{ - struct free_area * area; - unsigned int current_order; - struct page *page; - - for (current_order = order; current_order < MAX_ORDER; ++current_order) { - area = zone->free_area + current_order; - if (list_empty(&area->free_list)) - continue; - - page = list_entry(area->free_list.next, struct page, lru); - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; - zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); - } - - return NULL; -} - -/* - * Obtain a specified number of elements from the buddy allocator, all under - * a single hold of the lock, for efficiency. Add them to the supplied list. - * Returns the number of new pages which were placed at *list. - */ -static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list) -{ - unsigned long flags; - int i; - int allocated = 0; - struct page *page; - - spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) - break; - allocated++; - list_add_tail(&page->lru, list); - } - spin_unlock_irqrestore(&zone->lock, flags); - return allocated; -} - -#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) -static void __drain_pages(unsigned int cpu) -{ - struct zone *zone; - int i; - - for_each_zone(zone) { - struct per_cpu_pageset *pset; - - pset = &zone->pageset[cpu]; - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); - } - } -} -#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ - -#ifdef CONFIG_PM - -void mark_free_pages(struct zone *zone) -{ - unsigned long zone_pfn, flags; - int order; - struct list_head *curr; - - if (!zone->spanned_pages) - return; - - spin_lock_irqsave(&zone->lock, flags); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); - - for (order = MAX_ORDER - 1; order >= 0; --order) - list_for_each(curr, &zone->free_area[order].free_list) { - unsigned long start_pfn, i; - - start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); - - for (i=0; i < (1<<order); i++) - SetPageNosaveFree(pfn_to_page(start_pfn+i)); - } - spin_unlock_irqrestore(&zone->lock, flags); -} - -/* - * Spill all of this CPU's per-cpu pages back into the buddy allocator. - */ -void drain_local_pages(void) -{ - unsigned long flags; - - local_irq_save(flags); - __drain_pages(smp_processor_id()); - local_irq_restore(flags); -} -#endif /* CONFIG_PM */ - -static void zone_statistics(struct zonelist *zonelist, struct zone *z) -{ -#ifdef CONFIG_NUMA - unsigned long flags; - int cpu; - pg_data_t *pg = z->zone_pgdat; - pg_data_t *orig = zonelist->zones[0]->zone_pgdat; - struct per_cpu_pageset *p; - - local_irq_save(flags); - cpu = smp_processor_id(); - p = &z->pageset[cpu]; - if (pg == orig) { - z->pageset[cpu].numa_hit++; - } else { - p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; - } - if (pg == NODE_DATA(numa_node_id())) - p->local_node++; - else - p->other_node++; - local_irq_restore(flags); -#endif -} - -/* - * Free a 0-order page - */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); -static void fastcall free_hot_cold_page(struct page *page, int cold) -{ - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; - unsigned long flags; - - if (arch_free_page(page, 0)) - return; - - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); - if (PageAnon(page)) - page->mapping = NULL; - free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; - local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->lru, &pcp->list); - pcp->count++; - local_irq_restore(flags); - put_cpu(); -} - -void fastcall free_hot_page(struct page *page) -{ - free_hot_cold_page(page, 0); -} - -void fastcall free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); -} - -static inline void prep_zero_page(struct page *page, int order, int gfp_flags) -{ - int i; - - BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); - for(i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - -/* - * Really, prep_compound_page() should be called from __rmqueue_bulk(). But - * we cheat by calling it from here, in the order > 0 path. Saves a branch - * or two. - */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, int gfp_flags) -{ - unsigned long flags; - struct page *page = NULL; - int cold = !!(gfp_flags & __GFP_COLD); - - if (order == 0) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[get_cpu()].pcp[cold]; - local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); - if (pcp->count) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; - } - local_irq_restore(flags); - put_cpu(); - } - - if (page == NULL) { - spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); - } - - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); - - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); - - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } - return page; -} - -/* - * Return 1 if free pages are above 'mark'. This takes into account the order - * of the allocation. - */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int can_try_harder, int gfp_high) -{ - /* free_pages my go negative - that's OK */ - long min = mark, free_pages = z->free_pages - (1 << order) + 1; - int o; - - if (gfp_high) - min -= min / 2; - if (can_try_harder) - min -= min / 4; - - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; - for (o = 0; o < order; o++) { - /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; - - /* Require fewer higher order pages to be free */ - min >>= 1; - - if (free_pages <= min) - return 0; - } - return 1; -} - -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * fastcall -__alloc_pages(unsigned int gfp_mask, unsigned int order, - struct zonelist *zonelist) -{ - const int wait = gfp_mask & __GFP_WAIT; - struct zone **zones, *z; - struct page *page; - struct reclaim_state reclaim_state; - struct task_struct *p = current; - int i; - int classzone_idx; - int do_retry; - int can_try_harder; - int did_some_progress; - - might_sleep_if(wait); - - /* - * The caller may dip into page reserves a bit more if the caller - * cannot run direct reclaim, or is the caller has realtime scheduling - * policy - */ - can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait; - - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ - - if (unlikely(zones[0] == NULL)) { - /* Should this ever happen?? */ - return NULL; - } - - classzone_idx = zone_idx(zones[0]); - - restart: - /* Go through the zonelist once, looking for a zone with enough free */ - for (i = 0; (z = zones[i]) != NULL; i++) { - - if (!zone_watermark_ok(z, order, z->pages_low, - classzone_idx, 0, 0)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - - for (i = 0; (z = zones[i]) != NULL; i++) - wakeup_kswapd(z, order); - - /* - * Go through the zonelist again. Let __GFP_HIGH and allocations - * coming from realtime tasks to go deeper into reserves - */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - - /* This allocation should allow future memory freeing. */ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { - /* go through the zonelist yet again, ignoring mins */ - for (i = 0; (z = zones[i]) != NULL; i++) { - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - goto nopage; - } - - /* Atomic allocations - we can't balance anything */ - if (!wait) - goto nopage; - -rebalance: - cond_resched(); - - /* We now go into synchronous reclaim */ - p->flags |= PF_MEMALLOC; - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; - - did_some_progress = try_to_free_pages(zones, gfp_mask, order); - - p->reclaim_state = NULL; - p->flags &= ~PF_MEMALLOC; - - cond_resched(); - - if (likely(did_some_progress)) { - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_min, - classzone_idx, can_try_harder, - gfp_mask & __GFP_HIGH)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ - for (i = 0; (z = zones[i]) != NULL; i++) { - if (!zone_watermark_ok(z, order, z->pages_high, - classzone_idx, 0, 0)) - continue; - - page = buffered_rmqueue(z, order, gfp_mask); - if (page) - goto got_pg; - } - - out_of_memory(gfp_mask); - goto restart; - } - - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. - * - * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order - * <= 3, but that may not be true in other implementations. - */ - do_retry = 0; - if (!(gfp_mask & __GFP_NORETRY)) { - if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) - do_retry = 1; - if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { - blk_congestion_wait(WRITE, HZ/50); - goto rebalance; - } - -nopage: - if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); - dump_stack(); - } - return NULL; -got_pg: - zone_statistics(zonelist, z); - return page; -} - -EXPORT_SYMBOL(__alloc_pages); - -/* - * Common helper functions. - */ -fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order) -{ - struct page * page; - page = alloc_pages(gfp_mask, order); - if (!page) - return 0; - return (unsigned long) page_address(page); -} - -EXPORT_SYMBOL(__get_free_pages); - -fastcall unsigned long get_zeroed_page(unsigned int gfp_mask) -{ - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - BUG_ON(gfp_mask & __GFP_HIGHMEM); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; -} - -EXPORT_SYMBOL(get_zeroed_page); - -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) - free_hot_cold_page(pvec->pages[i], pvec->cold); -} - -fastcall void __free_pages(struct page *page, unsigned int order) -{ - if (!PageReserved(page) && put_page_testzero(page)) { - if (order == 0) - free_hot_page(page); - else - __free_pages_ok(page, order); - } -} - -EXPORT_SYMBOL(__free_pages); - -fastcall void free_pages(unsigned long addr, unsigned int order) -{ - if (addr != 0) { - BUG_ON(!virt_addr_valid((void *)addr)); - __free_pages(virt_to_page((void *)addr), order); - } -} - -EXPORT_SYMBOL(free_pages); - -/* - * Total amount of free (allocatable) RAM: - */ -unsigned int nr_free_pages(void) -{ - unsigned int sum = 0; - struct zone *zone; - - for_each_zone(zone) - sum += zone->free_pages; - - return sum; -} - -EXPORT_SYMBOL(nr_free_pages); - -#ifdef CONFIG_NUMA -unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) -{ - unsigned int i, sum = 0; - - for (i = 0; i < MAX_NR_ZONES; i++) - sum += pgdat->node_zones[i].free_pages; - - return sum; -} -#endif - -static unsigned int nr_free_zone_pages(int offset) -{ - pg_data_t *pgdat; - unsigned int sum = 0; - - for_each_pgdat(pgdat) { - struct zonelist *zonelist = pgdat->node_zonelists + offset; - struct zone **zonep = zonelist->zones; - struct zone *zone; - - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; - } - } - - return sum; -} - -/* - * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL - */ -unsigned int nr_free_buffer_pages(void) -{ - return nr_free_zone_pages(GFP_USER & GFP_ZONEMASK); -} - -/* - * Amount of free RAM allocatable within all zones - */ -unsigned int nr_free_pagecache_pages(void) -{ - return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); -} - -#ifdef CONFIG_HIGHMEM -unsigned int nr_free_highpages (void) -{ - pg_data_t *pgdat; - unsigned int pages = 0; - - for_each_pgdat(pgdat) - pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - - return pages; -} -#endif - -#ifdef CONFIG_NUMA -static void show_node(struct zone *zone) -{ - printk("Node %d ", zone->zone_pgdat->node_id); -} -#else -#define show_node(zone) do { } while (0) -#endif - -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -static DEFINE_PER_CPU(struct page_state, page_states) = {0}; - -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); -#ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif - -void __get_page_state(struct page_state *ret, int nr) -{ - int cpu = 0; - - memset(ret, 0, sizeof(*ret)); - - cpu = first_cpu(cpu_online_map); - while (cpu < NR_CPUS) { - unsigned long *in, *out, off; - - in = (unsigned long *)&per_cpu(page_states, cpu); - - cpu = next_cpu(cpu, cpu_online_map); - - if (cpu < NR_CPUS) - prefetch(&per_cpu(page_states, cpu)); - - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_page_state(struct page_state *ret) -{ - int nr; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr + 1); -} - -void get_full_page_state(struct page_state *ret) -{ - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); -} - -unsigned long __read_page_state(unsigned offset) -{ - unsigned long ret = 0; - int cpu; - - for_each_online_cpu(cpu) { - unsigned long in; - - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); - } - return ret; -} - -void __mod_page_state(unsigned offset, unsigned long delta) -{ - unsigned long flags; - void* ptr; - - local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; - local_irq_restore(flags); -} - -EXPORT_SYMBOL(__mod_page_state); - -void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) -{ - struct zone *zones = pgdat->node_zones; - int i; - - *active = 0; - *inactive = 0; - *free = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - *active += zones[i].nr_active; - *inactive += zones[i].nr_inactive; - *free += zones[i].free_pages; - } -} - -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) -{ - struct pglist_data *pgdat; - - *active = 0; - *inactive = 0; - *free = 0; - for_each_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); - *active += l; - *inactive += m; - *free += n; - } -} - -void si_meminfo(struct sysinfo *val) -{ - val->totalram = totalram_pages; - val->sharedram = 0; - val->freeram = nr_free_pages(); - val->bufferram = nr_blockdev_pages(); -#ifdef CONFIG_HIGHMEM - val->totalhigh = totalhigh_pages; - val->freehigh = nr_free_highpages(); -#else - val->totalhigh = 0; - val->freehigh = 0; -#endif - val->mem_unit = PAGE_SIZE; -} - -EXPORT_SYMBOL(si_meminfo); - -#ifdef CONFIG_NUMA -void si_meminfo_node(struct sysinfo *val, int nid) -{ - pg_data_t *pgdat = NODE_DATA(nid); - - val->totalram = pgdat->node_present_pages; - val->freeram = nr_free_pages_pgdat(pgdat); - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; - val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; - val->mem_unit = PAGE_SIZE; -} -#endif - -#define K(x) ((x) << (PAGE_SHIFT-10)) - -/* - * Show free area list (used inside shift_scroll-lock stuff) - * We also calculate the percentage fragmentation. We do this by counting the - * memory on each free list with the exception of the first item on the list. - */ -void show_free_areas(void) -{ - struct page_state ps; - int cpu, temperature; - unsigned long active; - unsigned long inactive; - unsigned long free; - struct zone *zone; - - for_each_zone(zone) { - show_node(zone); - printk("%s per-cpu:", zone->name); - - if (!zone->present_pages) { - printk(" empty\n"); - continue; - } else - printk("\n"); - - for (cpu = 0; cpu < NR_CPUS; ++cpu) { - struct per_cpu_pageset *pageset; - - if (!cpu_possible(cpu)) - continue; - - pageset = zone->pageset + cpu; - - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", - cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].low, - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); - } - } - - get_page_state(&ps); - get_zone_counts(&active, &inactive, &free); - - printk("\nFree pages: %11ukB (%ukB HighMem)\n", - K(nr_free_pages()), - K(nr_free_highpages())); - - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " - "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", - active, - inactive, - ps.nr_dirty, - ps.nr_writeback, - ps.nr_unstable, - nr_free_pages(), - ps.nr_slab, - ps.nr_mapped, - ps.nr_page_table_pages); - - for_each_zone(zone) { - int i; - - show_node(zone); - printk("%s" - " free:%lukB" - " min:%lukB" - " low:%lukB" - " high:%lukB" - " active:%lukB" - " inactive:%lukB" - " present:%lukB" - " pages_scanned:%lu" - " all_unreclaimable? %s" - "\n", - zone->name, - K(zone->free_pages), - K(zone->pages_min), - K(zone->pages_low), - K(zone->pages_high), - K(zone->nr_active), - K(zone->nr_inactive), - K(zone->present_pages), - zone->pages_scanned, - (zone->all_unreclaimable ? "yes" : "no") - ); - printk("lowmem_reserve[]:"); - for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); - printk("\n"); - } - - for_each_zone(zone) { - unsigned long nr, flags, order, total = 0; - - show_node(zone); - printk("%s: ", zone->name); - if (!zone->present_pages) { - printk("empty\n"); - continue; - } - - spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) { - nr = zone->free_area[order].nr_free; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); - } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); - } - - show_swap_cache_info(); -} - -/* - * Builds allocation fallback zone lists. - */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; - } - - return j; -} - -#ifdef CONFIG_NUMA -#define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; -/** - * find_next_best_node - find the next node that should appear in a given - * node's fallback list - * @node: node whose fallback list we're appending - * @used_node_mask: nodemask_t of already used nodes - * - * We use a number of factors to determine which is the next node that should - * appear on a given node's fallback list. The node should not have appeared - * already in @node's fallback list, and it should be the next closest node - * according to the distance array (which contains arbitrary distance values - * from each node to each node in the system), and should also prefer nodes - * with no CPUs, since presumably they'll have very little allocation pressure - * on them otherwise. - * It returns -1 if no node is found. - */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) -{ - int i, n, val; - int min_val = INT_MAX; - int best_node = -1; - - for_each_online_node(i) { - cpumask_t tmp; - - /* Start from local node */ - n = (node+i) % num_online_nodes(); - - /* Don't want a node to appear more than once */ - if (node_isset(n, *used_node_mask)) - continue; - - /* Use the local node if we haven't already */ - if (!node_isset(node, *used_node_mask)) { - best_node = node; - break; - } - - /* Use the distance array to find the distance */ - val = node_distance(node, n); - - /* Give preference to headless and unused nodes */ - tmp = node_to_cpumask(n); - if (!cpus_empty(tmp)) - val += PENALTY_FOR_NODE_WITH_CPUS; - - /* Slight preference for less loaded node */ - val *= (MAX_NODE_LOAD*MAX_NUMNODES); - val += node_load[n]; - - if (val < min_val) { - min_val = val; - best_node = n; - } - } - - if (best_node >= 0) - node_set(best_node, *used_node_mask); - - return best_node; -} - -static void __init build_zonelists(pg_data_t *pgdat) -{ - int i, j, k, node, local_node; - int prev_node, load; - struct zonelist *zonelist; - nodemask_t used_mask; - - /* initialize zonelists */ - for (i = 0; i < GFP_ZONETYPES; i++) { - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); - zonelist->zones[0] = NULL; - } - - /* NUMA-aware ordering of nodes */ - local_node = pgdat->node_id; - load = num_online_nodes(); - prev_node = local_node; - nodes_clear(used_mask); - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - /* - * We don't want to pressure a particular node. - * So adding penalty to the first node in same - * distance group to make it round-robin. - */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) - node_load[node] += load; - prev_node = node; - load--; - for (i = 0; i < GFP_ZONETYPES; i++) { - zonelist = pgdat->node_zonelists + i; - for (j = 0; zonelist->zones[j] != NULL; j++); - - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - zonelist->zones[j] = NULL; - } - } -} - -#else /* CONFIG_NUMA */ - -static void __init build_zonelists(pg_data_t *pgdat) -{ - int i, j, k, node, local_node; - - local_node = pgdat->node_id; - for (i = 0; i < GFP_ZONETYPES; i++) { - struct zonelist *zonelist; - - zonelist = pgdat->node_zonelists + i; - memset(zonelist, 0, sizeof(*zonelist)); - - j = 0; - k = ZONE_NORMAL; - if (i & __GFP_HIGHMEM) - k = ZONE_HIGHMEM; - if (i & __GFP_DMA) - k = ZONE_DMA; - - j = build_zonelists_node(pgdat, zonelist, j, k); - /* - * Now we build the zonelist so that it contains the zones - * of all the other nodes. - * We don't want to pressure a particular node, so when - * building the zones for node N, we make sure that the - * zones coming right after the local ones are those from - * node N+1 (modulo N) - */ - for (node = local_node + 1; node < MAX_NUMNODES; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - } - for (node = 0; node < local_node; node++) { - if (!node_online(node)) - continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); - } - - zonelist->zones[j] = NULL; - } -} - -#endif /* CONFIG_NUMA */ - -void __init build_all_zonelists(void) -{ - int i; - - for_each_online_node(i) - build_zonelists(NODE_DATA(i)); - printk("Built %i zonelists\n", num_online_nodes()); -} - -/* - * Helper functions to size the waitqueue hash table. - * Essentially these want to choose hash table sizes sufficiently - * large so that collisions trying to wait on pages are rare. - * But in fact, the number of active page waitqueues on typical - * systems is ridiculously low, less than 200. So this is even - * conservative, even though it seems large. - * - * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to - * waitqueues, i.e. the size of the waitq table given the number of pages. - */ -#define PAGES_PER_WAITQUEUE 256 - -static inline unsigned long wait_table_size(unsigned long pages) -{ - unsigned long size = 1; - - pages /= PAGES_PER_WAITQUEUE; - - while (size < pages) - size <<= 1; - - /* - * Once we have dozens or even hundreds of threads sleeping - * on IO we've got bigger problems than wait queue collision. - * Limit the size of the wait table to a reasonable size. - */ - size = min(size, 4096UL); - - return max(size, 4UL); -} - -/* - * This is an integer logarithm so that shifts can be used later - * to extract the more random high bits from the multiplicative - * hash function before the remainder is taken. - */ -static inline unsigned long wait_table_bits(unsigned long size) -{ - return ffz(~size); -} - -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) - -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long realtotalpages, totalpages = 0; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); -} - - -/* - * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is - * done. Non-atomic initialization, single-pass. - */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) -{ - struct page *start = pfn_to_page(start_pfn); - struct page *page; - - for (page = start; page < (start + size); page++) { - set_page_zone(page, NODEZONE(nid, zone)); - set_page_count(page, 0); - reset_page_mapcount(page); - SetPageReserved(page); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(start_pfn << PAGE_SHIFT)); -#endif - start_pfn++; - } -} - -void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, - unsigned long size) -{ - int order; - for (order = 0; order < MAX_ORDER ; order++) { - INIT_LIST_HEAD(&zone->free_area[order].free_list); - zone->free_area[order].nr_free = 0; - } -} - -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn)) -#endif - -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - */ -static void __init free_area_init_core(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int cpu, nid = pgdat->node_id; - unsigned long zone_start_pfn = pgdat->node_start_pfn; - - pgdat->nr_zones = 0; - init_waitqueue_head(&pgdat->kswapd_wait); - pgdat->kswapd_max_order = 0; - - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize; - unsigned long batch; - - zone_table[NODEZONE(nid, j)] = zone; - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; - - if (j == ZONE_DMA || j == ZONE_NORMAL) - nr_kernel_pages += realsize; - nr_all_pages += realsize; - - zone->spanned_pages = size; - zone->present_pages = realsize; - zone->name = zone_names[j]; - spin_lock_init(&zone->lock); - spin_lock_init(&zone->lru_lock); - zone->zone_pgdat = pgdat; - zone->free_pages = 0; - - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; - zone->nr_scan_inactive = 0; - zone->nr_active = 0; - zone->nr_inactive = 0; - if (!size) - continue; - - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); - - memmap_init(size, nid, j, zone_start_pfn); - - zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); - } -} - -void __init node_alloc_mem_map(struct pglist_data *pgdat) -{ - unsigned long size; - - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); -#ifndef CONFIG_DISCONTIGMEM - mem_map = contig_page_data.node_mem_map; -#endif -} - -void __init free_area_init_node(int nid, struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long node_start_pfn, - unsigned long *zholes_size) -{ - pgdat->node_id = nid; - pgdat->node_start_pfn = node_start_pfn; - calculate_zone_totalpages(pgdat, zones_size, zholes_size); - - if (!pfn_to_page(node_start_pfn)) - node_alloc_mem_map(pgdat); - - free_area_init_core(pgdat, zones_size, zholes_size); -} - -#ifndef CONFIG_DISCONTIGMEM -static bootmem_data_t contig_bootmem_data; -struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; - -EXPORT_SYMBOL(contig_page_data); - -void __init free_area_init(unsigned long *zones_size) -{ - free_area_init_node(0, &contig_page_data, zones_size, - __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); -} -#endif - -#ifdef CONFIG_PROC_FS - -#include <linux/seq_file.h> - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return pgdat->pgdat_next; -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* - * This walks the free areas for each zone. - */ -static int frag_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - int order; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations fragmentation_op = { - .start = frag_start, - .next = frag_next, - .stop = frag_stop, - .show = frag_show, -}; - -static char *vmstat_text[] = { - "nr_dirty", - "nr_writeback", - "nr_unstable", - "nr_page_table_pages", - "nr_mapped", - "nr_slab", - - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - "pgalloc_high", - - "pgalloc_normal", - "pgalloc_dma", - "pgfree", - "pgactivate", - "pgdeactivate", - - "pgfault", - "pgmajfault", - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma", - - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma", - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - - "pgscan_kswapd_dma", - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma", - "pginodesteal", - - "slabs_scanned", - "kswapd_steal", - "kswapd_inodesteal", - "pageoutrun", - "allocstall", - - "pgrotated", -}; - -static void *vmstat_start(struct seq_file *m, loff_t *pos) -{ - struct page_state *ps; - - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) - return ERR_PTR(-ENOMEM); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; -} - -static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -{ - (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - return (unsigned long *)m->private + *pos; -} - -static int vmstat_show(struct seq_file *m, void *arg) -{ - unsigned long *l = arg; - unsigned long off = l - (unsigned long *)m->private; - - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); - return 0; -} - -static void vmstat_stop(struct seq_file *m, void *arg) -{ - kfree(m->private); - m->private = NULL; -} - -struct seq_operations vmstat_op = { - .start = vmstat_start, - .next = vmstat_next, - .stop = vmstat_stop, - .show = vmstat_show, -}; - -#endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_HOTPLUG_CPU -static int page_alloc_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - int cpu = (unsigned long)hcpu; - long *count; - unsigned long *src, *dest; - - if (action == CPU_DEAD) { - int i; - - /* Drain local pagecache count. */ - count = &per_cpu(nr_pagecache_local, cpu); - atomic_add(*count, &nr_pagecache); - *count = 0; - local_irq_disable(); - __drain_pages(cpu); - - /* Add dead cpu's page_states to our own. */ - dest = (unsigned long *)&__get_cpu_var(page_states); - src = (unsigned long *)&per_cpu(page_states, cpu); - - for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long); - i++) { - dest[i] += src[i]; - src[i] = 0; - } - - local_irq_enable(); - } - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - -void __init page_alloc_init(void) -{ - hotcpu_notifier(page_alloc_cpu_notify, 0); -} - -/* - * setup_per_zone_lowmem_reserve - called whenever - * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone - * has a correct pages reserved value, so an adequate number of - * pages are left in the zone after a successful __alloc_pages(). - */ -static void setup_per_zone_lowmem_reserve(void) -{ - struct pglist_data *pgdat; - int j, idx; - - for_each_pgdat(pgdat) { - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone * zone = pgdat->node_zones + j; - unsigned long present_pages = zone->present_pages; - - zone->lowmem_reserve[j] = 0; - - for (idx = j-1; idx >= 0; idx--) { - struct zone * lower_zone = pgdat->node_zones + idx; - - lower_zone->lowmem_reserve[j] = present_pages / sysctl_lowmem_reserve_ratio[idx]; - present_pages += lower_zone->present_pages; - } - } - } -} - -/* - * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures - * that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. - */ -static void setup_per_zone_pages_min(void) -{ - unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); - unsigned long lowmem_pages = 0; - struct zone *zone; - unsigned long flags; - - /* Calculate total number of !ZONE_HIGHMEM pages */ - for_each_zone(zone) { - if (!is_highmem(zone)) - lowmem_pages += zone->present_pages; - } - - for_each_zone(zone) { - spin_lock_irqsave(&zone->lru_lock, flags); - if (is_highmem(zone)) { - /* - * Often, highmem doesn't need to reserve any pages. - * But the pages_min/low/high values are also used for - * batching up page reclaim activity so we need a - * decent value here. - */ - int min_pages; - - min_pages = zone->present_pages / 1024; - if (min_pages < SWAP_CLUSTER_MAX) - min_pages = SWAP_CLUSTER_MAX; - if (min_pages > 128) - min_pages = 128; - zone->pages_min = min_pages; - } else { - /* if it's a lowmem zone, reserve a number of pages - * proportionate to the zone's size. - */ - zone->pages_min = (pages_min * zone->present_pages) / - lowmem_pages; - } - - /* - * When interpreting these watermarks, just keep in mind that: - * zone->pages_min == (zone->pages_min * 4) / 4; - */ - zone->pages_low = (zone->pages_min * 5) / 4; - zone->pages_high = (zone->pages_min * 6) / 4; - spin_unlock_irqrestore(&zone->lru_lock, flags); - } -} - -/* - * Initialise min_free_kbytes. - * - * For small machines we want it small (128k min). For large machines - * we want it large (64MB max). But it is not linear, because network - * bandwidth does not increase linearly with machine size. We use - * - * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: - * min_free_kbytes = sqrt(lowmem_kbytes * 16) - * - * which yields - * - * 16MB: 512k - * 32MB: 724k - * 64MB: 1024k - * 128MB: 1448k - * 256MB: 2048k - * 512MB: 2896k - * 1024MB: 4096k - * 2048MB: 5792k - * 4096MB: 8192k - * 8192MB: 11584k - * 16384MB: 16384k - */ -static int __init init_per_zone_pages_min(void) -{ - unsigned long lowmem_kbytes; - - lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; - setup_per_zone_pages_min(); - setup_per_zone_lowmem_reserve(); - return 0; -} -module_init(init_per_zone_pages_min) - -/* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so - * that we can call two helper functions whenever min_free_kbytes - * changes. - */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, file, buffer, length, ppos); - setup_per_zone_pages_min(); - return 0; -} - -/* - * lowmem_reserve_ratio_sysctl_handler - just a wrapper around - * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() - * whenever sysctl_lowmem_reserve_ratio changes. - * - * The reserve ratio obviously has absolutely no relation with the - * pages_min watermarks. The lowmem reserve ratio can only make sense - * if in function of the boot time zone sizes. - */ -int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) -{ - proc_dointvec_minmax(table, write, file, buffer, length, ppos); - setup_per_zone_lowmem_reserve(); - return 0; -} - -__initdata int hashdist = HASHDIST_DEFAULT; - -#ifdef CONFIG_NUMA -static int __init set_hashdist(char *str) -{ - if (!str) - return 0; - hashdist = simple_strtoul(str, &str, 0); - return 1; -} -__setup("hashdist=", set_hashdist); -#endif - -/* - * allocate a large system hash table from bootmem - * - it is assumed that the hash table must contain an exact power-of-2 - * quantity of entries - * - limit is the number of hash buckets, not the total allocation size - */ -void *__init alloc_large_system_hash(const char *tablename, - unsigned long bucketsize, - unsigned long numentries, - int scale, - int flags, - unsigned int *_hash_shift, - unsigned int *_hash_mask, - unsigned long limit) -{ - unsigned long long max = limit; - unsigned long log2qty, size; - void *table = NULL; - - /* allow the kernel cmdline to have a say */ - if (!numentries) { - /* round applicable memory size up to nearest megabyte */ - numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; - numentries += (1UL << (20 - PAGE_SHIFT)) - 1; - numentries >>= 20 - PAGE_SHIFT; - numentries <<= 20 - PAGE_SHIFT; - - /* limit to 1 bucket per 2^scale bytes of low memory */ - if (scale > PAGE_SHIFT) - numentries >>= (scale - PAGE_SHIFT); - else - numentries <<= (PAGE_SHIFT - scale); - } - /* rounded up to nearest power of 2 in size */ - numentries = 1UL << (long_log2(numentries) + 1); - - /* limit allocation size to 1/16 total memory by default */ - if (max == 0) { - max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; - do_div(max, bucketsize); - } - - if (numentries > max) - numentries = max; - - log2qty = long_log2(numentries); - - do { - size = bucketsize << log2qty; - if (flags & HASH_EARLY) - table = alloc_bootmem(size); - else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); - else { - unsigned long order; - for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) - ; - table = (void*) __get_free_pages(GFP_ATOMIC, order); - } - } while (!table && size > PAGE_SIZE && --log2qty); - - if (!table) - panic("Failed to allocate %s hash table\n", tablename); - - printk("%s hash table entries: %d (order: %d, %lu bytes)\n", - tablename, - (1U << log2qty), - long_log2(size) - PAGE_SHIFT, - size); - - if (_hash_shift) - *_hash_shift = log2qty; - if (_hash_mask) - *_hash_mask = (1 << log2qty) - 1; - - return table; -} diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/net/core/dev.c --- a/linux-2.6.11-xen-sparse/net/core/dev.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,3389 +0,0 @@ -/* - * NET3 Protocol independent device support routines. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Derived from the non IP parts of dev.c 1.0.19 - * Authors: Ross Biro, <bir7@xxxxxxxxxxxxxxxxxxx> - * Fred N. van Kempen, <waltje@xxxxxxxxxxxxxxxxxxx> - * Mark Evans, <evansmp@xxxxxxxxxxxxxxxxx> - * - * Additional Authors: - * Florian la Roche <rzsfl@xxxxxxxxxxxx> - * Alan Cox <gw4pts@xxxxxxxxxxxxxxx> - * David Hinds <dahinds@xxxxxxxxxxxxxxxxxxxxx> - * Alexey Kuznetsov <kuznet@xxxxxxxxxxxxx> - * Adam Sulmicki <adam@xxxxxxxxxxxx> - * Pekka Riikonen <priikone@xxxxxxxxxxxxxxxx> - * - * Changes: - * D.J. Barrow : Fixed bug where dev->refcnt gets set - * to 2 if register_netdev gets called - * before net_dev_init & also removed a - * few lines of code in the process. - * Alan Cox : device private ioctl copies fields back. - * Alan Cox : Transmit queue code does relevant - * stunts to keep the queue safe. - * Alan Cox : Fixed double lock. - * Alan Cox : Fixed promisc NULL pointer trap - * ???????? : Support the full private ioctl range - * Alan Cox : Moved ioctl permission check into - * drivers - * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI - * Alan Cox : 100 backlog just doesn't cut it when - * you start doing multicast video 8) - * Alan Cox : Rewrote net_bh and list manager. - * Alan Cox : Fix ETH_P_ALL echoback lengths. - * Alan Cox : Took out transmit every packet pass - * Saved a few bytes in the ioctl handler - * Alan Cox : Network driver sets packet type before - * calling netif_rx. Saves a function - * call a packet. - * Alan Cox : Hashed net_bh() - * Richard Kooijman: Timestamp fixes. - * Alan Cox : Wrong field in SIOCGIFDSTADDR - * Alan Cox : Device lock protection. - * Alan Cox : Fixed nasty side effect of device close - * changes. - * Rudi Cilibrasi : Pass the right thing to - * set_mac_address() - * Dave Miller : 32bit quantity for the device lock to - * make it work out on a Sparc. - * Bjorn Ekwall : Added KERNELD hack. - * Alan Cox : Cleaned up the backlog initialise. - * Craig Metz : SIOCGIFCONF fix if space for under - * 1 device. - * Thomas Bogendoerfer : Return ENODEV for dev_open, if there - * is no device open function. - * Andi Kleen : Fix error reporting for SIOCGIFCONF - * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF - * Cyrus Durgin : Cleaned for KMOD - * Adam Sulmicki : Bug Fix : Network Device Unload - * A network device unload needs to purge - * the backlog queue. - * Paul Rusty Russell : SIOCSIFNAME - * Pekka Riikonen : Netdev boot-time settings code - * Andrew Morton : Make unregister_netdevice wait - * indefinitely on dev->refcnt - * J Hadi Salim : - Backlog queue sampling - * - netif_rx() feedback - */ - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <linux/bitops.h> -#include <linux/config.h> -#include <linux/cpu.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/errno.h> -#include <linux/interrupt.h> -#include <linux/if_ether.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/notifier.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <linux/rtnetlink.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/stat.h> -#include <linux/if_bridge.h> -#include <linux/divert.h> -#include <net/dst.h> -#include <net/pkt_sched.h> -#include <net/checksum.h> -#include <linux/highmem.h> -#include <linux/init.h> -#include <linux/kmod.h> -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/netpoll.h> -#include <linux/rcupdate.h> -#include <linux/delay.h> -#ifdef CONFIG_NET_RADIO -#include <linux/wireless.h> /* Note : will define WIRELESS_EXT */ -#include <net/iw_handler.h> -#endif /* CONFIG_NET_RADIO */ -#include <asm/current.h> - -#include <net/ip.h> -#include <linux/tcp.h> -#include <linux/udp.h> - - -/* This define, if set, will randomly drop a packet when congestion - * is more than moderate. It helps fairness in the multi-interface - * case when one of them is a hog, but it kills performance for the - * single interface case so it is off now by default. - */ -#undef RAND_LIE - -/* Setting this will sample the queue lengths and thus congestion - * via a timer instead of as each packet is received. - */ -#undef OFFLINE_SAMPLE - -/* - * The list of packet types we will receive (as opposed to discard) - * and the routines to invoke. - * - * Why 16. Because with 16 the only overlap we get on a hash of the - * low nibble of the protocol value is RARP/SNAP/X.25. - * - * NOTE: That is no longer true with the addition of VLAN tags. Not - * sure which should go first, but I bet it won't make much - * difference if we are running VLANs. The good news is that - * this protocol won't be in the list unless compiled in, so - * the average user (w/out VLANs) will not be adversly affected. - * --BLG - * - * 0800 IP - * 8100 802.1Q VLAN - * 0001 802.3 - * 0002 AX.25 - * 0004 802.2 - * 8035 RARP - * 0005 SNAP - * 0805 X.25 - * 0806 ARP - * 8137 IPX - * 0009 Localtalk - * 86DD IPv6 - */ - -static DEFINE_SPINLOCK(ptype_lock); -static struct list_head ptype_base[16]; /* 16 way hashed list */ -static struct list_head ptype_all; /* Taps */ - -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy); -static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); -#endif - -/* - * The @dev_base list is protected by @dev_base_lock and the rtln - * semaphore. - * - * Pure readers hold dev_base_lock for reading. - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -struct net_device *dev_base; -static struct net_device **dev_tail = &dev_base; -DEFINE_RWLOCK(dev_base_lock); - -EXPORT_SYMBOL(dev_base); -EXPORT_SYMBOL(dev_base_lock); - -#define NETDEV_HASHBITS 8 -static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS]; -static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS]; - -static inline struct hlist_head *dev_name_hash(const char *name) -{ - unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); - return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)]; -} - -static inline struct hlist_head *dev_index_hash(int ifindex) -{ - return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)]; -} - -/* - * Our notifier list - */ - -static struct notifier_block *netdev_chain; - -/* - * Device drivers call our routines to queue packets here. We empty the - * queue in the local softnet handler. - */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; - -#ifdef CONFIG_SYSFS -extern int netdev_sysfs_init(void); -extern int netdev_register_sysfs(struct net_device *); -extern void netdev_unregister_sysfs(struct net_device *); -#else -#define netdev_sysfs_init() (0) -#define netdev_register_sysfs(dev) (0) -#define netdev_unregister_sysfs(dev) do { } while(0) -#endif - - -/******************************************************************************* - - Protocol management and registration routines - -*******************************************************************************/ - -/* - * For efficiency - */ - -int netdev_nit; - -/* - * Add a protocol ID to the list. Now that the input handler is - * smarter we can dispense with all the messy stuff that used to be - * here. - * - * BEWARE!!! Protocol handlers, mangling input packets, - * MUST BE last in hash buckets and checking protocol handlers - * MUST start from promiscuous ptype_all chain in net_bh. - * It is true now, do not change it. - * Explanation follows: if protocol handler, mangling packet, will - * be the first on list, it is not able to sense, that packet - * is cloned and should be copied-on-write, so that it will - * change it and subsequent readers will get broken packet. - * --ANK (980803) - */ - -/** - * dev_add_pack - add packet handler - * @pt: packet type declaration - * - * Add a protocol handler to the networking stack. The passed &packet_type - * is linked into kernel lists and may not be freed until it has been - * removed from the kernel lists. - * - * This call does not sleep therefore it can not - * guarantee all CPU's that are in middle of receiving packets - * will see the new packet type (until the next received packet). - */ - -void dev_add_pack(struct packet_type *pt) -{ - int hash; - - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) { - netdev_nit++; - list_add_rcu(&pt->list, &ptype_all); - } else { - hash = ntohs(pt->type) & 15; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); -} - -extern void linkwatch_run_queue(void); - - - -/** - * __dev_remove_pack - remove packet handler - * @pt: packet type declaration - * - * Remove a protocol handler that was previously added to the kernel - * protocol handlers by dev_add_pack(). The passed &packet_type is removed - * from the kernel lists and can be freed or reused once this function - * returns. - * - * The packet type might still be in use by receivers - * and must not be freed until after all the CPU's have gone - * through a quiescent state. - */ -void __dev_remove_pack(struct packet_type *pt) -{ - struct list_head *head; - struct packet_type *pt1; - - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) { - netdev_nit--; - head = &ptype_all; - } else - head = &ptype_base[ntohs(pt->type) & 15]; - - list_for_each_entry(pt1, head, list) { - if (pt == pt1) { - list_del_rcu(&pt->list); - goto out; - } - } - - printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); -out: - spin_unlock_bh(&ptype_lock); -} -/** - * dev_remove_pack - remove packet handler - * @pt: packet type declaration - * - * Remove a protocol handler that was previously added to the kernel - * protocol handlers by dev_add_pack(). The passed &packet_type is removed - * from the kernel lists and can be freed or reused once this function - * returns. - * - * This call sleeps to guarantee that no CPU is looking at the packet - * type after return. - */ -void dev_remove_pack(struct packet_type *pt) -{ - __dev_remove_pack(pt); - - synchronize_net(); -} - -/****************************************************************************** - - Device Boot-time Settings Routines - -*******************************************************************************/ - -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; - -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; - - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strcpy(s[i].name, name); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } - - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; -} - -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) -{ - struct netdev_boot_setup *s = dev_boot_setup; - int i; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strncmp(dev->name, s[i].name, strlen(s[i].name))) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} - - -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) -{ - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; - - sprintf(name, "%s%d", prefix, unit); - - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(name)) - return 1; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; -} - -/* - * Saves at boot time configured settings for any netdevice. - */ -int __init netdev_boot_setup(char *str) -{ - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; - - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} - -__setup("netdev=", netdev_boot_setup); - -/******************************************************************************* - - Device Interface Subroutines - -*******************************************************************************/ - -/** - * __dev_get_by_name - find a device by its name - * @name: name to find - * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The - * reference counters are not incremented so the caller must be - * careful with locks. - */ - -struct net_device *__dev_get_by_name(const char *name) -{ - struct hlist_node *p; - - hlist_for_each(p, dev_name_hash(name)) { - struct net_device *dev - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(dev->name, name, IFNAMSIZ)) - return dev; - } - return NULL; -} - -/** - * dev_get_by_name - find a device by its name - * @name: name to find - * - * Find an interface by name. This can be called from any - * context and does its own locking. The returned handle has - * the usage count incremented and the caller must use dev_put() to - * release it when it is no longer needed. %NULL is returned if no - * matching device is found. - */ - -struct net_device *dev_get_by_name(const char *name) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_name(name); - if (dev) - dev_hold(dev); - read_unlock(&dev_base_lock); - return dev; -} - -/** - * __dev_get_by_index - find a device by its ifindex - * @ifindex: index of device - * - * Search for an interface by index. Returns %NULL if the device - * is not found or a pointer to the device. The device has not - * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. - */ - -struct net_device *__dev_get_by_index(int ifindex) -{ - struct hlist_node *p; - - hlist_for_each(p, dev_index_hash(ifindex)) { - struct net_device *dev - = hlist_entry(p, struct net_device, index_hlist); - if (dev->ifindex == ifindex) - return dev; - } - return NULL; -} - - -/** - * dev_get_by_index - find a device by its ifindex - * @ifindex: index of device - * - * Search for an interface by index. Returns NULL if the device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. - */ - -struct net_device *dev_get_by_index(int ifindex) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifindex); - if (dev) - dev_hold(dev); - read_unlock(&dev_base_lock); - return dev; -} - -/** - * dev_getbyhwaddr - find a device by its hardware address - * @type: media type of device - * @ha: hardware address - * - * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold the - * rtnl semaphore. The returned device has not had its ref count increased - * and the caller must therefore be careful about locking - * - * BUGS: - * If the API was consistent this would be __dev_get_by_hwaddr - */ - -struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) -{ - struct net_device *dev; - - ASSERT_RTNL(); - - for (dev = dev_base; dev; dev = dev->next) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) - break; - return dev; -} - -struct net_device *dev_getfirstbyhwtype(unsigned short type) -{ - struct net_device *dev; - - rtnl_lock(); - for (dev = dev_base; dev; dev = dev->next) { - if (dev->type == type) { - dev_hold(dev); - break; - } - } - rtnl_unlock(); - return dev; -} - -EXPORT_SYMBOL(dev_getfirstbyhwtype); - -/** - * dev_get_by_flags - find any device with given flags - * @if_flags: IFF_* values - * @mask: bitmask of bits in if_flags to check - * - * Search for any interface with the given flags. Returns NULL if a device - * is not found or a pointer to the device. The device returned has - * had a reference added and the pointer is safe until the user calls - * dev_put to indicate they have finished with it. - */ - -struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - for (dev = dev_base; dev != NULL; dev = dev->next) { - if (((dev->flags ^ if_flags) & mask) == 0) { - dev_hold(dev); - break; - } - } - read_unlock(&dev_base_lock); - return dev; -} - -/** - * dev_valid_name - check if name is okay for network device - * @name: name string - * - * Network device names need to be valid file names to - * to allow sysfs to work - */ -static int dev_valid_name(const char *name) -{ - return !(*name == '\0' - || !strcmp(name, ".") - || !strcmp(name, "..") - || strchr(name, '/')); -} - -/** - * dev_alloc_name - allocate a name for a device - * @dev: device - * @name: name format string - * - * Passed a format string - eg "lt%d" it will try and find a suitable - * id. Not efficient for many devices, not called a lot. The caller - * must hold the dev_base or rtnl lock while allocating the name and - * adding the device in order to avoid duplicates. Returns the number - * of the unit assigned or a negative errno code. - */ - -int dev_alloc_name(struct net_device *dev, const char *name) -{ - int i = 0; - char buf[IFNAMSIZ]; - const char *p; - const int max_netdevices = 8*PAGE_SIZE; - long *inuse; - struct net_device *d; - - p = strnchr(name, IFNAMSIZ-1, '%'); - if (p) { - /* - * Verify the string as this thing may have come from - * the user. There must be either one "%d" and no other "%" - * characters. - */ - if (p[1] != 'd' || strchr(p + 2, '%')) - return -EINVAL; - - /* Use one page as a bit array of possible slots */ - inuse = (long *) get_zeroed_page(GFP_ATOMIC); - if (!inuse) - return -ENOMEM; - - for (d = dev_base; d; d = d->next) { - if (!sscanf(d->name, name, &i)) - continue; - if (i < 0 || i >= max_netdevices) - continue; - - /* avoid cases where sscanf is not exact inverse of printf */ - snprintf(buf, sizeof(buf), name, i); - if (!strncmp(buf, d->name, IFNAMSIZ)) - set_bit(i, inuse); - } - - i = find_first_zero_bit(inuse, max_netdevices); - free_page((unsigned long) inuse); - } - - snprintf(buf, sizeof(buf), name, i); - if (!__dev_get_by_name(buf)) { - strlcpy(dev->name, buf, IFNAMSIZ); - return i; - } - - /* It is possible to run out of possible slots - * when the name is long and there isn't enough space left - * for the digits, or if all bits are used. - */ - return -ENFILE; -} - - -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, char *newname) -{ - int err = 0; - - ASSERT_RTNL(); - - if (dev->flags & IFF_UP) - return -EBUSY; - - if (!dev_valid_name(newname)) - return -EINVAL; - - if (strchr(newname, '%')) { - err = dev_alloc_name(dev, newname); - if (err < 0) - return err; - strcpy(newname, dev->name); - } - else if (__dev_get_by_name(newname)) - return -EEXIST; - else - strlcpy(dev->name, newname, IFNAMSIZ); - - err = class_device_rename(&dev->class_dev, dev->name); - if (!err) { - hlist_del(&dev->name_hlist); - hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); - notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); - } - - return err; -} - -/** - * netdev_state_change - device changes state - * @dev: device to cause notification - * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. - */ -void netdev_state_change(struct net_device *dev) -{ - if (dev->flags & IFF_UP) { - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0); - } -} - -/** - * dev_load - load a network module - * @name: name of interface - * - * If a network interface is not present and the process has suitable - * privileges this function loads the module. If module loading is not - * available in this kernel then it becomes a nop. - */ - -void dev_load(const char *name) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_name(name); - read_unlock(&dev_base_lock); - - if (!dev && capable(CAP_SYS_MODULE)) - request_module("%s", name); -} - -static int default_rebuild_header(struct sk_buff *skb) -{ - printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", - skb->dev ? skb->dev->name : "NULL!!!"); - kfree_skb(skb); - return 1; -} - - -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev) -{ - int ret = 0; - - /* - * Is it already up? - */ - - if (dev->flags & IFF_UP) - return 0; - - /* - * Is it even present? - */ - if (!netif_device_present(dev)) - return -ENODEV; - - /* - * Call device private open method - */ - set_bit(__LINK_STATE_START, &dev->state); - if (dev->open) { - ret = dev->open(dev); - if (ret) - clear_bit(__LINK_STATE_START, &dev->state); - } - - /* - * If it went open OK then: - */ - - if (!ret) { - /* - * Set the flags. - */ - dev->flags |= IFF_UP; - - /* - * Initialize multicasting status - */ - dev_mc_upload(dev); - - /* - * Wakeup transmit queue engine - */ - dev_activate(dev); - - /* - * ... and announce new interface. - */ - notifier_call_chain(&netdev_chain, NETDEV_UP, dev); - } - return ret; -} - -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -int dev_close(struct net_device *dev) -{ - if (!(dev->flags & IFF_UP)) - return 0; - - /* - * Tell people we are going down, so that they can - * prepare to death, when device is still operating. - */ - notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); - - dev_deactivate(dev); - - clear_bit(__LINK_STATE_START, &dev->state); - - /* Synchronize to scheduled poll. We cannot touch poll list, - * it can be even on different cpu. So just clear netif_running(), - * and wait when poll really will happen. Actually, the best place - * for this is inside dev->stop() after device stopped its irq - * engine, but this requires more changes in devices. */ - - smp_mb__after_clear_bit(); /* Commit netif_running(). */ - while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { - /* No hurry. */ - current->state = TASK_INTERRUPTIBLE; - schedule_timeout(1); - } - - /* - * Call the device specific close. This cannot fail. - * Only if device is UP - * - * We allow it to be called even after a DETACH hot-plug - * event. - */ - if (dev->stop) - dev->stop(dev); - - /* - * Device is now down. - */ - - dev->flags &= ~IFF_UP; - - /* - * Tell people we are down - */ - notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); - - return 0; -} - - -/* - * Device change register/unregister. These are not inline or static - * as we export them to the world. - */ - -/** - * register_netdevice_notifier - register a network notifier block - * @nb: notifier - * - * Register a notifier to be called when network device events occur. - * The notifier passed is linked into the kernel structures and must - * not be reused until it has been unregistered. A negative errno code - * is returned on a failure. - * - * When registered all registration and up events are replayed - * to the new notifier to allow device to have a race free - * view of the network device list. - */ - -int register_netdevice_notifier(struct notifier_block *nb) -{ - struct net_device *dev; - int err; - - rtnl_lock(); - err = notifier_chain_register(&netdev_chain, nb); - if (!err) { - for (dev = dev_base; dev; dev = dev->next) { - nb->notifier_call(nb, NETDEV_REGISTER, dev); - - if (dev->flags & IFF_UP) - nb->notifier_call(nb, NETDEV_UP, dev); - } - } - rtnl_unlock(); - return err; -} - -/** - * unregister_netdevice_notifier - unregister a network notifier block - * @nb: notifier - * - * Unregister a notifier previously registered by - * register_netdevice_notifier(). The notifier is unlinked into the - * kernel structures and may then be reused. A negative errno code - * is returned on a failure. - */ - -int unregister_netdevice_notifier(struct notifier_block *nb) -{ - return notifier_chain_unregister(&netdev_chain, nb); -} - -/** - * call_netdevice_notifiers - call all network notifier blocks - * @val: value passed unmodified to notifier function - * @v: pointer passed unmodified to notifier function - * - * Call all network notifier blocks. Parameters and return value - * are as for notifier_call_chain(). - */ - -int call_netdevice_notifiers(unsigned long val, void *v) -{ - return notifier_call_chain(&netdev_chain, val, v); -} - -/* When > 0 there are consumers of rx skb time stamps */ -static atomic_t netstamp_needed = ATOMIC_INIT(0); - -void net_enable_timestamp(void) -{ - atomic_inc(&netstamp_needed); -} - -void net_disable_timestamp(void) -{ - atomic_dec(&netstamp_needed); -} - -static inline void net_timestamp(struct timeval *stamp) -{ - if (atomic_read(&netstamp_needed)) - do_gettimeofday(stamp); - else { - stamp->tv_sec = 0; - stamp->tv_usec = 0; - } -} - -/* - * Support routine. Sends outgoing frames to any network - * taps currently in use. - */ - -void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) -{ - struct packet_type *ptype; - net_timestamp(&skb->stamp); - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - /* Never send packets back to the socket - * they originated from - MvS (miquels@xxxxxxxxxxxxxx) - */ - if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { - struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); - if (!skb2) - break; - - /* skb->nh should be correctly - set by sender, so that the second statement is - just protection against buggy protocols. - */ - skb2->mac.raw = skb2->data; - - if (skb2->nh.raw < skb2->data || - skb2->nh.raw > skb2->tail) { - if (net_ratelimit()) - printk(KERN_CRIT "protocol %04x is " - "buggy, dev %s\n", - skb2->protocol, dev->name); - skb2->nh.raw = skb2->data; - } - - skb2->h.raw = skb2->nh.raw; - skb2->pkt_type = PACKET_OUTGOING; - ptype->func(skb2, skb->dev, ptype); - } - } - rcu_read_unlock(); -} - -/* - * Invalidate hardware checksum when packet is to be mangled, and - * complete checksum manually on outgoing path. - */ -int skb_checksum_help(struct sk_buff *skb, int inward) -{ - unsigned int csum; - int ret = 0, offset = skb->h.raw - skb->data; - - if (inward) { - skb->ip_summed = CHECKSUM_NONE; - goto out; - } - - if (skb_cloned(skb)) { - ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); - if (ret) - goto out; - } - - if (offset > (int)skb->len) - BUG(); - csum = skb_checksum(skb, offset, skb->len-offset, 0); - - offset = skb->tail - skb->h.raw; - if (offset <= 0) - BUG(); - if (skb->csum + 2 > offset) - BUG(); - - *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); - skb->ip_summed = CHECKSUM_NONE; -out: - return ret; -} - -#ifdef CONFIG_HIGHMEM -/* Actually, we should eliminate this check as soon as we know, that: - * 1. IOMMU is present and allows to map all the memory. - * 2. No high memory really exists on this machine. - */ - -static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) -{ - int i; - - if (dev->features & NETIF_F_HIGHDMA) - return 0; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - if (PageHighMem(skb_shinfo(skb)->frags[i].page)) - return 1; - - return 0; -} -#else -#define illegal_highdma(dev, skb) (0) -#endif - -extern void skb_release_data(struct sk_buff *); - -/* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, int gfp_mask) -{ - unsigned int size; - u8 *data; - long offset; - struct skb_shared_info *ninfo; - int headerlen = skb->data - skb->head; - int expand = (skb->tail + skb->data_len) - skb->end; - - if (skb_shared(skb)) - BUG(); - - if (expand <= 0) - expand = 0; - - size = skb->end - skb->head + expand; - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - return -ENOMEM; - - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) - BUG(); - - /* Set up shinfo */ - ninfo = (struct skb_shared_info*)(data + size); - atomic_set(&ninfo->dataref, 1); - ninfo->tso_size = skb_shinfo(skb)->tso_size; - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; - - /* Offset between the two in bytes */ - offset = data - skb->head; - - /* Free old data. */ - skb_release_data(skb); - - skb->head = data; - skb->end = data + size; - - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; - - skb->tail += skb->data_len; - skb->data_len = 0; - return 0; -} - -#define HARD_TX_LOCK(dev, cpu) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ - spin_lock(&dev->xmit_lock); \ - dev->xmit_lock_owner = cpu; \ - } \ -} - -#define HARD_TX_UNLOCK(dev) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ - dev->xmit_lock_owner = -1; \ - spin_unlock(&dev->xmit_lock); \ - } \ -} - -/** - * dev_queue_xmit - transmit a buffer - * @skb: buffer to transmit - * - * Queue a buffer for transmission to a network device. The caller must - * have set the device and priority and built the buffer before calling - * this function. The function can be called from an interrupt. - * - * A negative errno code is returned on a failure. A success does not - * guarantee the frame will be transmitted as it may be dropped due - * to congestion or traffic shaping. - */ - -int dev_queue_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct Qdisc *q; - int rc = -ENOMEM; - - if (skb_shinfo(skb)->frag_list && - !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb, GFP_ATOMIC)) - goto out_kfree_skb; - - /* Fragmented skb is linearized if device does not support SG, - * or if at least one of fragments is in highmem and device - * does not support DMA from it. - */ - if (skb_shinfo(skb)->nr_frags && - (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb, GFP_ATOMIC)) - goto out_kfree_skb; - - /* If a checksum-deferred packet is forwarded to a device that needs a - * checksum, correct the pointers and force checksumming. - */ - if (skb->proto_csum_blank) { - if (skb->protocol != htons(ETH_P_IP)) - goto out_kfree_skb; - skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; - if (skb->h.raw >= skb->tail) - goto out_kfree_skb; - switch (skb->nh.iph->protocol) { - case IPPROTO_TCP: - skb->csum = offsetof(struct tcphdr, check); - break; - case IPPROTO_UDP: - skb->csum = offsetof(struct udphdr, check); - break; - default: - goto out_kfree_skb; - } - if ((skb->h.raw + skb->csum + 2) > skb->tail) - goto out_kfree_skb; - skb->ip_summed = CHECKSUM_HW; - } - - /* If packet is not checksummed and device does not support - * checksumming for this protocol, complete checksumming here. - */ - if (skb->ip_summed == CHECKSUM_HW && - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && - (!(dev->features & NETIF_F_IP_CSUM) || - skb->protocol != htons(ETH_P_IP)))) - if (skb_checksum_help(skb, 0)) - goto out_kfree_skb; - - /* Disable soft irqs for various locks below. Also - * stops preemption for RCU. - */ - local_bh_disable(); - - /* Updates of qdisc are serialized by queue_lock. - * The struct Qdisc which is pointed to by qdisc is now a - * rcu structure - it may be accessed without acquiring - * a lock (but the structure may be stale.) The freeing of the - * qdisc will be deferred until it's known that there are no - * more references to it. - * - * If the qdisc has an enqueue function, we still need to - * hold the queue_lock before calling it, since queue_lock - * also serializes access to the device queue. - */ - - q = rcu_dereference(dev->qdisc); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); -#endif - if (q->enqueue) { - /* Grab device queue */ - spin_lock(&dev->queue_lock); - - rc = q->enqueue(skb, q); - - qdisc_run(dev); - - spin_unlock(&dev->queue_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; - goto out; - } - - /* The device has no queue. Common case for software devices: - loopback, all the sorts of tunnels... - - Really, it is unlikely that xmit_lock protection is necessary here. - (f.e. loopback and IP tunnels are clean ignoring statistics - counters.) - However, it is possible, that they rely on protection - made by us here. - - Check this and shot the lock. It is not prone from deadlocks. - Either shot noqueue qdisc, it is even simpler 8) - */ - if (dev->flags & IFF_UP) { - int cpu = smp_processor_id(); /* ok because BHs are off */ - - if (dev->xmit_lock_owner != cpu) { - - HARD_TX_LOCK(dev, cpu); - - if (!netif_queue_stopped(dev)) { - if (netdev_nit) - dev_queue_xmit_nit(skb, dev); - - rc = 0; - if (!dev->hard_start_xmit(skb, dev)) { - HARD_TX_UNLOCK(dev); - goto out; - } - } - HARD_TX_UNLOCK(dev); - if (net_ratelimit()) - printk(KERN_CRIT "Virtual device %s asks to " - "queue packet!\n", dev->name); - } else { - /* Recursion is detected! It is possible, - * unfortunately */ - if (net_ratelimit()) - printk(KERN_CRIT "Dead loop on virtual device " - "%s, fix it urgently!\n", dev->name); - } - } - - rc = -ENETDOWN; - local_bh_enable(); - -out_kfree_skb: - kfree_skb(skb); - return rc; -out: - local_bh_enable(); - return rc; -} - - -/*======================================================================= - Receiver routines - =======================================================================*/ - -int netdev_max_backlog = 300; -int weight_p = 64; /* old backlog weight */ -/* These numbers are selected based on intuition and some - * experimentatiom, if you have more scientific way of doing this - * please go ahead and fix things. - */ -int no_cong_thresh = 10; -int no_cong = 20; -int lo_cong = 100; -int mod_cong = 290; - -DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; - - -static void get_sample_stats(int cpu) -{ -#ifdef RAND_LIE - unsigned long rd; - int rq; -#endif - struct softnet_data *sd = &per_cpu(softnet_data, cpu); - int blog = sd->input_pkt_queue.qlen; - int avg_blog = sd->avg_blog; - - avg_blog = (avg_blog >> 1) + (blog >> 1); - - if (avg_blog > mod_cong) { - /* Above moderate congestion levels. */ - sd->cng_level = NET_RX_CN_HIGH; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_DROP; -#endif - } else if (avg_blog > lo_cong) { - sd->cng_level = NET_RX_CN_MOD; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_CN_HIGH; -#endif - } else if (avg_blog > no_cong) - sd->cng_level = NET_RX_CN_LOW; - else /* no congestion */ - sd->cng_level = NET_RX_SUCCESS; - - sd->avg_blog = avg_blog; -} - -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy) -{ -/* 10 ms 0r 1ms -- i don't care -- JHS */ - int next_tick = 1; - int cpu = smp_processor_id(); - - get_sample_stats(cpu); - next_tick += jiffies; - mod_timer(&samp_timer, next_tick); -} -#endif - - -/** - * netif_rx - post buffer to the network code - * @skb: buffer to post - * - * This function receives a packet from a device driver and queues it for - * the upper (protocol) levels to process. It always succeeds. The buffer - * may be dropped during processing for congestion control or by the - * protocol layers. - * - * return values: - * NET_RX_SUCCESS (no congestion) - * NET_RX_CN_LOW (low congestion) - * NET_RX_CN_MOD (moderate congestion) - * NET_RX_CN_HIGH (high congestion) - * NET_RX_DROP (packet was dropped) - * - */ - -int netif_rx(struct sk_buff *skb) -{ - int this_cpu; - struct softnet_data *queue; - unsigned long flags; - -#ifdef CONFIG_NETPOLL - if (skb->dev->netpoll_rx && netpoll_rx(skb)) { - kfree_skb(skb); - return NET_RX_DROP; - } -#endif - - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); - - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - this_cpu = smp_processor_id(); - queue = &__get_cpu_var(softnet_data); - - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { - if (queue->throttle) - goto drop; - -enqueue: - dev_hold(skb->dev); - __skb_queue_tail(&queue->input_pkt_queue, skb); -#ifndef OFFLINE_SAMPLE - get_sample_stats(this_cpu); -#endif - local_irq_restore(flags); - return queue->cng_level; - } - - if (queue->throttle) - queue->throttle = 0; - - netif_rx_schedule(&queue->backlog_dev); - goto enqueue; - } - - if (!queue->throttle) { - queue->throttle = 1; - __get_cpu_var(netdev_rx_stat).throttled++; - } - -drop: - __get_cpu_var(netdev_rx_stat).dropped++; - local_irq_restore(flags); - - kfree_skb(skb); - return NET_RX_DROP; -} - -int netif_rx_ni(struct sk_buff *skb) -{ - int err; - - preempt_disable(); - err = netif_rx(skb); - if (local_softirq_pending()) - do_softirq(); - preempt_enable(); - - return err; -} - -EXPORT_SYMBOL(netif_rx_ni); - -static __inline__ void skb_bond(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - - if (dev->master) { - skb->real_dev = skb->dev; - skb->dev = dev->master; - } -} - -static void net_tx_action(struct softirq_action *h) -{ - struct softnet_data *sd = &__get_cpu_var(softnet_data); - - if (sd->completion_queue) { - struct sk_buff *clist; - - local_irq_disable(); - clist = sd->completion_queue; - sd->completion_queue = NULL; - local_irq_enable(); - - while (clist) { - struct sk_buff *skb = clist; - clist = clist->next; - - BUG_TRAP(!atomic_read(&skb->users)); - __kfree_skb(skb); - } - } - - if (sd->output_queue) { - struct net_device *head; - - local_irq_disable(); - head = sd->output_queue; - sd->output_queue = NULL; - local_irq_enable(); - - while (head) { - struct net_device *dev = head; - head = head->next_sched; - - smp_mb__before_clear_bit(); - clear_bit(__LINK_STATE_SCHED, &dev->state); - - if (spin_trylock(&dev->queue_lock)) { - qdisc_run(dev); - spin_unlock(&dev->queue_lock); - } else { - netif_schedule(dev); - } - } - } -} - -static __inline__ int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev) -{ - atomic_inc(&skb->users); - return pt_prev->func(skb, skb->dev, pt_prev); -} - -#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) -int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); - -static __inline__ int handle_bridge(struct sk_buff **pskb, - struct packet_type **pt_prev, int *ret) -{ - struct net_bridge_port *port; - - if ((*pskb)->pkt_type == PACKET_LOOPBACK || - (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) - return 0; - - if (*pt_prev) { - *ret = deliver_skb(*pskb, *pt_prev); - *pt_prev = NULL; - } - - return br_handle_frame_hook(port, pskb); -} -#else -#define handle_bridge(skb, pt_prev, ret) (0) -#endif - -#ifdef CONFIG_NET_CLS_ACT -/* TODO: Maybe we should just force sch_ingress to be compiled in - * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions - * a compare and 2 stores extra right now if we dont have it on - * but have CONFIG_NET_CLS_ACT - * NOTE: This doesnt stop any functionality; if you dont have - * the ingress scheduler, you just cant add policies on ingress. - * - */ -static int ing_filter(struct sk_buff *skb) -{ - struct Qdisc *q; - struct net_device *dev = skb->dev; - int result = TC_ACT_OK; - - if (dev->qdisc_ingress) { - __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); - if (MAX_RED_LOOP < ttl++) { - printk("Redir loop detected Dropping packet (%s->%s)\n", - skb->input_dev?skb->input_dev->name:"??",skb->dev->name); - return TC_ACT_SHOT; - } - - skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); - - skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); - if (NULL == skb->input_dev) { - skb->input_dev = skb->dev; - printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); - } - spin_lock(&dev->ingress_lock); - if ((q = dev->qdisc_ingress) != NULL) - result = q->enqueue(skb, q); - spin_unlock(&dev->ingress_lock); - - } - - return result; -} -#endif - -int netif_receive_skb(struct sk_buff *skb) -{ - struct packet_type *ptype, *pt_prev; - int ret = NET_RX_DROP; - unsigned short type; - -#ifdef CONFIG_NETPOLL - if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) { - kfree_skb(skb); - return NET_RX_DROP; - } -#endif - - if (!skb->stamp.tv_sec) - net_timestamp(&skb->stamp); - - skb_bond(skb); - - __get_cpu_var(netdev_rx_stat).total++; - - skb->h.raw = skb->nh.raw = skb->data; - skb->mac_len = skb->nh.raw - skb->mac.raw; - - pt_prev = NULL; - - rcu_read_lock(); - -#ifdef CONFIG_NET_CLS_ACT - if (skb->tc_verd & TC_NCLS) { - skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); - goto ncls; - } -#endif - - switch (skb->ip_summed) { - case CHECKSUM_UNNECESSARY: - skb->proto_csum_valid = 1; - break; - case CHECKSUM_HW: - /* XXX Implement me. */ - default: - skb->proto_csum_valid = 0; - break; - } - - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev); - pt_prev = ptype; - } - } - -#ifdef CONFIG_NET_CLS_ACT - if (pt_prev) { - ret = deliver_skb(skb, pt_prev); - pt_prev = NULL; /* noone else should process this after*/ - } else { - skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); - } - - ret = ing_filter(skb); - - if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { - kfree_skb(skb); - goto out; - } - - skb->tc_verd = 0; -ncls: -#endif - - handle_diverter(skb); - - if (handle_bridge(&skb, &pt_prev, &ret)) - goto out; - - type = skb->protocol; - list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { - if (ptype->type == type && - (!ptype->dev || ptype->dev == skb->dev)) { - if (pt_prev) - ret = deliver_skb(skb, pt_prev); - pt_prev = ptype; - } - } - - if (pt_prev) { - ret = pt_prev->func(skb, skb->dev, pt_prev); - } else { - kfree_skb(skb); - /* Jamal, now you will not able to escape explaining - * me how you were going to use this. :-) - */ - ret = NET_RX_DROP; - } - -out: - rcu_read_unlock(); - return ret; -} - -static int process_backlog(struct net_device *backlog_dev, int *budget) -{ - int work = 0; - int quota = min(backlog_dev->quota, *budget); - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; - - for (;;) { - struct sk_buff *skb; - struct net_device *dev; - - local_irq_disable(); - skb = __skb_dequeue(&queue->input_pkt_queue); - if (!skb) - goto job_done; - local_irq_enable(); - - dev = skb->dev; - - netif_receive_skb(skb); - - dev_put(dev); - - work++; - - if (work >= quota || jiffies - start_time > 1) - break; - - } - - backlog_dev->quota -= work; - *budget -= work; - return -1; - -job_done: - backlog_dev->quota -= work; - *budget -= work; - - list_del(&backlog_dev->poll_list); - smp_mb__before_clear_bit(); - netif_poll_enable(backlog_dev); - - if (queue->throttle) - queue->throttle = 0; - local_irq_enable(); - return 0; -} - -static void net_rx_action(struct softirq_action *h) -{ - struct softnet_data *queue = &__get_cpu_var(softnet_data); - unsigned long start_time = jiffies; - int budget = netdev_max_backlog; - - - local_irq_disable(); - - while (!list_empty(&queue->poll_list)) { - struct net_device *dev; - - if (budget <= 0 || jiffies - start_time > 1) - goto softnet_break; - - local_irq_enable(); - - dev = list_entry(queue->poll_list.next, - struct net_device, poll_list); - - if (dev->quota <= 0 || dev->poll(dev, &budget)) { - local_irq_disable(); - list_del(&dev->poll_list); - list_add_tail(&dev->poll_list, &queue->poll_list); - if (dev->quota < 0) - dev->quota += dev->weight; - else - dev->quota = dev->weight; - } else { - dev_put(dev); - local_irq_disable(); - } - } -out: - local_irq_enable(); - return; - -softnet_break: - __get_cpu_var(netdev_rx_stat).time_squeeze++; - __raise_softirq_irqoff(NET_RX_SOFTIRQ); - goto out; -} - -static gifconf_func_t * gifconf_list [NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t * gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} - - -/* - * Map an interface index to its name (SIOCGIFNAME) - */ - -/* - * We need this ioctl for efficient implementation of the - * if_indextoname() function required by the IPv6 API. Without - * it, we would have to search all the interfaces to find a - * match. --pb - */ - -static int dev_ifname(struct ifreq __user *arg) -{ - struct net_device *dev; - struct ifreq ifr; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - read_lock(&dev_base_lock); - dev = __dev_get_by_index(ifr.ifr_ifindex); - if (!dev) { - read_unlock(&dev_base_lock); - return -ENODEV; - } - - strcpy(ifr.ifr_name, dev->name); - read_unlock(&dev_base_lock); - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; -} - -/* - * Perform a SIOCGIFCONF call. This structure will change - * size eventually, and there is nothing I can do about it. - * Thus we will need a 'compatibility mode'. - */ - -static int dev_ifconf(char __user *arg) -{ - struct ifconf ifc; - struct net_device *dev; - char __user *pos; - int len; - int total; - int i; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; - - /* - * Loop over the interfaces, and write an info block for each. - */ - - total = 0; - for (dev = dev_base; dev; dev = dev->next) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0); - else - done = gifconf_list[i](dev, pos + total, - len - total); - if (done < 0) - return -EFAULT; - total += done; - } - } - } - - /* - * All done. Write the updated control block back to the caller. - */ - ifc.ifc_len = total; - - /* - * Both BSD and Solaris return 0 here, so we do too. - */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; -} - -#ifdef CONFIG_PROC_FS -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ -static __inline__ struct net_device *dev_get_idx(loff_t pos) -{ - struct net_device *dev; - loff_t i; - - for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); - - return i == pos ? dev : NULL; -} - -void *dev_seq_start(struct seq_file *seq, loff_t *pos) -{ - read_lock(&dev_base_lock); - return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; -} - -void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; -} - -void dev_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock(&dev_base_lock); -} - -static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) -{ - if (dev->get_stats) { - struct net_device_stats *stats = dev->get_stats(dev); - - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); - } else - seq_printf(seq, "%6s: No statistics available.\n", dev->name); -} - -/* - * Called from the PROCfs module. This now uses the new arbitrary sized - * /proc/net interface to create /proc/net/dev - */ -static int dev_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "Inter-| Receive " - " | Transmit\n" - " face |bytes packets errs drop fifo frame " - "compressed multicast|bytes packets errs " - "drop fifo colls carrier compressed\n"); - else - dev_seq_printf_stats(seq, v); - return 0; -} - -static struct netif_rx_stats *softnet_get_online(loff_t *pos) -{ - struct netif_rx_stats *rc = NULL; - - while (*pos < NR_CPUS) - if (cpu_online(*pos)) { - rc = &per_cpu(netdev_rx_stat, *pos); - break; - } else - ++*pos; - return rc; -} - -static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) -{ - return softnet_get_online(pos); -} - -static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - return softnet_get_online(pos); -} - -static void softnet_seq_stop(struct seq_file *seq, void *v) -{ -} - -static int softnet_seq_show(struct seq_file *seq, void *v) -{ - struct netif_rx_stats *s = v; - - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, s->throttled, - s->fastroute_hit, s->fastroute_success, s->fastroute_defer, - s->fastroute_deferred_out, -#if 0 - s->fastroute_latency_reduction -#else - s->cpu_collision -#endif - ); - return 0; -} - -static struct seq_operations dev_seq_ops = { - .start = dev_seq_start, - .next = dev_seq_next, - .stop = dev_seq_stop, - .show = dev_seq_show, -}; - -static int dev_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &dev_seq_ops); -} - -static struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, - .open = dev_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static struct seq_operations softnet_seq_ops = { - .start = softnet_seq_start, - .next = softnet_seq_next, - .stop = softnet_seq_stop, - .show = softnet_seq_show, -}; - -static int softnet_seq_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &softnet_seq_ops); -} - -static struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, - .open = softnet_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef WIRELESS_EXT -extern int wireless_proc_init(void); -#else -#define wireless_proc_init() 0 -#endif - -static int __init dev_proc_init(void) -{ - int rc = -ENOMEM; - - if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) - goto out; - if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) - goto out_dev; - if (wireless_proc_init()) - goto out_softnet; - rc = 0; -out: - return rc; -out_softnet: - proc_net_remove("softnet_stat"); -out_dev: - proc_net_remove("dev"); - goto out; -} -#else -#define dev_proc_init() 0 -#endif /* CONFIG_PROC_FS */ - - -/** - * netdev_set_master - set up master/slave pair - * @slave: slave device - * @master: new master device - * - * Changes the master device of the slave. Pass %NULL to break the - * bonding. The caller must hold the RTNL semaphore. On a failure - * a negative errno code is returned. On success the reference counts - * are adjusted, %RTM_NEWLINK is sent to the routing socket and the - * function returns zero. - */ -int netdev_set_master(struct net_device *slave, struct net_device *master) -{ - struct net_device *old = slave->master; - - ASSERT_RTNL(); - - if (master) { - if (old) - return -EBUSY; - dev_hold(master); - } - - slave->master = master; - - synchronize_net(); - - if (old) - dev_put(old); - - if (master) - slave->flags |= IFF_SLAVE; - else - slave->flags &= ~IFF_SLAVE; - - rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); - return 0; -} - -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promsicuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - */ -void dev_set_promiscuity(struct net_device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - - dev->flags |= IFF_PROMISC; - if ((dev->promiscuity += inc) == 0) - dev->flags &= ~IFF_PROMISC; - if (dev->flags ^ old_flags) { - dev_mc_upload(dev); - printk(KERN_INFO "device %s %s promiscuous mode\n", - dev->name, (dev->flags & IFF_PROMISC) ? "entered" : - "left"); - } -} - -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - */ - -void dev_set_allmulti(struct net_device *dev, int inc) -{ - unsigned short old_flags = dev->flags; - - dev->flags |= IFF_ALLMULTI; - if ((dev->allmulti += inc) == 0) - dev->flags &= ~IFF_ALLMULTI; - if (dev->flags ^ old_flags) - dev_mc_upload(dev); -} - -unsigned dev_get_flags(const struct net_device *dev) -{ - unsigned flags; - - flags = (dev->flags & ~(IFF_PROMISC | - IFF_ALLMULTI | - IFF_RUNNING)) | - (dev->gflags & (IFF_PROMISC | - IFF_ALLMULTI)); - - if (netif_running(dev) && netif_carrier_ok(dev)) - flags |= IFF_RUNNING; - - return flags; -} - -int dev_change_flags(struct net_device *dev, unsigned flags) -{ - int ret; - int old_flags = dev->flags; - - /* - * Set the flags on our device. - */ - - dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | - IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | - IFF_AUTOMEDIA)) | - (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | - IFF_ALLMULTI)); - - /* - * Load in the correct multicast list now the flags have changed. - */ - - dev_mc_upload(dev); - - /* - * Have we downed the interface. We handle IFF_UP ourselves - * according to user attempts to set it, rather than blindly - * setting it. - */ - - ret = 0; - if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ - ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); - - if (!ret) - dev_mc_upload(dev); - } - - if (dev->flags & IFF_UP && - ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | - IFF_VOLATILE))) - notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); - - if ((flags ^ dev->gflags) & IFF_PROMISC) { - int inc = (flags & IFF_PROMISC) ? +1 : -1; - dev->gflags ^= IFF_PROMISC; - dev_set_promiscuity(dev, inc); - } - - /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI - is important. Some (broken) drivers set IFF_PROMISC, when - IFF_ALLMULTI is requested not asking us and not reporting. - */ - if ((flags ^ dev->gflags) & IFF_ALLMULTI) { - int inc = (flags & IFF_ALLMULTI) ? +1 : -1; - dev->gflags ^= IFF_ALLMULTI; - dev_set_allmulti(dev, inc); - } - - if (old_flags ^ dev->flags) - rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); - - return ret; -} - -int dev_set_mtu(struct net_device *dev, int new_mtu) -{ - int err; - - if (new_mtu == dev->mtu) - return 0; - - /* MTU must be positive. */ - if (new_mtu < 0) - return -EINVAL; - - if (!netif_device_present(dev)) - return -ENODEV; - - err = 0; - if (dev->change_mtu) - err = dev->change_mtu(dev, new_mtu); - else - dev->mtu = new_mtu; - if (!err && dev->flags & IFF_UP) - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEMTU, dev); - return err; -} - - -/* - * Perform the SIOCxIFxxx calls. - */ -static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) -{ - int err; - struct net_device *dev = __dev_get_by_name(ifr->ifr_name); - - if (!dev) - return -ENODEV; - - switch (cmd) { - case SIOCGIFFLAGS: /* Get interface flags */ - ifr->ifr_flags = dev_get_flags(dev); - return 0; - - case SIOCSIFFLAGS: /* Set interface flags */ - return dev_change_flags(dev, ifr->ifr_flags); - - case SIOCGIFMETRIC: /* Get the metric on the interface - (currently unused) */ - ifr->ifr_metric = 0; - return 0; - - case SIOCSIFMETRIC: /* Set the metric on the interface - (currently unused) */ - return -EOPNOTSUPP; - - case SIOCGIFMTU: /* Get the MTU of a device */ - ifr->ifr_mtu = dev->mtu; - return 0; - - case SIOCSIFMTU: /* Set the MTU of a device */ - return dev_set_mtu(dev, ifr->ifr_mtu); - - case SIOCGIFHWADDR: - if (!dev->addr_len) - memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); - else - memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - ifr->ifr_hwaddr.sa_family = dev->type; - return 0; - - case SIOCSIFHWADDR: - if (!dev->set_mac_address) - return -EOPNOTSUPP; - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - err = dev->set_mac_address(dev, &ifr->ifr_hwaddr); - if (!err) - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); - return err; - - case SIOCSIFHWBROADCAST: - if (ifr->ifr_hwaddr.sa_family != dev->type) - return -EINVAL; - memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); - notifier_call_chain(&netdev_chain, - NETDEV_CHANGEADDR, dev); - return 0; - - case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; - - case SIOCSIFMAP: - if (dev->set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return dev->set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; - - case SIOCADDMULTI: - if (!dev->set_multicast_list || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); - - case SIOCDELMULTI: - if (!dev->set_multicast_list || - ifr->ifr_hwaddr.sa_family != AF_UNSPEC) - return -EINVAL; - if (!netif_device_present(dev)) - return -ENODEV; - return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, - dev->addr_len, 1); - - case SIOCGIFINDEX: - ifr->ifr_ifindex = dev->ifindex; - return 0; - - case SIOCGIFTXQLEN: - ifr->ifr_qlen = dev->tx_queue_len; - return 0; - - case SIOCSIFTXQLEN: - if (ifr->ifr_qlen < 0) - return -EINVAL; - dev->tx_queue_len = ifr->ifr_qlen; - return 0; - - case SIOCSIFNAME: - ifr->ifr_newname[IFNAMSIZ-1] = '\0'; - return dev_change_name(dev, ifr->ifr_newname); - - /* - * Unknown or private ioctl - */ - - default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || - cmd == SIOCBONDRELEASE || - cmd == SIOCBONDSETHWADDR || - cmd == SIOCBONDSLAVEINFOQUERY || - cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCWANDEV) { - err = -EOPNOTSUPP; - if (dev->do_ioctl) { - if (netif_device_present(dev)) - err = dev->do_ioctl(dev, ifr, - cmd); - else - err = -ENODEV; - } - } else - err = -EINVAL; - - } - return err; -} - -/* - * This function handles all "interface"-type I/O control requests. The actual - * 'doing' part of this is dev_ifsioc above. - */ - -/** - * dev_ioctl - network device ioctl - * @cmd: command to issue - * @arg: pointer to a struct ifreq in user space - * - * Issue ioctl functions to devices. This is normally called by the - * user space syscall interfaces but can sometimes be useful for - * other purposes. The return value is the return from the syscall if - * positive or a negative errno code on error. - */ - -int dev_ioctl(unsigned int cmd, void __user *arg) -{ - struct ifreq ifr; - int ret; - char *colon; - - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_shlock(); - ret = dev_ifconf((char __user *) arg); - rtnl_shunlock(); - return ret; - } - if (cmd == SIOCGIFNAME) - return dev_ifname((struct ifreq __user *)arg); - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; - - colon = strchr(ifr.ifr_name, ':'); - if (colon) - *colon = 0; - - /* - * See which interface the caller is talking about. - */ - - switch (cmd) { - /* - * These ioctl calls: - * - can be done by all. - * - atomic and do not require locking. - * - return a value - */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFHWADDR: - case SIOCGIFSLAVE: - case SIOCGIFMAP: - case SIOCGIFINDEX: - case SIOCGIFTXQLEN: - dev_load(ifr.ifr_name); - read_lock(&dev_base_lock); - ret = dev_ifsioc(&ifr, cmd); - read_unlock(&dev_base_lock); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - case SIOCETHTOOL: - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ethtool(&ifr); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - return a value - */ - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCSIFNAME: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } - return ret; - - /* - * These ioctl calls: - * - require superuser power. - * - require strict serialization. - * - do not return a value - */ - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMTU: - case SIOCSIFMAP: - case SIOCSIFHWADDR: - case SIOCSIFSLAVE: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCSIFHWBROADCAST: - case SIOCSIFTXQLEN: - case SIOCSMIIREG: - case SIOCBONDENSLAVE: - case SIOCBONDRELEASE: - case SIOCBONDSETHWADDR: - case SIOCBONDSLAVEINFOQUERY: - case SIOCBONDINFOQUERY: - case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - return ret; - - case SIOCGIFMEM: - /* Get the per device memory space. We can add this but - * currently do not support it */ - case SIOCSIFMEM: - /* Set the per device memory buffer space. - * Not applicable in our case */ - case SIOCSIFLINK: - return -EINVAL; - - /* - * Unknown or private ioctl. - */ - default: - if (cmd == SIOCWANDEV || - (cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(ifr.ifr_name); - rtnl_lock(); - ret = dev_ifsioc(&ifr, cmd); - rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } -#ifdef WIRELESS_EXT - /* Take care of Wireless Extensions */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - /* If command is `set a parameter', or - * `get the encoding parameters', check if - * the user has the right to do it */ - if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - } - dev_load(ifr.ifr_name); - rtnl_lock(); - /* Follow me in net/core/wireless.c */ - ret = wireless_process_ioctl(&ifr, cmd); - rtnl_unlock(); - if (IW_IS_GET(cmd) && - copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - return ret; - } -#endif /* WIRELESS_EXT */ - return -EINVAL; - } -} - - -/** - * dev_new_index - allocate an ifindex - * - * Returns a suitable unique value for a new device interface - * number. The caller must hold the rtnl semaphore or the - * dev_base_lock to be sure it remains unique. - */ -static int dev_new_index(void) -{ - static int ifindex; - for (;;) { - if (++ifindex <= 0) - ifindex = 1; - if (!__dev_get_by_index(ifindex)) - return ifindex; - } -} - -static int dev_boot_phase = 1; - -/* Delayed registration/unregisteration */ -static DEFINE_SPINLOCK(net_todo_list_lock); -static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); - -static inline void net_set_todo(struct net_device *dev) -{ - spin_lock(&net_todo_list_lock); - list_add_tail(&dev->todo_list, &net_todo_list); - spin_unlock(&net_todo_list_lock); -} - -/** - * register_netdevice - register a network device - * @dev: device to register - * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * Callers must hold the rtnl semaphore. You may want - * register_netdev() instead of this. - * - * BUGS: - * The locking appears insufficient to guarantee two parallel registers - * will not get the same name. - */ - -int register_netdevice(struct net_device *dev) -{ - struct hlist_head *head; - struct hlist_node *p; - int ret; - - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - /* When net_device's are persistent, this will be fatal. */ - BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); - - spin_lock_init(&dev->queue_lock); - spin_lock_init(&dev->xmit_lock); - dev->xmit_lock_owner = -1; -#ifdef CONFIG_NET_CLS_ACT - spin_lock_init(&dev->ingress_lock); -#endif - - ret = alloc_divert_blk(dev); - if (ret) - goto out; - - dev->iflink = -1; - - /* Init, if this function is available */ - if (dev->init) { - ret = dev->init(dev); - if (ret) { - if (ret > 0) - ret = -EIO; - goto out_err; - } - } - - if (!dev_valid_name(dev->name)) { - ret = -EINVAL; - goto out_err; - } - - dev->ifindex = dev_new_index(); - if (dev->iflink == -1) - dev->iflink = dev->ifindex; - - /* Check for existence of name */ - head = dev_name_hash(dev->name); - hlist_for_each(p, head) { - struct net_device *d - = hlist_entry(p, struct net_device, name_hlist); - if (!strncmp(d->name, dev->name, IFNAMSIZ)) { - ret = -EEXIST; - goto out_err; - } - } - - /* Fix illegal SG+CSUM combinations. */ - if ((dev->features & NETIF_F_SG) && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) { - printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", - dev->name); - dev->features &= ~NETIF_F_SG; - } - - /* TSO requires that SG is present as well. */ - if ((dev->features & NETIF_F_TSO) && - !(dev->features & NETIF_F_SG)) { - printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", - dev->name); - dev->features &= ~NETIF_F_TSO; - } - - /* - * nil rebuild_header routine, - * that should be never called and used as just bug trap. - */ - - if (!dev->rebuild_header) - dev->rebuild_header = default_rebuild_header; - - /* - * Default initial state at registry is that the - * device is present. - */ - - set_bit(__LINK_STATE_PRESENT, &dev->state); - - dev->next = NULL; - dev_init_scheduler(dev); - write_lock_bh(&dev_base_lock); - *dev_tail = dev; - dev_tail = &dev->next; - hlist_add_head(&dev->name_hlist, head); - hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); - dev_hold(dev); - dev->reg_state = NETREG_REGISTERING; - write_unlock_bh(&dev_base_lock); - - /* Notify protocols, that a new device appeared. */ - notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); - - /* Finish registration after unlock */ - net_set_todo(dev); - ret = 0; - -out: - return ret; -out_err: - free_divert_blk(dev); - goto out; -} - -/** - * register_netdev - register a network device - * @dev: device to register - * - * Take a completed network device structure and add it to the kernel - * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier - * chain. 0 is returned on success. A negative errno code is returned - * on a failure to set up the device, or if the name is a duplicate. - * - * This is a wrapper around register_netdev that takes the rtnl semaphore - * and expands the device name if you passed a format string to - * alloc_netdev. - */ -int register_netdev(struct net_device *dev) -{ - int err; - - rtnl_lock(); - - /* - * If the name is a format string the caller wants us to do a - * name allocation. - */ - if (strchr(dev->name, '%')) { - err = dev_alloc_name(dev, dev->name); - if (err < 0) - goto out; - } - - /* - * Back compatibility hook. Kill this one in 2.5 - */ - if (dev->name[0] == 0 || dev->name[0] == ' ') { - err = dev_alloc_name(dev, "eth%d"); - if (err < 0) - goto out; - } - - err = register_netdevice(dev); -out: - rtnl_unlock(); - return err; -} -EXPORT_SYMBOL(register_netdev); - -/* - * netdev_wait_allrefs - wait until all references are gone. - * - * This is called when unregistering network devices. - * - * Any protocol or device that holds a reference should register - * for netdevice notification, and cleanup and put back the - * reference if they receive an UNREGISTER event. - * We can get stuck here if buggy protocols don't correctly - * call dev_put. - */ -static void netdev_wait_allrefs(struct net_device *dev) -{ - unsigned long rebroadcast_time, warning_time; - - rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { - if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { - rtnl_shlock(); - - /* Rebroadcast unregister notification */ - notifier_call_chain(&netdev_chain, - NETDEV_UNREGISTER, dev); - - if (test_bit(__LINK_STATE_LINKWATCH_PENDING, - &dev->state)) { - /* We must not have linkwatch events - * pending on unregister. If this - * happens, we simply run the queue - * unscheduled, resulting in a noop - * for this device. - */ - linkwatch_run_queue(); - } - - rtnl_shunlock(); - - rebroadcast_time = jiffies; - } - - msleep(250); - - if (time_after(jiffies, warning_time + 10 * HZ)) { - printk(KERN_EMERG "unregister_netdevice: " - "waiting for %s to become free. Usage " - "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); - warning_time = jiffies; - } - } -} - -/* The sequence is: - * - * rtnl_lock(); - * ... - * register_netdevice(x1); - * register_netdevice(x2); - * ... - * unregister_netdevice(y1); - * unregister_netdevice(y2); - * ... - * rtnl_unlock(); - * free_netdev(y1); - * free_netdev(y2); - * - * We are invoked by rtnl_unlock() after it drops the semaphore. - * This allows us to deal with problems: - * 1) We can create/delete sysfs objects which invoke hotplug - * without deadlocking with linkwatch via keventd. - * 2) Since we run with the RTNL semaphore not held, we can sleep - * safely in order to wait for the netdev refcnt to drop to zero. - */ -static DECLARE_MUTEX(net_todo_run_mutex); -void netdev_run_todo(void) -{ - struct list_head list = LIST_HEAD_INIT(list); - int err; - - - /* Need to guard against multiple cpu's getting out of order. */ - down(&net_todo_run_mutex); - - /* Not safe to do outside the semaphore. We must not return - * until all unregister events invoked by the local processor - * have been completed (either by this todo run, or one on - * another cpu). - */ - if (list_empty(&net_todo_list)) - goto out; - - /* Snapshot list, allow later requests */ - spin_lock(&net_todo_list_lock); - list_splice_init(&net_todo_list, &list); - spin_unlock(&net_todo_list_lock); - - while (!list_empty(&list)) { - struct net_device *dev - = list_entry(list.next, struct net_device, todo_list); - list_del(&dev->todo_list); - - switch(dev->reg_state) { - case NETREG_REGISTERING: - err = netdev_register_sysfs(dev); - if (err) - printk(KERN_ERR "%s: failed sysfs registration (%d)\n", - dev->name, err); - dev->reg_state = NETREG_REGISTERED; - break; - - case NETREG_UNREGISTERING: - netdev_unregister_sysfs(dev); - dev->reg_state = NETREG_UNREGISTERED; - - netdev_wait_allrefs(dev); - - /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); - BUG_TRAP(!dev->ip_ptr); - BUG_TRAP(!dev->ip6_ptr); - BUG_TRAP(!dev->dn_ptr); - - - /* It must be the very last action, - * after this 'dev' may point to freed up memory. - */ - if (dev->destructor) - dev->destructor(dev); - break; - - default: - printk(KERN_ERR "network todo '%s' but state %d\n", - dev->name, dev->reg_state); - break; - } - } - -out: - up(&net_todo_run_mutex); -} - -/** - * alloc_netdev - allocate network device - * @sizeof_priv: size of private data to allocate space for - * @name: device name format string - * @setup: callback to initialize device - * - * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. - */ -struct net_device *alloc_netdev(int sizeof_priv, const char *name, - void (*setup)(struct net_device *)) -{ - void *p; - struct net_device *dev; - int alloc_size; - - /* ensure 32-byte alignment of both the device and private area */ - alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; - alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; - - p = kmalloc(alloc_size, GFP_KERNEL); - if (!p) { - printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); - return NULL; - } - memset(p, 0, alloc_size); - - dev = (struct net_device *) - (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); - dev->padded = (char *)dev - (char *)p; - - if (sizeof_priv) - dev->priv = netdev_priv(dev); - - setup(dev); - strcpy(dev->name, name); - return dev; -} -EXPORT_SYMBOL(alloc_netdev); - -/** - * free_netdev - free network device - * @dev: device - * - * This function does the last stage of destroying an allocated device - * interface. The reference to the device object is released. - * If this is the last reference then it will be freed. - */ -void free_netdev(struct net_device *dev) -{ -#ifdef CONFIG_SYSFS - /* Compatiablity with error handling in drivers */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - kfree((char *)dev - dev->padded); - return; - } - - BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; - - /* will free via class release */ - class_device_put(&dev->class_dev); -#else - kfree((char *)dev - dev->padded); -#endif -} - -/* Synchronize with packet receive processing. */ -void synchronize_net(void) -{ - might_sleep(); - synchronize_kernel(); -} - -/** - * unregister_netdevice - remove device from the kernel - * @dev: device - * - * This function shuts down a device interface and removes it - * from the kernel tables. On success 0 is returned, on a failure - * a negative errno code is returned. - * - * Callers must hold the rtnl semaphore. You may want - * unregister_netdev() instead of this. - */ - -int unregister_netdevice(struct net_device *dev) -{ - struct net_device *d, **dp; - - BUG_ON(dev_boot_phase); - ASSERT_RTNL(); - - /* Some devices call without registering for initialization unwind. */ - if (dev->reg_state == NETREG_UNINITIALIZED) { - printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " - "was registered\n", dev->name, dev); - return -ENODEV; - } - - BUG_ON(dev->reg_state != NETREG_REGISTERED); - - /* If device is running, close it first. */ - if (dev->flags & IFF_UP) - dev_close(dev); - - /* And unlink it from device chain. */ - for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { - if (d == dev) { - write_lock_bh(&dev_base_lock); - hlist_del(&dev->name_hlist); - hlist_del(&dev->index_hlist); - if (dev_tail == &dev->next) - dev_tail = dp; - *dp = d->next; - write_unlock_bh(&dev_base_lock); - break; - } - } - if (!d) { - printk(KERN_ERR "unregister net_device: '%s' not found\n", - dev->name); - return -ENODEV; - } - - dev->reg_state = NETREG_UNREGISTERING; - - synchronize_net(); - - /* Shutdown queueing discipline. */ - dev_shutdown(dev); - - - /* Notify protocols, that we are about to destroy - this device. They should clean all the things. - */ - notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); - - /* - * Flush the multicast chain - */ - dev_mc_discard(dev); - - if (dev->uninit) - dev->uninit(dev); - - /* Notifier chain MUST detach us from master device. */ - BUG_TRAP(!dev->master); - - free_divert_blk(dev); - - /* Finish processing unregister after unlock */ - net_set_todo(dev); - - synchronize_net(); - - dev_put(dev); - return 0; -} - -/** - * unregister_netdev - remove device from the kernel - * @dev: device - * - * This function shuts down a device interface and removes it - * from the kernel tables. On success 0 is returned, on a failure - * a negative errno code is returned. - * - * This is just a wrapper for unregister_netdevice that takes - * the rtnl semaphore. In general you want to use this and not - * unregister_netdevice. - */ -void unregister_netdev(struct net_device *dev) -{ - rtnl_lock(); - unregister_netdevice(dev); - rtnl_unlock(); -} - -EXPORT_SYMBOL(unregister_netdev); - -#ifdef CONFIG_HOTPLUG_CPU -static int dev_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *ocpu) -{ - struct sk_buff **list_skb; - struct net_device **list_net; - struct sk_buff *skb; - unsigned int cpu, oldcpu = (unsigned long)ocpu; - struct softnet_data *sd, *oldsd; - - if (action != CPU_DEAD) - return NOTIFY_OK; - - local_irq_disable(); - cpu = smp_processor_id(); - sd = &per_cpu(softnet_data, cpu); - oldsd = &per_cpu(softnet_data, oldcpu); - - /* Find end of our completion_queue. */ - list_skb = &sd->completion_queue; - while (*list_skb) - list_skb = &(*list_skb)->next; - /* Append completion queue from offline CPU. */ - *list_skb = oldsd->completion_queue; - oldsd->completion_queue = NULL; - - /* Find end of our output_queue. */ - list_net = &sd->output_queue; - while (*list_net) - list_net = &(*list_net)->next_sched; - /* Append output queue from offline CPU. */ - *list_net = oldsd->output_queue; - oldsd->output_queue = NULL; - - raise_softirq_irqoff(NET_TX_SOFTIRQ); - local_irq_enable(); - - /* Process offline CPU's input_pkt_queue */ - while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) - netif_rx(skb); - - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - - -/* - * Initialize the DEV module. At boot time this walks the device list and - * unhooks any devices that fail to initialise (normally hardware not - * present) and leaves us with a valid list of present and active devices. - * - */ - -/* - * This is called single threaded during boot, so no need - * to take the rtnl semaphore. - */ -static int __init net_dev_init(void) -{ - int i, rc = -ENOMEM; - - BUG_ON(!dev_boot_phase); - - net_random_init(); - - if (dev_proc_init()) - goto out; - - if (netdev_sysfs_init()) - goto out; - - INIT_LIST_HEAD(&ptype_all); - for (i = 0; i < 16; i++) - INIT_LIST_HEAD(&ptype_base[i]); - - for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) - INIT_HLIST_HEAD(&dev_name_head[i]); - - for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) - INIT_HLIST_HEAD(&dev_index_head[i]); - - /* - * Initialise the packet receive queues. - */ - - for (i = 0; i < NR_CPUS; i++) { - struct softnet_data *queue; - - queue = &per_cpu(softnet_data, i); - skb_queue_head_init(&queue->input_pkt_queue); - queue->throttle = 0; - queue->cng_level = 0; - queue->avg_blog = 10; /* arbitrary non-zero */ - queue->completion_queue = NULL; - INIT_LIST_HEAD(&queue->poll_list); - set_bit(__LINK_STATE_START, &queue->backlog_dev.state); - queue->backlog_dev.weight = weight_p; - queue->backlog_dev.poll = process_backlog; - atomic_set(&queue->backlog_dev.refcnt, 1); - } - -#ifdef OFFLINE_SAMPLE - samp_timer.expires = jiffies + (10 * HZ); - add_timer(&samp_timer); -#endif - - dev_boot_phase = 0; - - open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); - open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); - - hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); - dev_mcast_init(); - rc = 0; -out: - return rc; -} - -subsys_initcall(net_dev_init); - -EXPORT_SYMBOL(__dev_get_by_index); -EXPORT_SYMBOL(__dev_get_by_name); -EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(__skb_linearize); -EXPORT_SYMBOL(dev_add_pack); -EXPORT_SYMBOL(dev_alloc_name); -EXPORT_SYMBOL(dev_close); -EXPORT_SYMBOL(dev_get_by_flags); -EXPORT_SYMBOL(dev_get_by_index); -EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); -EXPORT_SYMBOL(dev_open); -EXPORT_SYMBOL(dev_queue_xmit); -EXPORT_SYMBOL(dev_remove_pack); -EXPORT_SYMBOL(dev_set_allmulti); -EXPORT_SYMBOL(dev_set_promiscuity); -EXPORT_SYMBOL(dev_change_flags); -EXPORT_SYMBOL(dev_set_mtu); -EXPORT_SYMBOL(free_netdev); -EXPORT_SYMBOL(netdev_boot_setup_check); -EXPORT_SYMBOL(netdev_set_master); -EXPORT_SYMBOL(netdev_state_change); -EXPORT_SYMBOL(netif_receive_skb); -EXPORT_SYMBOL(netif_rx); -EXPORT_SYMBOL(register_gifconf); -EXPORT_SYMBOL(register_netdevice); -EXPORT_SYMBOL(register_netdevice_notifier); -EXPORT_SYMBOL(skb_checksum_help); -EXPORT_SYMBOL(synchronize_net); -EXPORT_SYMBOL(unregister_netdevice); -EXPORT_SYMBOL(unregister_netdevice_notifier); -EXPORT_SYMBOL(net_enable_timestamp); -EXPORT_SYMBOL(net_disable_timestamp); - -#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) -EXPORT_SYMBOL(br_handle_frame_hook); -#endif - -#ifdef CONFIG_KMOD -EXPORT_SYMBOL(dev_load); -#endif - -EXPORT_PER_CPU_SYMBOL(softnet_data); diff -r d75a502b45eb -r 43e28a2f6037 linux-2.6.11-xen-sparse/net/core/skbuff.c --- a/linux-2.6.11-xen-sparse/net/core/skbuff.c Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,1523 +0,0 @@ -/* - * Routines having to do with the 'struct sk_buff' memory handlers. - * - * Authors: Alan Cox <iiitac@xxxxxxxxxxxxxx> - * Florian La Roche <rzsfl@xxxxxxxxxxxx> - * - * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ - * - * Fixes: - * Alan Cox : Fixed the worst of the load - * balancer bugs. - * Dave Platt : Interrupt stacking fix. - * Richard Kooijman : Timestamp fixes. - * Alan Cox : Changed buffer format. - * Alan Cox : destructor hook for AF_UNIX etc. - * Linus Torvalds : Better skb_clone. - * Alan Cox : Added skb_copy. - * Alan Cox : Added all the changed routines Linus - * only put in the headers - * Ray VanTassle : Fixed --skb->lock in free - * Alan Cox : skb_copy copy arp field - * Andi Kleen : slabified it. - * Robert Olsson : Removed skb_head_pool - * - * NOTE: - * The __skb_ routines should be called with interrupts - * disabled, or you better be *real* sure that the operation is atomic - * with respect to whatever list is being frobbed (e.g. via lock_sock() - * or via disabling bottom half handlers, etc). - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* - * The functions in this file will not compile correctly with gcc 2.4.x - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/interrupt.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/slab.h> -#include <linux/netdevice.h> -#ifdef CONFIG_NET_CLS_ACT -#include <net/pkt_sched.h> -#endif -#include <linux/string.h> -#include <linux/skbuff.h> -#include <linux/cache.h> -#include <linux/rtnetlink.h> -#include <linux/init.h> -#include <linux/highmem.h> - -#include <net/protocol.h> -#include <net/dst.h> -#include <net/sock.h> -#include <net/checksum.h> -#include <net/xfrm.h> - -#include <asm/uaccess.h> -#include <asm/system.h> - -static kmem_cache_t *skbuff_head_cache; - -/* - * Keep out-of-line to prevent kernel bloat. - * __builtin_return_address is not used because it is not always - * reliable. - */ - -/** - * skb_over_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_put(). Not user callable. - */ -void skb_over_panic(struct sk_buff *skb, int sz, void *here) -{ - printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s", - here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); - BUG(); -} - -/** - * skb_under_panic - private function - * @skb: buffer - * @sz: size - * @here: address - * - * Out of line support code for skb_push(). Not user callable. - */ - -void skb_under_panic(struct sk_buff *skb, int sz, void *here) -{ - printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s", - here, skb->len, sz, skb->dev ? skb->dev->name : "<NULL>"); - BUG(); -} - -/* Allocate a new skbuff. We do this ourselves so we can fill in a few - * 'private' fields and also do memory statistics to find all the - * [BEEP] leaks. - * - */ - -/** - * alloc_skb - allocate a network buffer - * @size: size to allocate - * @gfp_mask: allocation mask - * - * Allocate a new &sk_buff. The returned buffer has no headroom and a - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ -struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) -{ - struct sk_buff *skb; - u8 *data; - - /* Get the HEAD */ - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); - if (!skb) - goto out; - - /* Get the DATA. Size must match skb_add_mtu(). */ - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - goto nodata; - - memset(skb, 0, offsetof(struct sk_buff, truesize)); - skb->truesize = size + sizeof(struct sk_buff); - atomic_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb->tail = data; - skb->end = data + size; - - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; - skb_shinfo(skb)->frag_list = NULL; -out: - return skb; -nodata: - kmem_cache_free(skbuff_head_cache, skb); - skb = NULL; - goto out; -} - -/** - * alloc_skb_from_cache - allocate a network buffer - * @cp: kmem_cache from which to allocate the data area - * (object size must be big enough for @size bytes + skb overheads) - * @size: size to allocate - * @gfp_mask: allocation mask - * - * Allocate a new &sk_buff. The returned buffer has no headroom and - * tail room of size bytes. The object has a reference count of one. - * The return is the buffer. On a failure the return is %NULL. - * - * Buffers may only be allocated from interrupts using a @gfp_mask of - * %GFP_ATOMIC. - */ -struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, - unsigned int size, int gfp_mask) -{ - struct sk_buff *skb; - u8 *data; - - /* Get the HEAD */ - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); - if (!skb) - goto out; - - /* Get the DATA. */ - size = SKB_DATA_ALIGN(size); - data = kmem_cache_alloc(cp, gfp_mask); - if (!data) - goto nodata; - - memset(skb, 0, offsetof(struct sk_buff, truesize)); - skb->truesize = size + sizeof(struct sk_buff); - atomic_set(&skb->users, 1); - skb->head = data; - skb->data = data; - skb->tail = data; - skb->end = data + size; - - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; - skb_shinfo(skb)->frag_list = NULL; -out: - return skb; -nodata: - kmem_cache_free(skbuff_head_cache, skb); - skb = NULL; - goto out; -} - - -static void skb_drop_fraglist(struct sk_buff *skb) -{ - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - skb_shinfo(skb)->frag_list = NULL; - - do { - struct sk_buff *this = list; - list = list->next; - kfree_skb(this); - } while (list); -} - -static void skb_clone_fraglist(struct sk_buff *skb) -{ - struct sk_buff *list; - - for (list = skb_shinfo(skb)->frag_list; list; list = list->next) - skb_get(list); -} - -void skb_release_data(struct sk_buff *skb) -{ - if (!skb->cloned || - atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); - } - - if (skb_shinfo(skb)->frag_list) - skb_drop_fraglist(skb); - - kfree(skb->head); - } -} - -/* - * Free an skbuff by memory without cleaning the state. - */ -void kfree_skbmem(struct sk_buff *skb) -{ - skb_release_data(skb); - kmem_cache_free(skbuff_head_cache, skb); -} - -/** - * __kfree_skb - private function - * @skb: buffer - * - * Free an sk_buff. Release anything attached to the buffer. - * Clean the state. This is an internal helper function. Users should - * always call kfree_skb - */ - -void __kfree_skb(struct sk_buff *skb) -{ - if (skb->list) { - printk(KERN_WARNING "Warning: kfree_skb passed an skb still " - "on a list (from %p).\n", NET_CALLER(skb)); - BUG(); - } - - dst_release(skb->dst); -#ifdef CONFIG_XFRM - secpath_put(skb->sp); -#endif - if(skb->destructor) { - if (in_irq()) - printk(KERN_WARNING "Warning: kfree_skb on " - "hard IRQ %p\n", NET_CALLER(skb)); - skb->destructor(skb); - } -#ifdef CONFIG_NETFILTER - nf_conntrack_put(skb->nfct); -#ifdef CONFIG_BRIDGE_NETFILTER - nf_bridge_put(skb->nf_bridge); -#endif -#endif -/* XXX: IS this still necessary? - JHS */ -#ifdef CONFIG_NET_SCHED - skb->tc_index = 0; -#ifdef CONFIG_NET_CLS_ACT - skb->tc_verd = 0; - skb->tc_classid = 0; -#endif -#endif - - kfree_skbmem(skb); -} - -/** - * skb_clone - duplicate an sk_buff - * @skb: buffer to clone - * @gfp_mask: allocation priority - * - * Duplicate an &sk_buff. The new one is not owned by a socket. Both - * copies share the same packet data but not structure. The new - * buffer has a reference count of 1. If the allocation fails the - * function returns %NULL otherwise the new buffer is returned. - * - * If this function is called from an interrupt gfp_mask() must be - * %GFP_ATOMIC. - */ - -struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) -{ - struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); - - if (!n) - return NULL; - -#define C(x) n->x = skb->x - - n->next = n->prev = NULL; - n->list = NULL; - n->sk = NULL; - C(stamp); - C(dev); - C(real_dev); - C(h); - C(nh); - C(mac); - C(dst); - dst_clone(skb->dst); - C(sp); -#ifdef CONFIG_INET - secpath_get(skb->sp); -#endif - memcpy(n->cb, skb->cb, sizeof(skb->cb)); - C(len); - C(data_len); - C(csum); - C(local_df); - n->cloned = 1; - C(proto_csum_valid); - C(proto_csum_blank); - C(pkt_type); - C(ip_summed); - C(priority); - C(protocol); - C(security); - n->destructor = NULL; -#ifdef CONFIG_NETFILTER - C(nfmark); - C(nfcache); - C(nfct); - nf_conntrack_get(skb->nfct); - C(nfctinfo); -#ifdef CONFIG_NETFILTER_DEBUG - C(nf_debug); -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - C(nf_bridge); - nf_bridge_get(skb->nf_bridge); -#endif -#endif /*CONFIG_NETFILTER*/ -#if defined(CONFIG_HIPPI) - C(private); -#endif -#ifdef CONFIG_NET_SCHED - C(tc_index); -#ifdef CONFIG_NET_CLS_ACT - n->tc_verd = SET_TC_VERD(skb->tc_verd,0); - n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); - n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); - C(input_dev); - C(tc_classid); -#endif - -#endif - C(truesize); - atomic_set(&n->users, 1); - C(head); - C(data); - C(tail); - C(end); - - atomic_inc(&(skb_shinfo(skb)->dataref)); - skb->cloned = 1; - - return n; -} - -static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) -{ - /* - * Shift between the two data areas in bytes - */ - unsigned long offset = new->data - old->data; - - new->list = NULL; - new->sk = NULL; - new->dev = old->dev; - new->real_dev = old->real_dev; - new->priority = old->priority; - new->protocol = old->protocol; - new->dst = dst_clone(old->dst); -#ifdef CONFIG_INET - new->sp = secpath_get(old->sp); -#endif - new->h.raw = old->h.raw + offset; - new->nh.raw = old->nh.raw + offset; - new->mac.raw = old->mac.raw + offset; - memcpy(new->cb, old->cb, sizeof(old->cb)); - new->local_df = old->local_df; - new->pkt_type = old->pkt_type; - new->stamp = old->stamp; - new->destructor = NULL; - new->security = old->security; -#ifdef CONFIG_NETFILTER - new->nfmark = old->nfmark; - new->nfcache = old->nfcache; - new->nfct = old->nfct; - nf_conntrack_get(old->nfct); - new->nfctinfo = old->nfctinfo; -#ifdef CONFIG_NETFILTER_DEBUG - new->nf_debug = old->nf_debug; -#endif -#ifdef CONFIG_BRIDGE_NETFILTER - new->nf_bridge = old->nf_bridge; - nf_bridge_get(old->nf_bridge); -#endif -#endif -#ifdef CONFIG_NET_SCHED -#ifdef CONFIG_NET_CLS_ACT - new->tc_verd = old->tc_verd; -#endif - new->tc_index = old->tc_index; -#endif - atomic_set(&new->users, 1); - skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; - skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; -} - -/** - * skb_copy - create private copy of an sk_buff - * @skb: buffer to copy - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and its data. This is used when the - * caller wishes to modify the data and needs a private copy of the - * data to alter. Returns %NULL on failure or the pointer to the buffer - * on success. The returned buffer has a reference count of 1. - * - * As by-product this function converts non-linear &sk_buff to linear - * one, so that &sk_buff becomes completely private and caller is allowed - * to modify all the data of returned buffer. This means that this - * function is not recommended for use in circumstances when only - * header is going to be modified. Use pskb_copy() instead. - */ - -struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) -{ - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len, - gfp_mask); - if (!n) - return NULL; - - /* Set the data pointer */ - skb_reserve(n, headerlen); - /* Set the tail pointer and length */ - skb_put(n, skb->len); - n->csum = skb->csum; - n->ip_summed = skb->ip_summed; - - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) - BUG(); - - copy_skb_header(n, skb); - return n; -} - - -/** - * pskb_copy - create copy of an sk_buff with private head. - * @skb: buffer to copy - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and part of its data, located - * in header. Fragmented data remain shared. This is used when - * the caller wishes to modify only header of &sk_buff and needs - * private copy of the header to alter. Returns %NULL on failure - * or the pointer to the buffer on success. - * The returned buffer has a reference count of 1. - */ - -struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) -{ - /* - * Allocate the copy buffer - */ - struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask); - - if (!n) - goto out; - - /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); - /* Set the tail pointer and length */ - skb_put(n, skb_headlen(skb)); - /* Copy the bytes */ - memcpy(n->data, skb->data, n->len); - n->csum = skb->csum; - n->ip_summed = skb->ip_summed; - - n->data_len = skb->data_len; - n->len = skb->len; - - if (skb_shinfo(skb)->nr_frags) { - int i; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; - get_page(skb_shinfo(n)->frags[i].page); - } - skb_shinfo(n)->nr_frags = i; - } - - if (skb_shinfo(skb)->frag_list) { - skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; - skb_clone_fraglist(n); - } - - copy_skb_header(n, skb); -out: - return n; -} - -/** - * pskb_expand_head - reallocate header of &sk_buff - * @skb: buffer to reallocate - * @nhead: room to add at head - * @ntail: room to add at tail - * @gfp_mask: allocation priority - * - * Expands (or creates identical copy, if &nhead and &ntail are zero) - * header of skb. &sk_buff itself is not changed. &sk_buff MUST have - * reference count of 1. Returns zero in the case of success or error, - * if expansion failed. In the last case, &sk_buff is not changed. - * - * All the pointers pointing into skb header may change and must be - * reloaded after call to this function. - */ - -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) -{ - int i; - u8 *data; - int size = nhead + (skb->end - skb->head) + ntail; - long off; - - if (skb_shared(skb)) - BUG(); - - size = SKB_DATA_ALIGN(size); - - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - goto nodata; - - /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ - memcpy(data + nhead, skb->head, skb->tail - skb->head); - memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); - - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - - skb_release_data(skb); - - off = (data + nhead) - skb->head; - - skb->head = data; - skb->end = data + size; - skb->data += off; - skb->tail += off; - skb->mac.raw += off; - skb->h.raw += off; - skb->nh.raw += off; - skb->cloned = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); - return 0; - -nodata: - return -ENOMEM; -} - -/* Make private copy of skb with writable head and some headroom */ - -struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) -{ - struct sk_buff *skb2; - int delta = headroom - skb_headroom(skb); - - if (delta <= 0) - skb2 = pskb_copy(skb, GFP_ATOMIC); - else { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, - GFP_ATOMIC)) { - kfree_skb(skb2); - skb2 = NULL; - } - } - return skb2; -} - - -/** - * skb_copy_expand - copy and expand sk_buff - * @skb: buffer to copy - * @newheadroom: new free bytes at head - * @newtailroom: new free bytes at tail - * @gfp_mask: allocation priority - * - * Make a copy of both an &sk_buff and its data and while doing so - * allocate additional space. - * - * This is used when the caller wishes to modify the data and needs a - * private copy of the data to alter as well as more space for new fields. - * Returns %NULL on failure or the pointer to the buffer - * on success. The returned buffer has a reference count of 1. - * - * You must pass %GFP_ATOMIC as the allocation priority if this function - * is called from an interrupt. - * - * BUG ALERT: ip_summed is not copied. Why does this work? Is it used - * only by netfilter in the cases when checksum is recalculated? --ANK - */ -struct sk_buff *skb_copy_expand(const struct sk_buff *skb, - int newheadroom, int newtailroom, int gfp_mask) -{ - /* - * Allocate the copy buffer - */ - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask); - int head_copy_len, head_copy_off; - - if (!n) - return NULL; - - skb_reserve(n, newheadroom); - - /* Set the tail pointer and length */ - skb_put(n, skb->len); - - head_copy_len = skb_headroom(skb); - head_copy_off = 0; - if (newheadroom <= head_copy_len) - head_copy_len = newheadroom; - else - head_copy_off = newheadroom - head_copy_len; - - /* Copy the linear header and data. */ - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, - skb->len + head_copy_len)) - BUG(); - - copy_skb_header(n, skb); - - return n; -} - -/** - * skb_pad - zero pad the tail of an skb - * @skb: buffer to pad - * @pad: space to pad - * - * Ensure that a buffer is followed by a padding area that is zero - * filled. Used by network drivers which may DMA or transfer data - * beyond the buffer end onto the wire. - * - * May return NULL in out of memory cases. - */ - -struct sk_buff *skb_pad(struct sk_buff *skb, int pad) -{ - struct sk_buff *nskb; - - /* If the skbuff is non linear tailroom is always zero.. */ - if (skb_tailroom(skb) >= pad) { - memset(skb->data+skb->len, 0, pad); - return skb; - } - - nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); - kfree_skb(skb); - if (nskb) - memset(nskb->data+nskb->len, 0, pad); - return nskb; -} - -/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. - * If realloc==0 and trimming is impossible without change of data, - * it is BUG(). - */ - -int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) -{ - int offset = skb_headlen(skb); - int nfrags = skb_shinfo(skb)->nr_frags; - int i; - - for (i = 0; i < nfrags; i++) { - int end = offset + skb_shinfo(skb)->frags[i].size; - if (end > len) { - if (skb_cloned(skb)) { - if (!realloc) - BUG(); - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; - } - if (len <= offset) { - put_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb)->nr_frags--; - } else { - skb_shinfo(skb)->frags[i].size = len - offset; - } - } - offset = end; - } - - if (offset < len) { - skb->data_len -= skb->len - len; - skb->len = len; - } else { - if (len <= skb_headlen(skb)) { - skb->len = len; - skb->data_len = 0; - skb->tail = skb->data + len; - if (skb_shinfo(skb)->frag_list && !skb_cloned(skb)) - skb_drop_fraglist(skb); - } else { - skb->data_len -= skb->len - len; - skb->len = len; - } - } - - return 0; -} - -/** - * __pskb_pull_tail - advance tail of skb header - * @skb: buffer to reallocate - * @delta: number of bytes to advance tail - * - * The function makes a sense only on a fragmented &sk_buff, - * it expands header moving its tail forward and copying necessary - * data from fragmented part. - * - * &sk_buff MUST have reference count of 1. - * - * Returns %NULL (and &sk_buff does not change) if pull failed - * or value of new tail of skb in the case of success. - * - * All the pointers pointing into skb header may change and must be - * reloaded after call to this function. - */ - -/* Moves tail of skb head forward, copying data from fragmented part, - * when it is necessary. - * 1. It may fail due to malloc failure. - * 2. It may change skb pointers. - * - * It is pretty complicated. Luckily, it is called only in exceptional cases. - */ -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) -{ - /* If skb has not enough free space at tail, get new one - * plus 128 bytes for future expansions. If we have enough - * room at tail, reallocate without expansion only if skb is cloned. - */ - int i, k, eat = (skb->tail + delta) - skb->end; - - if (eat > 0 || skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, - GFP_ATOMIC)) - return NULL; - } - - if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta)) - BUG(); - - /* Optimization: no fragments, no reasons to preestimate - * size of pulled pages. Superb. - */ - if (!skb_shinfo(skb)->frag_list) - goto pull_pages; - - /* Estimate size of pulled pages. */ - eat = delta; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size >= eat) - goto pull_pages; - eat -= skb_shinfo(skb)->frags[i].size; - } - - /* If we need update frag list, we are in troubles. - * Certainly, it possible to add an offset to skb data, - * but taking into account that pulling is expected to - * be very rare operation, it is worth to fight against - * further bloating skb head and crucify ourselves here instead. - * Pure masohism, indeed. 8)8) - */ - if (eat) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - struct sk_buff *clone = NULL; - struct sk_buff *insp = NULL; - - do { - if (!list) - BUG(); - - if (list->len <= eat) { - /* Eaten as whole. */ - eat -= list->len; - list = list->next; - insp = list; - } else { - /* Eaten partially. */ - - if (skb_shared(list)) { - /* Sucks! We need to fork list. :-( */ - clone = skb_clone(list, GFP_ATOMIC); - if (!clone) - return NULL; - insp = list->next; - list = clone; - } else { - /* This may be pulled without - * problems. */ - insp = list; - } - if (!pskb_pull(list, eat)) { - if (clone) - kfree_skb(clone); - return NULL; - } - break; - } - } while (eat); - - /* Free pulled out fragments. */ - while ((list = skb_shinfo(skb)->frag_list) != insp) { - skb_shinfo(skb)->frag_list = list->next; - kfree_skb(list); - } - /* And insert new clone at head. */ - if (clone) { - clone->next = list; - skb_shinfo(skb)->frag_list = clone; - } - } - /* Success! Now we may commit changes to skb data. */ - -pull_pages: - eat = delta; - k = 0; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); - eat -= skb_shinfo(skb)->frags[i].size; - } else { - skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; - if (eat) { - skb_shinfo(skb)->frags[k].page_offset += eat; - skb_shinfo(skb)->frags[k].size -= eat; - eat = 0; - } - k++; - } - } - skb_shinfo(skb)->nr_frags = k; - - skb->tail += delta; - skb->data_len -= delta; - - return skb->tail; -} - -/* Copy some data bits from skb to kernel buffer. */ - -int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) -{ - int i, copy; - int start = skb_headlen(skb); - - if (offset > (int)skb->len - len) - goto fault; - - /* Copy header. */ - if ((copy = start - offset) > 0) { - if (copy > len) - copy = len; - memcpy(to, skb->data + offset, copy); - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - u8 *vaddr; - - if (copy > len) - copy = len; - - vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); - memcpy(to, - vaddr + skb_shinfo(skb)->frags[i].page_offset+ - offset - start, copy); - kunmap_skb_frag(vaddr); - - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_bits(list, offset - start, - to, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to += copy; - } - start = end; - } - } - if (!len) - return 0; - -fault: - return -EFAULT; -} - -/* Keep iterating until skb_iter_next returns false. */ -void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i) -{ - i->len = skb_headlen(skb); - i->data = (unsigned char *)skb->data; - i->nextfrag = 0; - i->fraglist = NULL; -} - -int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i) -{ - /* Unmap previous, if not head fragment. */ - if (i->nextfrag) - kunmap_skb_frag(i->data); - - if (i->fraglist) { - fraglist: - /* We're iterating through fraglist. */ - if (i->nextfrag < skb_shinfo(i->fraglist)->nr_frags) { - i->data = kmap_skb_frag(&skb_shinfo(i->fraglist) - ->frags[i->nextfrag]); - i->len = skb_shinfo(i->fraglist)->frags[i->nextfrag] - .size; - i->nextfrag++; - return 1; - } - /* Fragments with fragments? Too hard! */ - BUG_ON(skb_shinfo(i->fraglist)->frag_list); - i->fraglist = i->fraglist->next; - if (!i->fraglist) - goto end; - - i->len = skb_headlen(i->fraglist); - i->data = i->fraglist->data; - i->nextfrag = 0; - return 1; - } - - if (i->nextfrag < skb_shinfo(skb)->nr_frags) { - i->data = kmap_skb_frag(&skb_shinfo(skb)->frags[i->nextfrag]); - i->len = skb_shinfo(skb)->frags[i->nextfrag].size; - i->nextfrag++; - return 1; - } - - i->fraglist = skb_shinfo(skb)->frag_list; - if (i->fraglist) - goto fraglist; - -end: - /* Bug trap for callers */ - i->data = NULL; - return 0; -} - -void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i) -{ - /* Unmap previous, if not head fragment. */ - if (i->data && i->nextfrag) - kunmap_skb_frag(i->data); - /* Bug trap for callers */ - i->data = NULL; -} - -/* Checksum skb data. */ - -unsigned int skb_checksum(const struct sk_buff *skb, int offset, - int len, unsigned int csum) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - int pos = 0; - - /* Checksum header. */ - if (copy > 0) { - if (copy > len) - copy = len; - csum = csum_partial(skb->data + offset, copy, csum); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - unsigned int csum2; - u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - vaddr = kmap_skb_frag(frag); - csum2 = csum_partial(vaddr + frag->page_offset + - offset - start, copy, 0); - kunmap_skb_frag(vaddr); - csum = csum_block_add(csum, csum2, pos); - if (!(len -= copy)) - return csum; - offset += copy; - pos += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - unsigned int csum2; - if (copy > len) - copy = len; - csum2 = skb_checksum(list, offset - start, - copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - pos += copy; - } - start = end; - } - } - if (len) - BUG(); - - return csum; -} - -/* Both of above in one bottle. */ - -unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, - u8 *to, int len, unsigned int csum) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - int pos = 0; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - csum = csum_partial_copy_nocheck(skb->data + offset, to, - copy, csum); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - - BUG_TRAP(start <= offset + len); - - end = start + skb_shinfo(skb)->frags[i].size; - if ((copy = end - offset) > 0) { - unsigned int csum2; - u8 *vaddr; - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - if (copy > len) - copy = len; - vaddr = kmap_skb_frag(frag); - csum2 = csum_partial_copy_nocheck(vaddr + - frag->page_offset + - offset - start, to, - copy, 0); - kunmap_skb_frag(vaddr); - csum = csum_block_add(csum, csum2, pos); - if (!(len -= copy)) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - - if (skb_shinfo(skb)->frag_list) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; - - for (; list; list = list->next) { - unsigned int csum2; - int end; - - BUG_TRAP(start <= offset + len); - - end = start + list->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - csum2 = skb_copy_and_csum_bits(list, - offset - start, - to, copy, 0); - csum = csum_block_add(csum, csum2, pos); - if ((len -= copy) == 0) - return csum; - offset += copy; - to += copy; - pos += copy; - } - start = end; - } - } - if (len) - BUG(); - return csum; -} - -void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) -{ - unsigned int csum; - long csstart; - - if (skb->ip_summed == CHECKSUM_HW) - csstart = skb->h.raw - skb->data; - else - csstart = skb_headlen(skb); - - if (csstart > skb_headlen(skb)) - BUG(); - - memcpy(to, skb->data, csstart); - - csum = 0; - if (csstart != skb->len) - csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, - skb->len - csstart, 0); - - if (skb->ip_summed == CHECKSUM_HW) { - long csstuff = csstart + skb->csum; - - *((unsigned short *)(to + csstuff)) = csum_fold(csum); - } -} - -/** - * skb_dequeue - remove from the head of the queue - * @list: list to dequeue from - * - * Remove the head of the list. The list lock is taken so the function - * may be used safely with other locking list functions. The head item is - * returned or %NULL if the list is empty. - */ - -struct sk_buff *skb_dequeue(struct sk_buff_head *list) -{ - unsigned long flags; - struct sk_buff *result; - - spin_lock_irqsave(&list->lock, flags); - result = __skb_dequeue(list); - spin_unlock_irqrestore(&list->lock, flags); - return result; -} - -/** - * skb_dequeue_tail - remove from the tail of the queue - * @list: list to dequeue from - * - * Remove the tail of the list. The list lock is taken so the function - * may be used safely with other locking list functions. The tail item is - * returned or %NULL if the list is empty. - */ -struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) -{ - unsigned long flags; - struct sk_buff *result; - - spin_lock_irqsave(&list->lock, flags); - result = __skb_dequeue_tail(list); - spin_unlock_irqrestore(&list->lock, flags); - return result; -} - -/** - * skb_queue_purge - empty a list - * @list: list to empty - * - * Delete all buffers on an &sk_buff list. Each buffer is removed from - * the list and one reference dropped. This function takes the list - * lock and is atomic with respect to other list locking functions. - */ -void skb_queue_purge(struct sk_buff_head *list) -{ - struct sk_buff *skb; - while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); -} - -/** - * skb_queue_head - queue a buffer at the list head - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the start of the list. This function takes the - * list lock and can be used safely with other locking &sk_buff functions - * safely. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_head(list, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} - -/** - * skb_queue_tail - queue a buffer at the list tail - * @list: list to use - * @newsk: buffer to queue - * - * Queue a buffer at the tail of the list. This function takes the - * list lock and can be used safely with other locking &sk_buff functions - * safely. - * - * A buffer cannot be placed on two lists at the same time. - */ -void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - __skb_queue_tail(list, newsk); - spin_unlock_irqrestore(&list->lock, flags); -} -/** - * skb_unlink - remove a buffer from a list - * @skb: buffer to remove - * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls - * - * Works even without knowing the list it is sitting on, which can be - * handy at times. It also means that THE LIST MUST EXIST when you - * unlink. Thus a list must have its contents unlinked before it is - * destroyed. - */ -void skb_unlink(struct sk_buff *skb) -{ - struct sk_buff_head *list = skb->list; - - if (list) { - unsigned long flags; - - spin_lock_irqsave(&list->lock, flags); - if (skb->list == list) - __skb_unlink(skb, skb->list); - spin_unlock_irqrestore(&list->lock, flags); - } -} - - -/** - * skb_append - append a buffer - * @old: buffer to insert after - * @newsk: buffer to insert - * - * Place a packet after a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls. - * A buffer cannot be placed on two lists at the same time. - */ - -void skb_append(struct sk_buff *old, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&old->list->lock, flags); - __skb_append(old, newsk); - spin_unlock_irqrestore(&old->list->lock, flags); -} - - -/** - * skb_insert - insert a buffer - * @old: buffer to insert before - * @newsk: buffer to insert - * - * Place a packet before a given packet in a list. The list locks are taken - * and this function is atomic with respect to other list locked calls - * A buffer cannot be placed on two lists at the same time. - */ - -void skb_insert(struct sk_buff *old, struct sk_buff *newsk) -{ - unsigned long flags; - - spin_lock_irqsave(&old->list->lock, flags); - __skb_insert(newsk, old->prev, old, old->list); - spin_unlock_irqrestore(&old->list->lock, flags); -} - -#if 0 -/* - * Tune the memory allocator for a new MTU size. - */ -void skb_add_mtu(int mtu) -{ - /* Must match allocation in alloc_skb */ - mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info); - - kmem_add_cache_size(mtu); -} -#endif - -static inline void skb_split_inside_header(struct sk_buff *skb, - struct sk_buff* skb1, - const u32 len, const int pos) -{ - int i; - - memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len); - - /* And move data appendix as is. */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; - - skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; - skb_shinfo(skb)->nr_frags = 0; - skb1->data_len = skb->data_len; - skb1->len += skb1->data_len; - skb->data_len = 0; - skb->len = len; - skb->tail = skb->data + len; -} - -static inline void skb_split_no_header(struct sk_buff *skb, - struct sk_buff* skb1, - const u32 len, int pos) -{ - int i, k = 0; - const int nfrags = skb_shinfo(skb)->nr_frags; - - skb_shinfo(skb)->nr_frags = 0; - skb1->len = skb1->data_len = skb->len - len; - skb->len = len; - skb->data_len = len - pos; - - for (i = 0; i < nfrags; i++) { - int size = skb_shinfo(skb)->frags[i].size; - - if (pos + size > len) { - skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; - - if (pos < len) { - /* Split frag. - * We have to variants in this case: - * 1. Move all the frag to the second - * part, if it is possible. F.e. - * this approach is mandatory for TUX, - * where splitting is expensive. - * 2. Split is accurately. We make this. - */ - get_page(skb_shinfo(skb)->frags[i].page); - skb_shinfo(skb1)->frags[0].page_offset += len - pos; - skb_shinfo(skb1)->frags[0].size -= len - pos; - skb_shinfo(skb)->frags[i].size = len - pos; - skb_shinfo(skb)->nr_frags++; - } - k++; - } else - skb_shinfo(skb)->nr_frags++; - pos += size; - } - skb_shinfo(skb1)->nr_frags = k; -} - -/** - * skb_split - Split fragmented skb to two parts at length len. - */ -void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) -{ - int pos = skb_headlen(skb); - - if (len < pos) /* Split line is inside header. */ - skb_split_inside_header(skb, skb1, len, pos); - else /* Second chunk has no header, nothing to copy. */ - skb_split_no_header(skb, skb1, len, pos); -} - -void __init skb_init(void) -{ - skbuff_head_cache = kmem_cache_create("skbuff_head_cache", - sizeof(struct sk_buff), - 0, - SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!skbuff_head_cache) - panic("cannot create skbuff cache"); -} - -EXPORT_SYMBOL(___pskb_trim); -EXPORT_SYMBOL(__kfree_skb); -EXPORT_SYMBOL(__pskb_pull_tail); -EXPORT_SYMBOL(alloc_skb); -EXPORT_SYMBOL(pskb_copy); -EXPORT_SYMBOL(pskb_expand_head); -EXPORT_SYMBOL(skb_checksum); -EXPORT_SYMBOL(skb_clone); -EXPORT_SYMBOL(skb_clone_fraglist); -EXPORT_SYMBOL(skb_copy); -EXPORT_SYMBOL(skb_copy_and_csum_bits); -EXPORT_SYMBOL(skb_copy_and_csum_dev); -EXPORT_SYMBOL(skb_copy_bits); -EXPORT_SYMBOL(skb_copy_expand); -EXPORT_SYMBOL(skb_over_panic); -EXPORT_SYMBOL(skb_pad); -EXPORT_SYMBOL(skb_realloc_headroom); -EXPORT_SYMBOL(skb_under_panic); -EXPORT_SYMBOL(skb_dequeue); -EXPORT_SYMBOL(skb_dequeue_tail); -EXPORT_SYMBOL(skb_insert); -EXPORT_SYMBOL(skb_queue_purge); -EXPORT_SYMBOL(skb_queue_head); -EXPORT_SYMBOL(skb_queue_tail); -EXPORT_SYMBOL(skb_unlink); -EXPORT_SYMBOL(skb_append); -EXPORT_SYMBOL(skb_split); -EXPORT_SYMBOL(skb_iter_first); -EXPORT_SYMBOL(skb_iter_next); -EXPORT_SYMBOL(skb_iter_abort); diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/agpgart.patch --- a/patches/linux-2.6.11/agpgart.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,437 +0,0 @@ ---- linux-2.6.11/drivers/char/agp/agp.h 2005-03-02 07:38:07 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/agp.h 2005-03-22 11:14:02 +00:00 -@@ -272,6 +272,8 @@ - #define AGP_GENERIC_SIZES_ENTRIES 11 - extern struct aper_size_info_16 agp3_generic_sizes[]; - -+#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) -+#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) - - extern int agp_off; - extern int agp_try_unsupported_boot; ---- linux-2.6.11/drivers/char/agp/ali-agp.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/ali-agp.c 2005-03-22 11:14:56 +00:00 -@@ -150,7 +150,7 @@ - pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); - pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, - (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | -- virt_to_phys(addr)) | ALI_CACHE_FLUSH_EN )); -+ virt_to_gart(addr)) | ALI_CACHE_FLUSH_EN )); - return addr; - } - -@@ -174,7 +174,7 @@ - pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); - pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, - (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | -- virt_to_phys(addr)) | ALI_CACHE_FLUSH_EN)); -+ virt_to_gart(addr)) | ALI_CACHE_FLUSH_EN)); - agp_generic_destroy_page(addr); - } - ---- linux-2.6.11/drivers/char/agp/amd-k7-agp.c 2005-03-02 07:38:33 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/amd-k7-agp.c 2005-03-22 11:14:56 +00:00 -@@ -43,7 +43,7 @@ - - SetPageReserved(virt_to_page(page_map->real)); - global_cache_flush(); -- page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), -+ page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), - PAGE_SIZE); - if (page_map->remapped == NULL) { - ClearPageReserved(virt_to_page(page_map->real)); -@@ -154,7 +154,7 @@ - - agp_bridge->gatt_table_real = (u32 *)page_dir.real; - agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; -- agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); -+ agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); - - /* Get the address for the gart region. - * This is a bus address even on the alpha, b/c its -@@ -167,7 +167,7 @@ - - /* Calculate the agp offset */ - for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { -- writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, -+ writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, - page_dir.remapped+GET_PAGE_DIR_OFF(addr)); - readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ - } ---- linux-2.6.11/drivers/char/agp/amd64-agp.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/amd64-agp.c 2005-03-22 11:14:56 +00:00 -@@ -218,7 +218,7 @@ - - static int amd_8151_configure(void) - { -- unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); -+ unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); - - /* Configure AGP regs in each x86-64 host bridge. */ - for_each_nb() { -@@ -590,7 +590,7 @@ - { - struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - -- release_mem_region(virt_to_phys(bridge->gatt_table_real), -+ release_mem_region(virt_to_gart(bridge->gatt_table_real), - amd64_aperture_sizes[bridge->aperture_size_idx].size); - agp_remove_bridge(bridge); - agp_put_bridge(bridge); ---- linux-2.6.11/drivers/char/agp/ati-agp.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/ati-agp.c 2005-03-22 11:14:56 +00:00 -@@ -61,7 +61,7 @@ - - SetPageReserved(virt_to_page(page_map->real)); - err = map_page_into_agp(virt_to_page(page_map->real)); -- page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), -+ page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), - PAGE_SIZE); - if (page_map->remapped == NULL || err) { - ClearPageReserved(virt_to_page(page_map->real)); ---- linux-2.6.11/drivers/char/agp/backend.c 2005-03-02 07:38:13 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/backend.c 2005-03-22 11:14:56 +00:00 -@@ -142,7 +142,7 @@ - return -ENOMEM; - } - -- bridge->scratch_page_real = virt_to_phys(addr); -+ bridge->scratch_page_real = virt_to_gart(addr); - bridge->scratch_page = - bridge->driver->mask_memory(bridge->scratch_page_real, 0); - } -@@ -186,7 +186,7 @@ - err_out: - if (bridge->driver->needs_scratch_page) - bridge->driver->agp_destroy_page( -- phys_to_virt(bridge->scratch_page_real)); -+ gart_to_virt(bridge->scratch_page_real)); - if (got_gatt) - bridge->driver->free_gatt_table(); - if (got_keylist) { -@@ -211,7 +211,7 @@ - if (bridge->driver->agp_destroy_page && - bridge->driver->needs_scratch_page) - bridge->driver->agp_destroy_page( -- phys_to_virt(bridge->scratch_page_real)); -+ gart_to_virt(bridge->scratch_page_real)); - } - - /* XXX Kludge alert: agpgart isn't ready for multiple bridges yet */ ---- linux-2.6.11/drivers/char/agp/efficeon-agp.c 2005-03-02 07:37:30 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/efficeon-agp.c 2005-03-22 11:15:17 +00:00 -@@ -219,7 +219,7 @@ - - efficeon_private.l1_table[index] = page; - -- value = __pa(page) | pati | present | index; -+ value = virt_to_gart(page) | pati | present | index; - - pci_write_config_dword(agp_bridge->dev, - EFFICEON_ATTPAGE, value); ---- linux-2.6.11/drivers/char/agp/generic.c 2005-03-02 07:37:55 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/generic.c 2005-03-22 11:17:37 +00:00 -@@ -151,7 +151,7 @@ - } - if (curr->page_count != 0) { - for (i = 0; i < curr->page_count; i++) { -- agp_bridge->driver->agp_destroy_page(phys_to_virt(curr->memory[i])); -+ agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[i])); - } - } - agp_free_key(curr->key); -@@ -204,7 +204,7 @@ - agp_free_memory(new); - return NULL; - } -- new->memory[i] = virt_to_phys(addr); -+ new->memory[i] = virt_to_gart(addr); - new->page_count++; - } - -@@ -697,8 +697,7 @@ - break; - } - -- table = (char *) __get_free_pages(GFP_KERNEL, -- page_order); -+ table = alloc_gatt_pages(page_order); - - if (table == NULL) { - i++; -@@ -729,7 +728,7 @@ - size = ((struct aper_size_info_fixed *) temp)->size; - page_order = ((struct aper_size_info_fixed *) temp)->page_order; - num_entries = ((struct aper_size_info_fixed *) temp)->num_entries; -- table = (char *) __get_free_pages(GFP_KERNEL, page_order); -+ table = alloc_gatt_pages(page_order); - } - - if (table == NULL) -@@ -744,7 +743,7 @@ - agp_gatt_table = (void *)table; - - agp_bridge->driver->cache_flush(); -- agp_bridge->gatt_table = ioremap_nocache(virt_to_phys(table), -+ agp_bridge->gatt_table = ioremap_nocache(virt_to_gart(table), - (PAGE_SIZE * (1 << page_order))); - agp_bridge->driver->cache_flush(); - -@@ -752,11 +751,11 @@ - for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) - ClearPageReserved(page); - -- free_pages((unsigned long) table, page_order); -+ free_gatt_pages(table, page_order); - - return -ENOMEM; - } -- agp_bridge->gatt_bus_addr = virt_to_phys(agp_bridge->gatt_table_real); -+ agp_bridge->gatt_bus_addr = virt_to_gart(agp_bridge->gatt_table_real); - - /* AK: bogus, should encode addresses > 4GB */ - for (i = 0; i < num_entries; i++) { -@@ -810,7 +809,7 @@ - for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) - ClearPageReserved(page); - -- free_pages((unsigned long) agp_bridge->gatt_table_real, page_order); -+ free_gatt_pages(agp_bridge->gatt_table_real, page_order); - - agp_gatt_table = NULL; - agp_bridge->gatt_table = NULL; ---- linux-2.6.11/drivers/char/agp/hp-agp.c 2005-03-02 07:38:19 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/hp-agp.c 2005-03-22 11:14:56 +00:00 -@@ -110,7 +110,7 @@ - hp->gart_size = HP_ZX1_GART_SIZE; - hp->gatt_entries = hp->gart_size / hp->io_page_size; - -- hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); -+ hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); - hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)]; - - if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) { -@@ -248,7 +248,7 @@ - agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS); - - if (hp->io_pdir_owner) { -- writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); -+ writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); - readl(hp->ioc_regs+HP_ZX1_PDIR_BASE); - writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG); - readl(hp->ioc_regs+HP_ZX1_TCNFG); ---- linux-2.6.11/drivers/char/agp/i460-agp.c 2005-03-02 07:38:10 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/i460-agp.c 2005-03-22 11:14:56 +00:00 -@@ -371,7 +371,7 @@ - } - memset(lp->alloced_map, 0, map_size); - -- lp->paddr = virt_to_phys(lpage); -+ lp->paddr = virt_to_gart(lpage); - lp->refcount = 0; - atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); - return 0; -@@ -382,7 +382,7 @@ - kfree(lp->alloced_map); - lp->alloced_map = NULL; - -- free_pages((unsigned long) phys_to_virt(lp->paddr), I460_IO_PAGE_SHIFT - PAGE_SHIFT); -+ free_pages((unsigned long) gart_to_virt(lp->paddr), I460_IO_PAGE_SHIFT - PAGE_SHIFT); - atomic_sub(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); - } - ---- linux-2.6.11/drivers/char/agp/intel-agp.c 2005-03-02 07:38:09 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/intel-agp.c 2005-03-22 11:14:56 +00:00 -@@ -285,7 +285,7 @@ - if (new == NULL) - return NULL; - -- new->memory[0] = virt_to_phys(addr); -+ new->memory[0] = virt_to_gart(addr); - if (pg_count == 4) { - /* kludge to get 4 physical pages for ARGB cursor */ - new->memory[1] = new->memory[0] + PAGE_SIZE; -@@ -328,10 +328,10 @@ - agp_free_key(curr->key); - if(curr->type == AGP_PHYS_MEMORY) { - if (curr->page_count == 4) -- i8xx_destroy_pages(phys_to_virt(curr->memory[0])); -+ i8xx_destroy_pages(gart_to_virt(curr->memory[0])); - else - agp_bridge->driver->agp_destroy_page( -- phys_to_virt(curr->memory[0])); -+ gart_to_virt(curr->memory[0])); - vfree(curr->memory); - } - kfree(curr); ---- linux-2.6.11/drivers/char/agp/intel-mch-agp.c 2005-03-02 07:37:48 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/intel-mch-agp.c 2005-03-22 11:14:56 +00:00 -@@ -51,7 +51,7 @@ - if (new == NULL) - return NULL; - -- new->memory[0] = virt_to_phys(addr); -+ new->memory[0] = virt_to_gart(addr); - new->page_count = 1; - new->num_scratch_pages = 1; - new->type = AGP_PHYS_MEMORY; -@@ -63,7 +63,7 @@ - { - agp_free_key(curr->key); - if(curr->type == AGP_PHYS_MEMORY) { -- agp_bridge->driver->agp_destroy_page(phys_to_virt(curr->memory[0])); -+ agp_bridge->driver->agp_destroy_page(gart_to_virt(curr->memory[0])); - vfree(curr->memory); - } - kfree(curr); ---- linux-2.6.11/drivers/char/agp/sworks-agp.c 2005-03-02 07:38:37 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/sworks-agp.c 2005-03-22 11:14:56 +00:00 -@@ -51,7 +51,7 @@ - } - SetPageReserved(virt_to_page(page_map->real)); - global_cache_flush(); -- page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), -+ page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), - PAGE_SIZE); - if (page_map->remapped == NULL) { - ClearPageReserved(virt_to_page(page_map->real)); -@@ -162,7 +162,7 @@ - /* Create a fake scratch directory */ - for(i = 0; i < 1024; i++) { - writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); -- writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); -+ writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); - } - - retval = serverworks_create_gatt_pages(value->num_entries / 1024); -@@ -174,7 +174,7 @@ - - agp_bridge->gatt_table_real = (u32 *)page_dir.real; - agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; -- agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); -+ agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); - - /* Get the address for the gart region. - * This is a bus address even on the alpha, b/c its -@@ -187,7 +187,7 @@ - /* Calculate the agp offset */ - - for(i = 0; i < value->num_entries / 1024; i++) -- writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); -+ writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); - - return 0; - } ---- linux-2.6.11/drivers/char/agp/uninorth-agp.c 2005-03-02 07:38:09 +00:00 -+++ linux-2.6.11-agp/drivers/char/agp/uninorth-agp.c 2005-03-22 11:14:56 +00:00 -@@ -200,7 +200,7 @@ - - agp_bridge->gatt_table_real = (u32 *) table; - agp_bridge->gatt_table = (u32 *)table; -- agp_bridge->gatt_bus_addr = virt_to_phys(table); -+ agp_bridge->gatt_bus_addr = virt_to_gart(table); - - for (i = 0; i < num_entries; i++) { - agp_bridge->gatt_table[i] = ---- linux-2.6.11/include/asm-alpha/agp.h 2005-03-02 07:37:39 +00:00 -+++ linux-2.6.11-agp/include/asm-alpha/agp.h 2005-03-22 11:18:34 +00:00 -@@ -10,4 +10,14 @@ - #define flush_agp_mappings() - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-i386/agp.h 2005-03-02 07:37:31 +00:00 -+++ linux-2.6.11-agp/include/asm-i386/agp.h 2005-03-22 11:18:39 +00:00 -@@ -21,4 +21,14 @@ - worth it. Would need a page for it. */ - #define flush_agp_cache() asm volatile("wbinvd":::"memory") - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-ia64/agp.h 2005-03-02 07:38:09 +00:00 -+++ linux-2.6.11-agp/include/asm-ia64/agp.h 2005-03-22 11:18:45 +00:00 -@@ -18,4 +18,14 @@ - #define flush_agp_mappings() /* nothing */ - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif /* _ASM_IA64_AGP_H */ ---- linux-2.6.11/include/asm-ppc/agp.h 2005-03-02 07:38:08 +00:00 -+++ linux-2.6.11-agp/include/asm-ppc/agp.h 2005-03-22 11:18:52 +00:00 -@@ -10,4 +10,14 @@ - #define flush_agp_mappings() - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-sparc64/agp.h 2005-03-02 07:37:48 +00:00 -+++ linux-2.6.11-agp/include/asm-sparc64/agp.h 2005-03-22 11:18:59 +00:00 -@@ -8,4 +8,14 @@ - #define flush_agp_mappings() - #define flush_agp_cache() mb() - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif ---- linux-2.6.11/include/asm-x86_64/agp.h 2005-03-02 07:37:30 +00:00 -+++ linux-2.6.11-agp/include/asm-x86_64/agp.h 2005-03-22 11:18:22 +00:00 -@@ -19,4 +19,14 @@ - worth it. Would need a page for it. */ - #define flush_agp_cache() asm volatile("wbinvd":::"memory") - -+/* Convert a physical address to an address suitable for the GART. */ -+#define phys_to_gart(x) (x) -+#define gart_to_phys(x) (x) -+ -+/* GATT allocation. Returns/accepts GATT kernel virtual address. */ -+#define alloc_gatt_pages(order) \ -+ ((char *)__get_free_pages(GFP_KERNEL, (order))) -+#define free_gatt_pages(table, order) \ -+ free_pages((unsigned long)(table), (order)) -+ - #endif diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch --- a/patches/linux-2.6.11/i386-cpu-hotplug-updated-for-mm.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,656 +0,0 @@ - -From: Zwane Mwaikambo <zwane@xxxxxxxxxxxxx> - -Find attached the i386 cpu hotplug patch updated for Ingo's latest round of -goodies. In order to avoid dumping cpu hotplug code into kernel/irq/* i -dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The -difference being that on cpu offline, fixup_irqs() is called before we -clear the cpu from cpu_online_map and a long delay in order to ensure that -we never have any queued external interrupts on the APICs. Due to my usual -test victims being in boxes a continent away this hasn't been tested, but -i'll cover bug reports (nudge, Nathan! ;) - -1) Add CONFIG_HOTPLUG_CPU -2) disable local APIC timer on dead cpus. -3) Disable preempt around irq balancing to prevent CPUs going down. -4) Print irq stats for all possible cpus. -5) Debugging check for interrupts on offline cpus. -6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. -7) play_dead() for offline cpus to spin inside. -8) Handle offline cpus set in flush_tlb_others(). -9) Grab lock earlier in smp_call_function() to prevent CPUs going down. -10) Implement __cpu_disable() and __cpu_die(). -11) Enable local interrupts in cpu_enable() after fixup_irqs() -12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. -13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. - -Signed-off-by: Zwane Mwaikambo <zwane@xxxxxxxxxxxxx> -DESC -ppc64: fix hotplug cpu -EDESC -From: Zwane Mwaikambo <zwane@xxxxxxxxxxx> - -I seem to have broken this when I moved the clearing of the dying cpu to -arch specific code. - -Signed-off-by: Zwane Mwaikambo <zwane@xxxxxxxxxxx> -Signed-off-by: Andrew Morton <akpm@xxxxxxxx> ---- - - 25-akpm/arch/i386/Kconfig | 9 ++ - 25-akpm/arch/i386/kernel/apic.c | 3 - 25-akpm/arch/i386/kernel/io_apic.c | 2 - 25-akpm/arch/i386/kernel/irq.c | 66 +++++++++++++++++---- - 25-akpm/arch/i386/kernel/msr.c | 2 - 25-akpm/arch/i386/kernel/process.c | 35 +++++++++++ - 25-akpm/arch/i386/kernel/smp.c | 25 +++++--- - 25-akpm/arch/i386/kernel/smpboot.c | 98 ++++++++++++++++++++++++++++++-- - 25-akpm/arch/i386/kernel/traps.c | 8 ++ - 25-akpm/arch/ia64/kernel/smpboot.c | 3 - 25-akpm/arch/ppc64/kernel/pSeries_smp.c | 5 + - 25-akpm/arch/s390/kernel/smp.c | 4 - - 25-akpm/include/asm-i386/cpu.h | 2 - 25-akpm/include/asm-i386/irq.h | 4 + - 25-akpm/include/asm-i386/smp.h | 3 - 25-akpm/kernel/cpu.c | 14 +--- - arch/ppc64/kernel/smp.c | 0 - 17 files changed, 242 insertions(+), 41 deletions(-) - -diff -puN arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm arch/i386/Kconfig ---- 25/arch/i386/Kconfig~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/Kconfig 2005-02-23 02:20:06.000000000 -0800 -@@ -1205,6 +1205,15 @@ config SCx200 - This support is also available as a module. If compiled as a - module, it will be called scx200. - -+config HOTPLUG_CPU -+ bool "Support for hot-pluggable CPUs (EXPERIMENTAL)" -+ depends on SMP && HOTPLUG && EXPERIMENTAL -+ ---help--- -+ Say Y here to experiment with turning CPUs off and on. CPUs -+ can be controlled through /sys/devices/system/cpu. -+ -+ Say N. -+ - source "drivers/pcmcia/Kconfig" - - source "drivers/pci/hotplug/Kconfig" -diff -puN arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/apic.c ---- 25/arch/i386/kernel/apic.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/apic.c 2005-02-23 02:20:06.000000000 -0800 -@@ -26,6 +26,7 @@ - #include <linux/mc146818rtc.h> - #include <linux/kernel_stat.h> - #include <linux/sysdev.h> -+#include <linux/cpu.h> - - #include <asm/atomic.h> - #include <asm/smp.h> -@@ -1048,7 +1049,7 @@ void __init setup_secondary_APIC_clock(v - setup_APIC_timer(calibration_result); - } - --void __init disable_APIC_timer(void) -+void __devinit disable_APIC_timer(void) - { - if (using_apic_timer) { - unsigned long v; -diff -puN arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/io_apic.c ---- 25/arch/i386/kernel/io_apic.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/io_apic.c 2005-02-23 02:20:06.000000000 -0800 -@@ -576,9 +576,11 @@ static int balanced_irq(void *unused) - try_to_freeze(PF_FREEZE); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { -+ preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; -+ preempt_enable(); - } - } - return 0; -diff -puN arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/irq.c ---- 25/arch/i386/kernel/irq.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/irq.c 2005-02-23 02:20:06.000000000 -0800 -@@ -15,6 +15,9 @@ - #include <linux/seq_file.h> - #include <linux/interrupt.h> - #include <linux/kernel_stat.h> -+#include <linux/notifier.h> -+#include <linux/cpu.h> -+#include <linux/delay.h> - - #ifndef CONFIG_X86_LOCAL_APIC - /* -@@ -209,9 +212,8 @@ int show_interrupts(struct seq_file *p, - - if (i == 0) { - seq_printf(p, " "); -- for (j=0; j<NR_CPUS; j++) -- if (cpu_online(j)) -- seq_printf(p, "CPU%d ",j); -+ for_each_cpu(j) -+ seq_printf(p, "CPU%d ",j); - seq_putc(p, '\n'); - } - -@@ -224,9 +226,8 @@ int show_interrupts(struct seq_file *p, - #ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); - #else -- for (j = 0; j < NR_CPUS; j++) -- if (cpu_online(j)) -- seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); -+ for_each_cpu(j) -+ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); - #endif - seq_printf(p, " %14s", irq_desc[i].handler->typename); - seq_printf(p, " %s", action->name); -@@ -239,16 +240,13 @@ skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { - seq_printf(p, "NMI: "); -- for (j = 0; j < NR_CPUS; j++) -- if (cpu_online(j)) -- seq_printf(p, "%10u ", nmi_count(j)); -+ for_each_cpu(j) -+ seq_printf(p, "%10u ", nmi_count(j)); - seq_putc(p, '\n'); - #ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); -- for (j = 0; j < NR_CPUS; j++) -- if (cpu_online(j)) -- seq_printf(p, "%10u ", -- irq_stat[j].apic_timer_irqs); -+ for_each_cpu(j) -+ seq_printf(p, "%10u ", irq_stat[j].apic_timer_irqs); - seq_putc(p, '\n'); - #endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -@@ -258,3 +256,45 @@ skip: - } - return 0; - } -+ -+#ifdef CONFIG_HOTPLUG_CPU -+#include <mach_apic.h> -+ -+void fixup_irqs(cpumask_t map) -+{ -+ unsigned int irq; -+ static int warned; -+ -+ for (irq = 0; irq < NR_IRQS; irq++) { -+ cpumask_t mask; -+ if (irq == 2) -+ continue; -+ -+ cpus_and(mask, irq_affinity[irq], map); -+ if (any_online_cpu(mask) == NR_CPUS) { -+ printk("Breaking affinity for irq %i\n", irq); -+ mask = map; -+ } -+ if (irq_desc[irq].handler->set_affinity) -+ irq_desc[irq].handler->set_affinity(irq, mask); -+ else if (irq_desc[irq].action && !(warned++)) -+ printk("Cannot set affinity for irq %i\n", irq); -+ } -+ -+#if 0 -+ barrier(); -+ /* Ingo Molnar says: "after the IO-APIC masks have been redirected -+ [note the nop - the interrupt-enable boundary on x86 is two -+ instructions from sti] - to flush out pending hardirqs and -+ IPIs. After this point nothing is supposed to reach this CPU." */ -+ __asm__ __volatile__("sti; nop; cli"); -+ barrier(); -+#else -+ /* That doesn't seem sufficient. Give it 1ms. */ -+ local_irq_enable(); -+ mdelay(1); -+ local_irq_disable(); -+#endif -+} -+#endif -+ -diff -puN arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/msr.c ---- 25/arch/i386/kernel/msr.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/msr.c 2005-02-23 02:20:06.000000000 -0800 -@@ -260,7 +260,7 @@ static struct file_operations msr_fops = - .open = msr_open, - }; - --static int msr_class_simple_device_add(int i) -+static int __devinit msr_class_simple_device_add(int i) - { - int err = 0; - struct class_device *class_err; -diff -puN arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/process.c ---- 25/arch/i386/kernel/process.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/process.c 2005-02-23 02:20:06.000000000 -0800 -@@ -13,6 +13,7 @@ - - #include <stdarg.h> - -+#include <linux/cpu.h> - #include <linux/errno.h> - #include <linux/sched.h> - #include <linux/fs.h> -@@ -55,6 +56,9 @@ - #include <linux/irq.h> - #include <linux/err.h> - -+#include <asm/tlbflush.h> -+#include <asm/cpu.h> -+ - asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); - - int hlt_counter; -@@ -139,6 +143,34 @@ static void poll_idle (void) - } - } - -+#ifdef CONFIG_HOTPLUG_CPU -+#include <asm/nmi.h> -+/* We don't actually take CPU down, just spin without interrupts. */ -+static inline void play_dead(void) -+{ -+ /* Ack it */ -+ __get_cpu_var(cpu_state) = CPU_DEAD; -+ -+ /* We shouldn't have to disable interrupts while dead, but -+ * some interrupts just don't seem to go away, and this makes -+ * it "work" for testing purposes. */ -+ /* Death loop */ -+ while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) -+ cpu_relax(); -+ -+ local_irq_disable(); -+ __flush_tlb_all(); -+ cpu_set(smp_processor_id(), cpu_online_map); -+ enable_APIC_timer(); -+ local_irq_enable(); -+} -+#else -+static inline void play_dead(void) -+{ -+ BUG(); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ - /* - * The idle thread. There's no useful work to be - * done, so just try to conserve power and have a -@@ -162,6 +194,9 @@ void cpu_idle (void) - if (!idle) - idle = default_idle; - -+ if (cpu_is_offline(cpu)) -+ play_dead(); -+ - irq_stat[cpu].idle_timestamp = jiffies; - idle(); - } -diff -puN arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/smpboot.c ---- 25/arch/i386/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/smpboot.c 2005-02-23 02:20:06.000000000 -0800 -@@ -44,6 +44,9 @@ - #include <linux/smp_lock.h> - #include <linux/irq.h> - #include <linux/bootmem.h> -+#include <linux/notifier.h> -+#include <linux/cpu.h> -+#include <linux/percpu.h> - - #include <linux/delay.h> - #include <linux/mc146818rtc.h> -@@ -89,6 +92,9 @@ extern unsigned char trampoline_end []; - static unsigned char *trampoline_base; - static int trampoline_exec; - -+/* State of each CPU. */ -+DEFINE_PER_CPU(int, cpu_state) = { 0 }; -+ - /* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller -@@ -1095,6 +1101,9 @@ static void __init smp_boot_cpus(unsigne - who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ - void __init smp_prepare_cpus(unsigned int max_cpus) - { -+ smp_commenced_mask = cpumask_of_cpu(0); -+ cpu_callin_map = cpumask_of_cpu(0); -+ mb(); - smp_boot_cpus(max_cpus); - } - -@@ -1104,20 +1113,99 @@ void __devinit smp_prepare_boot_cpu(void - cpu_set(smp_processor_id(), cpu_callout_map); - } - --int __devinit __cpu_up(unsigned int cpu) -+#ifdef CONFIG_HOTPLUG_CPU -+ -+/* must be called with the cpucontrol mutex held */ -+static int __devinit cpu_enable(unsigned int cpu) - { -- /* This only works at boot for x86. See "rewrite" above. */ -- if (cpu_isset(cpu, smp_commenced_mask)) { -- local_irq_enable(); -- return -ENOSYS; -+ /* get the target out of its holding state */ -+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; -+ wmb(); -+ -+ /* wait for the processor to ack it. timeout? */ -+ while (!cpu_online(cpu)) -+ cpu_relax(); -+ -+ fixup_irqs(cpu_online_map); -+ /* counter the disable in fixup_irqs() */ -+ local_irq_enable(); -+ return 0; -+} -+ -+int __cpu_disable(void) -+{ -+ cpumask_t map = cpu_online_map; -+ int cpu = smp_processor_id(); -+ -+ /* -+ * Perhaps use cpufreq to drop frequency, but that could go -+ * into generic code. -+ * -+ * We won't take down the boot processor on i386 due to some -+ * interrupts only being able to be serviced by the BSP. -+ * Especially so if we're not using an IOAPIC -zwane -+ */ -+ if (cpu == 0) -+ return -EBUSY; -+ -+ /* We enable the timer again on the exit path of the death loop */ -+ disable_APIC_timer(); -+ /* Allow any queued timer interrupts to get serviced */ -+ local_irq_enable(); -+ mdelay(1); -+ local_irq_disable(); -+ -+ cpu_clear(cpu, map); -+ fixup_irqs(map); -+ /* It's now safe to remove this processor from the online map */ -+ cpu_clear(cpu, cpu_online_map); -+ return 0; -+} -+ -+void __cpu_die(unsigned int cpu) -+{ -+ /* We don't do anything here: idle task is faking death itself. */ -+ unsigned int i; -+ -+ for (i = 0; i < 10; i++) { -+ /* They ack this in play_dead by setting CPU_DEAD */ -+ if (per_cpu(cpu_state, cpu) == CPU_DEAD) -+ return; -+ current->state = TASK_UNINTERRUPTIBLE; -+ schedule_timeout(HZ/10); - } -+ printk(KERN_ERR "CPU %u didn't die...\n", cpu); -+} -+#else /* ... !CONFIG_HOTPLUG_CPU */ -+int __cpu_disable(void) -+{ -+ return -ENOSYS; -+} - -+void __cpu_die(unsigned int cpu) -+{ -+ /* We said "no" in __cpu_disable */ -+ BUG(); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __devinit __cpu_up(unsigned int cpu) -+{ - /* In case one didn't come up */ - if (!cpu_isset(cpu, cpu_callin_map)) { -+ printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); - local_irq_enable(); - return -EIO; - } - -+#ifdef CONFIG_HOTPLUG_CPU -+ /* Already up, and in cpu_quiescent now? */ -+ if (cpu_isset(cpu, smp_commenced_mask)) { -+ cpu_enable(cpu); -+ return 0; -+ } -+#endif -+ - local_irq_enable(); - /* Unleash the CPU! */ - cpu_set(cpu, smp_commenced_mask); -diff -puN arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/smp.c ---- 25/arch/i386/kernel/smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/smp.c 2005-02-23 02:20:06.000000000 -0800 -@@ -19,6 +19,7 @@ - #include <linux/mc146818rtc.h> - #include <linux/cache.h> - #include <linux/interrupt.h> -+#include <linux/cpu.h> - - #include <asm/mtrr.h> - #include <asm/tlbflush.h> -@@ -163,7 +164,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu - unsigned long flags; - - local_irq_save(flags); -- -+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); - /* - * Wait for idle. - */ -@@ -345,21 +346,21 @@ out: - static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) - { -- cpumask_t tmp; - /* - * A couple of (to be removed) sanity checks: - * -- * - we do not send IPIs to not-yet booted CPUs. - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); -- -- cpus_and(tmp, cpumask, cpu_online_map); -- BUG_ON(!cpus_equal(cpumask, tmp)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - BUG_ON(!mm); - -+ /* If a CPU which we ran on has gone down, OK. */ -+ cpus_and(cpumask, cpumask, cpu_online_map); -+ if (cpus_empty(cpumask)) -+ return; -+ - /* - * i'm not happy about this global shared spinlock in the - * MM hot path, but we'll see how contended it is. -@@ -484,6 +485,7 @@ void smp_send_nmi_allbutself(void) - */ - void smp_send_reschedule(int cpu) - { -+ WARN_ON(cpu_is_offline(cpu)); - send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); - } - -@@ -524,10 +526,16 @@ int smp_call_function (void (*func) (voi - */ - { - struct call_data_struct data; -- int cpus = num_online_cpus()-1; -+ int cpus; - -- if (!cpus) -+ /* Holding any lock stops cpus from going down. */ -+ spin_lock(&call_lock); -+ cpus = num_online_cpus()-1; -+ -+ if (!cpus) { -+ spin_unlock(&call_lock); - return 0; -+ } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); -@@ -539,7 +547,6 @@ int smp_call_function (void (*func) (voi - if (wait) - atomic_set(&data.finished, 0); - -- spin_lock(&call_lock); - call_data = &data; - mb(); - -diff -puN arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm arch/i386/kernel/traps.c ---- 25/arch/i386/kernel/traps.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/i386/kernel/traps.c 2005-02-23 02:20:06.000000000 -0800 -@@ -669,6 +669,14 @@ fastcall void do_nmi(struct pt_regs * re - nmi_enter(); - - cpu = smp_processor_id(); -+ -+#ifdef CONFIG_HOTPLUG_CPU -+ if (!cpu_online(cpu)) { -+ nmi_exit(); -+ return; -+ } -+#endif -+ - ++nmi_count(cpu); - - if (!nmi_callback(regs, cpu)) -diff -puN arch/ia64/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm arch/ia64/kernel/smpboot.c ---- 25/arch/ia64/kernel/smpboot.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/ia64/kernel/smpboot.c 2005-02-23 02:20:06.000000000 -0800 -@@ -590,9 +590,10 @@ int __cpu_disable(void) - if (cpu == 0) - return -EBUSY; - -+ cpu_clear(cpu, cpu_online_map); - fixup_irqs(); - local_flush_tlb_all(); -- printk ("Disabled cpu %u\n", smp_processor_id()); -+ printk("Disabled cpu %u\n", cpu); - return 0; - } - -diff -puN arch/ppc64/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/ppc64/kernel/smp.c -diff -puN arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm arch/s390/kernel/smp.c ---- 25/arch/s390/kernel/smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/arch/s390/kernel/smp.c 2005-02-23 02:20:06.000000000 -0800 -@@ -679,12 +679,14 @@ __cpu_disable(void) - { - unsigned long flags; - ec_creg_mask_parms cr_parms; -+ int cpu = smp_processor_id(); - - spin_lock_irqsave(&smp_reserve_lock, flags); -- if (smp_cpu_reserved[smp_processor_id()] != 0) { -+ if (smp_cpu_reserved[cpu] != 0) { - spin_unlock_irqrestore(&smp_reserve_lock, flags); - return -EBUSY; - } -+ cpu_clear(cpu, cpu_online_map); - - #ifdef CONFIG_PFAULT - /* Disable pfault pseudo page faults on this cpu. */ -diff -puN include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/cpu.h ---- 25/include/asm-i386/cpu.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/include/asm-i386/cpu.h 2005-02-23 02:20:06.000000000 -0800 -@@ -5,6 +5,7 @@ - #include <linux/cpu.h> - #include <linux/topology.h> - #include <linux/nodemask.h> -+#include <linux/percpu.h> - - #include <asm/node.h> - -@@ -17,4 +18,5 @@ extern int arch_register_cpu(int num); - extern void arch_unregister_cpu(int); - #endif - -+DECLARE_PER_CPU(int, cpu_state); - #endif /* _ASM_I386_CPU_H_ */ -diff -puN include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/irq.h ---- 25/include/asm-i386/irq.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/include/asm-i386/irq.h 2005-02-23 02:20:06.000000000 -0800 -@@ -38,4 +38,8 @@ extern void release_vm86_irqs(struct tas - extern int irqbalance_disable(char *str); - #endif - -+#ifdef CONFIG_HOTPLUG_CPU -+extern void fixup_irqs(cpumask_t map); -+#endif -+ - #endif /* _ASM_IRQ_H */ -diff -puN include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm include/asm-i386/smp.h ---- 25/include/asm-i386/smp.h~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/include/asm-i386/smp.h 2005-02-23 02:20:06.000000000 -0800 -@@ -85,6 +85,9 @@ static __inline int logical_smp_processo - } - - #endif -+ -+extern int __cpu_disable(void); -+extern void __cpu_die(unsigned int cpu); - #endif /* !__ASSEMBLY__ */ - - #define NO_PROC_ID 0xFF /* No processor magic marker */ -diff -puN kernel/cpu.c~i386-cpu-hotplug-updated-for-mm kernel/cpu.c ---- 25/kernel/cpu.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:06.000000000 -0800 -+++ 25-akpm/kernel/cpu.c 2005-02-23 02:20:06.000000000 -0800 -@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) - { - int err; - -- /* Take offline: makes arch_cpu_down somewhat easier. */ -- cpu_clear(smp_processor_id(), cpu_online_map); -- - /* Ensure this CPU doesn't handle any more interrupts. */ - err = __cpu_disable(); - if (err < 0) -- cpu_set(smp_processor_id(), cpu_online_map); -- else -- /* Force idle task to run as soon as we yield: it should -- immediately notice cpu is offline and die quickly. */ -- sched_idle_next(); -+ return err; - -- return err; -+ /* Force idle task to run as soon as we yield: it should -+ immediately notice cpu is offline and die quickly. */ -+ sched_idle_next(); -+ return 0; - } - - int cpu_down(unsigned int cpu) -diff -puN arch/ppc64/kernel/pSeries_smp.c~i386-cpu-hotplug-updated-for-mm arch/ppc64/kernel/pSeries_smp.c ---- 25/arch/ppc64/kernel/pSeries_smp.c~i386-cpu-hotplug-updated-for-mm 2005-02-23 02:20:08.000000000 -0800 -+++ 25-akpm/arch/ppc64/kernel/pSeries_smp.c 2005-02-23 02:20:08.000000000 -0800 -@@ -86,10 +86,13 @@ static int query_cpu_stopped(unsigned in - - int pSeries_cpu_disable(void) - { -+ int cpu = smp_processor_id(); -+ -+ cpu_clear(cpu, cpu_online_map); - systemcfg->processorCount--; - - /*fix boot_cpuid here*/ -- if (smp_processor_id() == boot_cpuid) -+ if (cpu == boot_cpuid) - boot_cpuid = any_online_cpu(cpu_online_map); - - /* FIXME: abstract this to not be platform specific later on */ -_ diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/iomap.patch --- a/patches/linux-2.6.11/iomap.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,120 +0,0 @@ -diff -ur linux-2.6.11/drivers/char/agp/frontend.c linux-2.6.11-io/drivers/char/agp/frontend.c ---- linux-2.6.11/drivers/char/agp/frontend.c 2005-03-02 07:37:49.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/agp/frontend.c 2005-03-15 17:38:30.000000000 +0000 -@@ -627,7 +627,7 @@ - DBG("client vm_ops=%p", kerninfo.vm_ops); - if (kerninfo.vm_ops) { - vma->vm_ops = kerninfo.vm_ops; -- } else if (remap_pfn_range(vma, vma->vm_start, -+ } else if (io_remap_pfn_range(vma, vma->vm_start, - (kerninfo.aper_base + offset) >> PAGE_SHIFT, - size, vma->vm_page_prot)) { - goto out_again; -@@ -643,7 +643,7 @@ - DBG("controller vm_ops=%p", kerninfo.vm_ops); - if (kerninfo.vm_ops) { - vma->vm_ops = kerninfo.vm_ops; -- } else if (remap_pfn_range(vma, vma->vm_start, -+ } else if (io_remap_pfn_range(vma, vma->vm_start, - kerninfo.aper_base >> PAGE_SHIFT, - size, vma->vm_page_prot)) { - goto out_again; -diff -ur linux-2.6.11/drivers/char/drm/drm_vm.c linux-2.6.11-io/drivers/char/drm/drm_vm.c ---- linux-2.6.11/drivers/char/drm/drm_vm.c 2005-03-02 07:38:33.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/drm/drm_vm.c 2005-03-15 17:43:26.000000000 +0000 -@@ -630,7 +630,7 @@ - vma->vm_end - vma->vm_start, - vma->vm_page_prot, 0)) - #else -- if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start, -+ if (io_remap_pfn_range(vma, vma->vm_start, - (VM_OFFSET(vma) + offset) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) -diff -ur linux-2.6.11/drivers/char/drm/i810_dma.c linux-2.6.11-io/drivers/char/drm/i810_dma.c ---- linux-2.6.11/drivers/char/drm/i810_dma.c 2005-03-02 07:37:55.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/drm/i810_dma.c 2005-03-15 17:53:36.000000000 +0000 -@@ -139,7 +139,7 @@ - buf_priv->currently_mapped = I810_BUF_MAPPED; - unlock_kernel(); - -- if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start, -+ if (io_remap_pfn_range(vma, vma->vm_start, - VM_OFFSET(vma) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) return -EAGAIN; -diff -ur linux-2.6.11/drivers/char/drm/i830_dma.c linux-2.6.11-io/drivers/char/drm/i830_dma.c ---- linux-2.6.11/drivers/char/drm/i830_dma.c 2005-03-02 07:37:48.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/drm/i830_dma.c 2005-03-15 17:53:46.000000000 +0000 -@@ -157,7 +157,7 @@ - buf_priv->currently_mapped = I830_BUF_MAPPED; - unlock_kernel(); - -- if (remap_pfn_range(DRM_RPR_ARG(vma) vma->vm_start, -+ if (io_remap_pfn_range(vma, vma->vm_start, - VM_OFFSET(vma) >> PAGE_SHIFT, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) return -EAGAIN; -diff -ur linux-2.6.11/drivers/char/hpet.c linux-2.6.11-io/drivers/char/hpet.c ---- linux-2.6.11/drivers/char/hpet.c 2005-03-02 07:38:10.000000000 +0000 -+++ linux-2.6.11-io/drivers/char/hpet.c 2005-03-15 17:37:22.000000000 +0000 -@@ -76,6 +76,7 @@ - struct hpets { - struct hpets *hp_next; - struct hpet __iomem *hp_hpet; -+ unsigned long hp_hpet_phys; - struct time_interpolator *hp_interpolator; - unsigned long hp_period; - unsigned long hp_delta; -@@ -265,7 +266,7 @@ - return -EINVAL; - - devp = file->private_data; -- addr = (unsigned long)devp->hd_hpet; -+ addr = devp->hd_hpets->hp_hpet_phys; - - if (addr & (PAGE_SIZE - 1)) - return -ENOSYS; -@@ -274,7 +275,7 @@ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - addr = __pa(addr); - -- if (remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, -+ if (io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - printk(KERN_ERR "remap_pfn_range failed in hpet.c\n"); - return -EAGAIN; -@@ -795,6 +796,7 @@ - - hpetp->hp_which = hpet_nhpet++; - hpetp->hp_hpet = hdp->hd_address; -+ hpetp->hp_hpet_phys = hdp->hd_phys_address; - - hpetp->hp_ntimer = hdp->hd_nirqs; - -diff -ur linux-2.6.11/drivers/sbus/char/flash.c linux-2.6.11-io/drivers/sbus/char/flash.c ---- linux-2.6.11/drivers/sbus/char/flash.c 2005-03-02 07:38:10.000000000 +0000 -+++ linux-2.6.11-io/drivers/sbus/char/flash.c 2005-03-15 17:20:22.000000000 +0000 -@@ -75,7 +75,7 @@ - pgprot_val(vma->vm_page_prot) |= _PAGE_E; - vma->vm_flags |= (VM_SHM | VM_LOCKED); - -- if (remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot)) -+ if (io_remap_pfn_range(vma, vma->vm_start, addr, size, vma->vm_page_prot)) - return -EAGAIN; - - return 0; -diff -ur linux-2.6.11/include/linux/mm.h linux-2.6.11-io/include/linux/mm.h ---- linux-2.6.11/include/linux/mm.h 2005-03-02 07:37:47.000000000 +0000 -+++ linux-2.6.11-io/include/linux/mm.h 2005-03-15 17:03:46.000000000 +0000 -@@ -815,6 +815,10 @@ - extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); - int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); -+/* Allow arch override for mapping of device and I/O (non-RAM) pages. */ -+#ifndef io_remap_pfn_range -+#define io_remap_pfn_range remap_pfn_range -+#endif - - #ifdef CONFIG_PROC_FS - void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/linux-2.6.11.12.patch --- a/patches/linux-2.6.11/linux-2.6.11.12.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,2579 +0,0 @@ -diff --git a/Documentation/SecurityBugs b/Documentation/SecurityBugs -new file mode 100644 ---- /dev/null -+++ b/Documentation/SecurityBugs -@@ -0,0 +1,38 @@ -+Linux kernel developers take security very seriously. As such, we'd -+like to know when a security bug is found so that it can be fixed and -+disclosed as quickly as possible. Please report security bugs to the -+Linux kernel security team. -+ -+1) Contact -+ -+The Linux kernel security team can be contacted by email at -+<security@xxxxxxxxxx>. This is a private list of security officers -+who will help verify the bug report and develop and release a fix. -+It is possible that the security team will bring in extra help from -+area maintainers to understand and fix the security vulnerability. -+ -+As it is with any bug, the more information provided the easier it -+will be to diagnose and fix. Please review the procedure outlined in -+REPORTING-BUGS if you are unclear about what information is helpful. -+Any exploit code is very helpful and will not be released without -+consent from the reporter unless it has already been made public. -+ -+2) Disclosure -+ -+The goal of the Linux kernel security team is to work with the -+bug submitter to bug resolution as well as disclosure. We prefer -+to fully disclose the bug as soon as possible. It is reasonable to -+delay disclosure when the bug or the fix is not yet fully understood, -+the solution is not well-tested or for vendor coordination. However, we -+expect these delays to be short, measurable in days, not weeks or months. -+A disclosure date is negotiated by the security team working with the -+bug submitter as well as vendors. However, the kernel security team -+holds the final say when setting a disclosure date. The timeframe for -+disclosure is from immediate (esp. if it's already publically known) -+to a few weeks. As a basic default policy, we expect report date to -+disclosure date to be on the order of 7 days. -+ -+3) Non-disclosure agreements -+ -+The Linux kernel security team is not a formal body and therefore unable -+to enter any non-disclosure agreements. -diff --git a/MAINTAINERS b/MAINTAINERS ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -1966,6 +1966,11 @@ M: christer@xxxxxxxxxxx - W: http://www.weinigel.se - S: Supported - -+SECURITY CONTACT -+P: Security Officers -+M: security@xxxxxxxxxx -+S: Supported -+ - SELINUX SECURITY MODULE - P: Stephen Smalley - M: sds@xxxxxxxxxxxxxx -diff --git a/Makefile b/Makefile ---- a/Makefile -+++ b/Makefile -@@ -1,8 +1,8 @@ - VERSION = 2 - PATCHLEVEL = 6 - SUBLEVEL = 11 --EXTRAVERSION = --NAME=Woozy Numbat -+EXTRAVERSION = .12 -+NAME=Woozy Beaver - - # *DOCUMENTATION* - # To see a list of typical targets execute "make help" -diff --git a/REPORTING-BUGS b/REPORTING-BUGS ---- a/REPORTING-BUGS -+++ b/REPORTING-BUGS -@@ -16,6 +16,10 @@ code relevant to what you were doing. If - describe how to recreate it. That is worth even more than the oops itself. - The list of maintainers is in the MAINTAINERS file in this directory. - -+ If it is a security bug, please copy the Security Contact listed -+in the MAINTAINERS file. They can help coordinate bugfix and disclosure. -+See Documentation/SecurityBugs for more infomation. -+ - If you are totally stumped as to whom to send the report, send it to - linux-kernel@xxxxxxxxxxxxxxxx (For more information on the linux-kernel - mailing list see http://www.tux.org/lkml/). -diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S ---- a/arch/ia64/kernel/fsys.S -+++ b/arch/ia64/kernel/fsys.S -@@ -611,8 +611,10 @@ GLOBAL_ENTRY(fsys_bubble_down) - movl r2=ia64_ret_from_syscall - ;; - mov rp=r2 // set the real return addr -- tbit.z p8,p0=r3,TIF_SYSCALL_TRACE -+ and r3=_TIF_SYSCALL_TRACEAUDIT,r3 - ;; -+ cmp.eq p8,p0=r3,r0 -+ - (p10) br.cond.spnt.many ia64_ret_from_syscall // p10==true means out registers are more than 8 - (p8) br.call.sptk.many b6=b6 // ignore this return addr - br.cond.sptk ia64_trace_syscall -diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c ---- a/arch/ia64/kernel/signal.c -+++ b/arch/ia64/kernel/signal.c -@@ -224,7 +224,8 @@ ia64_rt_sigreturn (struct sigscratch *sc - * could be corrupted. - */ - retval = (long) &ia64_leave_kernel; -- if (test_thread_flag(TIF_SYSCALL_TRACE)) -+ if (test_thread_flag(TIF_SYSCALL_TRACE) -+ || test_thread_flag(TIF_SYSCALL_AUDIT)) - /* - * strace expects to be notified after sigreturn returns even though the - * context to which we return may not be in the middle of a syscall. -diff --git a/arch/ppc/oprofile/op_model_fsl_booke.c b/arch/ppc/oprofile/op_model_fsl_booke.c ---- a/arch/ppc/oprofile/op_model_fsl_booke.c -+++ b/arch/ppc/oprofile/op_model_fsl_booke.c -@@ -150,7 +150,6 @@ static void fsl_booke_handle_interrupt(s - int is_kernel; - int val; - int i; -- unsigned int cpu = smp_processor_id(); - - /* set the PMM bit (see comment below) */ - mtmsr(mfmsr() | MSR_PMM); -@@ -162,7 +161,7 @@ static void fsl_booke_handle_interrupt(s - val = ctr_read(i); - if (val < 0) { - if (oprofile_running && ctr[i].enabled) { -- oprofile_add_sample(pc, is_kernel, i, cpu); -+ oprofile_add_pc(pc, is_kernel, i); - ctr_write(i, reset_value[i]); - } else { - ctr_write(i, 0); -diff --git a/arch/ppc/platforms/4xx/ebony.h b/arch/ppc/platforms/4xx/ebony.h ---- a/arch/ppc/platforms/4xx/ebony.h -+++ b/arch/ppc/platforms/4xx/ebony.h -@@ -61,8 +61,8 @@ - */ - - /* OpenBIOS defined UART mappings, used before early_serial_setup */ --#define UART0_IO_BASE (u8 *) 0xE0000200 --#define UART1_IO_BASE (u8 *) 0xE0000300 -+#define UART0_IO_BASE 0xE0000200 -+#define UART1_IO_BASE 0xE0000300 - - /* external Epson SG-615P */ - #define BASE_BAUD 691200 -diff --git a/arch/ppc/platforms/4xx/luan.h b/arch/ppc/platforms/4xx/luan.h ---- a/arch/ppc/platforms/4xx/luan.h -+++ b/arch/ppc/platforms/4xx/luan.h -@@ -47,9 +47,9 @@ - #define RS_TABLE_SIZE 3 - - /* PIBS defined UART mappings, used before early_serial_setup */ --#define UART0_IO_BASE (u8 *) 0xa0000200 --#define UART1_IO_BASE (u8 *) 0xa0000300 --#define UART2_IO_BASE (u8 *) 0xa0000600 -+#define UART0_IO_BASE 0xa0000200 -+#define UART1_IO_BASE 0xa0000300 -+#define UART2_IO_BASE 0xa0000600 - - #define BASE_BAUD 11059200 - #define STD_UART_OP(num) \ -diff --git a/arch/ppc/platforms/4xx/ocotea.h b/arch/ppc/platforms/4xx/ocotea.h ---- a/arch/ppc/platforms/4xx/ocotea.h -+++ b/arch/ppc/platforms/4xx/ocotea.h -@@ -56,8 +56,8 @@ - #define RS_TABLE_SIZE 2 - - /* OpenBIOS defined UART mappings, used before early_serial_setup */ --#define UART0_IO_BASE (u8 *) 0xE0000200 --#define UART1_IO_BASE (u8 *) 0xE0000300 -+#define UART0_IO_BASE 0xE0000200 -+#define UART1_IO_BASE 0xE0000300 - - #define BASE_BAUD 11059200/16 - #define STD_UART_OP(num) \ -diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c ---- a/arch/ppc64/kernel/pSeries_iommu.c -+++ b/arch/ppc64/kernel/pSeries_iommu.c -@@ -401,6 +401,8 @@ static void iommu_bus_setup_pSeriesLP(st - struct device_node *dn, *pdn; - unsigned int *dma_window = NULL; - -+ DBG("iommu_bus_setup_pSeriesLP, bus %p, bus->self %p\n", bus, bus->self); -+ - dn = pci_bus_to_OF_node(bus); - - /* Find nearest ibm,dma-window, walking up the device tree */ -@@ -455,6 +457,56 @@ static void iommu_dev_setup_pSeries(stru - } - } - -+static void iommu_dev_setup_pSeriesLP(struct pci_dev *dev) -+{ -+ struct device_node *pdn, *dn; -+ struct iommu_table *tbl; -+ int *dma_window = NULL; -+ -+ DBG("iommu_dev_setup_pSeriesLP, dev %p (%s)\n", dev, dev->pretty_name); -+ -+ /* dev setup for LPAR is a little tricky, since the device tree might -+ * contain the dma-window properties per-device and not neccesarily -+ * for the bus. So we need to search upwards in the tree until we -+ * either hit a dma-window property, OR find a parent with a table -+ * already allocated. -+ */ -+ dn = pci_device_to_OF_node(dev); -+ -+ for (pdn = dn; pdn && !pdn->iommu_table; pdn = pdn->parent) { -+ dma_window = (unsigned int *)get_property(pdn, "ibm,dma-window", NULL); -+ if (dma_window) -+ break; -+ } -+ -+ /* Check for parent == NULL so we don't try to setup the empty EADS -+ * slots on POWER4 machines. -+ */ -+ if (dma_window == NULL || pdn->parent == NULL) { -+ /* Fall back to regular (non-LPAR) dev setup */ -+ DBG("No dma window for device, falling back to regular setup\n"); -+ iommu_dev_setup_pSeries(dev); -+ return; -+ } else { -+ DBG("Found DMA window, allocating table\n"); -+ } -+ -+ if (!pdn->iommu_table) { -+ /* iommu_table_setparms_lpar needs bussubno. */ -+ pdn->bussubno = pdn->phb->bus->number; -+ -+ tbl = (struct iommu_table *)kmalloc(sizeof(struct iommu_table), -+ GFP_KERNEL); -+ -+ iommu_table_setparms_lpar(pdn->phb, pdn, tbl, dma_window); -+ -+ pdn->iommu_table = iommu_init_table(tbl); -+ } -+ -+ if (pdn != dn) -+ dn->iommu_table = pdn->iommu_table; -+} -+ - static void iommu_bus_setup_null(struct pci_bus *b) { } - static void iommu_dev_setup_null(struct pci_dev *d) { } - -@@ -479,13 +531,14 @@ void iommu_init_early_pSeries(void) - ppc_md.tce_free = tce_free_pSeriesLP; - } - ppc_md.iommu_bus_setup = iommu_bus_setup_pSeriesLP; -+ ppc_md.iommu_dev_setup = iommu_dev_setup_pSeriesLP; - } else { - ppc_md.tce_build = tce_build_pSeries; - ppc_md.tce_free = tce_free_pSeries; - ppc_md.iommu_bus_setup = iommu_bus_setup_pSeries; -+ ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries; - } - -- ppc_md.iommu_dev_setup = iommu_dev_setup_pSeries; - - pci_iommu_init(); - } -diff --git a/arch/sparc/kernel/ptrace.c b/arch/sparc/kernel/ptrace.c ---- a/arch/sparc/kernel/ptrace.c -+++ b/arch/sparc/kernel/ptrace.c -@@ -531,18 +531,6 @@ asmlinkage void do_ptrace(struct pt_regs - pt_error_return(regs, EIO); - goto out_tsk; - } -- if (addr != 1) { -- if (addr & 3) { -- pt_error_return(regs, EINVAL); -- goto out_tsk; -- } --#ifdef DEBUG_PTRACE -- printk ("Original: %08lx %08lx\n", child->thread.kregs->pc, child->thread.kregs->npc); -- printk ("Continuing with %08lx %08lx\n", addr, addr+4); --#endif -- child->thread.kregs->pc = addr; -- child->thread.kregs->npc = addr + 4; -- } - - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); -diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c ---- a/arch/sparc64/kernel/ptrace.c -+++ b/arch/sparc64/kernel/ptrace.c -@@ -514,25 +514,6 @@ asmlinkage void do_ptrace(struct pt_regs - pt_error_return(regs, EIO); - goto out_tsk; - } -- if (addr != 1) { -- unsigned long pc_mask = ~0UL; -- -- if ((child->thread_info->flags & _TIF_32BIT) != 0) -- pc_mask = 0xffffffff; -- -- if (addr & 3) { -- pt_error_return(regs, EINVAL); -- goto out_tsk; -- } --#ifdef DEBUG_PTRACE -- printk ("Original: %016lx %016lx\n", -- child->thread_info->kregs->tpc, -- child->thread_info->kregs->tnpc); -- printk ("Continuing with %016lx %016lx\n", addr, addr+4); --#endif -- child->thread_info->kregs->tpc = (addr & pc_mask); -- child->thread_info->kregs->tnpc = ((addr + 4) & pc_mask); -- } - - if (request == PTRACE_SYSCALL) { - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); -diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c ---- a/arch/sparc64/kernel/signal32.c -+++ b/arch/sparc64/kernel/signal32.c -@@ -192,10 +192,13 @@ int copy_siginfo_to_user32(compat_siginf - err |= __put_user(from->si_uid, &to->si_uid); - break; - case __SI_FAULT >> 16: -- case __SI_POLL >> 16: - err |= __put_user(from->si_trapno, &to->si_trapno); - err |= __put_user((unsigned long)from->si_addr, &to->si_addr); - break; -+ case __SI_POLL >> 16: -+ err |= __put_user(from->si_band, &to->si_band); -+ err |= __put_user(from->si_fd, &to->si_fd); -+ break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: - err |= __put_user(from->si_pid, &to->si_pid); -diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S ---- a/arch/sparc64/kernel/systbls.S -+++ b/arch/sparc64/kernel/systbls.S -@@ -75,7 +75,7 @@ sys_call_table32: - /*260*/ .word compat_sys_sched_getaffinity, compat_sys_sched_setaffinity, sys32_timer_settime, compat_sys_timer_gettime, sys_timer_getoverrun - .word sys_timer_delete, sys32_timer_create, sys_ni_syscall, compat_sys_io_setup, sys_io_destroy - /*270*/ .word sys32_io_submit, sys_io_cancel, compat_sys_io_getevents, sys32_mq_open, sys_mq_unlink -- .word sys_mq_timedsend, sys_mq_timedreceive, compat_sys_mq_notify, compat_sys_mq_getsetattr, compat_sys_waitid -+ .word compat_sys_mq_timedsend, compat_sys_mq_timedreceive, compat_sys_mq_notify, compat_sys_mq_getsetattr, compat_sys_waitid - /*280*/ .word sys_ni_syscall, sys_add_key, sys_request_key, sys_keyctl - - #endif /* CONFIG_COMPAT */ -diff --git a/arch/um/include/sysdep-i386/syscalls.h b/arch/um/include/sysdep-i386/syscalls.h ---- a/arch/um/include/sysdep-i386/syscalls.h -+++ b/arch/um/include/sysdep-i386/syscalls.h -@@ -23,6 +23,9 @@ extern long sys_mmap2(unsigned long addr - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); - -+/* On i386 they choose a meaningless naming.*/ -+#define __NR_kexec_load __NR_sys_kexec_load -+ - #define ARCH_SYSCALLS \ - [ __NR_waitpid ] = (syscall_handler_t *) sys_waitpid, \ - [ __NR_break ] = (syscall_handler_t *) sys_ni_syscall, \ -@@ -101,15 +104,12 @@ extern long sys_mmap2(unsigned long addr - [ 223 ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_set_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_get_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, \ - [ 251 ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_remap_file_pages ] = (syscall_handler_t *) sys_remap_file_pages, \ -- [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, \ -- [ __NR_vserver ] = (syscall_handler_t *) sys_ni_syscall, -- -+ [ 285 ] = (syscall_handler_t *) sys_ni_syscall, -+ - /* 222 doesn't yet have a name in include/asm-i386/unistd.h */ - --#define LAST_ARCH_SYSCALL __NR_vserver -+#define LAST_ARCH_SYSCALL 285 - - /* - * Overrides for Emacs so that we follow Linus's tabbing style. -diff --git a/arch/um/include/sysdep-x86_64/syscalls.h b/arch/um/include/sysdep-x86_64/syscalls.h ---- a/arch/um/include/sysdep-x86_64/syscalls.h -+++ b/arch/um/include/sysdep-x86_64/syscalls.h -@@ -71,12 +71,7 @@ extern syscall_handler_t sys_arch_prctl; - [ __NR_iopl ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_set_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ - [ __NR_get_thread_area ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_remap_file_pages ] = (syscall_handler_t *) sys_remap_file_pages, \ - [ __NR_semtimedop ] = (syscall_handler_t *) sys_semtimedop, \ -- [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, \ -- [ 223 ] = (syscall_handler_t *) sys_ni_syscall, \ -- [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, \ -- [ __NR_vserver ] = (syscall_handler_t *) sys_ni_syscall, \ - [ 251 ] = (syscall_handler_t *) sys_ni_syscall, - - #define LAST_ARCH_SYSCALL 251 -diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c ---- a/arch/um/kernel/skas/uaccess.c -+++ b/arch/um/kernel/skas/uaccess.c -@@ -61,7 +61,8 @@ static void do_buffer_op(void *jmpbuf, v - void *arg; - int *res; - -- va_copy(args, *(va_list *)arg_ptr); -+ /* Some old gccs recognize __va_copy, but not va_copy */ -+ __va_copy(args, *(va_list *)arg_ptr); - addr = va_arg(args, unsigned long); - len = va_arg(args, int); - is_write = va_arg(args, int); -diff --git a/arch/um/kernel/sys_call_table.c b/arch/um/kernel/sys_call_table.c ---- a/arch/um/kernel/sys_call_table.c -+++ b/arch/um/kernel/sys_call_table.c -@@ -48,7 +48,6 @@ extern syscall_handler_t sys_vfork; - extern syscall_handler_t old_select; - extern syscall_handler_t sys_modify_ldt; - extern syscall_handler_t sys_rt_sigsuspend; --extern syscall_handler_t sys_vserver; - extern syscall_handler_t sys_mbind; - extern syscall_handler_t sys_get_mempolicy; - extern syscall_handler_t sys_set_mempolicy; -@@ -242,6 +241,7 @@ syscall_handler_t *sys_call_table[] = { - [ __NR_epoll_create ] = (syscall_handler_t *) sys_epoll_create, - [ __NR_epoll_ctl ] = (syscall_handler_t *) sys_epoll_ctl, - [ __NR_epoll_wait ] = (syscall_handler_t *) sys_epoll_wait, -+ [ __NR_remap_file_pages ] = (syscall_handler_t *) sys_remap_file_pages, - [ __NR_set_tid_address ] = (syscall_handler_t *) sys_set_tid_address, - [ __NR_timer_create ] = (syscall_handler_t *) sys_timer_create, - [ __NR_timer_settime ] = (syscall_handler_t *) sys_timer_settime, -@@ -252,12 +252,10 @@ syscall_handler_t *sys_call_table[] = { - [ __NR_clock_gettime ] = (syscall_handler_t *) sys_clock_gettime, - [ __NR_clock_getres ] = (syscall_handler_t *) sys_clock_getres, - [ __NR_clock_nanosleep ] = (syscall_handler_t *) sys_clock_nanosleep, -- [ __NR_statfs64 ] = (syscall_handler_t *) sys_statfs64, -- [ __NR_fstatfs64 ] = (syscall_handler_t *) sys_fstatfs64, - [ __NR_tgkill ] = (syscall_handler_t *) sys_tgkill, - [ __NR_utimes ] = (syscall_handler_t *) sys_utimes, -- [ __NR_fadvise64_64 ] = (syscall_handler_t *) sys_fadvise64_64, -- [ __NR_vserver ] = (syscall_handler_t *) sys_vserver, -+ [ __NR_fadvise64 ] = (syscall_handler_t *) sys_fadvise64, -+ [ __NR_vserver ] = (syscall_handler_t *) sys_ni_syscall, - [ __NR_mbind ] = (syscall_handler_t *) sys_mbind, - [ __NR_get_mempolicy ] = (syscall_handler_t *) sys_get_mempolicy, - [ __NR_set_mempolicy ] = (syscall_handler_t *) sys_set_mempolicy, -@@ -267,9 +265,8 @@ syscall_handler_t *sys_call_table[] = { - [ __NR_mq_timedreceive ] = (syscall_handler_t *) sys_mq_timedreceive, - [ __NR_mq_notify ] = (syscall_handler_t *) sys_mq_notify, - [ __NR_mq_getsetattr ] = (syscall_handler_t *) sys_mq_getsetattr, -- [ __NR_sys_kexec_load ] = (syscall_handler_t *) sys_ni_syscall, -+ [ __NR_kexec_load ] = (syscall_handler_t *) sys_ni_syscall, - [ __NR_waitid ] = (syscall_handler_t *) sys_waitid, -- [ 285 ] = (syscall_handler_t *) sys_ni_syscall, - [ __NR_add_key ] = (syscall_handler_t *) sys_add_key, - [ __NR_request_key ] = (syscall_handler_t *) sys_request_key, - [ __NR_keyctl ] = (syscall_handler_t *) sys_keyctl, -diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c ---- a/arch/x86_64/kernel/apic.c -+++ b/arch/x86_64/kernel/apic.c -@@ -775,9 +775,7 @@ void __init setup_boot_APIC_clock (void) - - void __init setup_secondary_APIC_clock(void) - { -- local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); -- local_irq_enable(); - } - - void __init disable_APIC_timer(void) -diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c ---- a/arch/x86_64/kernel/ptrace.c -+++ b/arch/x86_64/kernel/ptrace.c -@@ -129,13 +129,13 @@ static int putreg(struct task_struct *ch - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): -- if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) -- return -EIO; -+ if (value >= TASK_SIZE) -+ return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): -- if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) -- return -EIO; -+ if (value >= TASK_SIZE) -+ return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): -@@ -149,6 +149,11 @@ static int putreg(struct task_struct *ch - return -EIO; - value &= 0xffff; - break; -+ case offsetof(struct user_regs_struct, rip): -+ /* Check if the new RIP address is canonical */ -+ if (value >= TASK_SIZE) -+ return -EIO; -+ break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -@@ -247,7 +252,7 @@ asmlinkage long sys_ptrace(long request, - break; - - switch (addr) { -- case 0 ... sizeof(struct user_regs_struct): -+ case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): -@@ -292,7 +297,7 @@ asmlinkage long sys_ptrace(long request, - break; - - switch (addr) { -- case 0 ... sizeof(struct user_regs_struct): -+ case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ -diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c ---- a/arch/x86_64/kernel/smpboot.c -+++ b/arch/x86_64/kernel/smpboot.c -@@ -309,8 +309,6 @@ void __init smp_callin(void) - Dprintk("CALLIN, before setup_local_APIC().\n"); - setup_local_APIC(); - -- local_irq_enable(); -- - /* - * Get our bogomips. - */ -@@ -324,8 +322,6 @@ void __init smp_callin(void) - */ - smp_store_cpu_info(cpuid); - -- local_irq_disable(); -- - /* - * Allow the master to continue. - */ -diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c ---- a/arch/x86_64/mm/fault.c -+++ b/arch/x86_64/mm/fault.c -@@ -236,6 +236,8 @@ static noinline void pgtable_bad(unsigne - - /* - * Handle a fault on the vmalloc or module mapping area -+ * -+ * This assumes no large pages in there. - */ - static int vmalloc_fault(unsigned long address) - { -@@ -274,7 +276,10 @@ static int vmalloc_fault(unsigned long a - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); -- if (!pte_present(*pte) || pte_page(*pte) != pte_page(*pte_ref)) -+ /* Don't use pte_page here, because the mappings can point -+ outside mem_map, and the NUMA hash lookup cannot handle -+ that. */ -+ if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - __flush_tlb_all(); - return 0; -@@ -348,7 +353,9 @@ asmlinkage void do_page_fault(struct pt_ - * protection error (error_code & 1) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { -- if (!(error_code & 5)) { -+ if (!(error_code & 5) && -+ ((address >= VMALLOC_START && address < VMALLOC_END) || -+ (address >= MODULES_VADDR && address < MODULES_END))) { - if (vmalloc_fault(address) < 0) - goto bad_area_nosemaphore; - return; -diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c ---- a/arch/x86_64/mm/ioremap.c -+++ b/arch/x86_64/mm/ioremap.c -@@ -266,7 +266,7 @@ void iounmap(volatile void __iomem *addr - if ((p->flags >> 20) && - p->phys_addr + p->size - 1 < virt_to_phys(high_memory)) { - /* p->size includes the guard page, but cpa doesn't like that */ -- change_page_attr(virt_to_page(__va(p->phys_addr)), -+ change_page_attr_addr((unsigned long)(__va(p->phys_addr)), - (p->size - PAGE_SIZE) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); -diff --git a/drivers/block/ioctl.c b/drivers/block/ioctl.c ---- a/drivers/block/ioctl.c -+++ b/drivers/block/ioctl.c -@@ -237,3 +237,5 @@ long compat_blkdev_ioctl(struct file *fi - } - return ret; - } -+ -+EXPORT_SYMBOL_GPL(blkdev_ioctl); -diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c ---- a/drivers/block/pktcdvd.c -+++ b/drivers/block/pktcdvd.c -@@ -2400,7 +2400,7 @@ static int pkt_ioctl(struct inode *inode - case CDROM_LAST_WRITTEN: - case CDROM_SEND_PACKET: - case SCSI_IOCTL_SEND_COMMAND: -- return ioctl_by_bdev(pd->bdev, cmd, arg); -+ return blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); - - case CDROMEJECT: - /* -@@ -2408,7 +2408,7 @@ static int pkt_ioctl(struct inode *inode - * have to unlock it or else the eject command fails. - */ - pkt_lock_door(pd, 0); -- return ioctl_by_bdev(pd->bdev, cmd, arg); -+ return blkdev_ioctl(pd->bdev->bd_inode, file, cmd, arg); - - default: - printk("pktcdvd: Unknown ioctl for %s (%x)\n", pd->name, cmd); -diff --git a/drivers/char/drm/drm_ioctl.c b/drivers/char/drm/drm_ioctl.c ---- a/drivers/char/drm/drm_ioctl.c -+++ b/drivers/char/drm/drm_ioctl.c -@@ -326,6 +326,8 @@ int drm_setversion(DRM_IOCTL_ARGS) - - DRM_COPY_FROM_USER_IOCTL(sv, argp, sizeof(sv)); - -+ memset(&version, 0, sizeof(version)); -+ - dev->driver->version(&version); - retv.drm_di_major = DRM_IF_MAJOR; - retv.drm_di_minor = DRM_IF_MINOR; -diff --git a/drivers/char/raw.c b/drivers/char/raw.c ---- a/drivers/char/raw.c -+++ b/drivers/char/raw.c -@@ -122,7 +122,7 @@ raw_ioctl(struct inode *inode, struct fi - { - struct block_device *bdev = filp->private_data; - -- return ioctl_by_bdev(bdev, command, arg); -+ return blkdev_ioctl(bdev->bd_inode, filp, command, arg); - } - - static void bind_device(struct raw_config_request *rq) -diff --git a/drivers/i2c/chips/eeprom.c b/drivers/i2c/chips/eeprom.c ---- a/drivers/i2c/chips/eeprom.c -+++ b/drivers/i2c/chips/eeprom.c -@@ -130,7 +130,8 @@ static ssize_t eeprom_read(struct kobjec - - /* Hide Vaio security settings to regular users (16 first bytes) */ - if (data->nature == VAIO && off < 16 && !capable(CAP_SYS_ADMIN)) { -- int in_row1 = 16 - off; -+ size_t in_row1 = 16 - off; -+ in_row1 = min(in_row1, count); - memset(buf, 0, in_row1); - if (count - in_row1 > 0) - memcpy(buf + in_row1, &data->data[16], count - in_row1); -diff --git a/drivers/i2c/chips/it87.c b/drivers/i2c/chips/it87.c ---- a/drivers/i2c/chips/it87.c -+++ b/drivers/i2c/chips/it87.c -@@ -631,7 +631,7 @@ static ssize_t show_alarms(struct device - struct it87_data *data = it87_update_device(dev); - return sprintf(buf,"%d\n", ALARMS_FROM_REG(data->alarms)); - } --static DEVICE_ATTR(alarms, S_IRUGO | S_IWUSR, show_alarms, NULL); -+static DEVICE_ATTR(alarms, S_IRUGO, show_alarms, NULL); - - static ssize_t - show_vrm_reg(struct device *dev, char *buf) -diff --git a/drivers/i2c/chips/via686a.c b/drivers/i2c/chips/via686a.c ---- a/drivers/i2c/chips/via686a.c -+++ b/drivers/i2c/chips/via686a.c -@@ -554,7 +554,7 @@ static ssize_t show_alarms(struct device - struct via686a_data *data = via686a_update_device(dev); - return sprintf(buf,"%d\n", ALARMS_FROM_REG(data->alarms)); - } --static DEVICE_ATTR(alarms, S_IRUGO | S_IWUSR, show_alarms, NULL); -+static DEVICE_ATTR(alarms, S_IRUGO, show_alarms, NULL); - - /* The driver. I choose to use type i2c_driver, as at is identical to both - smbus_driver and isa_driver, and clients could be of either kind */ -diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c ---- a/drivers/ide/ide-disk.c -+++ b/drivers/ide/ide-disk.c -@@ -133,6 +133,8 @@ static ide_startstop_t __ide_do_rw_disk( - if (hwif->no_lba48_dma && lba48 && dma) { - if (block + rq->nr_sectors > 1ULL << 28) - dma = 0; -+ else -+ lba48 = 0; - } - - if (!dma) { -@@ -146,7 +148,7 @@ static ide_startstop_t __ide_do_rw_disk( - /* FIXME: SELECT_MASK(drive, 0) ? */ - - if (drive->select.b.lba) { -- if (drive->addressing == 1) { -+ if (lba48) { - task_ioreg_t tasklets[10]; - - pr_debug("%s: LBA=0x%012llx\n", drive->name, block); -diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h ---- a/drivers/input/serio/i8042-x86ia64io.h -+++ b/drivers/input/serio/i8042-x86ia64io.h -@@ -88,7 +88,7 @@ static struct dmi_system_id __initdata i - }; - #endif - --#ifdef CONFIG_ACPI -+#if defined(__ia64__) && defined(CONFIG_ACPI) - #include <linux/acpi.h> - #include <acpi/acpi_bus.h> - -@@ -281,7 +281,7 @@ static inline int i8042_platform_init(vo - i8042_kbd_irq = I8042_MAP_IRQ(1); - i8042_aux_irq = I8042_MAP_IRQ(12); - --#ifdef CONFIG_ACPI -+#if defined(__ia64__) && defined(CONFIG_ACPI) - if (i8042_acpi_init()) - return -1; - #endif -@@ -300,7 +300,7 @@ static inline int i8042_platform_init(vo - - static inline void i8042_platform_exit(void) - { --#ifdef CONFIG_ACPI -+#if defined(__ia64__) && defined(CONFIG_ACPI) - i8042_acpi_exit(); - #endif - } -diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc ---- a/drivers/md/raid6altivec.uc -+++ b/drivers/md/raid6altivec.uc -@@ -108,7 +108,11 @@ int raid6_have_altivec(void); - int raid6_have_altivec(void) - { - /* This assumes either all CPUs have Altivec or none does */ -+#ifdef CONFIG_PPC64 - return cur_cpu_spec->cpu_features & CPU_FTR_ALTIVEC; -+#else -+ return cur_cpu_spec[0]->cpu_features & CPU_FTR_ALTIVEC; -+#endif - } - #endif - -diff --git a/drivers/media/video/adv7170.c b/drivers/media/video/adv7170.c ---- a/drivers/media/video/adv7170.c -+++ b/drivers/media/video/adv7170.c -@@ -130,7 +130,7 @@ adv7170_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/adv7175.c b/drivers/media/video/adv7175.c ---- a/drivers/media/video/adv7175.c -+++ b/drivers/media/video/adv7175.c -@@ -126,7 +126,7 @@ adv7175_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/bt819.c b/drivers/media/video/bt819.c ---- a/drivers/media/video/bt819.c -+++ b/drivers/media/video/bt819.c -@@ -146,7 +146,7 @@ bt819_write_block (struct i2c_client *cl - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/bttv-cards.c b/drivers/media/video/bttv-cards.c ---- a/drivers/media/video/bttv-cards.c -+++ b/drivers/media/video/bttv-cards.c -@@ -1939,7 +1939,6 @@ struct tvcard bttv_tvcards[] = { - .no_tda9875 = 1, - .no_tda7432 = 1, - .tuner_type = TUNER_ABSENT, -- .no_video = 1, - .pll = PLL_28, - },{ - .name = "Teppro TEV-560/InterVision IV-560", -@@ -2718,8 +2717,6 @@ void __devinit bttv_init_card2(struct bt - } - btv->pll.pll_current = -1; - -- bttv_reset_audio(btv); -- - /* tuner configuration (from card list / autodetect / insmod option) */ - if (UNSET != bttv_tvcards[btv->c.type].tuner_type) - if(UNSET == btv->tuner_type) -diff --git a/drivers/media/video/saa7110.c b/drivers/media/video/saa7110.c ---- a/drivers/media/video/saa7110.c -+++ b/drivers/media/video/saa7110.c -@@ -60,8 +60,10 @@ MODULE_PARM_DESC(debug, "Debug level (0- - - #define I2C_SAA7110 0x9C /* or 0x9E */ - -+#define SAA7110_NR_REG 0x35 -+ - struct saa7110 { -- unsigned char reg[54]; -+ u8 reg[SAA7110_NR_REG]; - - int norm; - int input; -@@ -95,31 +97,28 @@ saa7110_write_block (struct i2c_client * - unsigned int len) - { - int ret = -1; -- u8 reg = *data++; -+ u8 reg = *data; /* first register to write to */ - -- len--; -+ /* Sanity check */ -+ if (reg + (len - 1) > SAA7110_NR_REG) -+ return ret; - - /* the saa7110 has an autoincrement function, use it if - * the adapter understands raw I2C */ - if (i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { - struct saa7110 *decoder = i2c_get_clientdata(client); - struct i2c_msg msg; -- u8 block_data[54]; - -- msg.len = 0; -- msg.buf = (char *) block_data; -+ msg.len = len; -+ msg.buf = (char *) data; - msg.addr = client->addr; -- msg.flags = client->flags; -- while (len >= 1) { -- msg.len = 0; -- block_data[msg.len++] = reg; -- while (len-- >= 1 && msg.len < 54) -- block_data[msg.len++] = -- decoder->reg[reg++] = *data++; -- ret = i2c_transfer(client->adapter, &msg, 1); -- } -+ msg.flags = 0; -+ ret = i2c_transfer(client->adapter, &msg, 1); -+ -+ /* Cache the written data */ -+ memcpy(decoder->reg + reg, data + 1, len - 1); - } else { -- while (len-- >= 1) { -+ for (++data, --len; len; len--) { - if ((ret = saa7110_write(client, reg++, - *data++)) < 0) - break; -@@ -192,7 +191,7 @@ saa7110_selmux (struct i2c_client *clien - return 0; - } - --static const unsigned char initseq[] = { -+static const unsigned char initseq[1 + SAA7110_NR_REG] = { - 0, 0x4C, 0x3C, 0x0D, 0xEF, 0xBD, 0xF2, 0x03, 0x00, - /* 0x08 */ 0xF8, 0xF8, 0x60, 0x60, 0x00, 0x86, 0x18, 0x90, - /* 0x10 */ 0x00, 0x59, 0x40, 0x46, 0x42, 0x1A, 0xFF, 0xDA, -diff --git a/drivers/media/video/saa7114.c b/drivers/media/video/saa7114.c ---- a/drivers/media/video/saa7114.c -+++ b/drivers/media/video/saa7114.c -@@ -163,7 +163,7 @@ saa7114_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/media/video/saa7185.c b/drivers/media/video/saa7185.c ---- a/drivers/media/video/saa7185.c -+++ b/drivers/media/video/saa7185.c -@@ -118,7 +118,7 @@ saa7185_write_block (struct i2c_client * - u8 block_data[32]; - - msg.addr = client->addr; -- msg.flags = client->flags; -+ msg.flags = 0; - while (len >= 2) { - msg.buf = (char *) block_data; - msg.len = 0; -diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c ---- a/drivers/net/3c59x.c -+++ b/drivers/net/3c59x.c -@@ -1581,7 +1581,8 @@ vortex_up(struct net_device *dev) - - if (VORTEX_PCI(vp)) { - pci_set_power_state(VORTEX_PCI(vp), PCI_D0); /* Go active */ -- pci_restore_state(VORTEX_PCI(vp)); -+ if (vp->pm_state_valid) -+ pci_restore_state(VORTEX_PCI(vp)); - pci_enable_device(VORTEX_PCI(vp)); - } - -@@ -2741,6 +2742,7 @@ vortex_down(struct net_device *dev, int - outl(0, ioaddr + DownListPtr); - - if (final_down && VORTEX_PCI(vp)) { -+ vp->pm_state_valid = 1; - pci_save_state(VORTEX_PCI(vp)); - acpi_set_WOL(dev); - } -@@ -3243,9 +3245,10 @@ static void acpi_set_WOL(struct net_devi - outw(RxEnable, ioaddr + EL3_CMD); - - pci_enable_wake(VORTEX_PCI(vp), 0, 1); -+ -+ /* Change the power state to D3; RxEnable doesn't take effect. */ -+ pci_set_power_state(VORTEX_PCI(vp), PCI_D3hot); - } -- /* Change the power state to D3; RxEnable doesn't take effect. */ -- pci_set_power_state(VORTEX_PCI(vp), PCI_D3hot); - } - - -diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c ---- a/drivers/net/amd8111e.c -+++ b/drivers/net/amd8111e.c -@@ -1381,6 +1381,8 @@ static int amd8111e_open(struct net_devi - - if(amd8111e_restart(dev)){ - spin_unlock_irq(&lp->lock); -+ if (dev->irq) -+ free_irq(dev->irq, dev); - return -ENOMEM; - } - /* Start ipg timer */ -diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c ---- a/drivers/net/ppp_async.c -+++ b/drivers/net/ppp_async.c -@@ -1000,7 +1000,7 @@ static void async_lcp_peek(struct asyncp - data += 4; - dlen -= 4; - /* data[0] is code, data[1] is length */ -- while (dlen >= 2 && dlen >= data[1]) { -+ while (dlen >= 2 && dlen >= data[1] && data[1] >= 2) { - switch (data[0]) { - case LCP_MRU: - val = (data[2] << 8) + data[3]; -diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c ---- a/drivers/net/r8169.c -+++ b/drivers/net/r8169.c -@@ -1683,16 +1683,19 @@ static void rtl8169_free_rx_skb(struct r - rtl8169_make_unusable_by_asic(desc); - } - --static inline void rtl8169_return_to_asic(struct RxDesc *desc, int rx_buf_sz) -+static inline void rtl8169_mark_to_asic(struct RxDesc *desc, u32 rx_buf_sz) - { -- desc->opts1 |= cpu_to_le32(DescOwn + rx_buf_sz); -+ u32 eor = le32_to_cpu(desc->opts1) & RingEnd; -+ -+ desc->opts1 = cpu_to_le32(DescOwn | eor | rx_buf_sz); - } - --static inline void rtl8169_give_to_asic(struct RxDesc *desc, dma_addr_t mapping, -- int rx_buf_sz) -+static inline void rtl8169_map_to_asic(struct RxDesc *desc, dma_addr_t mapping, -+ u32 rx_buf_sz) - { - desc->addr = cpu_to_le64(mapping); -- desc->opts1 |= cpu_to_le32(DescOwn + rx_buf_sz); -+ wmb(); -+ rtl8169_mark_to_asic(desc, rx_buf_sz); - } - - static int rtl8169_alloc_rx_skb(struct pci_dev *pdev, struct sk_buff **sk_buff, -@@ -1712,7 +1715,7 @@ static int rtl8169_alloc_rx_skb(struct p - mapping = pci_map_single(pdev, skb->tail, rx_buf_sz, - PCI_DMA_FROMDEVICE); - -- rtl8169_give_to_asic(desc, mapping, rx_buf_sz); -+ rtl8169_map_to_asic(desc, mapping, rx_buf_sz); - - out: - return ret; -@@ -2150,7 +2153,7 @@ static inline int rtl8169_try_rx_copy(st - skb_reserve(skb, NET_IP_ALIGN); - eth_copy_and_sum(skb, sk_buff[0]->tail, pkt_size, 0); - *sk_buff = skb; -- rtl8169_return_to_asic(desc, rx_buf_sz); -+ rtl8169_mark_to_asic(desc, rx_buf_sz); - ret = 0; - } - } -diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c ---- a/drivers/net/sis900.c -+++ b/drivers/net/sis900.c -@@ -236,7 +236,7 @@ static int __devinit sis900_get_mac_addr - signature = (u16) read_eeprom(ioaddr, EEPROMSignature); - if (signature == 0xffff || signature == 0x0000) { - printk (KERN_INFO "%s: Error EERPOM read %x\n", -- net_dev->name, signature); -+ pci_name(pci_dev), signature); - return 0; - } - -@@ -268,7 +268,7 @@ static int __devinit sis630e_get_mac_add - if (!isa_bridge) - isa_bridge = pci_get_device(PCI_VENDOR_ID_SI, 0x0018, isa_bridge); - if (!isa_bridge) { -- printk("%s: Can not find ISA bridge\n", net_dev->name); -+ printk("%s: Can not find ISA bridge\n", pci_name(pci_dev)); - return 0; - } - pci_read_config_byte(isa_bridge, 0x48, ®); -@@ -456,10 +456,6 @@ static int __devinit sis900_probe(struct - net_dev->tx_timeout = sis900_tx_timeout; - net_dev->watchdog_timeo = TX_TIMEOUT; - net_dev->ethtool_ops = &sis900_ethtool_ops; -- -- ret = register_netdev(net_dev); -- if (ret) -- goto err_unmap_rx; - - /* Get Mac address according to the chip revision */ - pci_read_config_byte(pci_dev, PCI_CLASS_REVISION, &revision); -@@ -476,7 +472,7 @@ static int __devinit sis900_probe(struct - - if (ret == 0) { - ret = -ENODEV; -- goto err_out_unregister; -+ goto err_unmap_rx; - } - - /* 630ET : set the mii access mode as software-mode */ -@@ -486,7 +482,7 @@ static int __devinit sis900_probe(struct - /* probe for mii transceiver */ - if (sis900_mii_probe(net_dev) == 0) { - ret = -ENODEV; -- goto err_out_unregister; -+ goto err_unmap_rx; - } - - /* save our host bridge revision */ -@@ -496,6 +492,10 @@ static int __devinit sis900_probe(struct - pci_dev_put(dev); - } - -+ ret = register_netdev(net_dev); -+ if (ret) -+ goto err_unmap_rx; -+ - /* print some information about our NIC */ - printk(KERN_INFO "%s: %s at %#lx, IRQ %d, ", net_dev->name, - card_name, ioaddr, net_dev->irq); -@@ -505,8 +505,6 @@ static int __devinit sis900_probe(struct - - return 0; - -- err_out_unregister: -- unregister_netdev(net_dev); - err_unmap_rx: - pci_free_consistent(pci_dev, RX_TOTAL_SIZE, sis_priv->rx_ring, - sis_priv->rx_ring_dma); -@@ -533,6 +531,7 @@ static int __devinit sis900_probe(struct - static int __init sis900_mii_probe(struct net_device * net_dev) - { - struct sis900_private * sis_priv = net_dev->priv; -+ const char *dev_name = pci_name(sis_priv->pci_dev); - u16 poll_bit = MII_STAT_LINK, status = 0; - unsigned long timeout = jiffies + 5 * HZ; - int phy_addr; -@@ -582,21 +581,20 @@ static int __init sis900_mii_probe(struc - mii_phy->phy_types = - (mii_status & (MII_STAT_CAN_TX_FDX | MII_STAT_CAN_TX)) ? LAN : HOME; - printk(KERN_INFO "%s: %s transceiver found at address %d.\n", -- net_dev->name, mii_chip_table[i].name, -+ dev_name, mii_chip_table[i].name, - phy_addr); - break; - } - - if( !mii_chip_table[i].phy_id1 ) { - printk(KERN_INFO "%s: Unknown PHY transceiver found at address %d.\n", -- net_dev->name, phy_addr); -+ dev_name, phy_addr); - mii_phy->phy_types = UNKNOWN; - } - } - - if (sis_priv->mii == NULL) { -- printk(KERN_INFO "%s: No MII transceivers found!\n", -- net_dev->name); -+ printk(KERN_INFO "%s: No MII transceivers found!\n", dev_name); - return 0; - } - -@@ -621,7 +619,7 @@ static int __init sis900_mii_probe(struc - poll_bit ^= (mdio_read(net_dev, sis_priv->cur_phy, MII_STATUS) & poll_bit); - if (time_after_eq(jiffies, timeout)) { - printk(KERN_WARNING "%s: reset phy and link down now\n", -- net_dev->name); -+ dev_name); - return -ETIME; - } - } -@@ -691,7 +689,7 @@ static u16 sis900_default_phy(struct net - sis_priv->mii = default_phy; - sis_priv->cur_phy = default_phy->phy_addr; - printk(KERN_INFO "%s: Using transceiver found at address %d as default\n", -- net_dev->name,sis_priv->cur_phy); -+ pci_name(sis_priv->pci_dev), sis_priv->cur_phy); - } - - status = mdio_read(net_dev, sis_priv->cur_phy, MII_CONTROL); -diff --git a/drivers/net/tun.c b/drivers/net/tun.c ---- a/drivers/net/tun.c -+++ b/drivers/net/tun.c -@@ -229,7 +229,7 @@ static __inline__ ssize_t tun_get_user(s - size_t len = count; - - if (!(tun->flags & TUN_NO_PI)) { -- if ((len -= sizeof(pi)) > len) -+ if ((len -= sizeof(pi)) > count) - return -EINVAL; - - if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) -diff --git a/drivers/net/via-rhine.c b/drivers/net/via-rhine.c ---- a/drivers/net/via-rhine.c -+++ b/drivers/net/via-rhine.c -@@ -1197,8 +1197,10 @@ static int rhine_open(struct net_device - dev->name, rp->pdev->irq); - - rc = alloc_ring(dev); -- if (rc) -+ if (rc) { -+ free_irq(rp->pdev->irq, dev); - return rc; -+ } - alloc_rbufs(dev); - alloc_tbufs(dev); - rhine_chip_reset(dev); -@@ -1899,6 +1901,9 @@ static void rhine_shutdown (struct devic - struct rhine_private *rp = netdev_priv(dev); - void __iomem *ioaddr = rp->base; - -+ if (!(rp->quirks & rqWOL)) -+ return; /* Nothing to do for non-WOL adapters */ -+ - rhine_power_init(dev); - - /* Make sure we use pattern 0, 1 and not 4, 5 */ -diff --git a/drivers/net/wan/hd6457x.c b/drivers/net/wan/hd6457x.c ---- a/drivers/net/wan/hd6457x.c -+++ b/drivers/net/wan/hd6457x.c -@@ -315,7 +315,7 @@ static inline void sca_rx(card_t *card, - #endif - stats->rx_packets++; - stats->rx_bytes += skb->len; -- skb->dev->last_rx = jiffies; -+ dev->last_rx = jiffies; - skb->protocol = hdlc_type_trans(skb, dev); - netif_rx(skb); - } -diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c ---- a/drivers/pci/hotplug/pciehp_ctrl.c -+++ b/drivers/pci/hotplug/pciehp_ctrl.c -@@ -1354,10 +1354,11 @@ static u32 remove_board(struct pci_func - dbg("PCI Bridge Hot-Remove s:b:d:f(%02x:%02x:%02x:%02x)\n", - ctrl->seg, func->bus, func->device, func->function); - bridge_slot_remove(func); -- } else -+ } else { - dbg("PCI Function Hot-Remove s:b:d:f(%02x:%02x:%02x:%02x)\n", - ctrl->seg, func->bus, func->device, func->function); - slot_remove(func); -+ } - - func = pciehp_slot_find(ctrl->slot_bus, device, 0); - } -diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c ---- a/drivers/usb/serial/visor.c -+++ b/drivers/usb/serial/visor.c -@@ -386,6 +386,7 @@ struct visor_private { - int bytes_in; - int bytes_out; - int outstanding_urbs; -+ int throttled; - }; - - /* number of outstanding urbs to prevent userspace DoS from happening */ -@@ -415,6 +416,7 @@ static int visor_open (struct usb_serial - priv->bytes_in = 0; - priv->bytes_out = 0; - priv->outstanding_urbs = 0; -+ priv->throttled = 0; - spin_unlock_irqrestore(&priv->lock, flags); - - /* -@@ -602,6 +604,7 @@ static void visor_read_bulk_callback (st - struct tty_struct *tty; - unsigned long flags; - int i; -+ int throttled; - int result; - - dbg("%s - port %d", __FUNCTION__, port->number); -@@ -627,18 +630,21 @@ static void visor_read_bulk_callback (st - } - spin_lock_irqsave(&priv->lock, flags); - priv->bytes_in += urb->actual_length; -+ throttled = priv->throttled; - spin_unlock_irqrestore(&priv->lock, flags); - -- /* Continue trying to always read */ -- usb_fill_bulk_urb (port->read_urb, port->serial->dev, -- usb_rcvbulkpipe(port->serial->dev, -- port->bulk_in_endpointAddress), -- port->read_urb->transfer_buffer, -- port->read_urb->transfer_buffer_length, -- visor_read_bulk_callback, port); -- result = usb_submit_urb(port->read_urb, GFP_ATOMIC); -- if (result) -- dev_err(&port->dev, "%s - failed resubmitting read urb, error %d\n", __FUNCTION__, result); -+ /* Continue trying to always read if we should */ -+ if (!throttled) { -+ usb_fill_bulk_urb (port->read_urb, port->serial->dev, -+ usb_rcvbulkpipe(port->serial->dev, -+ port->bulk_in_endpointAddress), -+ port->read_urb->transfer_buffer, -+ port->read_urb->transfer_buffer_length, -+ visor_read_bulk_callback, port); -+ result = usb_submit_urb(port->read_urb, GFP_ATOMIC); -+ if (result) -+ dev_err(&port->dev, "%s - failed resubmitting read urb, error %d\n", __FUNCTION__, result); -+ } - return; - } - -@@ -683,16 +689,26 @@ exit: - - static void visor_throttle (struct usb_serial_port *port) - { -+ struct visor_private *priv = usb_get_serial_port_data(port); -+ unsigned long flags; -+ - dbg("%s - port %d", __FUNCTION__, port->number); -- usb_kill_urb(port->read_urb); -+ spin_lock_irqsave(&priv->lock, flags); -+ priv->throttled = 1; -+ spin_unlock_irqrestore(&priv->lock, flags); - } - - - static void visor_unthrottle (struct usb_serial_port *port) - { -+ struct visor_private *priv = usb_get_serial_port_data(port); -+ unsigned long flags; - int result; - - dbg("%s - port %d", __FUNCTION__, port->number); -+ spin_lock_irqsave(&priv->lock, flags); -+ priv->throttled = 0; -+ spin_unlock_irqrestore(&priv->lock, flags); - - port->read_urb->dev = port->serial->dev; - result = usb_submit_urb(port->read_urb, GFP_ATOMIC); -diff --git a/drivers/video/matrox/matroxfb_accel.c b/drivers/video/matrox/matroxfb_accel.c ---- a/drivers/video/matrox/matroxfb_accel.c -+++ b/drivers/video/matrox/matroxfb_accel.c -@@ -438,13 +438,21 @@ static void matroxfb_1bpp_imageblit(WPMI - } else if (step == 1) { - /* Special case for 1..8bit widths */ - while (height--) { -- mga_writel(mmio, 0, *chardata); -+#if defined(__BIG_ENDIAN) -+ fb_writel((*chardata) << 24, mmio.vaddr); -+#else -+ fb_writel(*chardata, mmio.vaddr); -+#endif - chardata++; - } - } else if (step == 2) { - /* Special case for 9..15bit widths */ - while (height--) { -- mga_writel(mmio, 0, *(u_int16_t*)chardata); -+#if defined(__BIG_ENDIAN) -+ fb_writel((*(u_int16_t*)chardata) << 16, mmio.vaddr); -+#else -+ fb_writel(*(u_int16_t*)chardata, mmio.vaddr); -+#endif - chardata += 2; - } - } else { -@@ -454,7 +462,7 @@ static void matroxfb_1bpp_imageblit(WPMI - - for (i = 0; i < step; i += 4) { - /* Hope that there are at least three readable bytes beyond the end of bitmap */ -- mga_writel(mmio, 0, get_unaligned((u_int32_t*)(chardata + i))); -+ fb_writel(get_unaligned((u_int32_t*)(chardata + i)),mmio.vaddr); - } - chardata += step; - } -diff --git a/drivers/video/matrox/matroxfb_base.h b/drivers/video/matrox/matroxfb_base.h ---- a/drivers/video/matrox/matroxfb_base.h -+++ b/drivers/video/matrox/matroxfb_base.h -@@ -170,14 +170,14 @@ static inline void mga_memcpy_toio(vaddr - - if ((unsigned long)src & 3) { - while (len >= 4) { -- writel(get_unaligned((u32 *)src), addr); -+ fb_writel(get_unaligned((u32 *)src), addr); - addr++; - len -= 4; - src += 4; - } - } else { - while (len >= 4) { -- writel(*(u32 *)src, addr); -+ fb_writel(*(u32 *)src, addr); - addr++; - len -= 4; - src += 4; -diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c ---- a/fs/binfmt_elf.c -+++ b/fs/binfmt_elf.c -@@ -257,7 +257,7 @@ create_elf_tables(struct linux_binprm *b - } - - /* Populate argv and envp */ -- p = current->mm->arg_start; -+ p = current->mm->arg_end = current->mm->arg_start; - while (argc-- > 0) { - size_t len; - __put_user((elf_addr_t)p, argv++); -@@ -1008,6 +1008,7 @@ out_free_ph: - static int load_elf_library(struct file *file) - { - struct elf_phdr *elf_phdata; -+ struct elf_phdr *eppnt; - unsigned long elf_bss, bss, len; - int retval, error, i, j; - struct elfhdr elf_ex; -@@ -1031,44 +1032,47 @@ static int load_elf_library(struct file - /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */ - - error = -ENOMEM; -- elf_phdata = (struct elf_phdr *) kmalloc(j, GFP_KERNEL); -+ elf_phdata = kmalloc(j, GFP_KERNEL); - if (!elf_phdata) - goto out; - -+ eppnt = elf_phdata; - error = -ENOEXEC; -- retval = kernel_read(file, elf_ex.e_phoff, (char *) elf_phdata, j); -+ retval = kernel_read(file, elf_ex.e_phoff, (char *)eppnt, j); - if (retval != j) - goto out_free_ph; - - for (j = 0, i = 0; i<elf_ex.e_phnum; i++) -- if ((elf_phdata + i)->p_type == PT_LOAD) j++; -+ if ((eppnt + i)->p_type == PT_LOAD) -+ j++; - if (j != 1) - goto out_free_ph; - -- while (elf_phdata->p_type != PT_LOAD) elf_phdata++; -+ while (eppnt->p_type != PT_LOAD) -+ eppnt++; - - /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, -- ELF_PAGESTART(elf_phdata->p_vaddr), -- (elf_phdata->p_filesz + -- ELF_PAGEOFFSET(elf_phdata->p_vaddr)), -+ ELF_PAGESTART(eppnt->p_vaddr), -+ (eppnt->p_filesz + -+ ELF_PAGEOFFSET(eppnt->p_vaddr)), - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, -- (elf_phdata->p_offset - -- ELF_PAGEOFFSET(elf_phdata->p_vaddr))); -+ (eppnt->p_offset - -+ ELF_PAGEOFFSET(eppnt->p_vaddr))); - up_write(¤t->mm->mmap_sem); -- if (error != ELF_PAGESTART(elf_phdata->p_vaddr)) -+ if (error != ELF_PAGESTART(eppnt->p_vaddr)) - goto out_free_ph; - -- elf_bss = elf_phdata->p_vaddr + elf_phdata->p_filesz; -+ elf_bss = eppnt->p_vaddr + eppnt->p_filesz; - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_free_ph; - } - -- len = ELF_PAGESTART(elf_phdata->p_filesz + elf_phdata->p_vaddr + ELF_MIN_ALIGN - 1); -- bss = elf_phdata->p_memsz + elf_phdata->p_vaddr; -+ len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); -+ bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) { - down_write(¤t->mm->mmap_sem); - do_brk(len, bss - len); -@@ -1275,7 +1279,7 @@ static void fill_prstatus(struct elf_prs - static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, - struct mm_struct *mm) - { -- int i, len; -+ unsigned int i, len; - - /* first copy the parameters from user space */ - memset(psinfo, 0, sizeof(struct elf_prpsinfo)); -diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c ---- a/fs/cramfs/inode.c -+++ b/fs/cramfs/inode.c -@@ -70,6 +70,7 @@ static struct inode *get_cramfs_inode(st - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; -+ inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } -diff --git a/fs/eventpoll.c b/fs/eventpoll.c ---- a/fs/eventpoll.c -+++ b/fs/eventpoll.c -@@ -619,6 +619,7 @@ eexit_1: - return error; - } - -+#define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) - - /* - * Implement the event wait interface for the eventpoll file. It is the kernel -@@ -635,7 +636,7 @@ asmlinkage long sys_epoll_wait(int epfd, - current, epfd, events, maxevents, timeout)); - - /* The maximum number of event must be greater than zero */ -- if (maxevents <= 0) -+ if (maxevents <= 0 || maxevents > MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ -diff --git a/fs/exec.c b/fs/exec.c ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -814,7 +814,7 @@ void get_task_comm(char *buf, struct tas - { - /* buf must be at least sizeof(tsk->comm) in size */ - task_lock(tsk); -- memcpy(buf, tsk->comm, sizeof(tsk->comm)); -+ strncpy(buf, tsk->comm, sizeof(tsk->comm)); - task_unlock(tsk); - } - -diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c ---- a/fs/ext2/dir.c -+++ b/fs/ext2/dir.c -@@ -592,6 +592,7 @@ int ext2_make_empty(struct inode *inode, - goto fail; - } - kaddr = kmap_atomic(page, KM_USER0); -+ memset(kaddr, 0, chunk_size); - de = (struct ext2_dir_entry_2 *)kaddr; - de->name_len = 1; - de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1)); -diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c ---- a/fs/ext3/balloc.c -+++ b/fs/ext3/balloc.c -@@ -268,7 +268,8 @@ void ext3_discard_reservation(struct ino - - if (!rsv_is_empty(&rsv->rsv_window)) { - spin_lock(rsv_lock); -- rsv_window_remove(inode->i_sb, rsv); -+ if (!rsv_is_empty(&rsv->rsv_window)) -+ rsv_window_remove(inode->i_sb, rsv); - spin_unlock(rsv_lock); - } - } -diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c ---- a/fs/hfs/mdb.c -+++ b/fs/hfs/mdb.c -@@ -333,6 +333,8 @@ void hfs_mdb_close(struct super_block *s - * Release the resources associated with the in-core MDB. */ - void hfs_mdb_put(struct super_block *sb) - { -+ if (!HFS_SB(sb)) -+ return; - /* free the B-trees */ - hfs_btree_close(HFS_SB(sb)->ext_tree); - hfs_btree_close(HFS_SB(sb)->cat_tree); -@@ -340,4 +342,7 @@ void hfs_mdb_put(struct super_block *sb) - /* free the buffers holding the primary and alternate MDBs */ - brelse(HFS_SB(sb)->mdb_bh); - brelse(HFS_SB(sb)->alt_mdb_bh); -+ -+ kfree(HFS_SB(sb)); -+ sb->s_fs_info = NULL; - } -diff --git a/fs/hfs/super.c b/fs/hfs/super.c ---- a/fs/hfs/super.c -+++ b/fs/hfs/super.c -@@ -263,7 +263,7 @@ static int hfs_fill_super(struct super_b - res = -EINVAL; - if (!parse_options((char *)data, sbi)) { - hfs_warn("hfs_fs: unable to parse mount options.\n"); -- goto bail3; -+ goto bail; - } - - sb->s_op = &hfs_super_operations; -@@ -276,7 +276,7 @@ static int hfs_fill_super(struct super_b - hfs_warn("VFS: Can't find a HFS filesystem on dev %s.\n", - hfs_mdb_name(sb)); - res = -EINVAL; -- goto bail2; -+ goto bail; - } - - /* try to get the root inode */ -@@ -306,10 +306,8 @@ bail_iput: - iput(root_inode); - bail_no_root: - hfs_warn("hfs_fs: get root inode failed.\n"); -+bail: - hfs_mdb_put(sb); --bail2: --bail3: -- kfree(sbi); - return res; - } - -diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c ---- a/fs/hfsplus/super.c -+++ b/fs/hfsplus/super.c -@@ -207,7 +207,9 @@ static void hfsplus_write_super(struct s - static void hfsplus_put_super(struct super_block *sb) - { - dprint(DBG_SUPER, "hfsplus_put_super\n"); -- if (!(sb->s_flags & MS_RDONLY)) { -+ if (!sb->s_fs_info) -+ return; -+ if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { - struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; - - vhdr->modify_date = hfsp_now2mt(); -@@ -223,6 +225,8 @@ static void hfsplus_put_super(struct sup - iput(HFSPLUS_SB(sb).alloc_file); - iput(HFSPLUS_SB(sb).hidden_dir); - brelse(HFSPLUS_SB(sb).s_vhbh); -+ kfree(sb->s_fs_info); -+ sb->s_fs_info = NULL; - } - - static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf) -diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c ---- a/fs/isofs/inode.c -+++ b/fs/isofs/inode.c -@@ -685,6 +685,8 @@ root_found: - sbi->s_log_zone_size = isonum_723 (h_pri->logical_block_size); - sbi->s_max_size = isonum_733(h_pri->volume_space_size); - } else { -+ if (!pri) -+ goto out_freebh; - rootp = (struct iso_directory_record *) pri->root_directory_record; - sbi->s_nzones = isonum_733 (pri->volume_space_size); - sbi->s_log_zone_size = isonum_723 (pri->logical_block_size); -@@ -1395,6 +1397,9 @@ struct inode *isofs_iget(struct super_bl - struct inode *inode; - struct isofs_iget5_callback_data data; - -+ if (offset >= 1ul << sb->s_blocksize_bits) -+ return NULL; -+ - data.block = block; - data.offset = offset; - -diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c ---- a/fs/isofs/rock.c -+++ b/fs/isofs/rock.c -@@ -53,6 +53,7 @@ - if(LEN & 1) LEN++; \ - CHR = ((unsigned char *) DE) + LEN; \ - LEN = *((unsigned char *) DE) - LEN; \ -+ if (LEN<0) LEN=0; \ - if (ISOFS_SB(inode->i_sb)->s_rock_offset!=-1) \ - { \ - LEN-=ISOFS_SB(inode->i_sb)->s_rock_offset; \ -@@ -73,6 +74,10 @@ - offset1 = 0; \ - pbh = sb_bread(DEV->i_sb, block); \ - if(pbh){ \ -+ if (offset > pbh->b_size || offset + cont_size > pbh->b_size){ \ -+ brelse(pbh); \ -+ goto out; \ -+ } \ - memcpy(buffer + offset1, pbh->b_data + offset, cont_size - offset1); \ - brelse(pbh); \ - chr = (unsigned char *) buffer; \ -@@ -103,12 +108,13 @@ int get_rock_ridge_filename(struct iso_d - struct rock_ridge * rr; - int sig; - -- while (len > 1){ /* There may be one byte for padding somewhere */ -+ while (len > 2){ /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; -- if (rr->len == 0) goto out; /* Something got screwed up here */ -+ if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; -+ if (len < 0) goto out; /* corrupted isofs */ - - switch(sig){ - case SIG('R','R'): -@@ -122,6 +128,7 @@ int get_rock_ridge_filename(struct iso_d - break; - case SIG('N','M'): - if (truncate) break; -+ if (rr->len < 5) break; - /* - * If the flags are 2 or 4, this indicates '.' or '..'. - * We don't want to do anything with this, because it -@@ -186,12 +193,13 @@ parse_rock_ridge_inode_internal(struct i - struct rock_ridge * rr; - int rootflag; - -- while (len > 1){ /* There may be one byte for padding somewhere */ -+ while (len > 2){ /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; -- if (rr->len == 0) goto out; /* Something got screwed up here */ -+ if (rr->len < 3) goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; -+ if (len < 0) goto out; /* corrupted isofs */ - - switch(sig){ - #ifndef CONFIG_ZISOFS /* No flag for SF or ZF */ -@@ -462,7 +470,7 @@ static int rock_ridge_symlink_readpage(s - struct rock_ridge *rr; - - if (!ISOFS_SB(inode->i_sb)->s_rock) -- panic ("Cannot have symlink with high sierra variant of iso filesystem\n"); -+ goto error; - - block = ei->i_iget5_block; - lock_kernel(); -@@ -487,13 +495,15 @@ static int rock_ridge_symlink_readpage(s - SETUP_ROCK_RIDGE(raw_inode, chr, len); - - repeat: -- while (len > 1) { /* There may be one byte for padding somewhere */ -+ while (len > 2) { /* There may be one byte for padding somewhere */ - rr = (struct rock_ridge *) chr; -- if (rr->len == 0) -+ if (rr->len < 3) - goto out; /* Something got screwed up here */ - sig = isonum_721(chr); - chr += rr->len; - len -= rr->len; -+ if (len < 0) -+ goto out; /* corrupted isofs */ - - switch (sig) { - case SIG('R', 'R'): -@@ -543,6 +553,7 @@ static int rock_ridge_symlink_readpage(s - fail: - brelse(bh); - unlock_kernel(); -+ error: - SetPageError(page); - kunmap(page); - unlock_page(page); -diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c ---- a/fs/jbd/checkpoint.c -+++ b/fs/jbd/checkpoint.c -@@ -339,8 +339,10 @@ int log_do_checkpoint(journal_t *journal - } - } while (jh != last_jh && !retry); - -- if (batch_count) -+ if (batch_count) { - __flush_batch(journal, bhs, &batch_count); -+ retry = 1; -+ } - - /* - * If someone cleaned up this transaction while we slept, we're -diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c ---- a/fs/jbd/transaction.c -+++ b/fs/jbd/transaction.c -@@ -1775,10 +1775,10 @@ static int journal_unmap_buffer(journal_ - JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); - ret = __dispose_buffer(jh, - journal->j_running_transaction); -+ journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -- journal_put_journal_head(jh); - return ret; - } else { - /* There is no currently-running transaction. So the -@@ -1789,10 +1789,10 @@ static int journal_unmap_buffer(journal_ - JBUFFER_TRACE(jh, "give to committing trans"); - ret = __dispose_buffer(jh, - journal->j_committing_transaction); -+ journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -- journal_put_journal_head(jh); - return ret; - } else { - /* The orphan record's transaction has -@@ -1813,10 +1813,10 @@ static int journal_unmap_buffer(journal_ - journal->j_running_transaction); - jh->b_next_transaction = NULL; - } -+ journal_put_journal_head(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); -- journal_put_journal_head(jh); - return 0; - } else { - /* Good, the buffer belongs to the running transaction. -diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h ---- a/include/asm-x86_64/processor.h -+++ b/include/asm-x86_64/processor.h -@@ -160,9 +160,9 @@ static inline void clear_in_cr4 (unsigne - - - /* -- * User space process size. 47bits. -+ * User space process size. 47bits minus one guard page. - */ --#define TASK_SIZE (0x800000000000UL) -+#define TASK_SIZE (0x800000000000UL - 4096) - - /* This decides where the kernel will search for a free chunk of vm - * space during mmap's. -diff --git a/include/linux/err.h b/include/linux/err.h ---- a/include/linux/err.h -+++ b/include/linux/err.h -@@ -13,6 +13,8 @@ - * This should be a per-architecture thing, to allow different - * error and pointer decisions. - */ -+#define IS_ERR_VALUE(x) unlikely((x) > (unsigned long)-1000L) -+ - static inline void *ERR_PTR(long error) - { - return (void *) error; -@@ -25,7 +27,7 @@ static inline long PTR_ERR(const void *p - - static inline long IS_ERR(const void *ptr) - { -- return unlikely((unsigned long)ptr > (unsigned long)-1000L); -+ return IS_ERR_VALUE((unsigned long)ptr); - } - - #endif /* _LINUX_ERR_H */ -diff --git a/kernel/exit.c b/kernel/exit.c ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -516,8 +516,6 @@ static inline void choose_new_parent(tas - */ - BUG_ON(p == reaper || reaper->exit_state >= EXIT_ZOMBIE); - p->real_parent = reaper; -- if (p->parent == p->real_parent) -- BUG(); - } - - static inline void reparent_thread(task_t *p, task_t *father, int traced) -diff --git a/kernel/signal.c b/kernel/signal.c ---- a/kernel/signal.c -+++ b/kernel/signal.c -@@ -1728,6 +1728,7 @@ do_signal_stop(int signr) - * with another processor delivering a stop signal, - * then the SIGCONT that wakes us up should clear it. - */ -+ read_unlock(&tasklist_lock); - return 0; - } - -diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c ---- a/lib/rwsem-spinlock.c -+++ b/lib/rwsem-spinlock.c -@@ -140,12 +140,12 @@ void fastcall __sched __down_read(struct - - rwsemtrace(sem, "Entering __down_read"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irq(&sem->wait_lock); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity++; -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - goto out; - } - -@@ -160,7 +160,7 @@ void fastcall __sched __down_read(struct - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - for (;;) { -@@ -181,10 +181,12 @@ void fastcall __sched __down_read(struct - */ - int fastcall __down_read_trylock(struct rw_semaphore *sem) - { -+ unsigned long flags; - int ret = 0; -+ - rwsemtrace(sem, "Entering __down_read_trylock"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { - /* granted */ -@@ -192,7 +194,7 @@ int fastcall __down_read_trylock(struct - ret = 1; - } - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __down_read_trylock"); - return ret; -@@ -209,12 +211,12 @@ void fastcall __sched __down_write(struc - - rwsemtrace(sem, "Entering __down_write"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irq(&sem->wait_lock); - - if (sem->activity == 0 && list_empty(&sem->wait_list)) { - /* granted */ - sem->activity = -1; -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - goto out; - } - -@@ -229,7 +231,7 @@ void fastcall __sched __down_write(struc - list_add_tail(&waiter.list, &sem->wait_list); - - /* we don't need to touch the semaphore struct anymore */ -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - for (;;) { -@@ -250,10 +252,12 @@ void fastcall __sched __down_write(struc - */ - int fastcall __down_write_trylock(struct rw_semaphore *sem) - { -+ unsigned long flags; - int ret = 0; -+ - rwsemtrace(sem, "Entering __down_write_trylock"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - if (sem->activity == 0 && list_empty(&sem->wait_list)) { - /* granted */ -@@ -261,7 +265,7 @@ int fastcall __down_write_trylock(struct - ret = 1; - } - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __down_write_trylock"); - return ret; -@@ -272,14 +276,16 @@ int fastcall __down_write_trylock(struct - */ - void fastcall __up_read(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering __up_read"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) - sem = __rwsem_wake_one_writer(sem); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __up_read"); - } -@@ -289,15 +295,17 @@ void fastcall __up_read(struct rw_semaph - */ - void fastcall __up_write(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering __up_write"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 0; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __up_write"); - } -@@ -308,15 +316,17 @@ void fastcall __up_write(struct rw_semap - */ - void fastcall __downgrade_write(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering __downgrade_write"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - sem->activity = 1; - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving __downgrade_write"); - } -diff --git a/lib/rwsem.c b/lib/rwsem.c ---- a/lib/rwsem.c -+++ b/lib/rwsem.c -@@ -150,7 +150,7 @@ rwsem_down_failed_common(struct rw_semap - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - - /* set up my own style of waitqueue */ -- spin_lock(&sem->wait_lock); -+ spin_lock_irq(&sem->wait_lock); - waiter->task = tsk; - get_task_struct(tsk); - -@@ -163,7 +163,7 @@ rwsem_down_failed_common(struct rw_semap - if (!(count & RWSEM_ACTIVE_MASK)) - sem = __rwsem_do_wake(sem, 0); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ - for (;;) { -@@ -219,15 +219,17 @@ rwsem_down_write_failed(struct rw_semaph - */ - struct rw_semaphore fastcall *rwsem_wake(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering rwsem_wake"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 0); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving rwsem_wake"); - -@@ -241,15 +243,17 @@ struct rw_semaphore fastcall *rwsem_wake - */ - struct rw_semaphore fastcall *rwsem_downgrade_wake(struct rw_semaphore *sem) - { -+ unsigned long flags; -+ - rwsemtrace(sem, "Entering rwsem_downgrade_wake"); - -- spin_lock(&sem->wait_lock); -+ spin_lock_irqsave(&sem->wait_lock, flags); - - /* do nothing if list empty */ - if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, 1); - -- spin_unlock(&sem->wait_lock); -+ spin_unlock_irqrestore(&sem->wait_lock, flags); - - rwsemtrace(sem, "Leaving rwsem_downgrade_wake"); - return sem; -diff --git a/mm/mmap.c b/mm/mmap.c ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -1315,37 +1315,40 @@ unsigned long - get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) - { -- if (flags & MAP_FIXED) { -- unsigned long ret; -+ unsigned long ret; - -- if (addr > TASK_SIZE - len) -- return -ENOMEM; -- if (addr & ~PAGE_MASK) -- return -EINVAL; -- if (file && is_file_hugepages(file)) { -- /* -- * Check if the given range is hugepage aligned, and -- * can be made suitable for hugepages. -- */ -- ret = prepare_hugepage_range(addr, len); -- } else { -- /* -- * Ensure that a normal request is not falling in a -- * reserved hugepage range. For some archs like IA-64, -- * there is a separate region for hugepages. -- */ -- ret = is_hugepage_only_range(addr, len); -- } -- if (ret) -- return -EINVAL; -- return addr; -- } -+ if (!(flags & MAP_FIXED)) { -+ unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - -- if (file && file->f_op && file->f_op->get_unmapped_area) -- return file->f_op->get_unmapped_area(file, addr, len, -- pgoff, flags); -+ get_area = current->mm->get_unmapped_area; -+ if (file && file->f_op && file->f_op->get_unmapped_area) -+ get_area = file->f_op->get_unmapped_area; -+ addr = get_area(file, addr, len, pgoff, flags); -+ if (IS_ERR_VALUE(addr)) -+ return addr; -+ } - -- return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); -+ if (addr > TASK_SIZE - len) -+ return -ENOMEM; -+ if (addr & ~PAGE_MASK) -+ return -EINVAL; -+ if (file && is_file_hugepages(file)) { -+ /* -+ * Check if the given range is hugepage aligned, and -+ * can be made suitable for hugepages. -+ */ -+ ret = prepare_hugepage_range(addr, len); -+ } else { -+ /* -+ * Ensure that a normal request is not falling in a -+ * reserved hugepage range. For some archs like IA-64, -+ * there is a separate region for hugepages. -+ */ -+ ret = is_hugepage_only_range(addr, len); -+ } -+ if (ret) -+ return -EINVAL; -+ return addr; - } - - EXPORT_SYMBOL(get_unmapped_area); -diff --git a/mm/rmap.c b/mm/rmap.c ---- a/mm/rmap.c -+++ b/mm/rmap.c -@@ -641,7 +641,7 @@ static void try_to_unmap_cluster(unsigne - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; -- pte_t *pte; -+ pte_t *pte, *original_pte; - pte_t pteval; - struct page *page; - unsigned long address; -@@ -673,7 +673,7 @@ static void try_to_unmap_cluster(unsigne - if (!pmd_present(*pmd)) - goto out_unlock; - -- for (pte = pte_offset_map(pmd, address); -+ for (original_pte = pte = pte_offset_map(pmd, address); - address < end; pte++, address += PAGE_SIZE) { - - if (!pte_present(*pte)) -@@ -710,7 +710,7 @@ static void try_to_unmap_cluster(unsigne - (*mapcount)--; - } - -- pte_unmap(pte); -+ pte_unmap(original_pte); - - out_unlock: - spin_unlock(&mm->page_table_lock); -diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c ---- a/net/bluetooth/af_bluetooth.c -+++ b/net/bluetooth/af_bluetooth.c -@@ -64,7 +64,7 @@ static kmem_cache_t *bt_sock_cache; - - int bt_sock_register(int proto, struct net_proto_family *ops) - { -- if (proto >= BT_MAX_PROTO) -+ if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; - - if (bt_proto[proto]) -@@ -77,7 +77,7 @@ EXPORT_SYMBOL(bt_sock_register); - - int bt_sock_unregister(int proto) - { -- if (proto >= BT_MAX_PROTO) -+ if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; - - if (!bt_proto[proto]) -@@ -92,7 +92,7 @@ static int bt_sock_create(struct socket - { - int err = 0; - -- if (proto >= BT_MAX_PROTO) -+ if (proto < 0 || proto >= BT_MAX_PROTO) - return -EINVAL; - - #if defined(CONFIG_KMOD) -diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c ---- a/net/bridge/br_input.c -+++ b/net/bridge/br_input.c -@@ -54,6 +54,9 @@ int br_handle_frame_finish(struct sk_buf - struct net_bridge_fdb_entry *dst; - int passedup = 0; - -+ /* insert into forwarding database after filtering to avoid spoofing */ -+ br_fdb_insert(p->br, p, eth_hdr(skb)->h_source, 0); -+ - if (br->dev->flags & IFF_PROMISC) { - struct sk_buff *skb2; - -@@ -108,8 +111,7 @@ int br_handle_frame(struct net_bridge_po - if (eth_hdr(skb)->h_source[0] & 1) - goto err; - -- if (p->state == BR_STATE_LEARNING || -- p->state == BR_STATE_FORWARDING) -+ if (p->state == BR_STATE_LEARNING) - br_fdb_insert(p->br, p, eth_hdr(skb)->h_source, 0); - - if (p->br->stp_enabled && -diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c ---- a/net/bridge/br_stp_bpdu.c -+++ b/net/bridge/br_stp_bpdu.c -@@ -140,6 +140,9 @@ int br_stp_handle_bpdu(struct sk_buff *s - struct net_bridge *br = p->br; - unsigned char *buf; - -+ /* insert into forwarding database after filtering to avoid spoofing */ -+ br_fdb_insert(p->br, p, eth_hdr(skb)->h_source, 0); -+ - /* need at least the 802 and STP headers */ - if (!pskb_may_pull(skb, sizeof(header)+1) || - memcmp(skb->data, header, sizeof(header))) -diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c ---- a/net/bridge/netfilter/ebtables.c -+++ b/net/bridge/netfilter/ebtables.c -@@ -179,9 +179,10 @@ unsigned int ebt_do_table (unsigned int - struct ebt_chainstack *cs; - struct ebt_entries *chaininfo; - char *base; -- struct ebt_table_info *private = table->private; -+ struct ebt_table_info *private; - - read_lock_bh(&table->lock); -+ private = table->private; - cb_base = COUNTER_BASE(private->counters, private->nentries, - smp_processor_id()); - if (private->chainstack) -diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c ---- a/net/ipv4/fib_hash.c -+++ b/net/ipv4/fib_hash.c -@@ -919,13 +919,23 @@ out: - return fa; - } - -+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) -+{ -+ struct fib_alias *fa = fib_get_first(seq); -+ -+ if (fa) -+ while (pos && (fa = fib_get_next(seq))) -+ --pos; -+ return pos ? NULL : fa; -+} -+ - static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - { - void *v = NULL; - - read_lock(&fib_hash_lock); - if (ip_fib_main_table) -- v = *pos ? fib_get_next(seq) : SEQ_START_TOKEN; -+ v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; - return v; - } - -diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c ---- a/net/ipv4/netfilter/ip_queue.c -+++ b/net/ipv4/netfilter/ip_queue.c -@@ -3,6 +3,7 @@ - * communicating with userspace via netlink. - * - * (C) 2000-2002 James Morris <jmorris@xxxxxxxxxxxxxxxx> -+ * (C) 2003-2005 Netfilter Core Team <coreteam@xxxxxxxxxxxxx> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as -@@ -14,6 +15,7 @@ - * Zander). - * 2000-08-01: Added Nick Williams' MAC support. - * 2002-06-25: Code cleanup. -+ * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) - * - */ - #include <linux/module.h> -@@ -66,7 +68,15 @@ static DECLARE_MUTEX(ipqnl_sem); - static void - ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) - { -+ /* TCP input path (and probably other bits) assume to be called -+ * from softirq context, not from syscall, like ipq_issue_verdict is -+ * called. TCP input path deadlocks with locks taken from timer -+ * softirq, e.g. We therefore emulate this by local_bh_disable() */ -+ -+ local_bh_disable(); - nf_reinject(entry->skb, entry->info, verdict); -+ local_bh_enable(); -+ - kfree(entry); - } - -diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c ---- a/net/ipv4/tcp_input.c -+++ b/net/ipv4/tcp_input.c -@@ -1653,7 +1653,10 @@ static void DBGUNDO(struct sock *sk, str - static void tcp_undo_cwr(struct tcp_sock *tp, int undo) - { - if (tp->prior_ssthresh) { -- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); -+ if (tcp_is_bic(tp)) -+ tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); -+ else -+ tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); - - if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { - tp->snd_ssthresh = tp->prior_ssthresh; -diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c ---- a/net/ipv4/tcp_timer.c -+++ b/net/ipv4/tcp_timer.c -@@ -38,6 +38,7 @@ static void tcp_keepalive_timer (unsigne - - #ifdef TCP_DEBUG - const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; -+EXPORT_SYMBOL(tcp_timer_bug_msg); - #endif - - /* -diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c ---- a/net/ipv4/xfrm4_output.c -+++ b/net/ipv4/xfrm4_output.c -@@ -103,17 +103,17 @@ int xfrm4_output(struct sk_buff *skb) - goto error_nolock; - } - -- spin_lock_bh(&x->lock); -- err = xfrm_state_check(x, skb); -- if (err) -- goto error; -- - if (x->props.mode) { - err = xfrm4_tunnel_check_size(skb); - if (err) -- goto error; -+ goto error_nolock; - } - -+ spin_lock_bh(&x->lock); -+ err = xfrm_state_check(x, skb); -+ if (err) -+ goto error; -+ - xfrm4_encap(skb); - - err = x->type->output(skb); -diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c ---- a/net/ipv6/xfrm6_output.c -+++ b/net/ipv6/xfrm6_output.c -@@ -103,17 +103,17 @@ int xfrm6_output(struct sk_buff *skb) - goto error_nolock; - } - -- spin_lock_bh(&x->lock); -- err = xfrm_state_check(x, skb); -- if (err) -- goto error; -- - if (x->props.mode) { - err = xfrm6_tunnel_check_size(skb); - if (err) -- goto error; -+ goto error_nolock; - } - -+ spin_lock_bh(&x->lock); -+ err = xfrm_state_check(x, skb); -+ if (err) -+ goto error; -+ - xfrm6_encap(skb); - - err = x->type->output(skb); -diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c ---- a/net/netrom/nr_in.c -+++ b/net/netrom/nr_in.c -@@ -74,7 +74,6 @@ static int nr_queue_rx_frame(struct sock - static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, - int frametype) - { -- bh_lock_sock(sk); - switch (frametype) { - case NR_CONNACK: { - nr_cb *nr = nr_sk(sk); -@@ -103,8 +102,6 @@ static int nr_state1_machine(struct sock - default: - break; - } -- bh_unlock_sock(sk); -- - return 0; - } - -@@ -116,7 +113,6 @@ static int nr_state1_machine(struct sock - static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, - int frametype) - { -- bh_lock_sock(sk); - switch (frametype) { - case NR_CONNACK | NR_CHOKE_FLAG: - nr_disconnect(sk, ECONNRESET); -@@ -132,8 +128,6 @@ static int nr_state2_machine(struct sock - default: - break; - } -- bh_unlock_sock(sk); -- - return 0; - } - -@@ -154,7 +148,6 @@ static int nr_state3_machine(struct sock - nr = skb->data[18]; - ns = skb->data[17]; - -- bh_lock_sock(sk); - switch (frametype) { - case NR_CONNREQ: - nr_write_internal(sk, NR_CONNACK); -@@ -265,8 +258,6 @@ static int nr_state3_machine(struct sock - default: - break; - } -- bh_unlock_sock(sk); -- - return queued; - } - -diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c ---- a/net/rose/rose_route.c -+++ b/net/rose/rose_route.c -@@ -727,7 +727,8 @@ int rose_rt_ioctl(unsigned int cmd, void - } - if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ - return -EINVAL; -- -+ if (rose_route.ndigis > 8) /* No more than 8 digipeats */ -+ return -EINVAL; - err = rose_add_node(&rose_route, dev); - dev_put(dev); - return err; -diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c ---- a/net/sched/sch_netem.c -+++ b/net/sched/sch_netem.c -@@ -184,10 +184,15 @@ static int netem_enqueue(struct sk_buff - /* Random duplication */ - if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); -- -- pr_debug("netem_enqueue: dup %p\n", skb2); -- if (skb2) -- delay_skb(sch, skb2); -+ if (skb2) { -+ struct Qdisc *rootq = sch->dev->qdisc; -+ u32 dupsave = q->duplicate; -+ -+ /* prevent duplicating a dup... */ -+ q->duplicate = 0; -+ rootq->enqueue(skb2, rootq); -+ q->duplicate = dupsave; -+ } - } - - /* If doing simple delay then gap == 0 so all packets -diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c ---- a/net/xfrm/xfrm_state.c -+++ b/net/xfrm/xfrm_state.c -@@ -609,7 +609,7 @@ static struct xfrm_state *__xfrm_find_ac - - for (i = 0; i < XFRM_DST_HSIZE; i++) { - list_for_each_entry(x, xfrm_state_bydst+i, bydst) { -- if (x->km.seq == seq) { -+ if (x->km.seq == seq && x->km.state == XFRM_STATE_ACQ) { - xfrm_state_hold(x); - return x; - } -diff --git a/security/keys/key.c b/security/keys/key.c ---- a/security/keys/key.c -+++ b/security/keys/key.c -@@ -57,9 +57,10 @@ struct key_user *key_user_lookup(uid_t u - { - struct key_user *candidate = NULL, *user; - struct rb_node *parent = NULL; -- struct rb_node **p = &key_user_tree.rb_node; -+ struct rb_node **p; - - try_again: -+ p = &key_user_tree.rb_node; - spin_lock(&key_user_lock); - - /* search the tree for a user record with a matching UID */ -diff --git a/sound/core/timer.c b/sound/core/timer.c ---- a/sound/core/timer.c -+++ b/sound/core/timer.c -@@ -1117,7 +1117,8 @@ static void snd_timer_user_append_to_tqu - if (tu->qused >= tu->queue_size) { - tu->overrun++; - } else { -- memcpy(&tu->queue[tu->qtail++], tread, sizeof(*tread)); -+ memcpy(&tu->tqueue[tu->qtail++], tread, sizeof(*tread)); -+ tu->qtail %= tu->queue_size; - tu->qused++; - } - } -@@ -1140,6 +1141,8 @@ static void snd_timer_user_ccallback(snd - spin_lock(&tu->qlock); - snd_timer_user_append_to_tqueue(tu, &r1); - spin_unlock(&tu->qlock); -+ kill_fasync(&tu->fasync, SIGIO, POLL_IN); -+ wake_up(&tu->qchange_sleep); - } - - static void snd_timer_user_tinterrupt(snd_timer_instance_t *timeri, -diff --git a/sound/pci/ac97/ac97_codec.c b/sound/pci/ac97/ac97_codec.c ---- a/sound/pci/ac97/ac97_codec.c -+++ b/sound/pci/ac97/ac97_codec.c -@@ -1185,7 +1185,7 @@ snd_kcontrol_t *snd_ac97_cnew(const snd_ - /* - * create mute switch(es) for normal stereo controls - */ --static int snd_ac97_cmute_new(snd_card_t *card, char *name, int reg, ac97_t *ac97) -+static int snd_ac97_cmute_new_stereo(snd_card_t *card, char *name, int reg, int check_stereo, ac97_t *ac97) - { - snd_kcontrol_t *kctl; - int err; -@@ -1196,7 +1196,7 @@ static int snd_ac97_cmute_new(snd_card_t - - mute_mask = 0x8000; - val = snd_ac97_read(ac97, reg); -- if (ac97->flags & AC97_STEREO_MUTES) { -+ if (check_stereo || (ac97->flags & AC97_STEREO_MUTES)) { - /* check whether both mute bits work */ - val1 = val | 0x8080; - snd_ac97_write(ac97, reg, val1); -@@ -1254,7 +1254,7 @@ static int snd_ac97_cvol_new(snd_card_t - /* - * create a mute-switch and a volume for normal stereo/mono controls - */ --static int snd_ac97_cmix_new(snd_card_t *card, const char *pfx, int reg, ac97_t *ac97) -+static int snd_ac97_cmix_new_stereo(snd_card_t *card, const char *pfx, int reg, int check_stereo, ac97_t *ac97) - { - int err; - char name[44]; -@@ -1265,7 +1265,7 @@ static int snd_ac97_cmix_new(snd_card_t - - if (snd_ac97_try_bit(ac97, reg, 15)) { - sprintf(name, "%s Switch", pfx); -- if ((err = snd_ac97_cmute_new(card, name, reg, ac97)) < 0) -+ if ((err = snd_ac97_cmute_new_stereo(card, name, reg, check_stereo, ac97)) < 0) - return err; - } - check_volume_resolution(ac97, reg, &lo_max, &hi_max); -@@ -1277,6 +1277,8 @@ static int snd_ac97_cmix_new(snd_card_t - return 0; - } - -+#define snd_ac97_cmix_new(card, pfx, reg, ac97) snd_ac97_cmix_new_stereo(card, pfx, reg, 0, ac97) -+#define snd_ac97_cmute_new(card, name, reg, ac97) snd_ac97_cmute_new_stereo(card, name, reg, 0, ac97) - - static unsigned int snd_ac97_determine_spdif_rates(ac97_t *ac97); - -@@ -1327,7 +1329,8 @@ static int snd_ac97_mixer_build(ac97_t * - - /* build surround controls */ - if (snd_ac97_try_volume_mix(ac97, AC97_SURROUND_MASTER)) { -- if ((err = snd_ac97_cmix_new(card, "Surround Playback", AC97_SURROUND_MASTER, ac97)) < 0) -+ /* Surround Master (0x38) is with stereo mutes */ -+ if ((err = snd_ac97_cmix_new_stereo(card, "Surround Playback", AC97_SURROUND_MASTER, 1, ac97)) < 0) - return err; - } - -diff --git a/sound/usb/usbaudio.c b/sound/usb/usbaudio.c ---- a/sound/usb/usbaudio.c -+++ b/sound/usb/usbaudio.c -@@ -3276,7 +3276,7 @@ static void snd_usb_audio_disconnect(str - } - usb_chip[chip->index] = NULL; - up(®ister_mutex); -- snd_card_free_in_thread(card); -+ snd_card_free(card); - } else { - up(®ister_mutex); - } -diff --git a/sound/usb/usx2y/usbusx2y.c b/sound/usb/usx2y/usbusx2y.c ---- a/sound/usb/usx2y/usbusx2y.c -+++ b/sound/usb/usx2y/usbusx2y.c -@@ -1,6 +1,11 @@ - /* - * usbusy2y.c - ALSA USB US-428 Driver - * -+2005-04-14 Karsten Wiese -+ Version 0.8.7.2: -+ Call snd_card_free() instead of snd_card_free_in_thread() to prevent oops with dead keyboard symptom. -+ Tested ok with kernel 2.6.12-rc2. -+ - 2004-12-14 Karsten Wiese - Version 0.8.7.1: - snd_pcm_open for rawusb pcm-devices now returns -EBUSY if called without rawusb's hwdep device being open. -@@ -143,7 +148,7 @@ - - - MODULE_AUTHOR("Karsten Wiese <annabellesgarden@xxxxxxxx>"); --MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.1"); -+MODULE_DESCRIPTION("TASCAM "NAME_ALLCAPS" Version 0.8.7.2"); - MODULE_LICENSE("GPL"); - MODULE_SUPPORTED_DEVICE("{{TASCAM(0x1604), "NAME_ALLCAPS"(0x8001)(0x8005)(0x8007) }}"); - -@@ -430,8 +435,6 @@ static void usX2Y_usb_disconnect(struct - if (ptr) { - usX2Ydev_t* usX2Y = usX2Y((snd_card_t*)ptr); - struct list_head* p; -- if (usX2Y->chip_status == USX2Y_STAT_CHIP_HUP) // on 2.6.1 kernel snd_usbmidi_disconnect() -- return; // calls us back. better leave :-) . - usX2Y->chip.shutdown = 1; - usX2Y->chip_status = USX2Y_STAT_CHIP_HUP; - usX2Y_unlinkSeq(&usX2Y->AS04); -@@ -443,7 +446,7 @@ static void usX2Y_usb_disconnect(struct - } - if (usX2Y->us428ctls_sharedmem) - wake_up(&usX2Y->us428ctls_wait_queue_head); -- snd_card_free_in_thread((snd_card_t*)ptr); -+ snd_card_free((snd_card_t*)ptr); - } - } - diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/net-csum.patch --- a/patches/linux-2.6.11/net-csum.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,22 +0,0 @@ -diff -ur linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_tcp.c linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_tcp.c ---- linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2005-05-27 11:47:48 +01:00 -+++ linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_tcp.c 2005-05-27 11:48:07 +01:00 -@@ -803,6 +803,7 @@ - */ - /* FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING -+ && skb->ip_summed != CHECKSUM_UNNECESSARY - && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, - skb->ip_summed == CHECKSUM_HW ? skb->csum - : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { -diff -ur linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_udp.c linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_udp.c ---- linux-2.6.11/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2005-05-27 11:47:48 +01:00 -+++ linux-2.6.11-csum/net/ipv4/netfilter/ip_conntrack_proto_udp.c 2005-05-27 11:48:07 +01:00 -@@ -120,6 +120,7 @@ - * and moreover root might send raw packets. - * FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING -+ && skb->ip_summed != CHECKSUM_UNNECESSARY - && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, - skb->ip_summed == CHECKSUM_HW ? skb->csum - : skb_checksum(skb, iph->ihl*4, udplen, 0))) { diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/rcu-nohz.patch --- a/patches/linux-2.6.11/rcu-nohz.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,16 +0,0 @@ -diff -ur linux-2.6.11/kernel/rcupdate.c linux-2.6.11-rcu/kernel/rcupdate.c ---- linux-2.6.11/kernel/rcupdate.c 2005-05-30 10:51:41 +01:00 -+++ linux-2.6.11-rcu/kernel/rcupdate.c 2005-05-30 10:53:53 +01:00 -@@ -202,8 +202,11 @@ - */ - static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp, struct rcu_state *rsp) - { -+ cpumask_t mask; -+ - cpu_clear(cpu, rsp->cpumask); -- if (cpus_empty(rsp->cpumask)) { -+ cpus_andnot(mask, rsp->cpumask, nohz_cpu_mask); -+ if (cpus_empty(mask)) { - /* batch completed ! */ - rcp->completed = rcp->cur; - rcu_start_batch(rcp, rsp, 0); diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/udp-frag.patch --- a/patches/linux-2.6.11/udp-frag.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,55 +0,0 @@ -diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c ---- a/net/ipv4/udp.c -+++ b/net/ipv4/udp.c -@@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, - unsigned long amount; - - amount = 0; -- spin_lock_irq(&sk->sk_receive_queue.lock); -+ spin_lock_bh(&sk->sk_receive_queue.lock); - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* -@@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, - */ - amount = skb->len - sizeof(struct udphdr); - } -- spin_unlock_irq(&sk->sk_receive_queue.lock); -+ spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); - } - -@@ -848,12 +848,12 @@ csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; -- spin_lock_irq(&sk->sk_receive_queue.lock); -+ spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } -- spin_unlock_irq(&sk->sk_receive_queue.lock); -+ spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } -@@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, - struct sk_buff_head *rcvq = &sk->sk_receive_queue; - struct sk_buff *skb; - -- spin_lock_irq(&rcvq->lock); -+ spin_lock_bh(&rcvq->lock); - while ((skb = skb_peek(rcvq)) != NULL) { - if (udp_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS); -@@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, - break; - } - } -- spin_unlock_irq(&rcvq->lock); -+ spin_unlock_bh(&rcvq->lock); - - /* nothing to see, move along */ - if (skb == NULL) - diff -r d75a502b45eb -r 43e28a2f6037 patches/linux-2.6.11/x86_64-linux.patch --- a/patches/linux-2.6.11/x86_64-linux.patch Fri Jul 15 19:57:12 2005 +++ /dev/null Sat Jul 16 14:02:54 2005 @@ -1,68 +0,0 @@ -diff -urN linux-2.6.10-orig/include/asm-x86_64/hw_irq.h linux-2.6.10/include/asm-x86_64/hw_irq.h ---- linux-2.6.10-orig/include/asm-x86_64/hw_irq.h 2005-01-06 00:34:38.000000000 -0500 -+++ linux-2.6.10/include/asm-x86_64/hw_irq.h 2005-02-25 17:45:37.181518088 -0500 -@@ -48,6 +48,7 @@ - * - * Vectors 0xf0-0xf9 are free (reserved for future Linux use). - */ -+#ifndef CONFIG_XEN - #define SPURIOUS_APIC_VECTOR 0xff - #define ERROR_APIC_VECTOR 0xfe - #define INVALIDATE_TLB_VECTOR 0xfd -@@ -57,7 +58,7 @@ - #define KDB_VECTOR 0xf9 - - #define THERMAL_APIC_VECTOR 0xf0 -- -+#endif - - /* - * Local APIC timer IRQ vector is on a different priority level, -diff -urN linux-2.6.10-orig/include/asm-x86_64/irq.h linux-2.6.10/include/asm-x86_64/irq.h ---- linux-2.6.10-orig/include/asm-x86_64/irq.h 2005-01-06 00:34:38.000000000 -0500 -+++ linux-2.6.10/include/asm-x86_64/irq.h 2005-02-25 17:45:37.181518088 -0500 -@@ -10,6 +10,9 @@ - * <tomsoft@xxxxxxxxxxxxxxxxxxxxxxxxx> - */ - -+#ifdef CONFIG_XEN -+#include "irq_vectors.h" -+#endif - #define TIMER_IRQ 0 - - /* -@@ -22,6 +25,7 @@ - * the usable vector space is 0x20-0xff (224 vectors) - */ - -+#ifndef CONFIG_XEN - /* - * The maximum number of vectors supported by x86_64 processors - * is limited to 256. For processors other than x86_64, NR_VECTORS -@@ -38,6 +42,7 @@ - #define NR_IRQS 224 - #define NR_IRQ_VECTORS 1024 - #endif -+#endif - - static __inline__ int irq_canonicalize(int irq) - { -diff -urN linux-2.6.10-orig/include/asm-x86_64/posix_types.h linux-2.6.10/include/asm-x86_64/posix_types.h ---- linux-2.6.10-orig/include/asm-x86_64/posix_types.h 2004-10-18 17:55:29.000000000 -0400 -+++ linux-2.6.10/include/asm-x86_64/posix_types.h 2005-02-25 17:45:37.183517784 -0500 -@@ -6,7 +6,7 @@ - * be a little careful about namespace pollution etc. Also, we cannot - * assume GCC is being used. - */ -- -+#ifndef __ASSEMBLY__ - typedef unsigned long __kernel_ino_t; - typedef unsigned int __kernel_mode_t; - typedef unsigned long __kernel_nlink_t; -@@ -115,5 +115,5 @@ - } - - #endif /* defined(__KERNEL__) */ -- -+#endif - #endif _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |