[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Merge with -unstable
# HG changeset patch # User djm@xxxxxxxxxxxxxxx # Node ID 8799d14bef771ae236166e3c5c00a65dd6f2d44c # Parent 5f1ed597f1072b86d5c59a588c3ac2aefd0b7450 # Parent 9fb0bad776dd3d1c1cd5eec4990a296fbe0e38dc Merge with -unstable diff -r 5f1ed597f107 -r 8799d14bef77 .hgignore --- a/.hgignore Wed Aug 24 02:43:18 2005 +++ b/.hgignore Thu Aug 25 22:53:20 2005 @@ -69,25 +69,25 @@ ^tools/blktap/blkdump$ ^tools/blktap/blkgnbd$ ^tools/blktap/blkimg$ -^tools/blktap/blockstore\.dat$ -^tools/blktap/blockstored$ ^tools/blktap/bstest$ -^tools/blktap/parallax$ ^tools/blktap/vdi\.dot$ ^tools/blktap/vdi\.ps$ -^tools/blktap/vdi_create$ -^tools/blktap/vdi_fill$ -^tools/blktap/vdi_list$ -^tools/blktap/vdi_snap$ -^tools/blktap/vdi_snap_list$ -^tools/blktap/vdi_tree$ -^tools/blktap/vdi_validate$ +^tools/blktap/parallax/vdi_create$ +^tools/blktap/parallax/vdi_fill$ +^tools/blktap/parallax/vdi_list$ +^tools/blktap/parallax/vdi_snap$ +^tools/blktap/parallax/vdi_snap_list$ +^tools/blktap/parallax/vdi_snap_delete$ +^tools/blktap/parallax/vdi_tree$ +^tools/blktap/parallax/vdi_validate$ +^tools/blktap/parallax/parallax$ +^tools/blktap/parallax/blockstored$ ^tools/blktap/xen/.*$ ^tools/check/\..*$ ^tools/cmdline/.*$ ^tools/cmdline/xen/.*$ -^tools/consoled/consoled$ -^tools/consoled/xc_console$ +^tools/console/xenconsoled$ +^tools/console/xenconsole$ ^tools/debugger/pdb/pdb$ ^tools/debugger/pdb/linux-[0-9.]*-module/.*\.ko$ ^tools/debugger/pdb/linux-[0-9.]*-module/.*\.mod.c$ @@ -116,7 +116,6 @@ ^tools/ioemu/target-.*/Makefile$ ^tools/ioemu/target-.*/config\..*$ ^tools/ioemu/target-.*/qemu-dm$ -^tools/ioemu/target-.*/qemu-vgaram-bin$ ^tools/libxc/xen/.*$ ^tools/misc/cpuperf/cpuperf-perfcntr$ ^tools/misc/cpuperf/cpuperf-xen$ @@ -148,6 +147,7 @@ ^tools/xcs/xcsdump$ ^tools/xcutils/xc_restore$ ^tools/xcutils/xc_save$ +^tools/xenstat/xentop/xentop$ ^tools/xenstore/testsuite/tmp/.*$ ^tools/xenstore/xen$ ^tools/xenstore/xenstored$ diff -r 5f1ed597f107 -r 8799d14bef77 Config.mk --- a/Config.mk Wed Aug 24 02:43:18 2005 +++ b/Config.mk Thu Aug 25 22:53:20 2005 @@ -7,13 +7,14 @@ # Tools to run on system hosting the build HOSTCC = gcc -HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer +HOSTCFLAGS = -Wall -Werror -Wstrict-prototypes -O2 -fomit-frame-pointer AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld CC = $(CROSS_COMPILE)gcc CPP = $(CROSS_COMPILE)gcc -E AR = $(CROSS_COMPILE)ar +RANLIB = $(CROSS_COMPILE)ranlib NM = $(CROSS_COMPILE)nm STRIP = $(CROSS_COMPILE)strip OBJCOPY = $(CROSS_COMPILE)objcopy @@ -35,3 +36,15 @@ # Choose the best mirror to download linux kernel KERNEL_REPO = http://www.kernel.org + +# ACM_USE_SECURITY_POLICY is set to security policy of Xen +# Supported models are: +# ACM_NULL_POLICY (ACM will not be built with this policy) +# ACM_CHINESE_WALL_POLICY +# ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY +# ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY +ACM_USE_SECURITY_POLICY ?= ACM_NULL_POLICY + +# Optional components +XENSTAT_XENTOP ?= y + diff -r 5f1ed597f107 -r 8799d14bef77 Makefile --- a/Makefile Wed Aug 24 02:43:18 2005 +++ b/Makefile Thu Aug 25 22:53:20 2005 @@ -101,11 +101,6 @@ for i in $(ALLKERNELS) ; do $(MAKE) $$i-delete ; done for i in $(ALLSPARSETREES) ; do $(MAKE) $$i-mrproper ; done -install-twisted: - wget http://www.twistedmatrix.com/products/get-current.epy - tar -zxf Twisted-*.tar.gz - cd Twisted-* && python setup.py install - install-logging: LOGGING=logging-0.4.9.2 install-logging: [ -f $(LOGGING).tar.gz ] || wget http://www.red-dove.com/$(LOGGING).tar.gz @@ -149,7 +144,6 @@ @echo ' kclean - clean guest kernel build trees' @echo '' @echo 'Dependency installation targets:' - @echo ' install-twisted - install the Twisted Matrix Framework' @echo ' install-logging - install the Python Logging package' @echo ' install-iptables - install iptables tools' @echo '' @@ -178,6 +172,12 @@ rm -rf $(D)/usr/bin/xen* $(D)/usr/bin/miniterm rm -rf $(D)/boot/*xen* rm -rf $(D)/lib/modules/*xen* + rm -rf $(D)/usr/bin/cpuperf-perfcntr $(D)/usr/bin/cpuperf-xen + rm -rf $(D)/usr/bin/xc_shadow + rm -rf $(D)/usr/share/xen $(D)/usr/libexec/xen + rm -rf $(D)/usr/share/man/man1/xen* + rm -rf $(D)/usr/share/man/man8/xen* + rm -rf $(D)/usr/lib/xen # Legacy targets for compatibility linux24: diff -r 5f1ed597f107 -r 8799d14bef77 buildconfigs/Rules.mk --- a/buildconfigs/Rules.mk Wed Aug 24 02:43:18 2005 +++ b/buildconfigs/Rules.mk Thu Aug 25 22:53:20 2005 @@ -66,6 +66,7 @@ PATCHDIRS := $(wildcard patches/*-*) +ifneq ($(PATCHDIRS),) -include $(patsubst %,%/.makedep,$(PATCHDIRS)) $(patsubst patches/%,patches/%/.makedep,$(PATCHDIRS)): patches/%/.makedep: @@ -80,6 +81,7 @@ ([ -d patches/$* ] && \ for i in patches/$*/*.patch ; do ( cd $(@D) ; patch -p1 <../$$i || exit 1 ) ; done) || true touch $@ # update timestamp to avoid rebuild +endif %-build: $(MAKE) -f buildconfigs/mk.$* build @@ -115,7 +117,7 @@ ifeq ($(XEN_TARGET_X86_PAE),y) sed -e 's!^CONFIG_HIGHMEM4G=y$$!\# CONFIG_HIGHMEM4G is not set!;s!^\# CONFIG_HIGHMEM64G is not set$$!CONFIG_HIGHMEM64G=y!' $(CONFIG_FILE) > $(CONFIG_FILE)- && mv $(CONFIG_FILE)- $(CONFIG_FILE) else - @: # do nothing yet + grep '^CONFIG_HIGHMEM64G=y' $(CONFIG_FILE) >/dev/null && ( sed -e 's!^CONFIG_HIGHMEM64G=y$$!\# CONFIG_HIGHMEM64G is not set!;s!^\# CONFIG_HIGHMEM4G is not set$$!CONFIG_HIGHMEM4G=y!' $(CONFIG_FILE) > $(CONFIG_FILE)- && mv $(CONFIG_FILE)- $(CONFIG_FILE) ) || true endif # never delete any intermediate files. diff -r 5f1ed597f107 -r 8799d14bef77 docs/src/user.tex --- a/docs/src/user.tex Wed Aug 24 02:43:18 2005 +++ b/docs/src/user.tex Thu Aug 25 22:53:20 2005 @@ -1709,8 +1709,11 @@ For example: `com1=9600, 8n1, 0x408, 5' maps COM1 to a 9600-baud port, 8 data bits, no parity, 1 stop bit, I/O port base 0x408, IRQ 5. - If the I/O base and IRQ are standard (com1:0x3f8,4; - com2:0x2f8,3) then they need not be specified. + If some configuration options are standard (e.g., I/O base and IRQ), + then only a prefix of the full configuration string need be + specified. If the baud rate is pre-configured (e.g., by the + bootloader) then you can specify `auto' in place of a numeric baud + rate. \item [console=$<$specifier list$>$ ] Specify the destination for Xen console I/O. @@ -1760,7 +1763,7 @@ physical address in the memory map will be ignored. This parameter may be specified with a B, K, M or G suffix, representing bytes, kilobytes, megabytes and gigabytes respectively. The - default unit, if no suffix is specified, is bytes. + default unit, if no suffix is specified, is kilobytes. \item [dom0\_mem=xxx ] Set the amount of memory to be allocated to domain0. In Xen 3.x the parameter diff -r 5f1ed597f107 -r 8799d14bef77 extras/mini-os/include/hypervisor.h --- a/extras/mini-os/include/hypervisor.h Wed Aug 24 02:43:18 2005 +++ b/extras/mini-os/include/hypervisor.h Thu Aug 25 22:53:20 2005 @@ -80,16 +80,42 @@ static __inline__ int HYPERVISOR_mmu_update(mmu_update_t *req, int count, - int *success_count) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) : "0" (__HYPERVISOR_mmu_update), - _a1 (req), _a2 (count), _a3 (success_count) : "memory" ); - - return ret; -} + int *success_count, + domid_t domid) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_mmu_update), "1" (req), "2" (count), + "3" (success_count), "4" (domid) + : "memory" ); + + return ret; +} + + +static __inline__ int HYPERVISOR_mmuext_op(struct mmuext_op *op, + int count, + int *success_count, + domid_t domid) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_mmuext_op), "1" (op), "2" (count), + "3" (success_count), "4" (domid) + : "memory" ); + + return ret; +} + + static __inline__ int HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) { diff -r 5f1ed597f107 -r 8799d14bef77 extras/mini-os/include/mm.h --- a/extras/mini-os/include/mm.h Wed Aug 24 02:43:18 2005 +++ b/extras/mini-os/include/mm.h Thu Aug 25 22:53:20 2005 @@ -43,13 +43,27 @@ #define PADDR_MASK ((1UL << PADDR_BITS)-1) #define VADDR_MASK ((1UL << VADDR_BITS)-1) -#define pte_to_mfn(_pte) (((_pte) & (PADDR_MASK&PAGE_MASK)) >> PAGE_SHIFT) +#define pte_to_mfn(_pte) (((_pte) & (PADDR_MASK&PAGE_MASK)) >> L1_PAGETABLE_SHIFT) + +#endif + + + +#ifdef __i386__ + +#define L1_PAGETABLE_SHIFT 12 +#define L2_PAGETABLE_SHIFT 22 + +#define L1_PAGETABLE_ENTRIES 1024 +#define L2_PAGETABLE_ENTRIES 1024 +#endif /* Given a virtual address, get an entry offset into a page table. */ #define l1_table_offset(_a) \ (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)) #define l2_table_offset(_a) \ (((_a) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)) +#ifdef __x86_64__ #define l3_table_offset(_a) \ (((_a) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1)) #define l4_table_offset(_a) \ @@ -67,13 +81,16 @@ #define _PAGE_PSE 0x080UL #define _PAGE_GLOBAL 0x100UL -#define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#define L1_PROT (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED) +#define L2_PROT (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_USER) + +#define PAGE_SIZE (1UL << L1_PAGETABLE_SHIFT) +#define PAGE_SHIFT L1_PAGETABLE_SHIFT #define PAGE_MASK (~(PAGE_SIZE-1)) -#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) -#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) -#define PFN_PHYS(x) ((x) << PAGE_SHIFT) +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> L1_PAGETABLE_SHIFT) +#define PFN_DOWN(x) ((x) >> L1_PAGETABLE_SHIFT) +#define PFN_PHYS(x) ((x) << L1_PAGETABLE_SHIFT) /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) @@ -83,14 +100,14 @@ #define mfn_to_pfn(_mfn) (machine_to_phys_mapping[(_mfn)]) static __inline__ unsigned long phys_to_machine(unsigned long phys) { - unsigned long machine = pfn_to_mfn(phys >> PAGE_SHIFT); - machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); + unsigned long machine = pfn_to_mfn(phys >> L1_PAGETABLE_SHIFT); + machine = (machine << L1_PAGETABLE_SHIFT) | (phys & ~PAGE_MASK); return machine; } static __inline__ unsigned long machine_to_phys(unsigned long machine) { - unsigned long phys = mfn_to_pfn(machine >> PAGE_SHIFT); - phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); + unsigned long phys = mfn_to_pfn(machine >> L1_PAGETABLE_SHIFT); + phys = (phys << L1_PAGETABLE_SHIFT) | (machine & ~PAGE_MASK); return phys; } @@ -105,7 +122,10 @@ #define __va to_virt #define __pa to_phys +#define virt_to_pfn(_virt) (PFN_DOWN(to_phys(_virt))) + void init_mm(void); unsigned long alloc_pages(int order); +int is_mfn_mapped(unsigned long mfn); #endif /* _MM_H_ */ diff -r 5f1ed597f107 -r 8799d14bef77 extras/mini-os/include/time.h --- a/extras/mini-os/include/time.h Wed Aug 24 02:43:18 2005 +++ b/extras/mini-os/include/time.h Thu Aug 25 22:53:20 2005 @@ -28,7 +28,7 @@ * of real time into system time */ typedef s64 s_time_t; -#define NOW() ((s_time_t)get_s_time()) +#define NOW() ((s_time_t)monotonic_clock()) #define SECONDS(_s) (((s_time_t)(_s)) * 1000000000UL ) #define TENTHS(_ts) (((s_time_t)(_ts)) * 100000000UL ) #define HUNDREDTHS(_hs) (((s_time_t)(_hs)) * 10000000UL ) @@ -36,7 +36,8 @@ #define MICROSECS(_us) (((s_time_t)(_us)) * 1000UL ) #define Time_Max ((s_time_t) 0x7fffffffffffffffLL) #define FOREVER Time_Max - +#define NSEC_TO_USEC(_nsec) (_nsec / 1000UL) +#define NSEC_TO_SEC(_nsec) (_nsec / 1000000000ULL) /* wall clock time */ typedef long time_t; @@ -44,6 +45,11 @@ struct timeval { time_t tv_sec; /* seconds */ suseconds_t tv_usec; /* microseconds */ +}; + +struct timespec { + time_t ts_sec; + long ts_nsec; }; diff -r 5f1ed597f107 -r 8799d14bef77 extras/mini-os/kernel.c --- a/extras/mini-os/kernel.c Wed Aug 24 02:43:18 2005 +++ b/extras/mini-os/kernel.c Thu Aug 25 22:53:20 2005 @@ -132,22 +132,8 @@ i = 0; for ( ; ; ) { - if(i >= 1000) - { - { - unsigned long saved; - __asm__ ("movl %%esp, %0" - :"=r"(saved) /* y is output operand */ - /* x is input operand */); -// :"a"); /* %eax is clobbered register */ - printk("ESP=0x%lx\n", saved); - } - - printk("1000 bloks\n"); - i=0; - } // HYPERVISOR_yield(); - block(1); + block(100); i++; } } diff -r 5f1ed597f107 -r 8799d14bef77 extras/mini-os/mm.c --- a/extras/mini-os/mm.c Wed Aug 24 02:43:18 2005 +++ b/extras/mini-os/mm.c Thu Aug 25 22:53:20 2005 @@ -5,9 +5,9 @@ * * File: mm.c * Author: Rolf Neugebauer (neugebar@xxxxxxxxxxxxx) - * Changes: + * Changes: Grzegorz Milos * - * Date: Aug 2003 + * Date: Aug 2003, chages Aug 2005 * * Environment: Xen Minimal OS * Description: memory management related functions @@ -41,86 +41,18 @@ #include <types.h> #include <lib.h> + +#ifdef MM_DEBUG +#define DEBUG(_f, _a...) \ + printk("MINI_OS(file=mm.c, line=%d) " _f "\n", __LINE__, ## _a) +#else +#define DEBUG(_f, _a...) ((void)0) +#endif + unsigned long *phys_to_machine_mapping; extern char *stack; extern char _text, _etext, _edata, _end; -static void init_page_allocator(unsigned long min, unsigned long max); - -void init_mm(void) -{ - - unsigned long start_pfn, max_pfn, max_free_pfn; - - unsigned long *pgd = (unsigned long *)start_info.pt_base; - - printk("MM: Init\n"); - - printk(" _text: %p\n", &_text); - printk(" _etext: %p\n", &_etext); - printk(" _edata: %p\n", &_edata); - printk(" stack start: %p\n", &stack); - printk(" _end: %p\n", &_end); - - /* set up minimal memory infos */ - start_pfn = PFN_UP(to_phys(&_end)); - max_pfn = start_info.nr_pages; - - printk(" start_pfn: %lx\n", start_pfn); - printk(" max_pfn: %lx\n", max_pfn); - - /* - * we know where free tables start (start_pfn) and how many we - * have (max_pfn). - * - * Currently the hypervisor stores page tables it providesin the - * high region of the this memory range. - * - * next we work out how far down this goes (max_free_pfn) - * - * XXX this assumes the hypervisor provided page tables to be in - * the upper region of our initial memory. I don't know if this - * is always true. - */ - - max_free_pfn = PFN_DOWN(to_phys(pgd)); -#ifdef __i386__ - { - unsigned long *pgd = (unsigned long *)start_info.pt_base; - unsigned long pte; - int i; - printk(" pgd(pa(pgd)): %lx(%lx)", (u_long)pgd, to_phys(pgd)); - - for ( i = 0; i < (HYPERVISOR_VIRT_START>>22); i++ ) - { - unsigned long pgde = *pgd++; - if ( !(pgde & 1) ) continue; - pte = machine_to_phys(pgde & PAGE_MASK); - printk(" PT(%x): %lx(%lx)", i, (u_long)to_virt(pte), pte); - if (PFN_DOWN(pte) <= max_free_pfn) - max_free_pfn = PFN_DOWN(pte); - } - } - max_free_pfn--; - printk(" max_free_pfn: %lx\n", max_free_pfn); - - /* - * now we can initialise the page allocator - */ - printk("MM: Initialise page allocator for %lx(%lx)-%lx(%lx)\n", - (u_long)to_virt(PFN_PHYS(start_pfn)), PFN_PHYS(start_pfn), - (u_long)to_virt(PFN_PHYS(max_free_pfn)), PFN_PHYS(max_free_pfn)); - init_page_allocator(PFN_PHYS(start_pfn), PFN_PHYS(max_free_pfn)); -#endif - - - /* Now initialise the physical->machine mapping table. */ - - - printk("MM: done\n"); - - -} /********************* * ALLOCATION BITMAP @@ -213,6 +145,59 @@ #define round_pgdown(_p) ((_p)&PAGE_MASK) #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) + +#ifdef MM_DEBUG +/* + * Prints allocation[0/1] for @nr_pages, starting at @start + * address (virtual). + */ +static void print_allocation(void *start, int nr_pages) +{ + unsigned long pfn_start = virt_to_pfn(start); + int count; + for(count = 0; count < nr_pages; count++) + if(allocated_in_map(pfn_start + count)) printk("1"); + else printk("0"); + + printk("\n"); +} + +/* + * Prints chunks (making them with letters) for @nr_pages starting + * at @start (virtual). + */ +static void print_chunks(void *start, int nr_pages) +{ + char chunks[1001], current='A'; + int order, count; + chunk_head_t *head; + unsigned long pfn_start = virt_to_pfn(start); + + memset(chunks, (int)'_', 1000); + if(nr_pages > 1000) + { + DEBUG("Can only pring 1000 pages. Increase buffer size."); + } + + for(order=0; order < FREELIST_SIZE; order++) + { + head = free_head[order]; + while(!FREELIST_EMPTY(head)) + { + for(count = 0; count < 1<< head->level; count++) + { + if(count + virt_to_pfn(head) - pfn_start < 1000) + chunks[count + virt_to_pfn(head) - pfn_start] = current; + } + head = head->next; + current++; + } + } + chunks[nr_pages] = '\0'; + printk("%s\n", chunks); +} +#endif + /* @@ -328,3 +313,198 @@ return 0; } +void free_pages(void *pointer, int order) +{ + chunk_head_t *freed_ch, *to_merge_ch; + chunk_tail_t *freed_ct; + unsigned long mask; + + /* First free the chunk */ + map_free(virt_to_pfn(pointer), 1 << order); + + /* Create free chunk */ + freed_ch = (chunk_head_t *)pointer; + freed_ct = (chunk_tail_t *)((char *)pointer + (1<<(order + PAGE_SHIFT)))-1; + + /* Now, possibly we can conseal chunks together */ + while(order < FREELIST_SIZE) + { + mask = 1 << (order + PAGE_SHIFT); + if((unsigned long)freed_ch & mask) + { + to_merge_ch = (chunk_head_t *)((char *)freed_ch - mask); + if(allocated_in_map(virt_to_pfn(to_merge_ch)) || + to_merge_ch->level != order) + break; + + /* Merge with predecessor */ + freed_ch = to_merge_ch; + } + else + { + to_merge_ch = (chunk_head_t *)((char *)freed_ch + mask); + if(allocated_in_map(virt_to_pfn(to_merge_ch)) || + to_merge_ch->level != order) + break; + + /* Merge with successor */ + freed_ct = (chunk_tail_t *)((char *)to_merge_ch + mask); + } + + /* We are commited to merging, unlink the chunk */ + *(to_merge_ch->pprev) = to_merge_ch->next; + to_merge_ch->next->pprev = to_merge_ch->pprev; + + order++; + } + + /* Link the new chunk */ + freed_ch->level = order; + freed_ch->next = free_head[order]; + freed_ch->pprev = &free_head[order]; + freed_ct->level = order; + + freed_ch->next->pprev = &freed_ch->next; + free_head[order] = freed_ch; + +} +void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn) +{ + unsigned long pfn_to_map, pt_frame; + unsigned long mach_ptd, max_mach_ptd; + int count; + unsigned long mach_pte, virt_pte; + unsigned long *ptd = (unsigned long *)start_info.pt_base; + mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; + struct mmuext_op pin_request; + + /* Firstly work out what is the first pfn that is not yet in page tables + NB. Assuming that builder fills whole pt_frames (which it does at the + moment) + */ + pfn_to_map = (start_info.nr_pt_frames - 1) * L1_PAGETABLE_ENTRIES; + DEBUG("start_pfn=%ld, first pfn_to_map %ld, max_pfn=%ld", + *start_pfn, pfn_to_map, *max_pfn); + + /* Machine address of page table directory */ + mach_ptd = phys_to_machine(to_phys(start_info.pt_base)); + mach_ptd += sizeof(void *) * + l2_table_offset((unsigned long)to_virt(PFN_PHYS(pfn_to_map))); + + max_mach_ptd = sizeof(void *) * + l2_table_offset((unsigned long)to_virt(PFN_PHYS(*max_pfn))); + + /* Check that we are not trying to access Xen region */ + if(max_mach_ptd > sizeof(void *) * l2_table_offset(HYPERVISOR_VIRT_START)) + { + printk("WARNING: mini-os will not use all the memory supplied\n"); + max_mach_ptd = sizeof(void *) * l2_table_offset(HYPERVISOR_VIRT_START); + *max_pfn = virt_to_pfn(HYPERVISOR_VIRT_START - PAGE_SIZE); + } + max_mach_ptd += phys_to_machine(to_phys(start_info.pt_base)); + DEBUG("Max_mach_ptd 0x%lx", max_mach_ptd); + + pt_frame = *start_pfn; + /* Should not happen - no empty, mapped pages */ + if(pt_frame >= pfn_to_map) + { + printk("ERROR: Not even a single empty, mapped page\n"); + *(int*)0=0; + } + + while(mach_ptd < max_mach_ptd) + { + /* Correct protection needs to be set for the new page table frame */ + virt_pte = (unsigned long)to_virt(PFN_PHYS(pt_frame)); + mach_pte = ptd[l2_table_offset(virt_pte)] & ~(PAGE_SIZE-1); + mach_pte += sizeof(void *) * l1_table_offset(virt_pte); + DEBUG("New page table page: pfn=0x%lx, mfn=0x%lx, virt_pte=0x%lx, " + "mach_pte=0x%lx", pt_frame, pfn_to_mfn(pt_frame), + virt_pte, mach_pte); + + /* Update the entry */ + mmu_updates[0].ptr = mach_pte; + mmu_updates[0].val = pfn_to_mfn(pt_frame) << PAGE_SHIFT | + (L1_PROT & ~_PAGE_RW); + if(HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF) < 0) + { + printk("PTE for new page table page could not be updated\n"); + *(int*)0=0; + } + + /* Pin the page to provide correct protection */ + pin_request.cmd = MMUEXT_PIN_L1_TABLE; + pin_request.mfn = pfn_to_mfn(pt_frame); + if(HYPERVISOR_mmuext_op(&pin_request, 1, NULL, DOMID_SELF) < 0) + { + printk("ERROR: pinning failed\n"); + *(int*)0=0; + } + + /* Now fill the new page table page with entries. + Update the page directory as well. */ + count = 0; + mmu_updates[count].ptr = mach_ptd; + mmu_updates[count].val = pfn_to_mfn(pt_frame) << PAGE_SHIFT | + L2_PROT; + count++; + mach_ptd += sizeof(void *); + mach_pte = phys_to_machine(PFN_PHYS(pt_frame++)); + + for(;count <= L1_PAGETABLE_ENTRIES && pfn_to_map <= *max_pfn; count++) + { + mmu_updates[count].ptr = mach_pte; + mmu_updates[count].val = + pfn_to_mfn(pfn_to_map++) << PAGE_SHIFT | L1_PROT; + if(count == 1) DEBUG("mach_pte 0x%lx", mach_pte); + mach_pte += sizeof(void *); + } + if(HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF) < 0) + { + printk("ERROR: mmu_update failed\n"); + *(int*)0=0; + } + (*start_pfn)++; + } + + *start_pfn = pt_frame; +} + +void init_mm(void) +{ + + unsigned long start_pfn, max_pfn; + + printk("MM: Init\n"); + + printk(" _text: %p\n", &_text); + printk(" _etext: %p\n", &_etext); + printk(" _edata: %p\n", &_edata); + printk(" stack start: %p\n", &stack); + printk(" _end: %p\n", &_end); + + /* set up minimal memory infos */ + phys_to_machine_mapping = (unsigned long *)start_info.mfn_list; + + /* First page follows page table pages and 3 more pages (store page etc) */ + start_pfn = PFN_UP(__pa(start_info.pt_base)) + start_info.nr_pt_frames + 3; + max_pfn = start_info.nr_pages; + + printk(" start_pfn: %lx\n", start_pfn); + printk(" max_pfn: %lx\n", max_pfn); + + + build_pagetable(&start_pfn, &max_pfn); + +#ifdef __i386__ + /* + * now we can initialise the page allocator + */ + printk("MM: Initialise page allocator for %lx(%lx)-%lx(%lx)\n", + (u_long)to_virt(PFN_PHYS(start_pfn)), PFN_PHYS(start_pfn), + (u_long)to_virt(PFN_PHYS(max_pfn)), PFN_PHYS(max_pfn)); + init_page_allocator(PFN_PHYS(start_pfn), PFN_PHYS(max_pfn)); +#endif + + printk("MM: done\n"); +} diff -r 5f1ed597f107 -r 8799d14bef77 extras/mini-os/time.c --- a/extras/mini-os/time.c Wed Aug 24 02:43:18 2005 +++ b/extras/mini-os/time.c Thu Aug 25 22:53:20 2005 @@ -43,19 +43,20 @@ * Time functions *************************************************************************/ -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - - /* These are peridically updated in shared_info, and then copied here. */ -static u32 shadow_tsc_stamp; -static s64 shadow_system_time; -static u32 shadow_time_version; -static struct timeval shadow_tv; +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + u32 tsc_to_usec_mul; + int tsc_shift; + u32 version; +}; +static struct timespec shadow_ts; +static u32 shadow_ts_version; + +static struct shadow_time_info shadow; + #ifndef rmb #define rmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory") @@ -63,116 +64,150 @@ #define HANDLE_USEC_OVERFLOW(_tv) \ do { \ - while ( (_tv).tv_usec >= 1000000 ) \ + while ( (_tv)->tv_usec >= 1000000 ) \ { \ - (_tv).tv_usec -= 1000000; \ - (_tv).tv_sec++; \ + (_tv)->tv_usec -= 1000000; \ + (_tv)->tv_sec++; \ } \ } while ( 0 ) +static inline int time_values_up_to_date(void) +{ + struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_time[0]; + + return (shadow.version == src->version); +} + + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if ( shift < 0 ) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "add %4,%%eax ; " + "xor %5,%5 ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#else + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#endif + + return product; +} + + +static unsigned long get_nsec_offset(void) +{ + u64 now, delta; + rdtscll(now); + delta = now - shadow.tsc_timestamp; + return scale_delta(delta, shadow.tsc_to_nsec_mul, shadow.tsc_shift); +} + + static void get_time_values_from_xen(void) { - do { - shadow_time_version = HYPERVISOR_shared_info->time_version2; - rmb(); - shadow_tv.tv_sec = HYPERVISOR_shared_info->wc_sec; - shadow_tv.tv_usec = HYPERVISOR_shared_info->wc_usec; - shadow_tsc_stamp = (u32)HYPERVISOR_shared_info->tsc_timestamp; - shadow_system_time = HYPERVISOR_shared_info->system_time; - rmb(); - } - while ( shadow_time_version != HYPERVISOR_shared_info->time_version1 ); -} - - -#define TIME_VALUES_UP_TO_DATE \ - (shadow_time_version == HYPERVISOR_shared_info->time_version2) - -static u32 get_time_delta_usecs(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= shadow_tsc_stamp; - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return edx; -} - -s64 get_s_time (void) -{ - u64 u_delta; - s64 ret; - - again: - - u_delta = get_time_delta_usecs(); - ret = shadow_system_time + (1000 * u_delta); - - if ( unlikely(!TIME_VALUES_UP_TO_DATE) ) - { - /* - * We may have blocked for a long time, rendering our calculations - * invalid (e.g. the time delta may have overflowed). Detect that - * and recalculate with fresh values. - */ - get_time_values_from_xen(); - goto again; - } - - return ret; -} + struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_time[0]; + + do { + shadow.version = src->version; + rmb(); + shadow.tsc_timestamp = src->tsc_timestamp; + shadow.system_timestamp = src->system_time; + shadow.tsc_to_nsec_mul = src->tsc_to_system_mul; + shadow.tsc_shift = src->tsc_shift; + rmb(); + } + while ((src->version & 1) | (shadow.version ^ src->version)); + + shadow.tsc_to_usec_mul = shadow.tsc_to_nsec_mul / 1000; +} + + + + +/* monotonic_clock(): returns # of nanoseconds passed since time_init() + * Note: This function is required to return accurate + * time even in the absence of multiple timer ticks. + */ +u64 monotonic_clock(void) +{ + u64 time; + u32 local_time_version; + + do { + local_time_version = shadow.version; + rmb(); + time = shadow.system_timestamp + get_nsec_offset(); + if (!time_values_up_to_date()) + get_time_values_from_xen(); + rmb(); + } while (local_time_version != shadow.version); + + return time; +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_ts_version = s->wc_version; + rmb(); + shadow_ts.ts_sec = s->wc_sec; + shadow_ts.ts_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (shadow_ts_version ^ s->wc_version)); +} + void gettimeofday(struct timeval *tv) { - struct timeval _tv; - - do { - get_time_values_from_xen(); - _tv.tv_usec = get_time_delta_usecs(); - _tv.tv_sec = shadow_tv.tv_sec; - _tv.tv_usec += shadow_tv.tv_usec; - } - while ( unlikely(!TIME_VALUES_UP_TO_DATE) ); - - HANDLE_USEC_OVERFLOW(_tv); - *tv = _tv; -} + u64 nsec = monotonic_clock(); + nsec += shadow_ts.ts_nsec; + + + tv->tv_sec = shadow_ts.ts_sec; + tv->tv_sec += NSEC_TO_SEC(nsec); + tv->tv_usec = NSEC_TO_USEC(nsec % 1000000000UL); +} + static void print_current_time(void) { - struct timeval tv; - - get_time_values_from_xen(); + struct timeval tv; gettimeofday(&tv); printk("T(s=%ld us=%ld)\n", tv.tv_sec, tv.tv_usec); } + void block(u32 millisecs) { struct timeval tv; gettimeofday(&tv); - //printk("tv.tv_sec=%ld, tv.tv_usec=%ld, shadow_system_time=%lld\n", tv.tv_sec, tv.tv_usec, shadow_system_time ); - HYPERVISOR_set_timer_op(get_s_time() + 1000000LL * (s64) millisecs); + HYPERVISOR_set_timer_op(monotonic_clock() + 1000000LL * (s64) millisecs); HYPERVISOR_block(); } @@ -185,7 +220,7 @@ static int i; get_time_values_from_xen(); - + update_wallclock(); i++; if (i >= 1000) { print_current_time(); @@ -197,24 +232,5 @@ void init_time(void) { - u64 __cpu_khz; - unsigned long cpu_khz; - - __cpu_khz = HYPERVISOR_shared_info->cpu_freq; - - cpu_khz = (u32) (__cpu_khz/1000); - - printk("Xen reported: %lu.%03lu MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = - (2^32 * 1 / (clocks/us)) */ - { - unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (fast_gettimeoffset_quotient), "=d" (edx) - :"r" (cpu_khz), - "0" (eax), "1" (edx)); - } - bind_virq(VIRQ_TIMER, &timer_handler); } diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.4-xen-sparse/arch/xen/Makefile --- a/linux-2.4-xen-sparse/arch/xen/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.4-xen-sparse/arch/xen/Makefile Thu Aug 25 22:53:20 2005 @@ -61,7 +61,6 @@ SUBDIRS += arch/xen/drivers/evtchn SUBDIRS += arch/xen/drivers/blkif SUBDIRS += arch/xen/drivers/netif -#SUBDIRS += arch/xen/drivers/usbif SUBDIRS += arch/xen/drivers/balloon ifdef CONFIG_XEN_PRIVILEGED_GUEST SUBDIRS += arch/xen/drivers/dom0 @@ -72,7 +71,6 @@ CORE_FILES += arch/xen/drivers/console/drv.o DRIVERS += arch/xen/drivers/blkif/drv.o DRIVERS += arch/xen/drivers/netif/drv.o -DRIVERS += arch/xen/drivers/usbif/drv.o ifdef CONFIG_XEN_PRIVILEGED_GUEST CORE_FILES += arch/xen/drivers/dom0/drv.o endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.4-xen-sparse/arch/xen/config.in --- a/linux-2.4-xen-sparse/arch/xen/config.in Wed Aug 24 02:43:18 2005 +++ b/linux-2.4-xen-sparse/arch/xen/config.in Thu Aug 25 22:53:20 2005 @@ -16,14 +16,10 @@ comment 'Xen' bool 'Support for privileged operations (domain 0)' CONFIG_XEN_PRIVILEGED_GUEST bool 'Device-driver domain (physical device access)' CONFIG_XEN_PHYSDEV_ACCESS -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then - bool 'USB-device backend driver' CONFIG_XEN_USB_BACKEND -fi bool 'Scrub memory before freeing it to Xen' CONFIG_XEN_SCRUB_PAGES bool 'Network-device frontend driver' CONFIG_XEN_NETDEV_FRONTEND bool 'Block-device frontend driver' CONFIG_XEN_BLKDEV_FRONTEND bool 'Block-device uses grant tables' CONFIG_XEN_BLKDEV_GRANT -bool 'USB-device frontend driver' CONFIG_XEN_USB_FRONTEND endmenu # The IBM S/390 patch needs this. define_bool CONFIG_NO_IDLE_HZ y @@ -267,7 +263,7 @@ source drivers/char/Config.in -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" -o "$CONFIG_XEN_USB_FRONTEND" = "y" ]; then +if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then source drivers/media/Config.in fi @@ -302,14 +298,8 @@ endmenu fi -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" -o "$CONFIG_XEN_USB_FRONTEND" = "y" ]; then - if [ "$CONFIG_XEN_USB_FRONTEND" = "y" -o "$CONFIG_XEN_USB_BACKEND" = "y" ]; then - define_bool CONFIG_USB y - fi +if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then source drivers/usb/Config.in -fi - -if [ "$CONFIG_XEN_PHYSDEV_ACCESS" = "y" ]; then source net/bluetooth/Config.in fi diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.4-xen-sparse/mkbuildtree --- a/linux-2.4-xen-sparse/mkbuildtree Wed Aug 24 02:43:18 2005 +++ b/linux-2.4-xen-sparse/mkbuildtree Thu Aug 25 22:53:20 2005 @@ -103,9 +103,8 @@ rm -f mkbuildtree set ${RS}/../linux-2.6-xen-sparse -[ "$1" == "${RS}/../linux-2.6-xen-parse" ] && { echo "no Linux 2.6 sparse tree at ${RS}/../linux-2.6-xen-sparse"; exit 1; } +[ "$1" == "${RS}/../linux-2.6-xen-sparse" ] && { echo "no Linux 2.6 sparse tree at ${RS}/../linux-2.6-xen-sparse"; exit 1; } LINUX_26="$1" - # Create links to the shared definitions of the Xen interfaces. rm -rf ${AD}/include/asm-xen/xen-public diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/Kconfig --- a/linux-2.6-xen-sparse/arch/xen/Kconfig Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/Kconfig Thu Aug 25 22:53:20 2005 @@ -61,15 +61,6 @@ with the blktap. This option will be removed as the block drivers are modified to use grant tables. -config XEN_BLKDEV_GRANT - bool "Grant table substrate for block drivers" - depends on !XEN_BLKDEV_TAP_BE - default y - help - This introduces the use of grant tables as a data exhange mechanism - between the frontend and backend block drivers. This currently - conflicts with the block tap. - config XEN_NETDEV_BACKEND bool "Network-device backend driver" depends on XEN_PHYSDEV_ACCESS diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_32 Thu Aug 25 22:53:20 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.12-xen0 -# Mon Jul 25 09:48:34 2005 +# Wed Aug 3 09:54:56 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -14,12 +14,11 @@ CONFIG_XEN_PHYSDEV_ACCESS=y CONFIG_XEN_BLKDEV_BACKEND=y # CONFIG_XEN_BLKDEV_TAP_BE is not set -CONFIG_XEN_BLKDEV_GRANT=y CONFIG_XEN_NETDEV_BACKEND=y CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -#CONFIG_XEN_NETDEV_GRANT_TX=y -#CONFIG_XEN_NETDEV_GRANT_RX=y +CONFIG_XEN_NETDEV_GRANT_TX=y +CONFIG_XEN_NETDEV_GRANT_RX=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -93,11 +92,11 @@ # CONFIG_M586 is not set # CONFIG_M586TSC is not set # CONFIG_M586MMX is not set -# CONFIG_M686 is not set +CONFIG_M686=y # CONFIG_MPENTIUMII is not set # CONFIG_MPENTIUMIII is not set # CONFIG_MPENTIUMM is not set -CONFIG_MPENTIUM4=y +# CONFIG_MPENTIUM4 is not set # CONFIG_MK6 is not set # CONFIG_MK7 is not set # CONFIG_MK8 is not set @@ -112,15 +111,15 @@ # CONFIG_X86_GENERIC is not set CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_X86_PPRO_FENCE=y CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y CONFIG_X86_GOOD_APIC=y -CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y # CONFIG_HPET_TIMER is not set # CONFIG_HPET_EMULATE_RTC is not set @@ -130,6 +129,7 @@ # CONFIG_X86_REBOOTFIXUPS is not set CONFIG_MICROCODE=y CONFIG_X86_CPUID=y +CONFIG_SWIOTLB=y # # Firmware Drivers @@ -540,7 +540,7 @@ # CONFIG_IP_NF_MATCH_STATE is not set # CONFIG_IP_NF_MATCH_CONNTRACK is not set # CONFIG_IP_NF_MATCH_OWNER is not set -# CONFIG_IP_NF_MATCH_PHYSDEV is not set +CONFIG_IP_NF_MATCH_PHYSDEV=y # CONFIG_IP_NF_MATCH_ADDRTYPE is not set # CONFIG_IP_NF_MATCH_REALM is not set # CONFIG_IP_NF_MATCH_SCTP is not set @@ -688,7 +688,7 @@ # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_R8169 is not set -# CONFIG_SK98LIN is not set +CONFIG_SK98LIN=y # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y # CONFIG_BNX2 is not set diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen0_defconfig_x86_64 Thu Aug 25 22:53:20 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-xen0 -# Wed Jun 29 10:01:20 2005 +# Linux kernel version: 2.6.12.4-xen0 +# Mon Aug 15 18:57:19 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -14,10 +14,11 @@ CONFIG_XEN_PHYSDEV_ACCESS=y CONFIG_XEN_BLKDEV_BACKEND=y # CONFIG_XEN_BLKDEV_TAP_BE is not set -CONFIG_XEN_BLKDEV_GRANT=y CONFIG_XEN_NETDEV_BACKEND=y CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y +CONFIG_XEN_NETDEV_GRANT_TX=y +CONFIG_XEN_NETDEV_GRANT_RX=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -50,6 +51,7 @@ # CONFIG_IKCONFIG is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -116,9 +118,11 @@ CONFIG_GENERIC_CPU=y CONFIG_X86_L1_CACHE_BYTES=128 # CONFIG_X86_TSC is not set +CONFIG_X86_XEN_GENAPIC=y # CONFIG_X86_MSR is not set # CONFIG_GART_IOMMU is not set CONFIG_DUMMY_IOMMU=y +CONFIG_SWIOTLB=y # CONFIG_X86_MCE is not set # @@ -160,6 +164,7 @@ CONFIG_STANDALONE=y # CONFIG_PREVENT_FIRMWARE_BUILD is not set # CONFIG_FW_LOADER is not set +# CONFIG_DEBUG_DRIVER is not set # # Memory Technology Devices (MTD) @@ -369,7 +374,23 @@ # # Multi-device support (RAID and LVM) # -# CONFIG_MD is not set +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=y +CONFIG_MD_RAID0=y +CONFIG_MD_RAID1=y +# CONFIG_MD_RAID10 is not set +# CONFIG_MD_RAID5 is not set +# CONFIG_MD_RAID6 is not set +CONFIG_MD_MULTIPATH=y +# CONFIG_MD_FAULTY is not set +CONFIG_BLK_DEV_DM=y +CONFIG_DM_CRYPT=y +CONFIG_DM_SNAPSHOT=y +CONFIG_DM_MIRROR=y +# CONFIG_DM_ZERO is not set +CONFIG_DM_MULTIPATH=y +CONFIG_DM_MULTIPATH_EMC=y # # Fusion MPT device support @@ -458,7 +479,7 @@ # CONFIG_IP_NF_MATCH_STATE is not set # CONFIG_IP_NF_MATCH_CONNTRACK is not set # CONFIG_IP_NF_MATCH_OWNER is not set -# CONFIG_IP_NF_MATCH_PHYSDEV is not set +CONFIG_IP_NF_MATCH_PHYSDEV=y # CONFIG_IP_NF_MATCH_ADDRTYPE is not set # CONFIG_IP_NF_MATCH_REALM is not set # CONFIG_IP_NF_MATCH_SCTP is not set @@ -589,7 +610,7 @@ # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_R8169 is not set -# CONFIG_SK98LIN is not set +CONFIG_SK98LIN=y # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y # CONFIG_BNX2 is not set @@ -786,7 +807,107 @@ # CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y -# CONFIG_USB is not set +CONFIG_USB=y +# CONFIG_USB_DEBUG is not set + +# +# Miscellaneous USB options +# +# CONFIG_USB_DEVICEFS is not set +# CONFIG_USB_BANDWIDTH is not set +# CONFIG_USB_DYNAMIC_MINORS is not set +# CONFIG_USB_OTG is not set + +# +# USB Host Controller Drivers +# +# CONFIG_USB_EHCI_HCD is not set +CONFIG_USB_OHCI_HCD=y +# CONFIG_USB_OHCI_BIG_ENDIAN is not set +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_UHCI_HCD=y +# CONFIG_USB_SL811_HCD is not set + +# +# USB Device Class drivers +# +# CONFIG_USB_BLUETOOTH_TTY is not set +# CONFIG_USB_ACM is not set +# CONFIG_USB_PRINTER is not set + +# +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' may also be needed; see USB_STORAGE Help for more information +# +# CONFIG_USB_STORAGE is not set + +# +# USB Input Devices +# +CONFIG_USB_HID=y +CONFIG_USB_HIDINPUT=y +# CONFIG_HID_FF is not set +# CONFIG_USB_HIDDEV is not set +# CONFIG_USB_AIPTEK is not set +# CONFIG_USB_WACOM is not set +# CONFIG_USB_KBTAB is not set +# CONFIG_USB_POWERMATE is not set +# CONFIG_USB_MTOUCH is not set +# CONFIG_USB_EGALAX is not set +# CONFIG_USB_XPAD is not set +# CONFIG_USB_ATI_REMOTE is not set + +# +# USB Imaging devices +# +# CONFIG_USB_MDC800 is not set +# CONFIG_USB_MICROTEK is not set + +# +# USB Multimedia devices +# +# CONFIG_USB_DABUSB is not set + +# +# Video4Linux support is needed for USB Multimedia device support +# + +# +# USB Network Adapters +# +# CONFIG_USB_CATC is not set +# CONFIG_USB_KAWETH is not set +# CONFIG_USB_PEGASUS is not set +# CONFIG_USB_RTL8150 is not set +# CONFIG_USB_USBNET is not set +CONFIG_USB_MON=y + +# +# USB port drivers +# + +# +# USB Serial Converter support +# +# CONFIG_USB_SERIAL is not set + +# +# USB Miscellaneous drivers +# +# CONFIG_USB_EMI62 is not set +# CONFIG_USB_EMI26 is not set +# CONFIG_USB_AUERSWALD is not set +# CONFIG_USB_RIO500 is not set +# CONFIG_USB_LEGOTOWER is not set +# CONFIG_USB_LCD is not set +# CONFIG_USB_LED is not set +# CONFIG_USB_CYTHERM is not set +# CONFIG_USB_PHIDGETKIT is not set +# CONFIG_USB_PHIDGETSERVO is not set +# CONFIG_USB_IDMOUSE is not set + +# +# USB ATM/DSL drivers +# # # USB Gadget Support @@ -801,7 +922,12 @@ # # InfiniBand support # -# CONFIG_INFINIBAND is not set +CONFIG_INFINIBAND=y +CONFIG_INFINIBAND_MTHCA=y +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_IPOIB=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +CONFIG_INFINIBAND_IPOIB_DEBUG_DATA=y # # Power management options @@ -1036,7 +1162,22 @@ # Kernel hacking # # CONFIG_PRINTK_TIME is not set -# CONFIG_DEBUG_KERNEL is not set -CONFIG_LOG_BUF_SHIFT=14 +CONFIG_DEBUG_KERNEL=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_LOG_BUF_SHIFT=15 +# CONFIG_SCHEDSTATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_STACKOVERFLOW is not set +# CONFIG_KPROBES is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y +# CONFIG_CHECKING is not set +# CONFIG_INIT_DEBUG is not set diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_32 Thu Aug 25 22:53:20 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.12-xenU -# Mon Jul 25 10:06:06 2005 +# Wed Aug 3 09:57:44 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -12,11 +12,10 @@ # # CONFIG_XEN_PRIVILEGED_GUEST is not set # CONFIG_XEN_PHYSDEV_ACCESS is not set -CONFIG_XEN_BLKDEV_GRANT=y CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y -#CONFIG_XEN_NETDEV_GRANT_TX=y -#CONFIG_XEN_NETDEV_GRANT_RX=y +CONFIG_XEN_NETDEV_GRANT_TX=y +CONFIG_XEN_NETDEV_GRANT_RX=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -90,11 +89,11 @@ # CONFIG_M586 is not set # CONFIG_M586TSC is not set # CONFIG_M586MMX is not set -# CONFIG_M686 is not set +CONFIG_M686=y # CONFIG_MPENTIUMII is not set # CONFIG_MPENTIUMIII is not set # CONFIG_MPENTIUMM is not set -CONFIG_MPENTIUM4=y +# CONFIG_MPENTIUM4 is not set # CONFIG_MK6 is not set # CONFIG_MK7 is not set # CONFIG_MK8 is not set @@ -109,15 +108,15 @@ # CONFIG_X86_GENERIC is not set CONFIG_X86_CMPXCHG=y CONFIG_X86_XADD=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_X86_PPRO_FENCE=y CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y CONFIG_X86_GOOD_APIC=y -CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y # CONFIG_HPET_TIMER is not set # CONFIG_HPET_EMULATE_RTC is not set @@ -415,7 +414,7 @@ # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set -# CONFIG_CRAMFS is not set +CONFIG_CRAMFS=y # CONFIG_VXFS_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 --- a/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xenU_defconfig_x86_64 Thu Aug 25 22:53:20 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.12-xenU -# Thu Jul 7 11:43:14 2005 +# Thu Aug 18 11:15:14 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -12,9 +12,10 @@ # # CONFIG_XEN_PRIVILEGED_GUEST is not set # CONFIG_XEN_PHYSDEV_ACCESS is not set -CONFIG_XEN_BLKDEV_GRANT=y CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y +CONFIG_XEN_NETDEV_GRANT_TX=y +CONFIG_XEN_NETDEV_GRANT_RX=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -28,7 +29,7 @@ # CONFIG_EXPERIMENTAL=y CONFIG_CLEAN_COMPILE=y -CONFIG_BROKEN_ON_SMP=y +CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 # @@ -46,8 +47,10 @@ CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set +# CONFIG_CPUSETS is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_ALL is not set CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -72,6 +75,7 @@ CONFIG_MODVERSIONS=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_KMOD=y +CONFIG_STOP_MACHINE=y CONFIG_XENARCH="x86_64" CONFIG_X86=y CONFIG_MMU=y @@ -84,12 +88,15 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_GOOD_APIC=y # CONFIG_HPET_TIMER is not set -# CONFIG_SMP is not set +CONFIG_SMP=y +CONFIG_NR_CPUS=8 +# CONFIG_SCHED_SMT is not set # CONFIG_PREEMPT is not set # CONFIG_MICROCODE is not set CONFIG_X86_CPUID=y # CONFIG_NUMA is not set # CONFIG_MTRR is not set +CONFIG_HAVE_DEC_LOCK=y # CONFIG_X86_LOCAL_APIC is not set # CONFIG_X86_IO_APIC is not set # CONFIG_PCI is not set @@ -112,7 +119,11 @@ # CONFIG_GENERIC_CPU is not set CONFIG_X86_L1_CACHE_BYTES=128 # CONFIG_X86_TSC is not set +CONFIG_X86_XEN_GENAPIC=y # CONFIG_X86_MSR is not set +CONFIG_X86_HT=y +# CONFIG_K8_NUMA is not set +# CONFIG_NUMA_EMU is not set CONFIG_DUMMY_IOMMU=y # CONFIG_X86_MCE is not set @@ -155,6 +166,7 @@ CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y +# CONFIG_DEBUG_DRIVER is not set # # Block devices @@ -257,7 +269,10 @@ CONFIG_IP_ROUTE_MULTIPATH=y # CONFIG_IP_ROUTE_MULTIPATH_CACHED is not set CONFIG_IP_ROUTE_VERBOSE=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y CONFIG_NET_IPIP=m CONFIG_NET_IPGRE=m CONFIG_NET_IPGRE_BROADCAST=y @@ -557,7 +572,6 @@ # # Old SIR device drivers # -# CONFIG_IRPORT_SIR is not set # # Old Serial dongle support @@ -660,14 +674,14 @@ CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y -CONFIG_EXT3_FS=m +CONFIG_EXT3_FS=y CONFIG_EXT3_FS_XATTR=y -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +# CONFIG_EXT3_FS_POSIX_ACL is not set +# CONFIG_EXT3_FS_SECURITY is not set CONFIG_JBD=m # CONFIG_JBD_DEBUG is not set CONFIG_FS_MBCACHE=y -CONFIG_REISERFS_FS=m +CONFIG_REISERFS_FS=y # CONFIG_REISERFS_CHECK is not set CONFIG_REISERFS_PROC_INFO=y CONFIG_REISERFS_FS_XATTR=y @@ -746,7 +760,7 @@ # CONFIG_BEFS_DEBUG is not set CONFIG_BFS_FS=m CONFIG_EFS_FS=m -CONFIG_CRAMFS=m +CONFIG_CRAMFS=y CONFIG_VXFS_FS=m # CONFIG_HPFS_FS is not set CONFIG_QNX4FS_FS=m @@ -859,17 +873,7 @@ # Security options # # CONFIG_KEYS is not set -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_CAPABILITIES=y -# CONFIG_SECURITY_SECLVL is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1 -CONFIG_SECURITY_SELINUX_DISABLE=y -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 +# CONFIG_SECURITY is not set # # Cryptographic options @@ -917,5 +921,19 @@ # Kernel hacking # # CONFIG_PRINTK_TIME is not set -# CONFIG_DEBUG_KERNEL is not set -CONFIG_LOG_BUF_SHIFT=14 +CONFIG_DEBUG_KERNEL=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_LOG_BUF_SHIFT=15 +# CONFIG_SCHEDSTATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_FS is not set +# CONFIG_DEBUG_STACKOVERFLOW is not set +# CONFIG_KPROBES is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_4KSTACKS is not set +# CONFIG_INIT_DEBUG is not set diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_32 Thu Aug 25 22:53:20 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.12-xen -# Thu Jul 14 21:55:53 2005 +# Wed Aug 3 10:04:25 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -14,10 +14,11 @@ CONFIG_XEN_PHYSDEV_ACCESS=y CONFIG_XEN_BLKDEV_BACKEND=y # CONFIG_XEN_BLKDEV_TAP_BE is not set -CONFIG_XEN_BLKDEV_GRANT=y CONFIG_XEN_NETDEV_BACKEND=y CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y +CONFIG_XEN_NETDEV_GRANT_TX=y +CONFIG_XEN_NETDEV_GRANT_RX=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -135,6 +136,7 @@ # CONFIG_X86_REBOOTFIXUPS is not set CONFIG_MICROCODE=m CONFIG_X86_CPUID=m +CONFIG_SWIOTLB=y # # Firmware Drivers diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 --- a/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/configs/xen_defconfig_x86_64 Thu Aug 25 22:53:20 2005 @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.12-xen -# Fri Jul 15 00:34:21 2005 +# Linux kernel version: 2.6.12.4-xen +# Mon Aug 15 19:54:11 2005 # CONFIG_XEN=y CONFIG_ARCH_XEN=y @@ -14,10 +14,11 @@ CONFIG_XEN_PHYSDEV_ACCESS=y CONFIG_XEN_BLKDEV_BACKEND=y # CONFIG_XEN_BLKDEV_TAP_BE is not set -CONFIG_XEN_BLKDEV_GRANT=y CONFIG_XEN_NETDEV_BACKEND=y CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y +CONFIG_XEN_NETDEV_GRANT_TX=y +CONFIG_XEN_NETDEV_GRANT_RX=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set # CONFIG_XEN_SHADOW_MODE is not set @@ -33,6 +34,7 @@ # CONFIG_CLEAN_COMPILE is not set CONFIG_BROKEN=y CONFIG_BROKEN_ON_SMP=y +CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 # @@ -48,10 +50,11 @@ CONFIG_HOTPLUG=y CONFIG_KOBJECT_UEVENT=y # CONFIG_IKCONFIG is not set +# CONFIG_CPUSETS is not set # CONFIG_EMBEDDED is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set -# CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_BASE_FULL=y @@ -73,8 +76,9 @@ # CONFIG_MODULE_FORCE_UNLOAD is not set CONFIG_OBSOLETE_MODPARM=y # CONFIG_MODVERSIONS is not set -# CONFIG_MODULE_SRCVERSION_ALL is not set +CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_KMOD=y +CONFIG_STOP_MACHINE=y CONFIG_XENARCH="x86_64" CONFIG_X86=y CONFIG_MMU=y @@ -87,12 +91,15 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_X86_GOOD_APIC=y # CONFIG_HPET_TIMER is not set -# CONFIG_SMP is not set +CONFIG_SMP=y +CONFIG_NR_CPUS=8 +# CONFIG_SCHED_SMT is not set # CONFIG_PREEMPT is not set CONFIG_MICROCODE=y # CONFIG_X86_CPUID is not set # CONFIG_NUMA is not set # CONFIG_MTRR is not set +CONFIG_HAVE_DEC_LOCK=y CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y CONFIG_PCI=y @@ -108,7 +115,7 @@ # CONFIG_X86_64=y CONFIG_64BIT=y -# CONFIG_EARLY_PRINTK is not set +CONFIG_EARLY_PRINTK=y # # Processor type and features @@ -117,9 +124,14 @@ CONFIG_GENERIC_CPU=y CONFIG_X86_L1_CACHE_BYTES=128 # CONFIG_X86_TSC is not set +CONFIG_X86_XEN_GENAPIC=y # CONFIG_X86_MSR is not set +CONFIG_X86_HT=y +# CONFIG_K8_NUMA is not set +# CONFIG_NUMA_EMU is not set # CONFIG_GART_IOMMU is not set CONFIG_DUMMY_IOMMU=y +CONFIG_SWIOTLB=y # CONFIG_X86_MCE is not set # @@ -149,7 +161,7 @@ # Executable file formats # CONFIG_BINFMT_ELF=y -CONFIG_BINFMT_MISC=m +CONFIG_BINFMT_MISC=y # # Device Drivers @@ -160,7 +172,7 @@ # CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y -CONFIG_FW_LOADER=m +CONFIG_FW_LOADER=y # CONFIG_DEBUG_DRIVER is not set # @@ -174,7 +186,7 @@ CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 # CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set # CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -# CONFIG_MTD_CMDLINE_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y # # User Modules And Translation Layers @@ -206,24 +218,20 @@ # CONFIG_MTD_CFI_I8 is not set CONFIG_MTD_CFI_INTELEXT=m CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_AMDSTD_RETRY=0 +CONFIG_MTD_CFI_AMDSTD_RETRY=3 CONFIG_MTD_CFI_STAA=m CONFIG_MTD_CFI_UTIL=m CONFIG_MTD_RAM=m CONFIG_MTD_ROM=m CONFIG_MTD_ABSENT=m # CONFIG_MTD_OBSOLETE_CHIPS is not set -# CONFIG_MTD_XIP is not set # # Mapping drivers for chip access # CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -CONFIG_MTD_PHYSMAP_START=0x8000000 -CONFIG_MTD_PHYSMAP_LEN=0x4000000 -CONFIG_MTD_PHYSMAP_BANKWIDTH=2 -CONFIG_MTD_PNC2000=m +# CONFIG_MTD_PHYSMAP is not set +# CONFIG_MTD_PNC2000 is not set CONFIG_MTD_SC520CDP=m CONFIG_MTD_NETSC520=m CONFIG_MTD_TS5500=m @@ -231,10 +239,9 @@ CONFIG_MTD_ELAN_104NC=m # CONFIG_MTD_AMD76XROM is not set # CONFIG_MTD_ICHXROM is not set -# CONFIG_MTD_SCB2_FLASH is not set -CONFIG_MTD_NETtel=m -CONFIG_MTD_DILNETPC=m -CONFIG_MTD_DILNETPC_BOOTSIZE=0x80000 +CONFIG_MTD_SCB2_FLASH=m +# CONFIG_MTD_NETtel is not set +# CONFIG_MTD_DILNETPC is not set # CONFIG_MTD_L440GX is not set CONFIG_MTD_PCI=m @@ -244,19 +251,19 @@ CONFIG_MTD_PMC551=m # CONFIG_MTD_PMC551_BUGFIX is not set # CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_PHRAM is not set CONFIG_MTD_MTDRAM=m CONFIG_MTDRAM_TOTAL_SIZE=4096 CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLKMTD=m -# CONFIG_MTD_BLOCK2MTD is not set +# CONFIG_MTD_BLKMTD is not set +CONFIG_MTD_BLOCK2MTD=m # # Disk-On-Chip Device Drivers # CONFIG_MTD_DOC2000=m -CONFIG_MTD_DOC2001=m +# CONFIG_MTD_DOC2001 is not set CONFIG_MTD_DOC2001PLUS=m CONFIG_MTD_DOCPROBE=m CONFIG_MTD_DOCECC=m @@ -269,10 +276,7 @@ CONFIG_MTD_NAND=m # CONFIG_MTD_NAND_VERIFY_WRITE is not set CONFIG_MTD_NAND_IDS=m -CONFIG_MTD_NAND_DISKONCHIP=m -# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 -# CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE is not set +# CONFIG_MTD_NAND_DISKONCHIP is not set # CONFIG_MTD_NAND_NANDSIM is not set # @@ -280,8 +284,7 @@ # CONFIG_PARPORT=m CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y +# CONFIG_PARPORT_PC_FIFO is not set # CONFIG_PARPORT_PC_SUPERIO is not set CONFIG_PARPORT_NOT_PC=y # CONFIG_PARPORT_GSC is not set @@ -290,13 +293,7 @@ # # Plug and Play support # -CONFIG_PNP=y -# CONFIG_PNP_DEBUG is not set - -# -# Protocols -# -CONFIG_PNPACPI=y +# CONFIG_PNP is not set # # Block devices @@ -324,7 +321,7 @@ CONFIG_PARIDE_FIT2=m CONFIG_PARIDE_FIT3=m CONFIG_PARIDE_EPAT=m -# CONFIG_PARIDE_EPATC8 is not set +CONFIG_PARIDE_EPATC8=y CONFIG_PARIDE_EPIA=m CONFIG_PARIDE_FRIQ=m CONFIG_PARIDE_FRPW=m @@ -345,7 +342,7 @@ # CONFIG_BLK_DEV_UB is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_BLK_DEV_RAM_SIZE=16384 CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_LBD=y @@ -360,70 +357,69 @@ CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y -# CONFIG_ATA_OVER_ETH is not set +CONFIG_ATA_OVER_ETH=m # # ATA/ATAPI/MFM/RLL support # -CONFIG_IDE=m -CONFIG_BLK_DEV_IDE=m +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y # # Please see Documentation/ide.txt for help/info on IDE drives # # CONFIG_BLK_DEV_IDE_SATA is not set # CONFIG_BLK_DEV_HD_IDE is not set -CONFIG_BLK_DEV_IDEDISK=m -# CONFIG_IDEDISK_MULTI_MODE is not set -CONFIG_BLK_DEV_IDECD=m -CONFIG_BLK_DEV_IDETAPE=m -CONFIG_BLK_DEV_IDEFLOPPY=m +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +CONFIG_BLK_DEV_IDECD=y +# CONFIG_BLK_DEV_IDETAPE is not set +CONFIG_BLK_DEV_IDEFLOPPY=y CONFIG_BLK_DEV_IDESCSI=m # CONFIG_IDE_TASK_IOCTL is not set # # IDE chipset support/bugfixes # -CONFIG_IDE_GENERIC=m +CONFIG_IDE_GENERIC=y CONFIG_BLK_DEV_CMD640=y -# CONFIG_BLK_DEV_CMD640_ENHANCED is not set -# CONFIG_BLK_DEV_IDEPNP is not set +CONFIG_BLK_DEV_CMD640_ENHANCED=y CONFIG_BLK_DEV_IDEPCI=y CONFIG_IDEPCI_SHARE_IRQ=y # CONFIG_BLK_DEV_OFFBOARD is not set -CONFIG_BLK_DEV_GENERIC=m -CONFIG_BLK_DEV_OPTI621=m -CONFIG_BLK_DEV_RZ1000=m +CONFIG_BLK_DEV_GENERIC=y +# CONFIG_BLK_DEV_OPTI621 is not set +CONFIG_BLK_DEV_RZ1000=y CONFIG_BLK_DEV_IDEDMA_PCI=y # CONFIG_BLK_DEV_IDEDMA_FORCED is not set CONFIG_IDEDMA_PCI_AUTO=y # CONFIG_IDEDMA_ONLYDISK is not set -CONFIG_BLK_DEV_AEC62XX=m -CONFIG_BLK_DEV_ALI15X3=m +CONFIG_BLK_DEV_AEC62XX=y +CONFIG_BLK_DEV_ALI15X3=y # CONFIG_WDC_ALI15X3 is not set -CONFIG_BLK_DEV_AMD74XX=m -CONFIG_BLK_DEV_ATIIXP=m -CONFIG_BLK_DEV_CMD64X=m -CONFIG_BLK_DEV_TRIFLEX=m -CONFIG_BLK_DEV_CY82C693=m -CONFIG_BLK_DEV_CS5520=m -CONFIG_BLK_DEV_CS5530=m -CONFIG_BLK_DEV_HPT34X=m +CONFIG_BLK_DEV_AMD74XX=y +CONFIG_BLK_DEV_ATIIXP=y +CONFIG_BLK_DEV_CMD64X=y +CONFIG_BLK_DEV_TRIFLEX=y +CONFIG_BLK_DEV_CY82C693=y +CONFIG_BLK_DEV_CS5520=y +CONFIG_BLK_DEV_CS5530=y +CONFIG_BLK_DEV_HPT34X=y # CONFIG_HPT34X_AUTODMA is not set -CONFIG_BLK_DEV_HPT366=m -CONFIG_BLK_DEV_SC1200=m -CONFIG_BLK_DEV_PIIX=m -CONFIG_BLK_DEV_NS87415=m -CONFIG_BLK_DEV_PDC202XX_OLD=m -CONFIG_PDC202XX_BURST=y -CONFIG_BLK_DEV_PDC202XX_NEW=m +CONFIG_BLK_DEV_HPT366=y +# CONFIG_BLK_DEV_SC1200 is not set +CONFIG_BLK_DEV_PIIX=y +# CONFIG_BLK_DEV_NS87415 is not set +CONFIG_BLK_DEV_PDC202XX_OLD=y +# CONFIG_PDC202XX_BURST is not set +CONFIG_BLK_DEV_PDC202XX_NEW=y CONFIG_PDC202XX_FORCE=y -CONFIG_BLK_DEV_SVWKS=m -CONFIG_BLK_DEV_SIIMAGE=m -CONFIG_BLK_DEV_SIS5513=m -CONFIG_BLK_DEV_SLC90E66=m -CONFIG_BLK_DEV_TRM290=m -CONFIG_BLK_DEV_VIA82CXXX=m +CONFIG_BLK_DEV_SVWKS=y +CONFIG_BLK_DEV_SIIMAGE=y +CONFIG_BLK_DEV_SIS5513=y +CONFIG_BLK_DEV_SLC90E66=y +# CONFIG_BLK_DEV_TRM290 is not set +CONFIG_BLK_DEV_VIA82CXXX=y # CONFIG_IDE_ARM is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_IVB is not set @@ -433,17 +429,17 @@ # # SCSI device support # -CONFIG_SCSI=m +CONFIG_SCSI=y CONFIG_SCSI_PROC_FS=y # # SCSI support type (disk, tape, CD-ROM) # -CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SD=y CONFIG_CHR_DEV_ST=m CONFIG_CHR_DEV_OSST=m CONFIG_BLK_DEV_SR=m -# CONFIG_BLK_DEV_SR_VENDOR is not set +CONFIG_BLK_DEV_SR_VENDOR=y CONFIG_CHR_DEV_SG=m # @@ -458,7 +454,7 @@ # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m -# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=m # # SCSI low-level drivers @@ -468,29 +464,30 @@ CONFIG_SCSI_ACARD=m CONFIG_SCSI_AACRAID=m CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=8 +CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y +# CONFIG_AIC7XXX_DEBUG_ENABLE is not set CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set CONFIG_SCSI_AIC7XXX_OLD=m CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_CMDS_PER_DEVICE=4 CONFIG_AIC79XX_RESET_DELAY_MS=15000 -CONFIG_AIC79XX_ENABLE_RD_STRM=y -CONFIG_AIC79XX_DEBUG_ENABLE=y +# CONFIG_AIC79XX_ENABLE_RD_STRM is not set +# CONFIG_AIC79XX_DEBUG_ENABLE is not set CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -CONFIG_SCSI_ADVANSYS=m -# CONFIG_MEGARAID_NEWGEN is not set -CONFIG_MEGARAID_LEGACY=m +# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +# CONFIG_SCSI_ADVANSYS is not set +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m CONFIG_SCSI_SATA=y CONFIG_SCSI_SATA_AHCI=m CONFIG_SCSI_SATA_SVW=m -CONFIG_SCSI_ATA_PIIX=m +CONFIG_SCSI_ATA_PIIX=y CONFIG_SCSI_SATA_NV=m CONFIG_SCSI_SATA_PROMISE=m -# CONFIG_SCSI_SATA_QSTOR is not set +CONFIG_SCSI_SATA_QSTOR=m CONFIG_SCSI_SATA_SX4=m CONFIG_SCSI_SATA_SIL=m CONFIG_SCSI_SATA_SIS=m @@ -500,17 +497,14 @@ CONFIG_SCSI_BUSLOGIC=m # CONFIG_SCSI_OMIT_FLASHPOINT is not set # CONFIG_SCSI_CPQFCTS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_EATA=m -CONFIG_SCSI_EATA_TAGGED_QUEUE=y -CONFIG_SCSI_EATA_LINKED_COMMANDS=y -CONFIG_SCSI_EATA_MAX_TAGS=16 -CONFIG_SCSI_EATA_PIO=m -CONFIG_SCSI_FUTURE_DOMAIN=m +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_EATA_PIO is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set CONFIG_SCSI_GDTH=m CONFIG_SCSI_IPS=m -# CONFIG_SCSI_INITIO is not set -# CONFIG_SCSI_INIA100 is not set +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m CONFIG_SCSI_PPA=m CONFIG_SCSI_IMM=m # CONFIG_SCSI_IZIP_EPP16 is not set @@ -520,32 +514,29 @@ CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 # CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set -CONFIG_SCSI_IPR=m -# CONFIG_SCSI_IPR_TRACE is not set -# CONFIG_SCSI_IPR_DUMP is not set +# CONFIG_SCSI_IPR is not set # CONFIG_SCSI_PCI2000 is not set # CONFIG_SCSI_PCI2220I is not set -CONFIG_SCSI_QLOGIC_ISP=m -CONFIG_SCSI_QLOGIC_FC=m -CONFIG_SCSI_QLOGIC_FC_FIRMWARE=y +# CONFIG_SCSI_QLOGIC_ISP is not set +# CONFIG_SCSI_QLOGIC_FC is not set CONFIG_SCSI_QLOGIC_1280=m CONFIG_SCSI_QLOGIC_1280_1040=y -CONFIG_SCSI_QLA2XXX=m -# CONFIG_SCSI_QLA21XX is not set -# CONFIG_SCSI_QLA22XX is not set -# CONFIG_SCSI_QLA2300 is not set -# CONFIG_SCSI_QLA2322 is not set -# CONFIG_SCSI_QLA6312 is not set +CONFIG_SCSI_QLA2XXX=y +CONFIG_SCSI_QLA21XX=m +CONFIG_SCSI_QLA22XX=m +CONFIG_SCSI_QLA2300=m +CONFIG_SCSI_QLA2322=m +CONFIG_SCSI_QLA6312=m CONFIG_SCSI_LPFC=m CONFIG_SCSI_DC395x=m CONFIG_SCSI_DC390T=m -CONFIG_SCSI_DEBUG=m +# CONFIG_SCSI_DEBUG is not set # # Multi-device support (RAID and LVM) # CONFIG_MD=y -CONFIG_BLK_DEV_MD=m +CONFIG_BLK_DEV_MD=y CONFIG_MD_LINEAR=m CONFIG_MD_RAID0=m CONFIG_MD_RAID1=m @@ -579,7 +570,7 @@ # Subsystem Options # # CONFIG_IEEE1394_VERBOSEDEBUG is not set -# CONFIG_IEEE1394_OUI_DB is not set +CONFIG_IEEE1394_OUI_DB=y CONFIG_IEEE1394_EXTRA_CONFIG_ROMS=y CONFIG_IEEE1394_CONFIG_ROM_IP1394=y @@ -618,9 +609,9 @@ # # Networking options # -CONFIG_PACKET=m +CONFIG_PACKET=y CONFIG_PACKET_MMAP=y -CONFIG_UNIX=m +CONFIG_UNIX=y CONFIG_NET_KEY=m CONFIG_INET=y CONFIG_IP_MULTICAST=y @@ -784,11 +775,6 @@ CONFIG_IP6_NF_RAW=m # -# DECnet: Netfilter Configuration -# -CONFIG_DECNET_NF_GRABULATOR=m - -# # Bridge: Netfilter Configuration # CONFIG_BRIDGE_NF_EBTABLES=m @@ -810,9 +796,9 @@ CONFIG_BRIDGE_EBT_REDIRECT=m CONFIG_BRIDGE_EBT_SNAT=m CONFIG_BRIDGE_EBT_LOG=m -# CONFIG_BRIDGE_EBT_ULOG is not set +CONFIG_BRIDGE_EBT_ULOG=m CONFIG_XFRM=y -CONFIG_XFRM_USER=m +CONFIG_XFRM_USER=y # # SCTP Configuration (EXPERIMENTAL) @@ -823,19 +809,18 @@ # CONFIG_SCTP_HMAC_NONE is not set # CONFIG_SCTP_HMAC_SHA1 is not set CONFIG_SCTP_HMAC_MD5=y -CONFIG_ATM=y -CONFIG_ATM_CLIP=y +CONFIG_ATM=m +CONFIG_ATM_CLIP=m # CONFIG_ATM_CLIP_NO_ICMP is not set CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m +# CONFIG_ATM_MPOA is not set CONFIG_ATM_BR2684=m # CONFIG_ATM_BR2684_IPFILTER is not set CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m -CONFIG_DECNET=m -# CONFIG_DECNET_ROUTER is not set +# CONFIG_DECNET is not set CONFIG_LLC=y -CONFIG_LLC2=m +# CONFIG_LLC2 is not set CONFIG_IPX=m # CONFIG_IPX_INTERN is not set CONFIG_ATALK=m @@ -843,12 +828,10 @@ CONFIG_IPDDP=m CONFIG_IPDDP_ENCAP=y CONFIG_IPDDP_DECAP=y -CONFIG_X25=m -CONFIG_LAPB=m -# CONFIG_NET_DIVERT is not set -CONFIG_ECONET=m -CONFIG_ECONET_AUNUDP=y -CONFIG_ECONET_NATIVE=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_NET_DIVERT=y +# CONFIG_ECONET is not set CONFIG_WAN_ROUTER=m # @@ -880,9 +863,9 @@ CONFIG_NET_CLS_ROUTE=y CONFIG_NET_CLS_FW=m CONFIG_NET_CLS_U32=m -# CONFIG_CLS_U32_PERF is not set -# CONFIG_NET_CLS_IND is not set -# CONFIG_CLS_U32_MARK is not set +CONFIG_CLS_U32_PERF=y +CONFIG_NET_CLS_IND=y +CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_EMATCH=y @@ -897,31 +880,12 @@ # # Network testing # -CONFIG_NET_PKTGEN=m +# CONFIG_NET_PKTGEN is not set CONFIG_NETPOLL=y # CONFIG_NETPOLL_RX is not set -# CONFIG_NETPOLL_TRAP is not set +CONFIG_NETPOLL_TRAP=y CONFIG_NET_POLL_CONTROLLER=y -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -# CONFIG_AX25_DAMA_SLAVE is not set -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m +# CONFIG_HAMRADIO is not set CONFIG_IRDA=m # @@ -937,7 +901,7 @@ # CONFIG_IRDA_CACHE_LAST_LSAP=y CONFIG_IRDA_FAST_RR=y -CONFIG_IRDA_DEBUG=y +# CONFIG_IRDA_DEBUG is not set # # Infrared-port device drivers @@ -1002,9 +966,9 @@ CONFIG_BT_HCIUART=m CONFIG_BT_HCIUART_H4=y CONFIG_BT_HCIUART_BCSP=y -# CONFIG_BT_HCIUART_BCSP_TXCRC is not set +CONFIG_BT_HCIUART_BCSP_TXCRC=y CONFIG_BT_HCIBCM203X=m -# CONFIG_BT_HCIBPA10X is not set +CONFIG_BT_HCIBPA10X=m CONFIG_BT_HCIBFUSB=m CONFIG_BT_HCIVHCI=m CONFIG_NETDEVICES=y @@ -1012,21 +976,11 @@ CONFIG_BONDING=m CONFIG_EQUALIZER=m CONFIG_TUN=m -CONFIG_NET_SB1000=m # # ARCnet devices # -CONFIG_ARCNET=m -CONFIG_ARCNET_1201=m -CONFIG_ARCNET_1051=m -CONFIG_ARCNET_RAW=m -# CONFIG_ARCNET_CAP is not set -CONFIG_ARCNET_COM90xx=m -CONFIG_ARCNET_COM90xxIO=m -CONFIG_ARCNET_RIM_I=m -CONFIG_ARCNET_COM20020=m -CONFIG_ARCNET_COM20020_PCI=m +# CONFIG_ARCNET is not set # # Ethernet (10 or 100Mbit) @@ -1046,21 +1000,21 @@ CONFIG_DE2104X=m CONFIG_TULIP=m # CONFIG_TULIP_MWI is not set -# CONFIG_TULIP_MMIO is not set +CONFIG_TULIP_MMIO=y # CONFIG_TULIP_NAPI is not set CONFIG_DE4X5=m CONFIG_WINBOND_840=m CONFIG_DM9102=m -CONFIG_HP100=m +# CONFIG_HP100 is not set CONFIG_NET_PCI=y CONFIG_PCNET32=m CONFIG_AMD8111_ETH=m -# CONFIG_AMD8111E_NAPI is not set +CONFIG_AMD8111E_NAPI=y CONFIG_ADAPTEC_STARFIRE=m -# CONFIG_ADAPTEC_STARFIRE_NAPI is not set +CONFIG_ADAPTEC_STARFIRE_NAPI=y CONFIG_B44=m CONFIG_FORCEDETH=m -# CONFIG_DGRS is not set +CONFIG_DGRS=m CONFIG_EEPRO100=m CONFIG_E100=m CONFIG_FEALNX=m @@ -1069,7 +1023,7 @@ CONFIG_8139CP=m CONFIG_8139TOO=m CONFIG_8139TOO_PIO=y -CONFIG_8139TOO_TUNE_TWISTER=y +# CONFIG_8139TOO_TUNE_TWISTER is not set CONFIG_8139TOO_8129=y # CONFIG_8139_OLD_RX_RESET is not set CONFIG_SIS900=m @@ -1077,21 +1031,22 @@ CONFIG_SUNDANCE=m # CONFIG_SUNDANCE_MMIO is not set CONFIG_VIA_RHINE=m -# CONFIG_VIA_RHINE_MMIO is not set +CONFIG_VIA_RHINE_MMIO=y # # Ethernet (1000 Mbit) # -# CONFIG_ACENIC is not set +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set CONFIG_DL2K=m CONFIG_E1000=m -# CONFIG_E1000_NAPI is not set +CONFIG_E1000_NAPI=y CONFIG_NS83820=m CONFIG_HAMACHI=m CONFIG_YELLOWFIN=m CONFIG_R8169=m -# CONFIG_R8169_NAPI is not set -# CONFIG_R8169_VLAN is not set +CONFIG_R8169_NAPI=y +CONFIG_R8169_VLAN=y CONFIG_SK98LIN=m CONFIG_VIA_VELOCITY=m CONFIG_TIGON3=m @@ -1101,9 +1056,9 @@ # Ethernet (10000 Mbit) # CONFIG_IXGB=m -# CONFIG_IXGB_NAPI is not set +CONFIG_IXGB_NAPI=y CONFIG_S2IO=m -# CONFIG_S2IO_NAPI is not set +CONFIG_S2IO_NAPI=y # CONFIG_2BUFF_MODE is not set # @@ -1124,7 +1079,7 @@ # # Obsolete Wireless cards support (pre-802.11) # -CONFIG_STRIP=m +# CONFIG_STRIP is not set # # Wireless 802.11b ISA/PCI cards support @@ -1145,35 +1100,7 @@ # # Wan interfaces # -CONFIG_WAN=y -CONFIG_DSCC4=m -CONFIG_DSCC4_PCISYNC=y -CONFIG_DSCC4_PCI_RST=y -CONFIG_LANMEDIA=m -CONFIG_SYNCLINK_SYNCPPP=m -CONFIG_HDLC=m -CONFIG_HDLC_RAW=y -CONFIG_HDLC_RAW_ETH=y -CONFIG_HDLC_CISCO=y -CONFIG_HDLC_FR=y -CONFIG_HDLC_PPP=y -CONFIG_HDLC_X25=y -CONFIG_PCI200SYN=m -CONFIG_WANXL=m -CONFIG_PC300=m -CONFIG_PC300_MLPPP=y -CONFIG_FARSYNC=m -CONFIG_DLCI=m -CONFIG_DLCI_COUNT=24 -CONFIG_DLCI_MAX=8 -CONFIG_WAN_ROUTER_DRIVERS=y -# CONFIG_VENDOR_SANGOMA is not set -CONFIG_CYCLADES_SYNC=m -CONFIG_CYCLOMX_X25=y -CONFIG_LAPBETHER=m -CONFIG_X25_ASY=m -CONFIG_SBNI=m -# CONFIG_SBNI_MULTILINE is not set +# CONFIG_WAN is not set # # ATM drivers @@ -1184,8 +1111,7 @@ # CONFIG_ATM_ENI_DEBUG is not set # CONFIG_ATM_ENI_TUNE_BURST is not set CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set +# CONFIG_ATM_ZATM is not set CONFIG_ATM_IDT77252=m # CONFIG_ATM_IDT77252_DEBUG is not set # CONFIG_ATM_IDT77252_RCV_ALL is not set @@ -1195,20 +1121,13 @@ CONFIG_ATM_HORIZON=m # CONFIG_ATM_HORIZON_DEBUG is not set CONFIG_ATM_FORE200E_MAYBE=m -CONFIG_ATM_FORE200E_PCA=y -CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y -# CONFIG_ATM_FORE200E_USE_TASKLET is not set -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_FORE200E=m +# CONFIG_ATM_FORE200E_PCA is not set CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y +# CONFIG_ATM_HE_USE_SUNI is not set CONFIG_FDDI=y -CONFIG_DEFXX=m +# CONFIG_DEFXX is not set CONFIG_SKFP=m -CONFIG_HIPPI=y -CONFIG_ROADRUNNER=m -# CONFIG_ROADRUNNER_LARGE_RINGS is not set +# CONFIG_HIPPI is not set CONFIG_PLIP=m CONFIG_PPP=m CONFIG_PPP_MULTILINK=y @@ -1216,15 +1135,15 @@ CONFIG_PPP_ASYNC=m CONFIG_PPP_SYNC_TTY=m CONFIG_PPP_DEFLATE=m -CONFIG_PPP_BSDCOMP=m +# CONFIG_PPP_BSDCOMP is not set CONFIG_PPPOE=m CONFIG_PPPOATM=m CONFIG_SLIP=m CONFIG_SLIP_COMPRESSED=y CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y +# CONFIG_SLIP_MODE_SLIP6 is not set CONFIG_NET_FC=y -CONFIG_SHAPER=m +# CONFIG_SHAPER is not set CONFIG_NETCONSOLE=m # @@ -1240,16 +1159,15 @@ CONFIG_ISDN_PPP_VJ=y CONFIG_ISDN_MPP=y CONFIG_IPPP_FILTER=y -CONFIG_ISDN_PPP_BSDCOMP=m +# CONFIG_ISDN_PPP_BSDCOMP is not set CONFIG_ISDN_AUDIO=y CONFIG_ISDN_TTY_FAX=y -CONFIG_ISDN_X25=y # # ISDN feature submodules # CONFIG_ISDN_DRV_LOOP=m -# CONFIG_ISDN_DIVERSION is not set +CONFIG_ISDN_DIVERSION=m # # ISDN4Linux hardware drivers @@ -1265,9 +1183,9 @@ # CONFIG_HISAX_EURO=y CONFIG_DE_AOC=y -# CONFIG_HISAX_NO_SENDCOMPLETE is not set -# CONFIG_HISAX_NO_LLC is not set -# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_NO_SENDCOMPLETE=y +CONFIG_HISAX_NO_LLC=y +CONFIG_HISAX_NO_KEYPAD=y CONFIG_HISAX_1TR6=y CONFIG_HISAX_NI1=y CONFIG_HISAX_MAX_CARDS=8 @@ -1342,19 +1260,12 @@ # # Active Eicon DIVA Server cards # -CONFIG_CAPI_EICON=y -CONFIG_ISDN_DIVAS=m -CONFIG_ISDN_DIVAS_BRIPCI=y -CONFIG_ISDN_DIVAS_PRIPCI=y -CONFIG_ISDN_DIVAS_DIVACAPI=m -CONFIG_ISDN_DIVAS_USERIDI=m -CONFIG_ISDN_DIVAS_MAINT=m +# CONFIG_CAPI_EICON is not set # # Telephony Support # -CONFIG_PHONE=m -CONFIG_PHONE_IXJ=m +# CONFIG_PHONE is not set # # Input device support @@ -1365,27 +1276,25 @@ # Userland interfaces # CONFIG_INPUT_MOUSEDEV=y -CONFIG_INPUT_MOUSEDEV_PSAUX=y +# CONFIG_INPUT_MOUSEDEV_PSAUX is not set CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_TSDEV=m -CONFIG_INPUT_TSDEV_SCREEN_X=240 -CONFIG_INPUT_TSDEV_SCREEN_Y=320 -CONFIG_INPUT_EVDEV=m -CONFIG_INPUT_EVBUG=m +# CONFIG_INPUT_TSDEV is not set +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_EVBUG is not set # # Input Device Drivers # CONFIG_INPUT_KEYBOARD=y CONFIG_KEYBOARD_ATKBD=y -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_NEWTON=m +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_LKKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2=y CONFIG_MOUSE_SERIAL=m CONFIG_MOUSE_VSXXXAA=m CONFIG_INPUT_JOYSTICK=y @@ -1427,19 +1336,19 @@ # CONFIG_SERIO=y CONFIG_SERIO_I8042=y -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PARKBD is not set +# CONFIG_SERIO_PCIPS2 is not set CONFIG_SERIO_LIBPS2=y -CONFIG_SERIO_RAW=m +# CONFIG_SERIO_RAW is not set CONFIG_GAMEPORT=m CONFIG_GAMEPORT_NS558=m CONFIG_GAMEPORT_L4=m CONFIG_GAMEPORT_EMU10K1=m CONFIG_GAMEPORT_VORTEX=m CONFIG_GAMEPORT_FM801=m -# CONFIG_GAMEPORT_CS461X is not set +CONFIG_GAMEPORT_CS461X=m # # Character devices @@ -1452,21 +1361,16 @@ # # Serial drivers # -CONFIG_SERIAL_8250=m -# CONFIG_SERIAL_8250_ACPI is not set -CONFIG_SERIAL_8250_NR_UARTS=4 -# CONFIG_SERIAL_8250_EXTENDED is not set +# CONFIG_SERIAL_8250 is not set # # Non-8250 serial port support # -CONFIG_SERIAL_CORE=m -CONFIG_SERIAL_JSM=m +# CONFIG_SERIAL_JSM is not set CONFIG_UNIX98_PTYS=y -CONFIG_LEGACY_PTYS=y -CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_LEGACY_PTYS is not set CONFIG_PRINTER=m -# CONFIG_LP_CONSOLE is not set +CONFIG_LP_CONSOLE=y CONFIG_PPDEV=m CONFIG_TIPAR=m @@ -1500,7 +1404,7 @@ CONFIG_WAFER_WDT=m CONFIG_I8XX_TCO=m CONFIG_SC1200_WDT=m -CONFIG_60XX_WDT=m +# CONFIG_60XX_WDT is not set CONFIG_CPU5_WDT=m CONFIG_W83627HF_WDT=m CONFIG_W83877F_WDT=m @@ -1518,67 +1422,31 @@ # CONFIG_USBPCWATCHDOG=m CONFIG_HW_RANDOM=m -CONFIG_NVRAM=m -CONFIG_RTC=m -CONFIG_GEN_RTC=m -CONFIG_GEN_RTC_X=y +# CONFIG_NVRAM is not set +CONFIG_RTC=y CONFIG_DTLK=m CONFIG_R3964=m -CONFIG_APPLICOM=m +# CONFIG_APPLICOM is not set # # Ftape, the floppy tape device driver # -CONFIG_FTAPE=m -CONFIG_ZFTAPE=m -CONFIG_ZFT_DFLT_BLK_SZ=10240 - -# -# The compressor will be built as a module only! -# -CONFIG_ZFT_COMPRESSOR=m -CONFIG_FT_NR_BUFFERS=3 -CONFIG_FT_PROC_FS=y -CONFIG_FT_NORMAL_DEBUG=y -# CONFIG_FT_FULL_DEBUG is not set -# CONFIG_FT_NO_TRACE is not set -# CONFIG_FT_NO_TRACE_AT_ALL is not set - -# -# Hardware configuration -# -CONFIG_FT_STD_FDC=y -# CONFIG_FT_MACH2 is not set -# CONFIG_FT_PROBE_FC10 is not set -# CONFIG_FT_ALT_FDC is not set -CONFIG_FT_FDC_THR=8 -CONFIG_FT_FDC_MAX_RATE=2000 -CONFIG_FT_ALPHA_CLOCK=0 -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m +# CONFIG_FTAPE is not set +# CONFIG_AGP is not set CONFIG_DRM=m CONFIG_DRM_TDFX=m # CONFIG_DRM_GAMMA is not set CONFIG_DRM_R128=m CONFIG_DRM_RADEON=m -CONFIG_DRM_I810=m -CONFIG_DRM_I830=m -CONFIG_DRM_I915=m -CONFIG_DRM_MGA=m -CONFIG_DRM_SIS=m -CONFIG_MWAVE=m -CONFIG_RAW_DRIVER=m +# CONFIG_MWAVE is not set +# CONFIG_RAW_DRIVER is not set # CONFIG_HPET is not set -CONFIG_MAX_RAW_DEVS=256 CONFIG_HANGCHECK_TIMER=m # # TPM devices # -CONFIG_TCG_TPM=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m +# CONFIG_TCG_TPM is not set # # I2C support @@ -1596,24 +1464,24 @@ # # I2C Hardware Bus support # -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set CONFIG_I2C_AMD756=m CONFIG_I2C_AMD756_S4882=m CONFIG_I2C_AMD8111=m -CONFIG_I2C_I801=m -CONFIG_I2C_I810=m -CONFIG_I2C_PIIX4=m +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set CONFIG_I2C_ISA=m CONFIG_I2C_NFORCE2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_PARPORT_LIGHT=m +# CONFIG_I2C_PARPORT is not set +# CONFIG_I2C_PARPORT_LIGHT is not set CONFIG_I2C_PROSAVAGE=m CONFIG_I2C_SAVAGE4=m -CONFIG_SCx200_ACB=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m +# CONFIG_SCx200_ACB is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set CONFIG_I2C_SIS96X=m CONFIG_I2C_STUB=m CONFIG_I2C_VIA=m @@ -1648,7 +1516,7 @@ CONFIG_SENSORS_LM92=m CONFIG_SENSORS_MAX1619=m CONFIG_SENSORS_PC87360=m -# CONFIG_SENSORS_SMSC47B397 is not set +CONFIG_SENSORS_SMSC47B397=m CONFIG_SENSORS_SIS5595=m CONFIG_SENSORS_SMSC47M1=m CONFIG_SENSORS_VIA686A=m @@ -1682,7 +1550,7 @@ # # Misc devices # -CONFIG_IBM_ASM=m +# CONFIG_IBM_ASM is not set # # Multimedia devices @@ -1714,13 +1582,14 @@ CONFIG_VIDEO_ZORAN_LML33=m CONFIG_VIDEO_ZORAN_LML33R10=m # CONFIG_VIDEO_ZR36120 is not set -# CONFIG_VIDEO_SAA7134 is not set +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_DVB=m CONFIG_VIDEO_MXB=m CONFIG_VIDEO_DPC=m CONFIG_VIDEO_HEXIUM_ORION=m CONFIG_VIDEO_HEXIUM_GEMINI=m CONFIG_VIDEO_CX88=m -# CONFIG_VIDEO_CX88_DVB is not set +CONFIG_VIDEO_CX88_DVB=m CONFIG_VIDEO_OVCAMCHIP=m # @@ -1740,7 +1609,7 @@ # Supported SAA7146 based PCI Adapters # CONFIG_DVB_AV7110=m -# CONFIG_DVB_AV7110_OSD is not set +CONFIG_DVB_AV7110_OSD=y CONFIG_DVB_BUDGET=m CONFIG_DVB_BUDGET_CI=m CONFIG_DVB_BUDGET_AV=m @@ -1755,7 +1624,12 @@ CONFIG_DVB_DIBUSB_MISDESIGNED_DEVICES=y # CONFIG_DVB_DIBCOM_DEBUG is not set CONFIG_DVB_CINERGYT2=m -# CONFIG_DVB_CINERGYT2_TUNING is not set +CONFIG_DVB_CINERGYT2_TUNING=y +CONFIG_DVB_CINERGYT2_STREAM_URB_COUNT=32 +CONFIG_DVB_CINERGYT2_STREAM_BUF_SIZE=512 +CONFIG_DVB_CINERGYT2_QUERY_INTERVAL=250 +CONFIG_DVB_CINERGYT2_ENABLE_RC_INPUT_DEVICE=y +CONFIG_DVB_CINERGYT2_RC_QUERY_INTERVAL=100 # # Supported FlexCopII (B2C2) Adapters @@ -1822,6 +1696,7 @@ CONFIG_VIDEO_VIDEOBUF=m CONFIG_VIDEO_TUNER=m CONFIG_VIDEO_BUF=m +CONFIG_VIDEO_BUF_DVB=m CONFIG_VIDEO_BTCX=m CONFIG_VIDEO_IR=m CONFIG_VIDEO_TVEEPROM=m @@ -1830,36 +1705,34 @@ # Graphics support # CONFIG_FB=y -CONFIG_FB_CFB_FILLRECT=m -CONFIG_FB_CFB_COPYAREA=m -CONFIG_FB_CFB_IMAGEBLIT=m -CONFIG_FB_SOFT_CURSOR=m +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SOFT_CURSOR=y # CONFIG_FB_MACMODES is not set CONFIG_FB_MODE_HELPERS=y CONFIG_FB_TILEBLITTING=y CONFIG_FB_CIRRUS=m -CONFIG_FB_PM2=m -CONFIG_FB_PM2_FIFO_DISCONNECT=y -CONFIG_FB_CYBER2000=m +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set # CONFIG_FB_ASILIANT is not set # CONFIG_FB_IMSTT is not set CONFIG_FB_VGA16=m -# CONFIG_FB_VESA is not set +CONFIG_FB_VESA=y CONFIG_VIDEO_SELECT=y -CONFIG_FB_HGA=m -# CONFIG_FB_HGA_ACCEL is not set -CONFIG_FB_NVIDIA=m -CONFIG_FB_NVIDIA_I2C=y +# CONFIG_FB_HGA is not set +# CONFIG_FB_NVIDIA is not set CONFIG_FB_RIVA=m -CONFIG_FB_RIVA_I2C=y -CONFIG_FB_RIVA_DEBUG=y +# CONFIG_FB_RIVA_I2C is not set +# CONFIG_FB_RIVA_DEBUG is not set CONFIG_FB_MATROX=m CONFIG_FB_MATROX_MILLENIUM=y CONFIG_FB_MATROX_MYSTIQUE=y -# CONFIG_FB_MATROX_G is not set +CONFIG_FB_MATROX_G=y CONFIG_FB_MATROX_I2C=m +CONFIG_FB_MATROX_MAVEN=m CONFIG_FB_MATROX_MULTIHEAD=y -CONFIG_FB_RADEON_OLD=m +# CONFIG_FB_RADEON_OLD is not set CONFIG_FB_RADEON=m CONFIG_FB_RADEON_I2C=y # CONFIG_FB_RADEON_DEBUG is not set @@ -1867,33 +1740,30 @@ CONFIG_FB_ATY=m CONFIG_FB_ATY_CT=y CONFIG_FB_ATY_GENERIC_LCD=y -CONFIG_FB_ATY_XL_INIT=y +# CONFIG_FB_ATY_XL_INIT is not set CONFIG_FB_ATY_GX=y CONFIG_FB_SAVAGE=m CONFIG_FB_SAVAGE_I2C=y CONFIG_FB_SAVAGE_ACCEL=y -CONFIG_FB_SIS=m -CONFIG_FB_SIS_300=y -CONFIG_FB_SIS_315=y +# CONFIG_FB_SIS is not set CONFIG_FB_NEOMAGIC=m CONFIG_FB_KYRO=m CONFIG_FB_3DFX=m -# CONFIG_FB_3DFX_ACCEL is not set +CONFIG_FB_3DFX_ACCEL=y CONFIG_FB_VOODOO1=m CONFIG_FB_TRIDENT=m -# CONFIG_FB_TRIDENT_ACCEL is not set +CONFIG_FB_TRIDENT_ACCEL=y # CONFIG_FB_PM3 is not set -CONFIG_FB_GEODE=y -CONFIG_FB_GEODE_GX1=m -CONFIG_FB_S1D13XXX=m -CONFIG_FB_VIRTUAL=m +# CONFIG_FB_GEODE is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_VIRTUAL is not set # # Console display driver support # CONFIG_VGA_CONSOLE=y CONFIG_DUMMY_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE=m +CONFIG_FRAMEBUFFER_CONSOLE=y # CONFIG_FONTS is not set CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y @@ -1901,8 +1771,15 @@ # # Logo configuration # -# CONFIG_LOGO is not set -# CONFIG_BACKLIGHT_LCD_SUPPORT is not set +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_LOGO_LINUX_CLUT224=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BACKLIGHT_CLASS_DEVICE=m +CONFIG_BACKLIGHT_DEVICE=y +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_DEVICE=y # # Sound @@ -1936,7 +1813,7 @@ CONFIG_SND_DUMMY=m CONFIG_SND_VIRMIDI=m CONFIG_SND_MTPAV=m -CONFIG_SND_SERIAL_U16550=m +# CONFIG_SND_SERIAL_U16550 is not set CONFIG_SND_MPU401=m # @@ -1956,8 +1833,8 @@ CONFIG_SND_CS46XX_NEW_DSP=y CONFIG_SND_CS4281=m CONFIG_SND_EMU10K1=m -# CONFIG_SND_EMU10K1X is not set -# CONFIG_SND_CA0106 is not set +CONFIG_SND_EMU10K1X=m +CONFIG_SND_CA0106=m CONFIG_SND_KORG1212=m CONFIG_SND_MIXART=m CONFIG_SND_NM256=m @@ -1982,7 +1859,7 @@ CONFIG_SND_INTEL8X0M=m CONFIG_SND_SONICVIBES=m CONFIG_SND_VIA82XX=m -# CONFIG_SND_VIA82XX_MODEM is not set +CONFIG_SND_VIA82XX_MODEM=m CONFIG_SND_VX222=m CONFIG_SND_HDA_INTEL=m @@ -1995,75 +1872,7 @@ # # Open Sound System # -CONFIG_SOUND_PRIME=m -CONFIG_SOUND_BT878=m -CONFIG_SOUND_CMPCI=m -# CONFIG_SOUND_CMPCI_FM is not set -# CONFIG_SOUND_CMPCI_MIDI is not set -CONFIG_SOUND_CMPCI_JOYSTICK=y -CONFIG_SOUND_EMU10K1=m -CONFIG_MIDI_EMU10K1=y -CONFIG_SOUND_FUSION=m -CONFIG_SOUND_CS4281=m -CONFIG_SOUND_ES1370=m -CONFIG_SOUND_ES1371=m -CONFIG_SOUND_ESSSOLO1=m -CONFIG_SOUND_MAESTRO=m -CONFIG_SOUND_MAESTRO3=m -CONFIG_SOUND_ICH=m -CONFIG_SOUND_SONICVIBES=m -CONFIG_SOUND_TRIDENT=m -# CONFIG_SOUND_MSNDCLAS is not set -# CONFIG_SOUND_MSNDPIN is not set -CONFIG_SOUND_VIA82CXXX=m -CONFIG_MIDI_VIA82CXXX=y -CONFIG_SOUND_OSS=m -# CONFIG_SOUND_TRACEINIT is not set -# CONFIG_SOUND_DMAP is not set -# CONFIG_SOUND_AD1816 is not set -CONFIG_SOUND_AD1889=m -CONFIG_SOUND_SGALAXY=m -CONFIG_SOUND_ADLIB=m -CONFIG_SOUND_ACI_MIXER=m -CONFIG_SOUND_CS4232=m -CONFIG_SOUND_SSCAPE=m -CONFIG_SOUND_GUS=m -CONFIG_SOUND_GUS16=y -CONFIG_SOUND_GUSMAX=y -CONFIG_SOUND_VMIDI=m -CONFIG_SOUND_TRIX=m -CONFIG_SOUND_MSS=m -CONFIG_SOUND_MPU401=m -CONFIG_SOUND_NM256=m -CONFIG_SOUND_MAD16=m -CONFIG_MAD16_OLDCARD=y -CONFIG_SOUND_PAS=m -CONFIG_SOUND_PSS=m -CONFIG_PSS_MIXER=y -CONFIG_SOUND_SB=m -# CONFIG_SOUND_AWE32_SYNTH is not set -CONFIG_SOUND_WAVEFRONT=m -CONFIG_SOUND_MAUI=m -CONFIG_SOUND_YM3812=m -CONFIG_SOUND_OPL3SA1=m -CONFIG_SOUND_OPL3SA2=m -CONFIG_SOUND_YMFPCI=m -# CONFIG_SOUND_YMFPCI_LEGACY is not set -CONFIG_SOUND_UART6850=m -CONFIG_SOUND_AEDSP16=m -CONFIG_SC6600=y -CONFIG_SC6600_JOY=y -CONFIG_SC6600_CDROM=4 -CONFIG_SC6600_CDROMBASE=0x0 -# CONFIG_AEDSP16_MSS is not set -# CONFIG_AEDSP16_SBPRO is not set -# CONFIG_AEDSP16_MPU401 is not set -CONFIG_SOUND_TVMIXER=m -CONFIG_SOUND_KAHLUA=m -CONFIG_SOUND_ALI5455=m -CONFIG_SOUND_FORTE=m -CONFIG_SOUND_RME96XX=m -CONFIG_SOUND_AD1980=m +# CONFIG_SOUND_PRIME is not set # # USB support @@ -2077,14 +1886,14 @@ # Miscellaneous USB options # CONFIG_USB_DEVICEFS=y -CONFIG_USB_BANDWIDTH=y +# CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_OTG is not set # # USB Host Controller Drivers # -CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD=m CONFIG_USB_EHCI_SPLIT_ISO=y CONFIG_USB_EHCI_ROOT_HUB_TT=y CONFIG_USB_OHCI_HCD=m @@ -2096,7 +1905,7 @@ # # USB Device Class drivers # -CONFIG_USB_AUDIO=m +# CONFIG_USB_AUDIO is not set # # USB Bluetooth TTY can only be used with disabled Bluetooth subsystem @@ -2122,16 +1931,13 @@ # # USB Input Devices # -CONFIG_USB_HID=m +CONFIG_USB_HID=y CONFIG_USB_HIDINPUT=y -# CONFIG_HID_FF is not set +CONFIG_HID_FF=y +CONFIG_HID_PID=y +CONFIG_LOGITECH_FF=y +CONFIG_THRUSTMASTER_FF=y CONFIG_USB_HIDDEV=y - -# -# USB HID Boot Protocol drivers -# -CONFIG_USB_KBD=y -CONFIG_USB_MOUSE=y CONFIG_USB_AIPTEK=m CONFIG_USB_WACOM=m CONFIG_USB_KBTAB=m @@ -2150,7 +1956,7 @@ # # USB Multimedia devices # -# CONFIG_USB_DABUSB is not set +CONFIG_USB_DABUSB=m CONFIG_USB_VICAM=m CONFIG_USB_DSBR=m CONFIG_USB_IBMCAM=m @@ -2220,30 +2026,30 @@ CONFIG_USB_SERIAL_IR=m CONFIG_USB_SERIAL_EDGEPORT=m CONFIG_USB_SERIAL_EDGEPORT_TI=m -# CONFIG_USB_SERIAL_GARMIN is not set +CONFIG_USB_SERIAL_GARMIN=m CONFIG_USB_SERIAL_IPW=m CONFIG_USB_SERIAL_KEYSPAN_PDA=m CONFIG_USB_SERIAL_KEYSPAN=m -# CONFIG_USB_SERIAL_KEYSPAN_MPR is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XA is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XB is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19QW is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19QI is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA49WLC is not set +CONFIG_USB_SERIAL_KEYSPAN_MPR=y +CONFIG_USB_SERIAL_KEYSPAN_USA28=y +CONFIG_USB_SERIAL_KEYSPAN_USA28X=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y +CONFIG_USB_SERIAL_KEYSPAN_USA19=y +CONFIG_USB_SERIAL_KEYSPAN_USA18X=y +CONFIG_USB_SERIAL_KEYSPAN_USA19W=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y +CONFIG_USB_SERIAL_KEYSPAN_USA49W=y +CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y CONFIG_USB_SERIAL_KLSI=m CONFIG_USB_SERIAL_KOBIL_SCT=m CONFIG_USB_SERIAL_MCT_U232=m CONFIG_USB_SERIAL_PL2303=m CONFIG_USB_SERIAL_HP4X=m CONFIG_USB_SERIAL_SAFE=m -# CONFIG_USB_SERIAL_SAFE_PADDED is not set -# CONFIG_USB_SERIAL_TI is not set +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_TI=m CONFIG_USB_SERIAL_CYBERJACK=m CONFIG_USB_SERIAL_XIRCOM=m CONFIG_USB_SERIAL_OMNINET=m @@ -2252,17 +2058,17 @@ # # USB Miscellaneous drivers # -# CONFIG_USB_EMI62 is not set +CONFIG_USB_EMI62=m # CONFIG_USB_EMI26 is not set CONFIG_USB_AUERSWALD=m CONFIG_USB_RIO500=m CONFIG_USB_LEGOTOWER=m CONFIG_USB_LCD=m CONFIG_USB_LED=m -CONFIG_USB_CYTHERM=m +# CONFIG_USB_CYTHERM is not set CONFIG_USB_PHIDGETKIT=m CONFIG_USB_PHIDGETSERVO=m -# CONFIG_USB_IDMOUSE is not set +CONFIG_USB_IDMOUSE=m CONFIG_USB_SISUSBVGA=m CONFIG_USB_TEST=m @@ -2275,33 +2081,24 @@ # # USB Gadget Support # -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG_FILES is not set -CONFIG_USB_GADGET_NET2280=y -CONFIG_USB_NET2280=m -# CONFIG_USB_GADGET_PXA2XX is not set -# CONFIG_USB_GADGET_GOKU is not set -# CONFIG_USB_GADGET_LH7A40X is not set -# CONFIG_USB_GADGET_OMAP is not set -# CONFIG_USB_GADGET_DUMMY_HCD is not set -CONFIG_USB_GADGET_DUALSPEED=y -CONFIG_USB_ZERO=m -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_GADGETFS=m -CONFIG_USB_FILE_STORAGE=m -# CONFIG_USB_FILE_STORAGE_TEST is not set -CONFIG_USB_G_SERIAL=m +# CONFIG_USB_GADGET is not set # # MMC/SD Card support # -# CONFIG_MMC is not set +CONFIG_MMC=m +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_BLOCK=m +CONFIG_MMC_WBSD=m # # InfiniBand support # -# CONFIG_INFINIBAND is not set +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_MTHCA=m +# CONFIG_INFINIBAND_MTHCA_DEBUG is not set +CONFIG_INFINIBAND_IPOIB=m +# CONFIG_INFINIBAND_IPOIB_DEBUG is not set # # Power management options @@ -2335,7 +2132,7 @@ # # File systems # -CONFIG_EXT2_FS=m +CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y @@ -2345,16 +2142,18 @@ CONFIG_EXT3_FS_SECURITY=y CONFIG_JBD=m # CONFIG_JBD_DEBUG is not set -CONFIG_FS_MBCACHE=m +CONFIG_FS_MBCACHE=y CONFIG_REISERFS_FS=m # CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set -# CONFIG_REISERFS_FS_XATTR is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y CONFIG_JFS_FS=m CONFIG_JFS_POSIX_ACL=y -# CONFIG_JFS_SECURITY is not set +CONFIG_JFS_SECURITY=y # CONFIG_JFS_DEBUG is not set -CONFIG_JFS_STATISTICS=y +# CONFIG_JFS_STATISTICS is not set CONFIG_FS_POSIX_ACL=y # @@ -2362,15 +2161,15 @@ # CONFIG_XFS_FS=m CONFIG_XFS_EXPORT=y -CONFIG_XFS_RT=y +# CONFIG_XFS_RT is not set CONFIG_XFS_QUOTA=y CONFIG_XFS_SECURITY=y CONFIG_XFS_POSIX_ACL=y CONFIG_MINIX_FS=m CONFIG_ROMFS_FS=m CONFIG_QUOTA=y -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m +# CONFIG_QFMT_V1 is not set +CONFIG_QFMT_V2=y CONFIG_QUOTACTL=y CONFIG_DNOTIFY=y CONFIG_AUTOFS_FS=m @@ -2379,10 +2178,10 @@ # # CD-ROM/DVD Filesystems # -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_JOLIET=y CONFIG_ZISOFS=y -CONFIG_ZISOFS_FS=m +CONFIG_ZISOFS_FS=y CONFIG_UDF_FS=m CONFIG_UDF_NLS=y @@ -2393,10 +2192,8 @@ CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -# CONFIG_NTFS_RW is not set +CONFIG_FAT_DEFAULT_IOCHARSET="ascii" +# CONFIG_NTFS_FS is not set # # Pseudo filesystems @@ -2404,9 +2201,7 @@ CONFIG_PROC_FS=y CONFIG_PROC_KCORE=y CONFIG_SYSFS=y -CONFIG_DEVFS_FS=y -# CONFIG_DEVFS_MOUNT is not set -# CONFIG_DEVFS_DEBUG is not set +# CONFIG_DEVFS_FS is not set CONFIG_DEVPTS_FS_XATTR=y CONFIG_DEVPTS_FS_SECURITY=y CONFIG_TMPFS=y @@ -2419,8 +2214,7 @@ # # Miscellaneous filesystems # -CONFIG_ADFS_FS=m -# CONFIG_ADFS_FS_RW is not set +# CONFIG_ADFS_FS is not set CONFIG_AFFS_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m @@ -2428,20 +2222,18 @@ # CONFIG_BEFS_DEBUG is not set CONFIG_BFS_FS=m CONFIG_EFS_FS=m -CONFIG_JFFS_FS=m -CONFIG_JFFS_FS_VERBOSE=0 -CONFIG_JFFS_PROC_FS=y +# CONFIG_JFFS_FS is not set CONFIG_JFFS2_FS=m CONFIG_JFFS2_FS_DEBUG=0 -# CONFIG_JFFS2_FS_NAND is not set +CONFIG_JFFS2_FS_NAND=y # CONFIG_JFFS2_FS_NOR_ECC is not set # CONFIG_JFFS2_COMPRESSION_OPTIONS is not set CONFIG_JFFS2_ZLIB=y CONFIG_JFFS2_RTIME=y # CONFIG_JFFS2_RUBIN is not set -CONFIG_CRAMFS=y +CONFIG_CRAMFS=m CONFIG_VXFS_FS=m -CONFIG_HPFS_FS=m +# CONFIG_HPFS_FS is not set CONFIG_QNX4FS_FS=m # CONFIG_QNX4FS_RW is not set CONFIG_SYSV_FS=m @@ -2470,7 +2262,8 @@ # CONFIG_SMB_NLS_DEFAULT is not set CONFIG_CIFS=m # CONFIG_CIFS_STATS is not set -# CONFIG_CIFS_XATTR is not set +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y # CONFIG_CIFS_EXPERIMENTAL is not set CONFIG_NCP_FS=m CONFIG_NCPFS_PACKET_SIGNING=y @@ -2478,38 +2271,29 @@ CONFIG_NCPFS_STRONG=y CONFIG_NCPFS_NFS_NS=y CONFIG_NCPFS_OS2_NS=y -# CONFIG_NCPFS_SMALLDOS is not set +CONFIG_NCPFS_SMALLDOS=y CONFIG_NCPFS_NLS=y CONFIG_NCPFS_EXTRAS=y -CONFIG_CODA_FS=m -# CONFIG_CODA_FS_OLD_API is not set -CONFIG_AFS_FS=m -CONFIG_RXRPC=m +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set # # Partition Types # CONFIG_PARTITION_ADVANCED=y -CONFIG_ACORN_PARTITION=y -CONFIG_ACORN_PARTITION_CUMANA=y -# CONFIG_ACORN_PARTITION_EESOX is not set -CONFIG_ACORN_PARTITION_ICS=y -# CONFIG_ACORN_PARTITION_ADFS is not set -# CONFIG_ACORN_PARTITION_POWERTEC is not set -CONFIG_ACORN_PARTITION_RISCIX=y +# CONFIG_ACORN_PARTITION is not set CONFIG_OSF_PARTITION=y CONFIG_AMIGA_PARTITION=y -CONFIG_ATARI_PARTITION=y +# CONFIG_ATARI_PARTITION is not set CONFIG_MAC_PARTITION=y CONFIG_MSDOS_PARTITION=y CONFIG_BSD_DISKLABEL=y CONFIG_MINIX_SUBPARTITION=y CONFIG_SOLARIS_X86_PARTITION=y CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_LDM_PARTITION=y -# CONFIG_LDM_DEBUG is not set +# CONFIG_LDM_PARTITION is not set CONFIG_SGI_PARTITION=y -CONFIG_ULTRIX_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set CONFIG_SUN_PARTITION=y CONFIG_EFI_PARTITION=y @@ -2517,8 +2301,8 @@ # Native Language Support # CONFIG_NLS=y -CONFIG_NLS_DEFAULT="cp437" -CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_CODEPAGE_737=m CONFIG_NLS_CODEPAGE_775=m CONFIG_NLS_CODEPAGE_850=m @@ -2541,7 +2325,7 @@ CONFIG_NLS_ISO8859_8=m CONFIG_NLS_CODEPAGE_1250=m CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m +CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=m CONFIG_NLS_ISO8859_2=m CONFIG_NLS_ISO8859_3=m @@ -2561,15 +2345,15 @@ # Security options # CONFIG_KEYS=y -# CONFIG_KEYS_DEBUG_PROC_KEYS is not set +CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y -# CONFIG_SECURITY_NETWORK is not set -CONFIG_SECURITY_CAPABILITIES=m -CONFIG_SECURITY_ROOTPLUG=m -CONFIG_SECURITY_SECLVL=m +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_CAPABILITIES=y +# CONFIG_SECURITY_ROOTPLUG is not set +# CONFIG_SECURITY_SECLVL is not set CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y -CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 +CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=1 CONFIG_SECURITY_SELINUX_DISABLE=y CONFIG_SECURITY_SELINUX_DEVELOP=y CONFIG_SECURITY_SELINUX_AVC_STATS=y @@ -2582,8 +2366,8 @@ CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_MD5=m +CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m @@ -2602,7 +2386,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_TEST=m +# CONFIG_CRYPTO_TEST is not set # # Hardware crypto devices @@ -2616,8 +2400,6 @@ CONFIG_LIBCRC32C=m CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=m -CONFIG_REED_SOLOMON=m -CONFIG_REED_SOLOMON_DEC16=y # # Kernel hacking @@ -2625,7 +2407,7 @@ # CONFIG_PRINTK_TIME is not set CONFIG_DEBUG_KERNEL=y CONFIG_MAGIC_SYSRQ=y -CONFIG_LOG_BUF_SHIFT=14 +CONFIG_LOG_BUF_SHIFT=15 # CONFIG_SCHEDSTATS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_SPINLOCK is not set @@ -2640,5 +2422,4 @@ # CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y -# CONFIG_CHECKING is not set # CONFIG_INIT_DEBUG is not set diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/Kconfig --- a/linux-2.6-xen-sparse/arch/xen/i386/Kconfig Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/Kconfig Thu Aug 25 22:53:20 2005 @@ -533,6 +533,11 @@ with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. +config SWIOTLB + bool + depends on PCI + default y + source "drivers/firmware/Kconfig" choice diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/Makefile Thu Aug 25 22:53:20 2005 @@ -14,8 +14,7 @@ c-obj-y := semaphore.o vm86.o \ ptrace.o sys_i386.o \ - i387.o dmi_scan.o bootflag.o \ - doublefault.o + i387.o dmi_scan.o bootflag.o s-obj-y := obj-y += cpu/ @@ -44,6 +43,7 @@ c-obj-$(CONFIG_EFI) += efi.o efi_stub.o c-obj-$(CONFIG_EARLY_PRINTK) += early_printk.o c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o EXTRA_AFLAGS := -traditional @@ -84,7 +84,7 @@ $(obj)/vsyscall-sysenter.o FORCE $(call if_changed,syscall) -c-link := init_task.o +c-link := s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o vsyscall.lds.o syscall_table.o $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): @@ -92,6 +92,7 @@ $(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S +EXTRA_AFLAGS += -I$(obj) $(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S obj-y += $(c-obj-y) $(s-obj-y) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/cpu/common.c Thu Aug 25 22:53:20 2005 @@ -19,11 +19,13 @@ #include "cpu.h" +#ifndef CONFIG_XEN DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]); EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); +#endif static int cachesize_override __initdata = -1; static int disable_x86_fxsr __initdata = 0; @@ -569,7 +571,7 @@ for (va = gdt_descr->address, f = 0; va < gdt_descr->address + gdt_descr->size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + frames[f] = virt_to_mfn(va); make_page_readonly((void *)va); } if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8)) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/head.S --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/head.S Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/head.S Thu Aug 25 22:53:20 2005 @@ -136,9 +136,6 @@ ENTRY(empty_zero_page) .org 0x2000 -ENTRY(swapper_pg_dir) - -.org 0x3000 ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ @@ -190,10 +187,10 @@ .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ .fill GDT_ENTRIES-32,8,0 -.org 0x4000 +.org 0x3000 ENTRY(default_ldt) -.org 0x5000 +.org 0x4000 /* * Real beginning of normal "text" segment */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/i386_ksyms.c Thu Aug 25 22:53:20 2005 @@ -115,9 +115,6 @@ EXPORT_SYMBOL(__copy_to_user_ll); EXPORT_SYMBOL(strnlen_user); -EXPORT_SYMBOL(dma_alloc_coherent); -EXPORT_SYMBOL(dma_free_coherent); - #ifdef CONFIG_PCI EXPORT_SYMBOL(pci_mem_start); #endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/ioport.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/ioport.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/ioport.c Thu Aug 25 22:53:20 2005 @@ -80,7 +80,7 @@ t->io_bitmap_ptr = bitmap; op.cmd = PHYSDEVOP_SET_IOBITMAP; - op.u.set_iobitmap.bitmap = (unsigned long)bitmap; + op.u.set_iobitmap.bitmap = (char *)bitmap; op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS; HYPERVISOR_physdev_op(&op); } @@ -113,16 +113,12 @@ if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) return -EPERM; - /* Maintain OS privileges even if user attempts to relinquish them. */ - if (new_io_pl == 0) - new_io_pl = 1; - /* Change our version of the privilege levels. */ current->thread.io_pl = new_io_pl; /* Force the change at ring 0. */ op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = new_io_pl; + op.u.set_iopl.iopl = (new_io_pl == 0) ? 1 : new_io_pl; HYPERVISOR_physdev_op(&op); return 0; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c Thu Aug 25 22:53:20 2005 @@ -198,7 +198,7 @@ { struct mm_struct * mm = current->mm; __u32 entry_1, entry_2, *lp; - unsigned long mach_lp; + maddr_t mach_lp; int error; struct user_desc ldt_info; @@ -245,7 +245,8 @@ /* Install the new entry ... */ install: - error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2); + error = HYPERVISOR_update_descriptor( + mach_lp, (u64)entry_1 | ((u64)entry_2<<32)); out_unlock: up(&mm->context.sem); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/mpparse.c Thu Aug 25 22:53:20 2005 @@ -784,7 +784,9 @@ void __init find_smp_config (void) { +#ifndef CONFIG_XEN unsigned int address; +#endif /* * FIXME: Linux assumes you have 640K of base ram.. diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/pci-dma.c Thu Aug 25 22:53:20 2005 @@ -23,6 +23,104 @@ int flags; unsigned long *bitmap; }; + +#define IOMMU_BUG_ON(test) \ +do { \ + if (unlikely(test)) { \ + printk(KERN_ALERT "Fatal DMA error! " \ + "Please use 'swiotlb=force'\n"); \ + BUG(); \ + } \ +} while (0) + +int +dma_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + int i, rc; + + BUG_ON(direction == DMA_NONE); + + if (swiotlb) { + rc = swiotlb_map_sg(hwdev, sg, nents, direction); + } else { + for (i = 0; i < nents; i++ ) { + sg[i].dma_address = + page_to_phys(sg[i].page) + sg[i].offset; + sg[i].dma_length = sg[i].length; + BUG_ON(!sg[i].page); + IOMMU_BUG_ON(address_needs_mapping( + hwdev, sg[i].dma_address)); + } + rc = nents; + } + + flush_write_buffers(); + return rc; +} +EXPORT_SYMBOL(dma_map_sg); + +void +dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_sg(hwdev, sg, nents, direction); +} +EXPORT_SYMBOL(dma_unmap_sg); + +dma_addr_t +dma_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_addr; + + BUG_ON(direction == DMA_NONE); + + if (swiotlb) { + dma_addr = swiotlb_map_page( + dev, page, offset, size, direction); + } else { + dma_addr = page_to_phys(page) + offset; + IOMMU_BUG_ON(address_needs_mapping(dev, dma_addr)); + } + + return dma_addr; +} +EXPORT_SYMBOL(dma_map_page); + +void +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_page(dev, dma_address, size, direction); +} +EXPORT_SYMBOL(dma_unmap_page); + +int +dma_mapping_error(dma_addr_t dma_addr) +{ + if (swiotlb) + return swiotlb_dma_mapping_error(dma_addr); + return 0; +} +EXPORT_SYMBOL(dma_mapping_error); + +int +dma_supported(struct device *dev, u64 mask) +{ + if (swiotlb) + return swiotlb_dma_supported(dev, mask); + /* + * By default we'll BUG when an infeasible DMA is requested, and + * request swiotlb=force (see IOMMU_BUG_ON). + */ + return 1; +} +EXPORT_SYMBOL(dma_supported); void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, unsigned int __nocast gfp) @@ -54,13 +152,14 @@ ret = (void *)vstart; if (ret != NULL) { - xen_contig_memory(vstart, order); + xen_create_contiguous_region(vstart, order); memset(ret, 0, size); *dma_handle = virt_to_bus(ret); } return ret; } +EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) @@ -72,9 +171,12 @@ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; bitmap_release_region(mem->bitmap, page, order); - } else + } else { + xen_destroy_contiguous_region((unsigned long)vaddr, order); free_pages((unsigned long)vaddr, order); -} + } +} +EXPORT_SYMBOL(dma_free_coherent); int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dma_addr_t device_addr, size_t size, int flags) @@ -153,46 +255,20 @@ } EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -static LIST_HEAD(dma_map_head); -static DEFINE_SPINLOCK(dma_map_lock); -struct dma_map_entry { - struct list_head list; - dma_addr_t dma; - char *bounce, *host; - size_t size; -}; -#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d))) - dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size, enum dma_data_direction direction) { - struct dma_map_entry *ent; - void *bnc; dma_addr_t dma; - unsigned long flags; - - BUG_ON(direction == DMA_NONE); - - /* - * Even if size is sub-page, the buffer may still straddle a page - * boundary. Take into account buffer start offset. All other calls are - * conservative and always search the dma_map list if it's non-empty. - */ - if ((((unsigned int)ptr & ~PAGE_MASK) + size) <= PAGE_SIZE) { + + BUG_ON(direction == DMA_NONE); + + if (swiotlb) { + dma = swiotlb_map_single(dev, ptr, size, direction); + } else { dma = virt_to_bus(ptr); - } else { - BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL); - BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL); - if (direction != DMA_FROM_DEVICE) - memcpy(bnc, ptr, size); - ent->dma = dma; - ent->bounce = bnc; - ent->host = ptr; - ent->size = size; - spin_lock_irqsave(&dma_map_lock, flags); - list_add(&ent->list, &dma_map_head); - spin_unlock_irqrestore(&dma_map_lock, flags); + IOMMU_BUG_ON(range_straddles_page_boundary(ptr, size)); + IOMMU_BUG_ON(address_needs_mapping(dev, dma)); } flush_write_buffers(); @@ -204,30 +280,9 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction) { - struct dma_map_entry *ent; - unsigned long flags; - - BUG_ON(direction == DMA_NONE); - - /* Fast-path check: are there any multi-page DMA mappings? */ - if (!list_empty(&dma_map_head)) { - spin_lock_irqsave(&dma_map_lock, flags); - list_for_each_entry ( ent, &dma_map_head, list ) { - if (DMA_MAP_MATCHES(ent, dma_addr)) { - list_del(&ent->list); - break; - } - } - spin_unlock_irqrestore(&dma_map_lock, flags); - if (&ent->list != &dma_map_head) { - BUG_ON(dma_addr != ent->dma); - BUG_ON(size != ent->size); - if (direction != DMA_TO_DEVICE) - memcpy(ent->host, ent->bounce, size); - dma_free_coherent(dev, size, ent->bounce, ent->dma); - kfree(ent); - } - } + BUG_ON(direction == DMA_NONE); + if (swiotlb) + swiotlb_unmap_single(dev, dma_addr, size, direction); } EXPORT_SYMBOL(dma_unmap_single); @@ -235,23 +290,8 @@ dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct dma_map_entry *ent; - unsigned long flags, off; - - /* Fast-path check: are there any multi-page DMA mappings? */ - if (!list_empty(&dma_map_head)) { - spin_lock_irqsave(&dma_map_lock, flags); - list_for_each_entry ( ent, &dma_map_head, list ) - if (DMA_MAP_MATCHES(ent, dma_handle)) - break; - spin_unlock_irqrestore(&dma_map_lock, flags); - if (&ent->list != &dma_map_head) { - off = dma_handle - ent->dma; - BUG_ON((off + size) > ent->size); - /*if (direction != DMA_TO_DEVICE)*/ - memcpy(ent->host+off, ent->bounce+off, size); - } - } + if (swiotlb) + swiotlb_sync_single_for_cpu(dev, dma_handle, size, direction); } EXPORT_SYMBOL(dma_sync_single_for_cpu); @@ -259,24 +299,17 @@ dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct dma_map_entry *ent; - unsigned long flags, off; - - /* Fast-path check: are there any multi-page DMA mappings? */ - if (!list_empty(&dma_map_head)) { - spin_lock_irqsave(&dma_map_lock, flags); - list_for_each_entry ( ent, &dma_map_head, list ) - if (DMA_MAP_MATCHES(ent, dma_handle)) - break; - spin_unlock_irqrestore(&dma_map_lock, flags); - if (&ent->list != &dma_map_head) { - off = dma_handle - ent->dma; - BUG_ON((off + size) > ent->size); - /*if (direction != DMA_FROM_DEVICE)*/ - memcpy(ent->bounce+off, ent->host+off, size); - } - } - - flush_write_buffers(); + if (swiotlb) + swiotlb_sync_single_for_device(dev, dma_handle, size, direction); } EXPORT_SYMBOL(dma_sync_single_for_device); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/process.c Thu Aug 25 22:53:20 2005 @@ -115,20 +115,12 @@ /* We don't actually take CPU down, just spin without interrupts. */ static inline void play_dead(void) { - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* We shouldn't have to disable interrupts while dead, but - * some interrupts just don't seem to go away, and this makes - * it "work" for testing purposes. */ /* Death loop */ while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE) HYPERVISOR_yield(); - local_irq_disable(); __flush_tlb_all(); cpu_set(smp_processor_id(), cpu_online_map); - local_irq_enable(); } #else static inline void play_dead(void) @@ -156,12 +148,19 @@ rmb(); if (cpu_is_offline(cpu)) { + local_irq_disable(); #if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU) + /* Ack it. From this point on until + we get woken up, we're not allowed + to take any locks. In particular, + don't printk. */ + __get_cpu_var(cpu_state) = CPU_DEAD; /* Tell hypervisor to take vcpu down. */ HYPERVISOR_vcpu_down(cpu); #endif play_dead(); - } + local_irq_enable(); + } __get_cpu_var(irq_stat).idle_timestamp = jiffies; xen_idle(); @@ -523,23 +522,22 @@ * Load the per-thread Thread-Local Storage descriptor. * This is load_TLS(next, cpu) with multicalls. */ -#define C(i) do { \ - if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ - next->tls_array[i].b != prev->tls_array[i].b)) { \ - mcl->op = __HYPERVISOR_update_descriptor; \ - mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \ - [GDT_ENTRY_TLS_MIN + i]); \ - mcl->args[1] = ((u32 *)&next->tls_array[i])[0]; \ - mcl->args[2] = ((u32 *)&next->tls_array[i])[1]; \ - mcl++; \ - } \ +#define C(i) do { \ + if (unlikely(next->tls_array[i].a != prev->tls_array[i].a || \ + next->tls_array[i].b != prev->tls_array[i].b)) { \ + mcl->op = __HYPERVISOR_update_descriptor; \ + *(u64 *)&mcl->args[0] = virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ + *(u64 *)&mcl->args[2] = *(u64 *)&next->tls_array[i]; \ + mcl++; \ + } \ } while (0) C(0); C(1); C(2); #undef C if (unlikely(prev->io_pl != next->io_pl)) { iopl_op.cmd = PHYSDEVOP_SET_IOPL; - iopl_op.u.set_iopl.iopl = next->io_pl; + iopl_op.u.set_iopl.iopl = (next->io_pl == 0) ? 1 : next->io_pl; mcl->op = __HYPERVISOR_physdev_op; mcl->args[0] = (unsigned long)&iopl_op; mcl++; @@ -549,7 +547,7 @@ iobmp_op.cmd = PHYSDEVOP_SET_IOBITMAP; iobmp_op.u.set_iobitmap.bitmap = - (unsigned long)next->io_bitmap_ptr; + (char *)next->io_bitmap_ptr; iobmp_op.u.set_iobitmap.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; mcl->op = __HYPERVISOR_physdev_op; @@ -791,3 +789,10 @@ sp -= get_random_int() % 8192; return sp & ~0xf; } + + +#ifndef CONFIG_X86_SMP +void _restore_vcpu(void) +{ +} +#endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/setup.c Thu Aug 25 22:53:20 2005 @@ -1575,19 +1575,20 @@ /* Make sure we have a correctly sized P->M table. */ if (max_pfn != xen_start_info.nr_pages) { phys_to_machine_mapping = alloc_bootmem_low_pages( - max_pfn * sizeof(unsigned long)); + max_pfn * sizeof(unsigned int)); if (max_pfn > xen_start_info.nr_pages) { /* set to INVALID_P2M_ENTRY */ memset(phys_to_machine_mapping, ~0, - max_pfn * sizeof(unsigned long)); + max_pfn * sizeof(unsigned int)); memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - xen_start_info.nr_pages * sizeof(unsigned long)); + (unsigned int *)xen_start_info.mfn_list, + xen_start_info.nr_pages * sizeof(unsigned int)); } else { memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - max_pfn * sizeof(unsigned long)); + (unsigned int *)xen_start_info.mfn_list, + max_pfn * sizeof(unsigned int)); + /* N.B. below relies on sizeof(int) == sizeof(long). */ if (HYPERVISOR_dom_mem_op( MEMOP_decrease_reservation, (unsigned long *)xen_start_info.mfn_list + max_pfn, @@ -1597,18 +1598,17 @@ free_bootmem( __pa(xen_start_info.mfn_list), PFN_PHYS(PFN_UP(xen_start_info.nr_pages * - sizeof(unsigned long)))); + sizeof(unsigned int)))); } pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE); - for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) + for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned int)), j++ ) { pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + virt_to_mfn(&phys_to_machine_mapping[i]); } HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = - virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; - + virt_to_mfn(pfn_to_mfn_frame_list); /* * NOTE: at this point the bootmem allocator is fully available. @@ -1636,7 +1636,7 @@ efi_map_memmap(); op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = current->thread.io_pl = 1; + op.u.set_iopl.iopl = 1; HYPERVISOR_physdev_op(&op); #ifdef CONFIG_ACPI_BOOT diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c Thu Aug 25 22:53:20 2005 @@ -62,6 +62,8 @@ #include <mach_wakecpu.h> #include <smpboot_hooks.h> +#include <asm-xen/evtchn.h> + /* Set if we find a B stepping CPU */ static int __initdata smp_b_stepping; @@ -129,15 +131,7 @@ */ void __init smp_alloc_memory(void) { -#if 1 - int cpu; - - for (cpu = 1; cpu < NR_CPUS; cpu++) { - cpu_gdt_descr[cpu].address = (unsigned long) - alloc_bootmem_low_pages(PAGE_SIZE); - /* XXX free unused pages later */ - } -#else +#if 0 trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); /* * Has to be in very low memory so we can execute @@ -859,8 +853,8 @@ atomic_set(&init_deasserted, 0); #if 1 - if (cpu_gdt_descr[0].size > PAGE_SIZE) - BUG(); + cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL); + BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE); cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; printk("GDT: copying %d bytes from %lx to %lx\n", cpu_gdt_descr[0].size, cpu_gdt_descr[0].address, @@ -878,7 +872,8 @@ ctxt.user_regs.cs = __KERNEL_CS; ctxt.user_regs.eip = start_eip; ctxt.user_regs.esp = idle->thread.esp; - ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12); +#define X86_EFLAGS_IOPL_RING1 0x1000 + ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING1; /* FPU is set up to default initial state. */ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); @@ -901,7 +896,7 @@ for (va = cpu_gdt_descr[cpu].address, f = 0; va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; va += PAGE_SIZE, f++) { - ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT; + ctxt.gdt_frames[f] = virt_to_mfn(va); make_page_readonly((void *)va); } ctxt.gdt_ents = cpu_gdt_descr[cpu].size / 8; @@ -917,10 +912,11 @@ ctxt.failsafe_callback_cs = __KERNEL_CS; ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; - ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(swapper_pg_dir); + ctxt.ctrlreg[3] = virt_to_mfn(swapper_pg_dir) << PAGE_SHIFT; boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); - printk("boot error: %ld\n", boot_error); + if (boot_error) + printk("boot error: %ld\n", boot_error); if (!boot_error) { /* @@ -1321,14 +1317,127 @@ } #ifdef CONFIG_HOTPLUG_CPU -#include <asm-xen/ctrl_if.h> - +#include <asm-xen/xenbus.h> /* hotplug down/up funtion pointer and target vcpu */ struct vcpu_hotplug_handler_t { - void (*fn)(int vcpu); + void (*fn) (int vcpu); u32 vcpu; }; static struct vcpu_hotplug_handler_t vcpu_hotplug_handler; + +static int vcpu_hotplug_cpu_process(void *unused) +{ + struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; + + if (handler->fn) { + (*(handler->fn)) (handler->vcpu); + handler->fn = NULL; + } + return 0; +} + +static void __vcpu_hotplug_handler(void *unused) +{ + int err; + + err = kernel_thread(vcpu_hotplug_cpu_process, + NULL, CLONE_FS | CLONE_FILES); + if (err < 0) + printk(KERN_ALERT "Error creating hotplug_cpu process!\n"); +} + +static void handle_vcpu_hotplug_event(struct xenbus_watch *, const char *); +static struct notifier_block xsn_cpu; + +/* xenbus watch struct */ +static struct xenbus_watch cpu_watch = { + .node = "cpu", + .callback = handle_vcpu_hotplug_event +}; + +/* NB: Assumes xenbus_lock is held! */ +static int setup_cpu_watcher(struct notifier_block *notifier, + unsigned long event, void *data) +{ + int err = 0; + + BUG_ON(down_trylock(&xenbus_lock) == 0); + err = register_xenbus_watch(&cpu_watch); + + if (err) { + printk("Failed to register watch on /cpu\n"); + } + + return NOTIFY_DONE; +} + +static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, const char *node) +{ + static DECLARE_WORK(vcpu_hotplug_work, __vcpu_hotplug_handler, NULL); + struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; + ssize_t ret; + int err, cpu; + char state[8]; + char dir[32]; + char *cpustr; + + /* get a pointer to start of cpu string */ + if ((cpustr = strstr(node, "cpu/")) != NULL) { + + /* find which cpu state changed, note vcpu for handler */ + sscanf(cpustr, "cpu/%d", &cpu); + handler->vcpu = cpu; + + /* calc the dir for xenbus read */ + sprintf(dir, "cpu/%d", cpu); + + /* make sure watch that was triggered is changes to the correct key */ + if ((strcmp(node + strlen(dir), "/availability")) != 0) + return; + + /* get the state value */ + xenbus_transaction_start("cpu"); + err = xenbus_scanf(dir, "availability", "%s", state); + xenbus_transaction_end(0); + + if (err != 1) { + printk(KERN_ERR + "XENBUS: Unable to read cpu state\n"); + return; + } + + /* if we detect a state change, take action */ + if (strcmp(state, "online") == 0) { + /* offline -> online */ + if (!cpu_isset(cpu, cpu_online_map)) { + handler->fn = (void *)&cpu_up; + ret = schedule_work(&vcpu_hotplug_work); + } + } else if (strcmp(state, "offline") == 0) { + /* online -> offline */ + if (cpu_isset(cpu, cpu_online_map)) { + handler->fn = (void *)&cpu_down; + ret = schedule_work(&vcpu_hotplug_work); + } + } else { + printk(KERN_ERR + "XENBUS: unknown state(%s) on node(%s)\n", state, + node); + } + } + return; +} + +static int __init setup_vcpu_hotplug_event(void) +{ + xsn_cpu.notifier_call = setup_cpu_watcher; + + register_xenstore_notifier(&xsn_cpu); + + return 0; +} + +subsys_initcall(setup_vcpu_hotplug_event); /* must be called with the cpucontrol mutex held */ static int __devinit cpu_enable(unsigned int cpu) @@ -1398,77 +1507,6 @@ printk(KERN_ERR "CPU %u didn't die...\n", cpu); } -static int vcpu_hotplug_cpu_process(void *unused) -{ - struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; - - if (handler->fn) { - (*(handler->fn))(handler->vcpu); - handler->fn = NULL; - } - return 0; -} - -static void __vcpu_hotplug_handler(void *unused) -{ - int err; - - err = kernel_thread(vcpu_hotplug_cpu_process, - NULL, CLONE_FS | CLONE_FILES); - if (err < 0) - printk(KERN_ALERT "Error creating hotplug_cpu process!\n"); - -} - -static void vcpu_hotplug_event_handler(ctrl_msg_t *msg, unsigned long id) -{ - static DECLARE_WORK(vcpu_hotplug_work, __vcpu_hotplug_handler, NULL); - vcpu_hotplug_t *req = (vcpu_hotplug_t *)&msg->msg[0]; - struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; - ssize_t ret; - - if (msg->length != sizeof(vcpu_hotplug_t)) - goto parse_error; - - /* grab target vcpu from msg */ - handler->vcpu = req->vcpu; - - /* determine which function to call based on msg subtype */ - switch (msg->subtype) { - case CMSG_VCPU_HOTPLUG_OFF: - handler->fn = (void *)&cpu_down; - ret = schedule_work(&vcpu_hotplug_work); - req->status = (u32) ret; - break; - case CMSG_VCPU_HOTPLUG_ON: - handler->fn = (void *)&cpu_up; - ret = schedule_work(&vcpu_hotplug_work); - req->status = (u32) ret; - break; - default: - goto parse_error; - } - - ctrl_if_send_response(msg); - return; - parse_error: - msg->length = 0; - ctrl_if_send_response(msg); -} - -static int __init setup_vcpu_hotplug_event(void) -{ - struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler; - - handler->fn = NULL; - ctrl_if_register_receiver(CMSG_VCPU_HOTPLUG, - vcpu_hotplug_event_handler, 0); - - return 0; -} - -__initcall(setup_vcpu_hotplug_event); - #else /* ... !CONFIG_HOTPLUG_CPU */ int __cpu_disable(void) { @@ -1529,20 +1567,66 @@ extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); -void __init smp_intr_init(void) +void smp_intr_init(void) { int cpu = smp_processor_id(); per_cpu(resched_irq, cpu) = - bind_ipi_on_cpu_to_irq(RESCHEDULE_VECTOR); + bind_ipi_to_irq(RESCHEDULE_VECTOR); sprintf(resched_name[cpu], "resched%d", cpu); BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, SA_INTERRUPT, resched_name[cpu], NULL)); per_cpu(callfunc_irq, cpu) = - bind_ipi_on_cpu_to_irq(CALL_FUNCTION_VECTOR); + bind_ipi_to_irq(CALL_FUNCTION_VECTOR); sprintf(callfunc_name[cpu], "callfunc%d", cpu); BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), smp_call_function_interrupt, SA_INTERRUPT, callfunc_name[cpu], NULL)); } + +static void smp_intr_exit(void) +{ + int cpu = smp_processor_id(); + + free_irq(per_cpu(resched_irq, cpu), NULL); + unbind_ipi_from_irq(RESCHEDULE_VECTOR); + + free_irq(per_cpu(callfunc_irq, cpu), NULL); + unbind_ipi_from_irq(CALL_FUNCTION_VECTOR); +} + +extern void local_setup_timer_irq(void); +extern void local_teardown_timer_irq(void); + +void smp_suspend(void) +{ + /* XXX todo: take down time and ipi's on all cpus */ + local_teardown_timer_irq(); + smp_intr_exit(); +} + +void smp_resume(void) +{ + /* XXX todo: restore time and ipi's on all cpus */ + smp_intr_init(); + local_setup_timer_irq(); +} + +DECLARE_PER_CPU(int, timer_irq); + +void _restore_vcpu(void) +{ + int cpu = smp_processor_id(); + extern atomic_t vcpus_rebooting; + + /* We are the first thing the vcpu runs when it comes back, + and we are supposed to restore the IPIs and timer + interrupts etc. When we return, the vcpu's idle loop will + start up again. */ + _bind_virq_to_irq(VIRQ_TIMER, cpu, per_cpu(timer_irq, cpu)); + _bind_virq_to_irq(VIRQ_DEBUG, cpu, per_cpu(ldebug_irq, cpu)); + _bind_ipi_to_irq(RESCHEDULE_VECTOR, cpu, per_cpu(resched_irq, cpu) ); + _bind_ipi_to_irq(CALL_FUNCTION_VECTOR, cpu, per_cpu(callfunc_irq, cpu) ); + atomic_dec(&vcpus_rebooting); +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/time.c Thu Aug 25 22:53:20 2005 @@ -70,6 +70,8 @@ #include "io_ports.h" +#include <asm-xen/evtchn.h> + extern spinlock_t i8259A_lock; int pit_latch_buggy; /* extern */ @@ -113,26 +115,15 @@ u32 version; }; static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); -static struct timeval shadow_tv; +static struct timespec shadow_tv; +static u32 shadow_tv_version; /* Keep track of last time we did processing/updating of jiffies and xtime. */ static u64 processed_system_time; /* System time (ns) at last processing. */ static DEFINE_PER_CPU(u64, processed_system_time); -#define NS_PER_TICK (1000000000ULL/HZ) - -#define HANDLE_USEC_UNDERFLOW(_tv) do { \ - while ((_tv).tv_usec < 0) { \ - (_tv).tv_usec += USEC_PER_SEC; \ - (_tv).tv_sec--; \ - } \ -} while (0) -#define HANDLE_USEC_OVERFLOW(_tv) do { \ - while ((_tv).tv_usec >= USEC_PER_SEC) { \ - (_tv).tv_usec -= USEC_PER_SEC; \ - (_tv).tv_sec++; \ - } \ -} while (0) +#define NS_PER_TICK (1000000000L/HZ) + static inline void __normalize_time(time_t *sec, s64 *nsec) { while (*nsec >= NSEC_PER_SEC) { @@ -153,8 +144,6 @@ return 1; } __setup("independent_wallclock", __independent_wallclock); -#define INDEPENDENT_WALLCLOCK() \ - (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN)) int tsc_disable __initdata = 0; @@ -175,25 +164,40 @@ .delay = delay_tsc, }; -static inline u32 down_shift(u64 time, int shift) -{ +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + if ( shift < 0 ) - return (u32)(time >> -shift); - return (u32)((u32)time << shift); -} - -/* - * 32-bit multiplication of integer multiplicand and fractional multiplier - * yielding 32-bit integer product. - */ -static inline u32 mul_frac(u32 multiplicand, u32 multiplier) -{ - u32 product_int, product_frac; + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ __asm__ ( - "mul %3" - : "=a" (product_frac), "=d" (product_int) - : "0" (multiplicand), "r" (multiplier) ); - return product_int; + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "add %4,%%eax ; " + "xor %5,%5 ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#else + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#endif + + return product; } void init_cpu_khz(void) @@ -201,55 +205,43 @@ u64 __cpu_khz = 1000000ULL << 32; struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0]; do_div(__cpu_khz, info->tsc_to_system_mul); - cpu_khz = down_shift(__cpu_khz, -info->tsc_shift); - printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); + if ( info->tsc_shift < 0 ) + cpu_khz = __cpu_khz << -info->tsc_shift; + else + cpu_khz = __cpu_khz >> info->tsc_shift; } static u64 get_nsec_offset(struct shadow_time_info *shadow) { - u64 now; - u32 delta; + u64 now, delta; rdtscll(now); - delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); - return mul_frac(delta, shadow->tsc_to_nsec_mul); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } static unsigned long get_usec_offset(struct shadow_time_info *shadow) { - u64 now; - u32 delta; + u64 now, delta; rdtscll(now); - delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift); - return mul_frac(delta, shadow->tsc_to_usec_mul); -} - -static void update_wallclock(void) -{ - shared_info_t *s = HYPERVISOR_shared_info; + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_usec_mul, shadow->tsc_shift); +} + +static void __update_wallclock(time_t sec, long nsec) +{ long wtm_nsec, xtime_nsec; time_t wtm_sec, xtime_sec; - u64 tmp, usec; - - shadow_tv.tv_sec = s->wc_sec; - shadow_tv.tv_usec = s->wc_usec; - - if (INDEPENDENT_WALLCLOCK()) - return; - - if ((time_status & STA_UNSYNC) != 0) - return; + u64 tmp, wc_nsec; /* Adjust wall-clock time base based on wall_jiffies ticks. */ - usec = processed_system_time; - do_div(usec, 1000); - usec += (u64)shadow_tv.tv_sec * 1000000ULL; - usec += (u64)shadow_tv.tv_usec; - usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ); + wc_nsec = processed_system_time; + wc_nsec += (u64)sec * 1000000000ULL; + wc_nsec += (u64)nsec; + wc_nsec -= (jiffies - wall_jiffies) * (u64)(NSEC_PER_SEC / HZ); /* Split wallclock base into seconds and nanoseconds. */ - tmp = usec; - xtime_nsec = do_div(tmp, 1000000) * 1000ULL; + tmp = wc_nsec; + xtime_nsec = do_div(tmp, 1000000000); xtime_sec = (time_t)tmp; wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec); @@ -257,13 +249,35 @@ set_normalized_timespec(&xtime, xtime_sec, xtime_nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; +} + +static void update_wallclock(void) +{ + shared_info_t *s = HYPERVISOR_shared_info; + + do { + shadow_tv_version = s->wc_version; + rmb(); + shadow_tv.tv_sec = s->wc_sec; + shadow_tv.tv_nsec = s->wc_nsec; + rmb(); + } + while ((s->wc_version & 1) | (shadow_tv_version ^ s->wc_version)); + + if (!independent_wallclock) + __update_wallclock(shadow_tv.tv_sec, shadow_tv.tv_nsec); } /* * Reads a consistent set of time-base values from Xen, into a shadow data - * area. Must be called with the xtime_lock held for writing. + * area. */ -static void __get_time_values_from_xen(void) +static void get_time_values_from_xen(void) { shared_info_t *s = HYPERVISOR_shared_info; struct vcpu_time_info *src; @@ -273,7 +287,7 @@ dst = &per_cpu(shadow_time, smp_processor_id()); do { - dst->version = src->time_version2; + dst->version = src->version; rmb(); dst->tsc_timestamp = src->tsc_timestamp; dst->system_timestamp = src->system_time; @@ -281,13 +295,9 @@ dst->tsc_shift = src->tsc_shift; rmb(); } - while (dst->version != src->time_version1); + while ((src->version & 1) | (dst->version ^ src->version)); dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000; - - if ((shadow_tv.tv_sec != s->wc_sec) || - (shadow_tv.tv_usec != s->wc_usec)) - update_wallclock(); } static inline int time_values_up_to_date(int cpu) @@ -298,7 +308,7 @@ src = &HYPERVISOR_shared_info->vcpu_time[cpu]; dst = &per_cpu(shadow_time, cpu); - return (dst->version == src->time_version2); + return (dst->version == src->version); } /* @@ -339,10 +349,10 @@ unsigned long seq; unsigned long usec, sec; unsigned long max_ntp_tick; - unsigned long flags; s64 nsec; unsigned int cpu; struct shadow_time_info *shadow; + u32 local_time_version; cpu = get_cpu(); shadow = &per_cpu(shadow_time, cpu); @@ -350,6 +360,7 @@ do { unsigned long lost; + local_time_version = shadow->version; seq = read_seqbegin(&xtime_lock); usec = get_usec_offset(shadow); @@ -385,12 +396,11 @@ * overflowed). Detect that and recalculate * with fresh values. */ - write_seqlock_irqsave(&xtime_lock, flags); - __get_time_values_from_xen(); - write_sequnlock_irqrestore(&xtime_lock, flags); + get_time_values_from_xen(); continue; } - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&xtime_lock, seq) || + (local_time_version != shadow->version)); put_cpu(); @@ -407,18 +417,14 @@ int do_settimeofday(struct timespec *tv) { - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec; + time_t sec; s64 nsec; - struct timespec xentime; unsigned int cpu; struct shadow_time_info *shadow; + dom0_op_t op; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - - if (!INDEPENDENT_WALLCLOCK()) - return 0; /* Silent failure? */ cpu = get_cpu(); shadow = &per_cpu(shadow_time, cpu); @@ -430,50 +436,30 @@ * overflows. If that were to happen then our shadow time values would * be stale, so we can retry with fresh ones. */ - again: - nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow); - if (unlikely(!time_values_up_to_date(cpu))) { - __get_time_values_from_xen(); - goto again; - } - + for ( ; ; ) { + nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow); + if (time_values_up_to_date(cpu)) + break; + get_time_values_from_xen(); + } + sec = tv->tv_sec; __normalize_time(&sec, &nsec); - set_normalized_timespec(&xentime, sec, nsec); - - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - nsec -= (shadow->system_timestamp - processed_system_time); - - __normalize_time(&sec, &nsec); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - time_adjust = 0; /* stop active adjtime() */ - time_status |= STA_UNSYNC; - time_maxerror = NTP_PHASE_LIMIT; - time_esterror = NTP_PHASE_LIMIT; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (xen_start_info.flags & SIF_INITDOMAIN) { - dom0_op_t op; + + if ((xen_start_info.flags & SIF_INITDOMAIN) && + !independent_wallclock) { op.cmd = DOM0_SETTIME; - op.u.settime.secs = xentime.tv_sec; - op.u.settime.usecs = xentime.tv_nsec / NSEC_PER_USEC; + op.u.settime.secs = sec; + op.u.settime.nsecs = nsec; op.u.settime.system_time = shadow->system_timestamp; - write_sequnlock_irq(&xtime_lock); HYPERVISOR_dom0_op(&op); - } else -#endif - write_sequnlock_irq(&xtime_lock); + update_wallclock(); + } else if (independent_wallclock) { + nsec -= shadow->system_timestamp; + __normalize_time(&sec, &nsec); + __update_wallclock(sec, nsec); + } + + write_sequnlock_irq(&xtime_lock); put_cpu(); @@ -489,6 +475,9 @@ int retval; WARN_ON(irqs_disabled()); + + if (!(xen_start_info.flags & SIF_INITDOMAIN)) + return 0; /* gets recalled with irq locally disabled */ spin_lock_irq(&rtc_lock); @@ -515,21 +504,21 @@ { int cpu = get_cpu(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); - s64 off; - unsigned long flags; - - for ( ; ; ) { - off = get_nsec_offset(shadow); - if (time_values_up_to_date(cpu)) - break; - write_seqlock_irqsave(&xtime_lock, flags); - __get_time_values_from_xen(); - write_sequnlock_irqrestore(&xtime_lock, flags); - } + u64 time; + u32 local_time_version; + + do { + local_time_version = shadow->version; + smp_rmb(); + time = shadow->system_timestamp + get_nsec_offset(shadow); + if (!time_values_up_to_date(cpu)) + get_time_values_from_xen(); + smp_rmb(); + } while (local_time_version != shadow->version); put_cpu(); - return shadow->system_timestamp + off; + return time; } EXPORT_SYMBOL(monotonic_clock); @@ -551,19 +540,16 @@ EXPORT_SYMBOL(profile_pc); #endif -/* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick - */ -static inline void do_timer_interrupt(int irq, void *dev_id, - struct pt_regs *regs) +irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { s64 delta, delta_cpu; int cpu = smp_processor_id(); struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu); + write_seqlock(&xtime_lock); + do { - __get_time_values_from_xen(); + get_time_values_from_xen(); delta = delta_cpu = shadow->system_timestamp + get_nsec_offset(shadow); @@ -572,7 +558,7 @@ } while (!time_values_up_to_date(cpu)); - if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) { + if (unlikely(delta < (s64)-1000000) || unlikely(delta_cpu < 0)) { printk("Timer ISR/%d: Time went backwards: " "delta=%lld cpu_delta=%lld shadow=%lld " "off=%lld processed=%lld cpu_processed=%lld\n", @@ -583,7 +569,6 @@ for (cpu = 0; cpu < num_online_cpus(); cpu++) printk(" %d: %lld\n", cpu, per_cpu(processed_system_time, cpu)); - return; } /* System-wide jiffy work. */ @@ -593,32 +578,25 @@ do_timer(regs); } - /* Local CPU jiffy work. */ + if (shadow_tv_version != HYPERVISOR_shared_info->wc_version) { + update_wallclock(); + clock_was_set(); + } + + write_sequnlock(&xtime_lock); + + /* + * Local CPU jiffy work. No need to hold xtime_lock, and I'm not sure + * if there is risk of deadlock if we do (since update_process_times + * may do scheduler rebalancing work and thus acquire runqueue locks). + */ while (delta_cpu >= NS_PER_TICK) { delta_cpu -= NS_PER_TICK; per_cpu(processed_system_time, cpu) += NS_PER_TICK; update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING, regs); } -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - do_timer_interrupt(irq, NULL, regs); - write_sequnlock(&xtime_lock); + return IRQ_HANDLED; } @@ -767,7 +745,7 @@ #endif /* Dynamically-mapped IRQ. */ -static DEFINE_PER_CPU(int, timer_irq); +DEFINE_PER_CPU(int, timer_irq); static struct irqaction irq_timer = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0", @@ -786,15 +764,16 @@ return; } #endif - __get_time_values_from_xen(); - xtime.tv_sec = shadow_tv.tv_sec; - xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC; - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + get_time_values_from_xen(); + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; + update_wallclock(); + init_cpu_khz(); + printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); #if defined(__x86_64__) vxtime.mode = VXTIME_TSC; @@ -860,6 +839,8 @@ void time_suspend(void) { /* nothing */ + teardown_irq(per_cpu(timer_irq, 0), &irq_timer); + unbind_virq_from_irq(VIRQ_TIMER); } /* No locking required. We are only CPU running, and interrupts are off. */ @@ -867,17 +848,31 @@ { init_cpu_khz(); - /* Get timebases for new environment. */ - __get_time_values_from_xen(); - - /* Reset our own concept of passage of system time. */ - processed_system_time = - per_cpu(shadow_time, smp_processor_id()).system_timestamp; + get_time_values_from_xen(); + + processed_system_time = per_cpu(shadow_time, 0).system_timestamp; per_cpu(processed_system_time, 0) = processed_system_time; + + update_wallclock(); + + per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER); + (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer); } #ifdef CONFIG_SMP static char timer_name[NR_CPUS][15]; +void local_setup_timer_irq(void) +{ + int cpu = smp_processor_id(); + + if (cpu == 0) + return; + per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); + sprintf(timer_name[cpu], "timer%d", cpu); + BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt, + SA_INTERRUPT, timer_name[cpu], NULL)); +} + void local_setup_timer(void) { int seq, cpu = smp_processor_id(); @@ -888,10 +883,17 @@ per_cpu(shadow_time, cpu).system_timestamp; } while (read_seqretry(&xtime_lock, seq)); - per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER); - sprintf(timer_name[cpu], "timer%d", cpu); - BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt, - SA_INTERRUPT, timer_name[cpu], NULL)); + local_setup_timer_irq(); +} + +void local_teardown_timer_irq(void) +{ + int cpu = smp_processor_id(); + + if (cpu == 0) + return; + free_irq(per_cpu(timer_irq, cpu), NULL); + unbind_virq_from_irq(VIRQ_TIMER); } #endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/traps.c Thu Aug 25 22:53:20 2005 @@ -871,6 +871,7 @@ } } +#ifndef CONFIG_XEN fastcall void setup_x86_bogus_stack(unsigned char * stk) { unsigned long *switch16_ptr, *switch32_ptr; @@ -915,6 +916,7 @@ memcpy(stack32, stack16, len); return stack32; } +#endif /* * 'math_state_restore()' saves the current math information in the diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/mm/fault.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/fault.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/fault.c Thu Aug 25 22:53:20 2005 @@ -281,7 +281,7 @@ siginfo_t info; /* Set the "privileged fault" bit to something sane. */ - error_code &= 3; + error_code &= ~4; error_code |= (regs->xcs & 2) << 1; if (regs->eflags & X86_EFLAGS_VM) error_code |= 4; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/highmem.c Thu Aug 25 22:53:20 2005 @@ -41,8 +41,7 @@ if (!pte_none(*(kmap_pte-idx))) BUG(); #endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); - __flush_tlb_one(vaddr); + set_pte_at_sync(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); return (void*) vaddr; } diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/mm/hypervisor.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/hypervisor.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/hypervisor.c Thu Aug 25 22:53:20 2005 @@ -35,6 +35,7 @@ #include <asm/pgtable.h> #include <asm-xen/hypervisor.h> #include <asm-xen/balloon.h> +#include <linux/module.h> #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) #include <linux/percpu.h> #include <asm/tlbflush.h> @@ -58,124 +59,124 @@ #ifndef CONFIG_XEN_SHADOW_MODE void xen_l1_entry_update(pte_t *ptr, pte_t val) { - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = pte_val_ma(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pte_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } void xen_l2_entry_update(pmd_t *ptr, pmd_t val) { - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = pmd_val_ma(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pmd_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } #ifdef CONFIG_X86_PAE void xen_l3_entry_update(pud_t *ptr, pud_t val) { - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = pud_val_ma(val); - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = pud_val_ma(val); + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } #endif #ifdef CONFIG_X86_64 void xen_l3_entry_update(pud_t *ptr, pud_t val) { - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = val.pud; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = val.pud; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } void xen_l4_entry_update(pgd_t *ptr, pgd_t val) { - mmu_update_t u; - u.ptr = virt_to_machine(ptr); - u.val = val.pgd; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); + mmu_update_t u; + u.ptr = virt_to_machine(ptr); + u.val = val.pgd; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_XEN_SHADOW_MODE */ void xen_machphys_update(unsigned long mfn, unsigned long pfn) { - mmu_update_t u; - u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; - u.val = pfn; - BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); + mmu_update_t u; + u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + u.val = pfn; + BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0); } void xen_pt_switch(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_NEW_BASEPTR; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_NEW_BASEPTR; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_new_user_pt(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_NEW_USER_BASEPTR; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_NEW_USER_BASEPTR; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_tlb_flush(void) { - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_LOCAL; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_invlpg(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_LOCAL; - op.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_LOCAL; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } #ifdef CONFIG_SMP void xen_tlb_flush_all(void) { - struct mmuext_op op; - op.cmd = MMUEXT_TLB_FLUSH_ALL; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_TLB_FLUSH_ALL; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_tlb_flush_mask(cpumask_t *mask) { - struct mmuext_op op; - if ( cpus_empty(*mask) ) - return; - op.cmd = MMUEXT_TLB_FLUSH_MULTI; - op.vcpumask = mask->bits; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_TLB_FLUSH_MULTI; + op.vcpumask = mask->bits; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_invlpg_all(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_INVLPG_ALL; - op.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_INVLPG_ALL; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr) { - struct mmuext_op op; - if ( cpus_empty(*mask) ) - return; - op.cmd = MMUEXT_INVLPG_MULTI; - op.vcpumask = mask->bits; - op.linear_addr = ptr & PAGE_MASK; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + if ( cpus_empty(*mask) ) + return; + op.cmd = MMUEXT_INVLPG_MULTI; + op.vcpumask = mask->bits; + op.linear_addr = ptr & PAGE_MASK; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } #endif /* CONFIG_SMP */ @@ -183,181 +184,233 @@ #ifndef CONFIG_XEN_SHADOW_MODE void xen_pgd_pin(unsigned long ptr) { - struct mmuext_op op; + struct mmuext_op op; #ifdef CONFIG_X86_64 - op.cmd = MMUEXT_PIN_L4_TABLE; + op.cmd = MMUEXT_PIN_L4_TABLE; #elif defined(CONFIG_X86_PAE) - op.cmd = MMUEXT_PIN_L3_TABLE; + op.cmd = MMUEXT_PIN_L3_TABLE; #else - op.cmd = MMUEXT_PIN_L2_TABLE; + op.cmd = MMUEXT_PIN_L2_TABLE; #endif - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_pgd_unpin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_pte_pin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L1_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L1_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_pte_unpin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } #ifdef CONFIG_X86_64 void xen_pud_pin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L3_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L3_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_pud_unpin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_pmd_pin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_PIN_L2_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_PIN_L2_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } void xen_pmd_unpin(unsigned long ptr) { - struct mmuext_op op; - op.cmd = MMUEXT_UNPIN_TABLE; - op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); + struct mmuext_op op; + op.cmd = MMUEXT_UNPIN_TABLE; + op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT); + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_XEN_SHADOW_MODE */ void xen_set_ldt(unsigned long ptr, unsigned long len) { - struct mmuext_op op; - op.cmd = MMUEXT_SET_LDT; - op.linear_addr = ptr; - op.nr_ents = len; - BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); -} - -void xen_contig_memory(unsigned long vstart, unsigned int order) -{ - /* - * Ensure multi-page extents are contiguous in machine memory. This code - * could be cleaned up some, and the number of hypercalls reduced. - */ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long mfn, i, flags; - - scrub_pages(vstart, 1 << order); - - balloon_lock(flags); - - /* 1. Zap current PTEs, giving away the underlying pages. */ - for (i = 0; i < (1<<order); i++) { - pgd = pgd_offset_k(vstart + (i*PAGE_SIZE)); - pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); - pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); - pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); - mfn = pte_mfn(*pte); - HYPERVISOR_update_va_mapping( - vstart + (i*PAGE_SIZE), __pte_ma(0), 0); - phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = - INVALID_P2M_ENTRY; - BUG_ON(HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, &mfn, 1, 0) != 1); - } - - /* 2. Get a new contiguous memory extent. */ - BUG_ON(HYPERVISOR_dom_mem_op( - MEMOP_increase_reservation, &mfn, 1, order | (32<<8)) != 1); - - /* 3. Map the new extent in place of old pages. */ - for (i = 0; i < (1<<order); i++) { - HYPERVISOR_update_va_mapping( - vstart + (i*PAGE_SIZE), - __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0); - xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i); - phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i; - } - - flush_tlb_all(); - - balloon_unlock(flags); -} - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS - -unsigned long allocate_empty_lowmem_region(unsigned long pages) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long *pfn_array; - unsigned long vstart; - unsigned long i; - unsigned int order = get_order(pages*PAGE_SIZE); - - vstart = __get_free_pages(GFP_KERNEL, order); - if ( vstart == 0 ) - return 0UL; - - scrub_pages(vstart, 1 << order); - - pfn_array = vmalloc((1<<order) * sizeof(*pfn_array)); - if ( pfn_array == NULL ) - BUG(); - - for ( i = 0; i < (1<<order); i++ ) - { - pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE))); - pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); - pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); - pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); - pfn_array[i] = pte_mfn(*pte); -#ifdef CONFIG_X86_64 - xen_l1_entry_update(pte, __pte(0)); -#else - HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0); -#endif - phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = - INVALID_P2M_ENTRY; - } - - flush_tlb_all(); - - balloon_put_pages(pfn_array, 1 << order); - - vfree(pfn_array); - - return vstart; -} - -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ + struct mmuext_op op; + op.cmd = MMUEXT_SET_LDT; + op.linear_addr = ptr; + op.nr_ents = len; + BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0); +} + +/* + * Bitmap is indexed by page number. If bit is set, the page is part of a + * xen_create_contiguous_region() area of memory. + */ +unsigned long *contiguous_bitmap; + +static void contiguous_bitmap_set( + unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] |= + ((1UL<<end_off)-1) & -(1UL<<start_off); + } else { + contiguous_bitmap[curr_idx] |= -(1UL<<start_off); + while ( ++curr_idx < end_idx ) + contiguous_bitmap[curr_idx] = ~0UL; + contiguous_bitmap[curr_idx] |= (1UL<<end_off)-1; + } +} + +static void contiguous_bitmap_clear( + unsigned long first_page, unsigned long nr_pages) +{ + unsigned long start_off, end_off, curr_idx, end_idx; + + curr_idx = first_page / BITS_PER_LONG; + start_off = first_page & (BITS_PER_LONG-1); + end_idx = (first_page + nr_pages) / BITS_PER_LONG; + end_off = (first_page + nr_pages) & (BITS_PER_LONG-1); + + if (curr_idx == end_idx) { + contiguous_bitmap[curr_idx] &= + -(1UL<<end_off) | ((1UL<<start_off)-1); + } else { + contiguous_bitmap[curr_idx] &= (1UL<<start_off)-1; + while ( ++curr_idx != end_idx ) + contiguous_bitmap[curr_idx] = 0; + contiguous_bitmap[curr_idx] &= -(1UL<<end_off); + } +} + +/* Ensure multi-page extents are contiguous in machine memory. */ +void xen_create_contiguous_region(unsigned long vstart, unsigned int order) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long mfn, i, flags; + + scrub_pages(vstart, 1 << order); + + balloon_lock(flags); + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1<<order); i++) { + pgd = pgd_offset_k(vstart + (i*PAGE_SIZE)); + pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); + pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); + pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); + mfn = pte_mfn(*pte); + BUG_ON(HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), __pte_ma(0), 0)); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = + INVALID_P2M_ENTRY; + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, &mfn, 1, 0) != 1); + } + + /* 2. Get a new contiguous memory extent. */ + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, &mfn, 1, order | (32<<8)) != 1); + + /* 3. Map the new extent in place of old pages. */ + for (i = 0; i < (1<<order); i++) { + BUG_ON(HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), + pfn_pte_ma(mfn+i, PAGE_KERNEL), 0)); + xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i; + } + + flush_tlb_all(); + + contiguous_bitmap_set(__pa(vstart) >> PAGE_SHIFT, 1UL << order); + + balloon_unlock(flags); +} + +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long mfn, i, flags; + + scrub_pages(vstart, 1 << order); + + balloon_lock(flags); + + contiguous_bitmap_clear(__pa(vstart) >> PAGE_SHIFT, 1UL << order); + + /* 1. Zap current PTEs, giving away the underlying pages. */ + for (i = 0; i < (1<<order); i++) { + pgd = pgd_offset_k(vstart + (i*PAGE_SIZE)); + pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE))); + pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE))); + pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); + mfn = pte_mfn(*pte); + BUG_ON(HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), __pte_ma(0), 0)); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = + INVALID_P2M_ENTRY; + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, &mfn, 1, 0) != 1); + } + + /* 2. Map new pages in place of old pages. */ + for (i = 0; i < (1<<order); i++) { + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, &mfn, 1, 0) != 1); + BUG_ON(HYPERVISOR_update_va_mapping( + vstart + (i*PAGE_SIZE), + pfn_pte_ma(mfn, PAGE_KERNEL), 0)); + xen_machphys_update(mfn, (__pa(vstart)>>PAGE_SHIFT)+i); + phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn; + } + + flush_tlb_all(); + + balloon_unlock(flags); +} + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/mm/init.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c Thu Aug 25 22:53:20 2005 @@ -41,6 +41,14 @@ #include <asm/sections.h> #include <asm-xen/hypervisor.h> +extern unsigned long *contiguous_bitmap; + +#if defined(CONFIG_SWIOTLB) +extern void swiotlb_init(void); +int swiotlb; +EXPORT_SYMBOL(swiotlb); +#endif + unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -334,18 +342,18 @@ extern void __init remap_numa_kva(void); #endif +pgd_t *swapper_pg_dir; + static void __init pagetable_init (void) { unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; - pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base; - -#ifdef CONFIG_X86_PAE + pgd_t *pgd_base = (pgd_t *)xen_start_info.pt_base; int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); -#endif + + swapper_pg_dir = pgd_base; + init_mm.pgd = pgd_base; + for (i = 0; i < NR_CPUS; i++) + per_cpu(cur_pgd, i) = pgd_base; /* Enable PSE if available */ if (cpu_has_pse) { @@ -358,44 +366,6 @@ __PAGE_KERNEL |= _PAGE_GLOBAL; __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL; } - - /* - * Switch to proper mm_init page directory. Initialise from the current - * page directory, write-protect the new page directory, then switch to - * it. We clean up by write-enabling and then freeing the old page dir. - */ -#ifndef CONFIG_X86_PAE - memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); - make_page_readonly(pgd_base); - xen_pgd_pin(__pa(pgd_base)); - load_cr3(pgd_base); - xen_pgd_unpin(__pa(old_pgd)); - make_page_writable(old_pgd); - __flush_tlb_all(); - free_bootmem(__pa(old_pgd), PAGE_SIZE); -#else - { - pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET); - pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET); - pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE); - - memcpy(new_pmd, old_pmd, PAGE_SIZE); - memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); - set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT)); - - make_page_readonly(new_pmd); - make_page_readonly(pgd_base); - xen_pgd_pin(__pa(pgd_base)); - load_cr3(pgd_base); - xen_pgd_unpin(__pa(old_pgd)); - make_page_writable(old_pgd); - make_page_writable(old_pmd); - __flush_tlb_all(); - - free_bootmem(__pa(old_pgd), PAGE_SIZE); - free_bootmem(__pa(old_pmd), PAGE_SIZE); - } -#endif init_mm.context.pinned = 1; kernel_physical_mapping_init(pgd_base); @@ -409,17 +379,6 @@ page_table_range_init(vaddr, 0, pgd_base); permanent_kmaps_init(pgd_base); - -#if 0 /* def CONFIG_X86_PAE */ - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); -#endif } #if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND) @@ -630,6 +589,15 @@ int tmp; int bad_ppro; unsigned long pfn; + + contiguous_bitmap = alloc_bootmem_low_pages( + (max_low_pfn + 2*BITS_PER_LONG) >> 3); + BUG_ON(!contiguous_bitmap); + memset(contiguous_bitmap, 0, (max_low_pfn + 2*BITS_PER_LONG) >> 3); + +#if defined(CONFIG_SWIOTLB) + swiotlb_init(); +#endif #ifndef CONFIG_DISCONTIGMEM if (!mem_map) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/ioremap.c Thu Aug 25 22:53:20 2005 @@ -36,6 +36,8 @@ { } +#ifdef __i386__ + void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) { return NULL; @@ -44,6 +46,8 @@ void __init bt_iounmap(void *addr, unsigned long size) { } + +#endif /* __i386__ */ #else @@ -58,7 +62,7 @@ extern unsigned long max_low_pfn; unsigned long mfn = address >> PAGE_SHIFT; unsigned long pfn = mfn_to_pfn(mfn); - return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn)); + return ((pfn < max_low_pfn) && (phys_to_machine_mapping[pfn] == mfn)); } /* @@ -126,10 +130,12 @@ return NULL; area->phys_addr = phys_addr; addr = (void __iomem *) area->addr; + flags |= _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED; +#ifdef __x86_64__ + flags |= _PAGE_USER; +#endif if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr, - size, __pgprot(_PAGE_PRESENT | _PAGE_RW | - _PAGE_DIRTY | _PAGE_ACCESSED - | flags), domid)) { + size, __pgprot(flags), domid)) { vunmap((void __force *) addr); return NULL; } @@ -218,6 +224,8 @@ kfree(p); } +#ifdef __i386__ + void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) { unsigned long offset, last_addr; @@ -289,6 +297,8 @@ } } +#endif /* __i386__ */ + #endif /* CONFIG_XEN_PHYSDEV_ACCESS */ /* These hacky macros avoid phys->machine translations. */ @@ -298,90 +308,20 @@ #define direct_mk_pte_phys(physpage, pgprot) \ __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) -static inline void direct_remap_area_pte(pte_t *pte, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - if (address >= end) - BUG(); - - do { - (*v)->ptr = virt_to_machine(pte); - (*v)++; - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int direct_remap_area_pmd(struct mm_struct *mm, - pmd_t *pmd, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - pte_t *pte = (mm == &init_mm) ? - pte_alloc_kernel(mm, pmd, address) : - pte_alloc_map(mm, pmd, address); - if (!pte) - return -ENOMEM; - direct_remap_area_pte(pte, address, end - address, v); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); + +static int direct_remap_area_pte_fn(pte_t *pte, + struct page *pte_page, + unsigned long address, + void *data) +{ + mmu_update_t **v = (mmu_update_t **)data; + + (*v)->ptr = ((maddr_t)pfn_to_mfn(page_to_pfn(pte_page)) << + PAGE_SHIFT) | ((unsigned long)pte & ~PAGE_MASK); + (*v)++; + return 0; } - -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v) -{ - pgd_t * dir; - unsigned long end = address + size; - int error; - - dir = pgd_offset(mm, address); - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - do { - pud_t *pud; - pmd_t *pmd; - - error = -ENOMEM; - pud = pud_alloc(mm, dir, address); - if (!pud) - break; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - break; - error = 0; - direct_remap_area_pmd(mm, pmd, address, end - address, &v); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - - } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); - return error; -} - int direct_remap_area_pages(struct mm_struct *mm, unsigned long address, @@ -393,7 +333,7 @@ int i; unsigned long start_address; #define MAX_DIRECTMAP_MMU_QUEUE 130 - mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; + mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u, *w = u; start_address = address; @@ -402,11 +342,10 @@ for (i = 0; i < size; i += PAGE_SIZE) { if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) { /* Fill in the PTE pointers. */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - + generic_page_range(mm, start_address, + address - start_address, + direct_remap_area_pte_fn, &w); + w = u; if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) return -EFAULT; v = u; @@ -417,7 +356,7 @@ * Fill in the machine address: PTE ptr is done later by * __direct_remap_area_pages(). */ - v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); + v->val = pte_val_ma(pfn_pte_ma(machine_addr >> PAGE_SHIFT, prot)); machine_addr += PAGE_SIZE; address += PAGE_SIZE; @@ -426,10 +365,8 @@ if (v != u) { /* get the ptep's filled in */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); + generic_page_range(mm, start_address, address - start_address, + direct_remap_area_pte_fn, &w); if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) return -EFAULT; } @@ -440,3 +377,48 @@ } EXPORT_SYMBOL(direct_remap_area_pages); + +static int lookup_pte_fn( + pte_t *pte, struct page *pte_page, unsigned long addr, void *data) +{ + unsigned long *ptep = (unsigned long *)data; + if (ptep) + *ptep = (pfn_to_mfn(page_to_pfn(pte_page)) << + PAGE_SHIFT) | + ((unsigned long)pte & ~PAGE_MASK); + return 0; +} + +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + unsigned long *ptep) +{ + return generic_page_range(mm, address, PAGE_SIZE, lookup_pte_fn, ptep); +} + +EXPORT_SYMBOL(create_lookup_pte_addr); + +static int noop_fn( + pte_t *pte, struct page *pte_page, unsigned long addr, void *data) +{ + return 0; +} + +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size) +{ + return generic_page_range(mm, address, size, noop_fn, NULL); +} + +EXPORT_SYMBOL(touch_pte_range); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c --- a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c Thu Aug 25 22:53:20 2005 @@ -25,6 +25,7 @@ #include <asm/mmu_context.h> #include <asm-xen/foreign_page.h> +#include <asm-xen/hypervisor.h> void show_mem(void) { @@ -169,7 +170,7 @@ __flush_tlb_one(vaddr); } -void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags) +void __set_fixmap (enum fixed_addresses idx, maddr_t phys, pgprot_t flags) { unsigned long address = __fix_to_virt(idx); @@ -221,8 +222,8 @@ unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); if (!pte_write(*virt_to_ptep(va))) - HYPERVISOR_update_va_mapping( - va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0); + BUG_ON(HYPERVISOR_update_va_mapping( + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)); ClearPageForeign(pte); set_page_count(pte, 1); @@ -274,6 +275,11 @@ { unsigned long flags; +#ifdef CONFIG_X86_PAE + /* this gives us a page below 4GB */ + xen_create_contiguous_region((unsigned long)pgd, 0); +#endif + if (!HAVE_SHARED_KERNEL_PMD) spin_lock_irqsave(&pgd_lock, flags); @@ -349,16 +355,17 @@ if (!pte_write(*ptep)) { xen_pgd_unpin(__pa(pgd)); - HYPERVISOR_update_va_mapping( + BUG_ON(HYPERVISOR_update_va_mapping( (unsigned long)pgd, pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), - 0); + 0)); } /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) { for (i = 0; i < USER_PTRS_PER_PGD; ++i) { pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); + make_page_writable(pmd); kmem_cache_free(pmd_cache, pmd); } if (!HAVE_SHARED_KERNEL_PMD) { @@ -444,9 +451,9 @@ if (PageHighMem(page)) return; - HYPERVISOR_update_va_mapping( + BUG_ON(HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - pfn_pte(pfn, flags), 0); + pfn_pte(pfn, flags), 0)); } static void mm_walk(struct mm_struct *mm, pgprot_t flags) @@ -485,10 +492,10 @@ spin_lock(&mm->page_table_lock); mm_walk(mm, PAGE_KERNEL_RO); - HYPERVISOR_update_va_mapping( + BUG_ON(HYPERVISOR_update_va_mapping( (unsigned long)mm->pgd, pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), - UVMF_TLB_FLUSH); + UVMF_TLB_FLUSH)); xen_pgd_pin(__pa(mm->pgd)); mm->context.pinned = 1; spin_lock(&mm_unpinned_lock); @@ -503,9 +510,9 @@ spin_lock(&mm->page_table_lock); xen_pgd_unpin(__pa(mm->pgd)); - HYPERVISOR_update_va_mapping( + BUG_ON(HYPERVISOR_update_va_mapping( (unsigned long)mm->pgd, - pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0); + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0)); mm_walk(mm, PAGE_KERNEL); xen_tlb_flush(); mm->context.pinned = 0; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/pci/Makefile --- a/linux-2.6-xen-sparse/arch/xen/i386/pci/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/pci/Makefile Thu Aug 25 22:53:20 2005 @@ -17,7 +17,7 @@ c-pci-$(CONFIG_X86_VISWS) := visws.o fixup.o pci-$(CONFIG_X86_VISWS) := c-pci-$(CONFIG_X86_NUMAQ) := numa.o -pci-$(CONFIG_X86_NUMAQ) := irq.o +l-pci-$(CONFIG_X86_NUMAQ) := irq.o obj-y += $(pci-y) c-obj-y += $(c-pci-y) common.o @@ -27,6 +27,7 @@ $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@ -obj-y += $(c-obj-y) $(l-pci-y) +# Make sure irq.o gets linked in before common.o +obj-y += $(patsubst common.o,$(l-pci-y) common.o,$(c-obj-y)) clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/kernel/evtchn.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/evtchn.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/evtchn.c Thu Aug 25 22:53:20 2005 @@ -116,9 +116,9 @@ #elif defined (__x86_64__) #define IRQ_REG orig_rax #endif -#define do_IRQ(irq, regs) do { \ - (regs)->IRQ_REG = (irq); \ - do_IRQ((regs)); \ +#define do_IRQ(irq, regs) do { \ + (regs)->IRQ_REG = (irq); \ + do_IRQ((regs)); \ } while (0) #endif @@ -137,14 +137,14 @@ /* NB. Interrupts are disabled on entry. */ asmlinkage void evtchn_do_upcall(struct pt_regs *regs) { - u32 l1, l2; + u32 l1, l2; unsigned int l1i, l2i, port; int irq, cpu = smp_processor_id(); shared_info_t *s = HYPERVISOR_shared_info; vcpu_info_t *vcpu_info = &s->vcpu_data[cpu]; vcpu_info->evtchn_upcall_pending = 0; - + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ l1 = xchg(&vcpu_info->evtchn_pending_sel, 0); while ( l1 != 0 ) @@ -158,9 +158,9 @@ l2 &= ~(1 << l2i); port = (l1i << 5) + l2i; - if ( (irq = evtchn_to_irq[port]) != -1 ) + if ( (irq = evtchn_to_irq[port]) != -1 ) { do_IRQ(irq, regs); - else + } else evtchn_device_upcall(port); } } @@ -229,13 +229,14 @@ if ( HYPERVISOR_event_channel_op(&op) != 0 ) panic("Failed to unbind virtual IRQ %d\n", virq); - /* This is a slight hack. Interdomain ports can be allocated - directly by userspace, and at that point they get bound by - Xen to vcpu 0. We therefore need to make sure that if we - get an event on an event channel we don't know about vcpu 0 - handles it. Binding channels to vcpu 0 when closing them - achieves this. */ - bind_evtchn_to_cpu(evtchn, 0); + /* + * This is a slight hack. Interdomain ports can be allocated directly + * by userspace, and at that point they get bound by Xen to vcpu 0. We + * therefore need to make sure that if we get an event on an event + * channel we don't know about vcpu 0 handles it. Binding channels to + * vcpu 0 when closing them achieves this. + */ + bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; irq_to_evtchn[irq] = -1; per_cpu(virq_to_irq, cpu)[virq] = -1; @@ -244,7 +245,75 @@ spin_unlock(&irq_mapping_update_lock); } -int bind_ipi_on_cpu_to_irq(int ipi) +/* This is only used when a vcpu from an xm save. The ipi is expected + to have been bound before we suspended, and so all of the xenolinux + state is set up; we only need to restore the Xen side of things. + The irq number has to be the same, but the evtchn number can + change. */ +void _bind_ipi_to_irq(int ipi, int vcpu, int irq) +{ + evtchn_op_t op; + int evtchn; + + spin_lock(&irq_mapping_update_lock); + + op.cmd = EVTCHNOP_bind_ipi; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IPI %d on cpu %d\n", ipi, vcpu); + evtchn = op.u.bind_ipi.port; + + printk("<0>IPI %d, old evtchn %d, evtchn %d.\n", + ipi, per_cpu(ipi_to_evtchn, vcpu)[ipi], + evtchn); + + evtchn_to_irq[irq_to_evtchn[irq]] = -1; + irq_to_evtchn[irq] = -1; + + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + printk("<0>evtchn_to_irq[%d] = %d.\n", evtchn, + evtchn_to_irq[evtchn]); + per_cpu(ipi_to_evtchn, vcpu)[ipi] = evtchn; + + bind_evtchn_to_cpu(evtchn, vcpu); + + spin_unlock(&irq_mapping_update_lock); + + clear_bit(evtchn, (unsigned long *)HYPERVISOR_shared_info->evtchn_mask); + clear_bit(evtchn, (unsigned long *)HYPERVISOR_shared_info->evtchn_pending); +} + +void _bind_virq_to_irq(int virq, int cpu, int irq) +{ + evtchn_op_t op; + int evtchn; + + spin_lock(&irq_mapping_update_lock); + + op.cmd = EVTCHNOP_bind_virq; + op.u.bind_virq.virq = virq; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to bind virtual IRQ %d\n", virq); + evtchn = op.u.bind_virq.port; + + evtchn_to_irq[irq_to_evtchn[irq]] = -1; + irq_to_evtchn[irq] = -1; + + evtchn_to_irq[evtchn] = irq; + irq_to_evtchn[irq] = evtchn; + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + + spin_unlock(&irq_mapping_update_lock); + + clear_bit(evtchn, (unsigned long *)HYPERVISOR_shared_info->evtchn_mask); + clear_bit(evtchn, (unsigned long *)HYPERVISOR_shared_info->evtchn_pending); +} + +int bind_ipi_to_irq(int ipi) { evtchn_op_t op; int evtchn, irq; @@ -269,7 +338,7 @@ } else { - irq = evtchn_to_irq[evtchn]; + irq = evtchn_to_irq[evtchn]; } irq_bindcount[irq]++; @@ -284,29 +353,29 @@ evtchn_op_t op; int cpu = smp_processor_id(); int evtchn = per_cpu(ipi_to_evtchn, cpu)[ipi]; - int irq = irq_to_evtchn[evtchn]; + int irq = evtchn_to_irq[evtchn]; spin_lock(&irq_mapping_update_lock); if ( --irq_bindcount[irq] == 0 ) { - op.cmd = EVTCHNOP_close; - op.u.close.dom = DOMID_SELF; - op.u.close.port = evtchn; - if ( HYPERVISOR_event_channel_op(&op) != 0 ) - panic("Failed to unbind virtual IPI %d on cpu %d\n", ipi, cpu); - - /* See comments in unbind_virq_from_irq */ - bind_evtchn_to_cpu(evtchn, 0); + op.cmd = EVTCHNOP_close; + op.u.close.dom = DOMID_SELF; + op.u.close.port = evtchn; + if ( HYPERVISOR_event_channel_op(&op) != 0 ) + panic("Failed to unbind virtual IPI %d on cpu %d\n", ipi, cpu); + + /* See comments in unbind_virq_from_irq */ + bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; irq_to_evtchn[irq] = -1; - per_cpu(ipi_to_evtchn, cpu)[ipi] = 0; + per_cpu(ipi_to_evtchn, cpu)[ipi] = 0; } spin_unlock(&irq_mapping_update_lock); } -int bind_evtchn_to_irq(int evtchn) +int bind_evtchn_to_irq(unsigned int evtchn) { int irq; @@ -326,7 +395,7 @@ return irq; } -void unbind_evtchn_from_irq(int evtchn) +void unbind_evtchn_from_irq(unsigned int evtchn) { int irq = evtchn_to_irq[evtchn]; @@ -341,9 +410,36 @@ spin_unlock(&irq_mapping_update_lock); } +int bind_evtchn_to_irqhandler( + unsigned int evtchn, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_evtchn_to_irq(evtchn); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if ( retval != 0 ) + unbind_evtchn_from_irq(evtchn); + + return retval; +} + +void unbind_evtchn_from_irqhandler(unsigned int evtchn, void *dev_id) +{ + unsigned int irq = evtchn_to_irq[evtchn]; + free_irq(irq, dev_id); + unbind_evtchn_from_irq(evtchn); +} + +#ifdef CONFIG_SMP static void do_nothing_function(void *ign) { } +#endif /* Rebind an evtchn so that it gets delivered to a specific cpu */ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) @@ -354,38 +450,37 @@ spin_lock(&irq_mapping_update_lock); evtchn = irq_to_evtchn[irq]; if (!VALID_EVTCHN(evtchn)) { - spin_unlock(&irq_mapping_update_lock); - return; - } - - /* Tell Xen to send future instances of this interrupt to the - other vcpu */ + spin_unlock(&irq_mapping_update_lock); + return; + } + + /* Tell Xen to send future instances of this interrupt to other vcpu. */ op.cmd = EVTCHNOP_bind_vcpu; op.u.bind_vcpu.port = evtchn; op.u.bind_vcpu.vcpu = tcpu; - /* If this fails, it usually just indicates that we're dealing - with a virq or IPI channel, which don't actually need to be - rebound. Ignore it, but don't do the xenlinux-level rebind - in that case. */ + /* + * If this fails, it usually just indicates that we're dealing with a virq + * or IPI channel, which don't actually need to be rebound. Ignore it, + * but don't do the xenlinux-level rebind in that case. + */ if (HYPERVISOR_event_channel_op(&op) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); + bind_evtchn_to_cpu(evtchn, tcpu); spin_unlock(&irq_mapping_update_lock); - /* Now send the new target processor a NOP IPI. When this - returns, it will check for any pending interrupts, and so - service any that got delivered to the wrong processor by - mistake. */ - /* XXX: The only time this is called with interrupts disabled is - from the hotplug/hotunplug path. In that case, all cpus are - stopped with interrupts disabled, and the missed interrupts - will be picked up when they start again. This is kind of a - hack. - */ - if (!irqs_disabled()) { - smp_call_function(do_nothing_function, NULL, 0, 0); - } + /* + * Now send the new target processor a NOP IPI. When this returns, it + * will check for any pending interrupts, and so service any that got + * delivered to the wrong processor by mistake. + * + * XXX: The only time this is called with interrupts disabled is from the + * hotplug/hotunplug path. In that case, all cpus are stopped with + * interrupts disabled, and the missed interrupts will be picked up when + * they start again. This is kind of a hack. + */ + if (!irqs_disabled()) + smp_call_function(do_nothing_function, NULL, 0, 0); } @@ -585,6 +680,16 @@ set_affinity_irq }; +void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) +{ + int evtchn = irq_to_evtchn[i]; + shared_info_t *s = HYPERVISOR_shared_info; + if ( !VALID_EVTCHN(evtchn) ) + return; + BUG_ON(!synch_test_bit(evtchn, &s->evtchn_mask[0])); + synch_set_bit(evtchn, &s->evtchn_pending[0]); +} + void irq_suspend(void) { int pirq, virq, irq, evtchn; @@ -631,7 +736,7 @@ evtchn = op.u.bind_virq.port; /* Record the new mapping. */ - bind_evtchn_to_cpu(evtchn, 0); + bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = irq; irq_to_evtchn[irq] = evtchn; @@ -655,9 +760,9 @@ #endif for ( cpu = 0; cpu < NR_CPUS; cpu++ ) { - /* No VIRQ -> IRQ mappings. */ - for ( i = 0; i < NR_VIRQS; i++ ) - per_cpu(virq_to_irq, cpu)[i] = -1; + /* No VIRQ -> IRQ mappings. */ + for ( i = 0; i < NR_VIRQS; i++ ) + per_cpu(virq_to_irq, cpu)[i] = -1; } /* No event-channel -> IRQ mappings. */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/gnttab.c Thu Aug 25 22:53:20 2005 @@ -34,44 +34,90 @@ EXPORT_SYMBOL(gnttab_grant_foreign_access); +EXPORT_SYMBOL(gnttab_end_foreign_access_ref); EXPORT_SYMBOL(gnttab_end_foreign_access); EXPORT_SYMBOL(gnttab_query_foreign_access); EXPORT_SYMBOL(gnttab_grant_foreign_transfer); +EXPORT_SYMBOL(gnttab_end_foreign_transfer_ref); EXPORT_SYMBOL(gnttab_end_foreign_transfer); EXPORT_SYMBOL(gnttab_alloc_grant_references); EXPORT_SYMBOL(gnttab_free_grant_references); +EXPORT_SYMBOL(gnttab_free_grant_reference); EXPORT_SYMBOL(gnttab_claim_grant_reference); EXPORT_SYMBOL(gnttab_release_grant_reference); EXPORT_SYMBOL(gnttab_grant_foreign_access_ref); EXPORT_SYMBOL(gnttab_grant_foreign_transfer_ref); -static grant_ref_t gnttab_free_list[NR_GRANT_ENTRIES]; +#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t)) +#define GNTTAB_LIST_END (NR_GRANT_ENTRIES + 1) + +static grant_ref_t gnttab_list[NR_GRANT_ENTRIES]; +static int gnttab_free_count = NR_GRANT_ENTRIES; static grant_ref_t gnttab_free_head; +static spinlock_t gnttab_list_lock = SPIN_LOCK_UNLOCKED; static grant_entry_t *shared; -/* - * Lock-free grant-entry allocator - */ - -static inline int -get_free_entry( - void) -{ - grant_ref_t fh, nfh = gnttab_free_head; - do { if ( unlikely((fh = nfh) == NR_GRANT_ENTRIES) ) return -1; } - while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, - gnttab_free_list[fh])) != fh) ); - return fh; +static struct gnttab_free_callback *gnttab_free_callback_list = NULL; + +static int +get_free_entries(int count) +{ + unsigned long flags; + int ref; + grant_ref_t head; + spin_lock_irqsave(&gnttab_list_lock, flags); + if (gnttab_free_count < count) { + spin_unlock_irqrestore(&gnttab_list_lock, flags); + return -1; + } + ref = head = gnttab_free_head; + gnttab_free_count -= count; + while (count-- > 1) + head = gnttab_list[head]; + gnttab_free_head = gnttab_list[head]; + gnttab_list[head] = GNTTAB_LIST_END; + spin_unlock_irqrestore(&gnttab_list_lock, flags); + return ref; +} + +#define get_free_entry() get_free_entries(1) + +static void +do_free_callbacks(void) +{ + struct gnttab_free_callback *callback = gnttab_free_callback_list, *next; + gnttab_free_callback_list = NULL; + while (callback) { + next = callback->next; + if (gnttab_free_count >= callback->count) { + callback->next = NULL; + callback->fn(callback->arg); + } else { + callback->next = gnttab_free_callback_list; + gnttab_free_callback_list = callback; + } + callback = next; + } } static inline void -put_free_entry( - grant_ref_t ref) -{ - grant_ref_t fh, nfh = gnttab_free_head; - do { gnttab_free_list[ref] = fh = nfh; wmb(); } - while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, ref)) != fh) ); +check_free_callbacks(void) +{ + if (unlikely(gnttab_free_callback_list)) + do_free_callbacks(); +} + +static void +put_free_entry(grant_ref_t ref) +{ + unsigned long flags; + spin_lock_irqsave(&gnttab_list_lock, flags); + gnttab_list[ref] = gnttab_free_head; + gnttab_free_head = ref; + gnttab_free_count++; + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); } /* @@ -79,8 +125,7 @@ */ int -gnttab_grant_foreign_access( - domid_t domid, unsigned long frame, int readonly) +gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int readonly) { int ref; @@ -96,8 +141,8 @@ } void -gnttab_grant_foreign_access_ref( - grant_ref_t ref, domid_t domid, unsigned long frame, int readonly) +gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int readonly) { shared[ref].frame = frame; shared[ref].domid = domid; @@ -107,7 +152,7 @@ int -gnttab_query_foreign_access( grant_ref_t ref ) +gnttab_query_foreign_access(grant_ref_t ref) { u16 nflags; @@ -117,7 +162,7 @@ } void -gnttab_end_foreign_access( grant_ref_t ref, int readonly ) +gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) { u16 flags, nflags; @@ -127,13 +172,17 @@ printk(KERN_ALERT "WARNING: g.e. still in use!\n"); } while ( (nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) != flags ); - +} + +void +gnttab_end_foreign_access(grant_ref_t ref, int readonly) +{ + gnttab_end_foreign_access_ref(ref, readonly); put_free_entry(ref); } int -gnttab_grant_foreign_transfer( - domid_t domid, unsigned long pfn ) +gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) { int ref; @@ -149,8 +198,8 @@ } void -gnttab_grant_foreign_transfer_ref( - grant_ref_t ref, domid_t domid, unsigned long pfn ) +gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, + unsigned long pfn) { shared[ref].frame = pfn; shared[ref].domid = domid; @@ -159,21 +208,13 @@ } unsigned long -gnttab_end_foreign_transfer( - grant_ref_t ref) +gnttab_end_foreign_transfer_ref(grant_ref_t ref) { unsigned long frame = 0; u16 flags; flags = shared[ref].flags; -#ifdef CONFIG_XEN_NETDEV_GRANT_RX - /* - * But can't flags == (GTF_accept_transfer | GTF_transfer_completed) - * if gnttab_donate executes without interruption??? - */ -#else - ASSERT(flags == (GTF_accept_transfer | GTF_transfer_committed)); -#endif + /* * If a transfer is committed then wait for the frame address to appear. * Otherwise invalidate the grant entry against future use. @@ -183,65 +224,91 @@ while ( unlikely((frame = shared[ref].frame) == 0) ) cpu_relax(); + return frame; +} + +unsigned long +gnttab_end_foreign_transfer(grant_ref_t ref) +{ + unsigned long frame = gnttab_end_foreign_transfer_ref(ref); put_free_entry(ref); - return frame; } void -gnttab_free_grant_references( u16 count, grant_ref_t head ) -{ - /* TODO: O(N)...? */ - grant_ref_t to_die = 0, next = head; - int i; - - for ( i = 0; i < count; i++ ) - { - to_die = next; - next = gnttab_free_list[next]; - put_free_entry( to_die ); +gnttab_free_grant_reference(grant_ref_t ref) +{ + + put_free_entry(ref); +} + +void +gnttab_free_grant_references(grant_ref_t head) +{ + grant_ref_t ref; + unsigned long flags; + int count = 1; + if (head == GNTTAB_LIST_END) + return; + spin_lock_irqsave(&gnttab_list_lock, flags); + ref = head; + while (gnttab_list[ref] != GNTTAB_LIST_END) { + ref = gnttab_list[ref]; + count++; } -} - -int -gnttab_alloc_grant_references( u16 count, - grant_ref_t *head, - grant_ref_t *terminal ) -{ - int i; - grant_ref_t h = gnttab_free_head; - - for ( i = 0; i < count; i++ ) - if ( unlikely(get_free_entry() == -1) ) - goto not_enough_refs; + gnttab_list[ref] = gnttab_free_head; + gnttab_free_head = head; + gnttab_free_count += count; + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +int +gnttab_alloc_grant_references(u16 count, grant_ref_t *head) +{ + int h = get_free_entries(count); + + if (h == -1) + return -ENOSPC; *head = h; - *terminal = gnttab_free_head; return 0; - -not_enough_refs: - gnttab_free_head = h; - return -ENOSPC; -} - -int -gnttab_claim_grant_reference( grant_ref_t *private_head, - grant_ref_t terminal ) -{ - grant_ref_t g; - if ( unlikely((g = *private_head) == terminal) ) +} + +int +gnttab_claim_grant_reference(grant_ref_t *private_head) +{ + grant_ref_t g = *private_head; + if (unlikely(g == GNTTAB_LIST_END)) return -ENOSPC; - *private_head = gnttab_free_list[g]; + *private_head = gnttab_list[g]; return g; } void -gnttab_release_grant_reference( grant_ref_t *private_head, - grant_ref_t release ) -{ - gnttab_free_list[release] = *private_head; +gnttab_release_grant_reference(grant_ref_t *private_head, grant_ref_t release) +{ + gnttab_list[release] = *private_head; *private_head = release; +} + +void +gnttab_request_free_callback(struct gnttab_free_callback *callback, + void (*fn)(void *), void *arg, u16 count) +{ + unsigned long flags; + spin_lock_irqsave(&gnttab_list_lock, flags); + if (callback->next) + goto out; + callback->fn = fn; + callback->arg = arg; + callback->count = count; + callback->next = gnttab_free_callback_list; + gnttab_free_callback_list = callback; + check_free_callbacks(); + out: + spin_unlock_irqrestore(&gnttab_list_lock, flags); } /* @@ -252,8 +319,9 @@ static struct proc_dir_entry *grant_pde; -static int grant_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long data) +static int +grant_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long data) { int ret; privcmd_hypercall_t hypercall; @@ -291,8 +359,9 @@ ioctl: grant_ioctl, }; -static int grant_read(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int +grant_read(char *page, char **start, off_t off, int count, int *eof, + void *data) { int len; unsigned int i; @@ -321,8 +390,9 @@ return len; } -static int grant_write(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static int +grant_write(struct file *file, const char __user *buffer, unsigned long count, + void *data) { /* TODO: implement this */ return -ENOSYS; @@ -330,7 +400,8 @@ #endif /* CONFIG_PROC_FS */ -int gnttab_resume(void) +int +gnttab_resume(void) { gnttab_setup_table_t setup; unsigned long frames[NR_GRANT_FRAMES]; @@ -349,7 +420,8 @@ return 0; } -int gnttab_suspend(void) +int +gnttab_suspend(void) { int i; @@ -359,7 +431,8 @@ return 0; } -static int __init gnttab_init(void) +static int __init +gnttab_init(void) { int i; @@ -368,7 +441,7 @@ shared = (grant_entry_t *)fix_to_virt(FIX_GNTTAB_END); for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) - gnttab_free_list[i] = i + 1; + gnttab_list[i] = i + 1; #ifdef CONFIG_PROC_FS /* diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/kernel/reboot.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c Thu Aug 25 22:53:20 2005 @@ -1,7 +1,4 @@ - #define __KERNEL_SYSCALLS__ -static int errno; -#include <linux/errno.h> #include <linux/version.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -9,13 +6,23 @@ #include <linux/module.h> #include <linux/reboot.h> #include <linux/sysrq.h> +#include <linux/stringify.h> #include <asm/irq.h> #include <asm/mmu_context.h> -#include <asm-xen/ctrl_if.h> +#include <asm-xen/evtchn.h> #include <asm-xen/hypervisor.h> #include <asm-xen/xen-public/dom0_ops.h> #include <asm-xen/linux-public/suspend.h> #include <asm-xen/queues.h> +#include <asm-xen/xenbus.h> +#include <asm-xen/ctrl_if.h> +#include <linux/cpu.h> +#include <linux/kthread.h> + +#define SHUTDOWN_INVALID -1 +#define SHUTDOWN_POWEROFF 0 +#define SHUTDOWN_REBOOT 1 +#define SHUTDOWN_SUSPEND 2 void machine_restart(char * __unused) { @@ -51,30 +58,76 @@ */ /* Ignore multiple shutdown requests. */ -static int shutting_down = -1; - -static void __do_suspend(void) +static int shutting_down = SHUTDOWN_INVALID; + +#ifndef CONFIG_HOTPLUG_CPU +#define cpu_down(x) (-EOPNOTSUPP) +#define cpu_up(x) (-EOPNOTSUPP) +#endif + +static void save_vcpu_context(int vcpu, vcpu_guest_context_t *ctxt) +{ + int r; + int gdt_pages; + r = HYPERVISOR_vcpu_pickle(vcpu, ctxt); + if (r != 0) + panic("pickling vcpu %d -> %d!\n", vcpu, r); + + /* Translate from machine to physical addresses where necessary, + so that they can be translated to our new machine address space + after resume. libxc is responsible for doing this to vcpu0, + but we do it to the others. */ + gdt_pages = (ctxt->gdt_ents + 511) / 512; + ctxt->ctrlreg[3] = machine_to_phys(ctxt->ctrlreg[3]); + for (r = 0; r < gdt_pages; r++) + ctxt->gdt_frames[r] = mfn_to_pfn(ctxt->gdt_frames[r]); +} + +void _restore_vcpu(int cpu); + +atomic_t vcpus_rebooting; + +static int restore_vcpu_context(int vcpu, vcpu_guest_context_t *ctxt) +{ + int r; + int gdt_pages = (ctxt->gdt_ents + 511) / 512; + + /* This is kind of a hack, and implicitly relies on the fact that + the vcpu stops in a place where all of the call clobbered + registers are already dead. */ + ctxt->user_regs.esp -= 4; + ((unsigned long *)ctxt->user_regs.esp)[0] = ctxt->user_regs.eip; + ctxt->user_regs.eip = (unsigned long)_restore_vcpu; + + /* De-canonicalise. libxc handles this for vcpu 0, but we need + to do it for the other vcpus. */ + ctxt->ctrlreg[3] = phys_to_machine(ctxt->ctrlreg[3]); + for (r = 0; r < gdt_pages; r++) + ctxt->gdt_frames[r] = pfn_to_mfn(ctxt->gdt_frames[r]); + + atomic_set(&vcpus_rebooting, 1); + r = HYPERVISOR_boot_vcpu(vcpu, ctxt); + if (r != 0) { + printk(KERN_EMERG "Failed to reboot vcpu %d (%d)\n", vcpu, r); + return -1; + } + + /* Make sure we wait for the new vcpu to come up before trying to do + anything with it or starting the next one. */ + while (atomic_read(&vcpus_rebooting)) + barrier(); + + return 0; +} + +static int __do_suspend(void *ignore) { int i, j; suspend_record_t *suspend_record; + static vcpu_guest_context_t suspended_cpu_records[NR_CPUS]; /* Hmmm... a cleaner interface to suspend/resume blkdevs would be nice. */ /* XXX SMH: yes it would :-( */ -#ifdef CONFIG_XEN_BLKDEV_FRONTEND - extern void blkdev_suspend(void); - extern void blkdev_resume(void); -#else -#define blkdev_suspend() do{}while(0) -#define blkdev_resume() do{}while(0) -#endif - -#ifdef CONFIG_XEN_NETDEV_FRONTEND - extern void netif_suspend(void); - extern void netif_resume(void); -#else -#define netif_suspend() do{}while(0) -#define netif_resume() do{}while(0) -#endif #ifdef CONFIG_XEN_USB_FRONTEND extern void usbif_resume(); @@ -82,37 +135,88 @@ #define usbif_resume() do{}while(0) #endif -#ifdef CONFIG_XEN_BLKDEV_GRANT extern int gnttab_suspend(void); extern int gnttab_resume(void); -#else -#define gnttab_suspend() do{}while(0) -#define gnttab_resume() do{}while(0) -#endif - + +#ifdef CONFIG_SMP + extern void smp_suspend(void); + extern void smp_resume(void); +#endif extern void time_suspend(void); extern void time_resume(void); extern unsigned long max_pfn; extern unsigned int *pfn_to_mfn_frame_list; + cpumask_t prev_online_cpus, prev_present_cpus; + int err = 0; + + BUG_ON(smp_processor_id() != 0); + BUG_ON(in_interrupt()); + +#if defined(CONFIG_SMP) && !defined(CONFIG_HOTPLUG_CPU) + if (num_online_cpus() > 1) { + printk(KERN_WARNING "Can't suspend SMP guests without CONFIG_HOTPLUG_CPU\n"); + return -EOPNOTSUPP; + } +#endif + suspend_record = (suspend_record_t *)__get_free_page(GFP_KERNEL); if ( suspend_record == NULL ) goto out; + /* Take all of the other cpus offline. We need to be careful not + to get preempted between the final test for num_online_cpus() + == 1 and disabling interrupts, since otherwise userspace could + bring another cpu online, and then we'd be stuffed. At the + same time, cpu_down can reschedule, so we need to enable + preemption while doing that. This kind of sucks, but should be + correct. */ + /* (We don't need to worry about other cpus bringing stuff up, + since by the time num_online_cpus() == 1, there aren't any + other cpus) */ + cpus_clear(prev_online_cpus); + preempt_disable(); + while (num_online_cpus() > 1) { + preempt_enable(); + for_each_online_cpu(i) { + if (i == 0) + continue; + err = cpu_down(i); + if (err != 0) { + printk(KERN_CRIT "Failed to take all CPUs down: %d.\n", err); + goto out_reenable_cpus; + } + cpu_set(i, prev_online_cpus); + } + preempt_disable(); + } + suspend_record->nr_pfns = max_pfn; /* final number of pfns */ __cli(); + + preempt_enable(); + + cpus_clear(prev_present_cpus); + for_each_present_cpu(i) { + if (i == 0) + continue; + save_vcpu_context(i, &suspended_cpu_records[i]); + cpu_set(i, prev_present_cpus); + } #ifdef __i386__ mm_pin_all(); kmem_cache_shrink(pgd_cache); #endif - netif_suspend(); - - blkdev_suspend(); - time_suspend(); + +#ifdef CONFIG_SMP + smp_suspend(); +#endif + + xenbus_suspend(); ctrl_if_suspend(); @@ -126,9 +230,11 @@ memcpy(&suspend_record->resume_info, &xen_start_info, sizeof(xen_start_info)); - HYPERVISOR_suspend(virt_to_machine(suspend_record) >> PAGE_SHIFT); - - shutting_down = -1; + /* We'll stop somewhere inside this hypercall. When it returns, + we'll start resuming after the restore. */ + HYPERVISOR_suspend(virt_to_mfn(suspend_record)); + + shutting_down = SHUTDOWN_INVALID; memcpy(&xen_start_info, &suspend_record->resume_info, sizeof(xen_start_info)); @@ -142,10 +248,10 @@ for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) { pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; + virt_to_mfn(&phys_to_machine_mapping[i]); } HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list = - virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT; + virt_to_mfn(pfn_to_mfn_frame_list); gnttab_resume(); @@ -153,19 +259,36 @@ ctrl_if_resume(); + xenbus_resume(); + +#ifdef CONFIG_SMP + smp_resume(); +#endif + time_resume(); - blkdev_resume(); - - netif_resume(); - usbif_resume(); + for_each_cpu_mask(i, prev_present_cpus) { + restore_vcpu_context(i, &suspended_cpu_records[i]); + } + __sti(); + + out_reenable_cpus: + for_each_cpu_mask(i, prev_online_cpus) { + j = cpu_up(i); + if (j != 0) { + printk(KERN_CRIT "Failed to bring cpu %d back up (%d).\n", + i, j); + err = j; + } + } out: if ( suspend_record != NULL ) free_page((unsigned long)suspend_record); + return err; } static int shutdown_process(void *__unused) @@ -186,7 +309,7 @@ switch ( shutting_down ) { - case CMSG_SHUTDOWN_POWEROFF: + case SHUTDOWN_POWEROFF: if ( execve("/sbin/poweroff", poweroff_argv, envp) < 0 ) { sys_reboot(LINUX_REBOOT_MAGIC1, @@ -196,7 +319,7 @@ } break; - case CMSG_SHUTDOWN_REBOOT: + case SHUTDOWN_REBOOT: if ( execve("/sbin/reboot", restart_argv, envp) < 0 ) { sys_reboot(LINUX_REBOOT_MAGIC1, @@ -207,16 +330,28 @@ break; } - shutting_down = -1; /* could try again */ + shutting_down = SHUTDOWN_INVALID; /* could try again */ return 0; } +static struct task_struct *kthread_create_on_cpu(int (*f)(void *arg), + void *arg, + const char *name, + int cpu) +{ + struct task_struct *p; + p = kthread_create(f, arg, name); + kthread_bind(p, cpu); + wake_up_process(p); + return p; +} + static void __shutdown_handler(void *unused) { int err; - if ( shutting_down != CMSG_SHUTDOWN_SUSPEND ) + if ( shutting_down != SHUTDOWN_SUSPEND ) { err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES); if ( err < 0 ) @@ -224,46 +359,121 @@ } else { - __do_suspend(); - } -} - -static void shutdown_handler(ctrl_msg_t *msg, unsigned long id) + kthread_create_on_cpu(__do_suspend, NULL, "suspender", 0); + } +} + +static void shutdown_handler(struct xenbus_watch *watch, const char *node) { static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); - if ( msg->subtype == CMSG_SHUTDOWN_SYSRQ ) - { - int sysrq = ((shutdown_sysrq_t *)&msg->msg[0])->key; - + char *str; + + str = (char *)xenbus_read("control", "shutdown", NULL); + /* Ignore read errors. */ + if (IS_ERR(str)) + return; + if (strlen(str) == 0) { + kfree(str); + return; + } + + xenbus_write("control", "shutdown", "", O_CREAT); + + if (strcmp(str, "poweroff") == 0) + shutting_down = SHUTDOWN_POWEROFF; + else if (strcmp(str, "reboot") == 0) + shutting_down = SHUTDOWN_REBOOT; + else if (strcmp(str, "suspend") == 0) + shutting_down = SHUTDOWN_SUSPEND; + else { + printk("Ignoring shutdown request: %s\n", str); + shutting_down = SHUTDOWN_INVALID; + } + + kfree(str); + + if (shutting_down != SHUTDOWN_INVALID) + schedule_work(&shutdown_work); +} + #ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handler(struct xenbus_watch *watch, const char *node) +{ + char sysrq_key = '\0'; + + if (!xenbus_scanf("control", "sysrq", "%c", &sysrq_key)) { + printk(KERN_ERR "Unable to read sysrq code in control/sysrq\n"); + return; + } + + xenbus_printf("control", "sysrq", "%c", '\0'); + + if (sysrq_key != '\0') { + #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - handle_sysrq(sysrq, NULL, NULL); + handle_sysrq(sysrq_key, NULL, NULL); #else - handle_sysrq(sysrq, NULL, NULL, NULL); -#endif -#endif - } - else if ( (shutting_down == -1) && - ((msg->subtype == CMSG_SHUTDOWN_POWEROFF) || - (msg->subtype == CMSG_SHUTDOWN_REBOOT) || - (msg->subtype == CMSG_SHUTDOWN_SUSPEND)) ) - { - shutting_down = msg->subtype; - schedule_work(&shutdown_work); - } - else - { - printk("Ignore spurious shutdown request\n"); - } - - ctrl_if_send_response(msg); + handle_sysrq(sysrq_key, NULL, NULL, NULL); +#endif + } +} +#endif + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +#ifdef CONFIG_MAGIC_SYSRQ +static struct xenbus_watch sysrq_watch = { + .node ="control/sysrq", + .callback = sysrq_handler +}; +#endif + +static struct notifier_block xenstore_notifier; + +/* Setup our watcher + NB: Assumes xenbus_lock is held! +*/ +static int setup_shutdown_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err1 = 0; +#ifdef CONFIG_MAGIC_SYSRQ + int err2 = 0; +#endif + + BUG_ON(down_trylock(&xenbus_lock) == 0); + + err1 = register_xenbus_watch(&shutdown_watch); +#ifdef CONFIG_MAGIC_SYSRQ + err2 = register_xenbus_watch(&sysrq_watch); +#endif + + if (err1) { + printk(KERN_ERR "Failed to set shutdown watcher\n"); + } + +#ifdef CONFIG_MAGIC_SYSRQ + if (err2) { + printk(KERN_ERR "Failed to set sysrq watcher\n"); + } +#endif + + return NOTIFY_DONE; } static int __init setup_shutdown_event(void) { - ctrl_if_register_receiver(CMSG_SHUTDOWN, shutdown_handler, 0); + + xenstore_notifier.notifier_call = setup_shutdown_watcher; + + register_xenstore_notifier(&xenstore_notifier); + return 0; } -__initcall(setup_shutdown_event); +subsys_initcall(setup_shutdown_event); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/kernel/skbuff.c --- a/linux-2.6-xen-sparse/arch/xen/kernel/skbuff.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/kernel/skbuff.c Thu Aug 25 22:53:20 2005 @@ -5,8 +5,6 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/string.h> -#include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> @@ -14,34 +12,86 @@ #include <linux/init.h> #include <asm/io.h> #include <asm/page.h> - -EXPORT_SYMBOL(__dev_alloc_skb); +#include <asm-xen/hypervisor.h> /* Referenced in netback.c. */ /*static*/ kmem_cache_t *skbuff_cachep; -/* Size must be cacheline-aligned (alloc_skb uses SKB_DATA_ALIGN). */ -#define XEN_SKB_SIZE \ - ((PAGE_SIZE - sizeof(struct skb_shared_info)) & ~(SMP_CACHE_BYTES - 1)) +#define MAX_SKBUFF_ORDER 2 +static kmem_cache_t *skbuff_order_cachep[MAX_SKBUFF_ORDER + 1]; struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask) { - struct sk_buff *skb; - skb = alloc_skb_from_cache(skbuff_cachep, length + 16, gfp_mask); - if ( likely(skb != NULL) ) - skb_reserve(skb, 16); - return skb; + struct sk_buff *skb; + int order; + + length = SKB_DATA_ALIGN(length + 16); + order = get_order(length + sizeof(struct skb_shared_info)); + if (order > MAX_SKBUFF_ORDER) { + printk(KERN_ALERT "Attempt to allocate order %d skbuff. " + "Increase MAX_SKBUFF_ORDER.\n", order); + return NULL; + } + + skb = alloc_skb_from_cache( + skbuff_order_cachep[order], length, gfp_mask); + if (skb != NULL) + skb_reserve(skb, 16); + + return skb; } static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused) { - scrub_pages(buf, 1); + int order = 0; + + while (skbuff_order_cachep[order] != cachep) + order++; + + if (order != 0) + xen_create_contiguous_region((unsigned long)buf, order); + + scrub_pages(buf, 1 << order); +} + +static void skbuff_dtor(void *buf, kmem_cache_t *cachep, unsigned long unused) +{ + int order = 0; + + while (skbuff_order_cachep[order] != cachep) + order++; + + if (order != 0) + xen_destroy_contiguous_region((unsigned long)buf, order); } static int __init skbuff_init(void) { - skbuff_cachep = kmem_cache_create( - "xen-skb", PAGE_SIZE, PAGE_SIZE, 0, skbuff_ctor, NULL); - return 0; + static char name[MAX_SKBUFF_ORDER + 1][20]; + unsigned long size; + int order; + + for (order = 0; order <= MAX_SKBUFF_ORDER; order++) { + size = PAGE_SIZE << order; + sprintf(name[order], "xen-skb-%lu", size); + skbuff_order_cachep[order] = kmem_cache_create( + name[order], size, size, 0, skbuff_ctor, skbuff_dtor); + } + + skbuff_cachep = skbuff_order_cachep[0]; + + return 0; } __initcall(skbuff_init); + +EXPORT_SYMBOL(__dev_alloc_skb); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig --- a/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/Kconfig Thu Aug 25 22:53:20 2005 @@ -125,6 +125,10 @@ config X86_IO_APIC bool default XEN_PRIVILEGED_GUEST + +config X86_XEN_GENAPIC + bool + default XEN_PRIVILEGED_GUEST || SMP config X86_LOCAL_APIC bool @@ -325,12 +329,12 @@ # need this always enabled with GART_IOMMU for the VIA workaround config SWIOTLB bool - depends on GART_IOMMU + depends on PCI default y config DUMMY_IOMMU bool - depends on !GART_IOMMU && !SWIOTLB + depends on !GART_IOMMU default y help Don't use IOMMU code. This will cause problems when you have more than 4GB diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/Makefile --- a/linux-2.6-xen-sparse/arch/xen/x86_64/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/Makefile Thu Aug 25 22:53:20 2005 @@ -79,14 +79,15 @@ CFLAGS += $(xenflags-y) AFLAGS += $(xenflags-y) -prepare: include/asm-$(XENARCH)/asm_offset.h -CLEAN_FILES += include/asm-$(XENARCH)/asm_offset.h +prepare: include/asm-$(XENARCH)/asm_offsets.h +CLEAN_FILES += include/asm-$(XENARCH)/asm_offsets.h arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/.asm-ignore \ include/linux/version.h include/config/MARKER +include/asm-$(XENARCH)/offset.h: arch/$(XENARCH)/kernel/asm-offsets.s + $(call filechk,gen-asm-offsets) -include/asm-$(XENARCH)/asm_offset.h: arch/xen/x86_64/kernel/asm-offsets.s - $(call filechk,gen-asm-offsets) - ln -fsn asm_offset.h include/asm-$(XENARCH)/offset.h - +include/asm-$(XENARCH)/asm_offsets.h: include/asm-$(XENARCH)/offset.h + ln -fsn offset.h include/asm-$(XENARCH)/asm_offsets.h + ln -fsn offset.h include/asm-$(XENARCH)/asm_offset.h diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile --- a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/Makefile Thu Aug 25 22:53:20 2005 @@ -36,8 +36,8 @@ $(call if_changed,syscall) AFLAGS_vsyscall-int80.o = -m32 -I$(obj) -AFLAGS_vsyscall-sysenter.o = -m32 -AFLAGS_vsyscall-syscall.o = -m32 +AFLAGS_vsyscall-sysenter.o = -m32 -I$(obj) +AFLAGS_vsyscall-syscall.o = -m32 -I$(obj) CFLAGS_ia32_ioctl.o += -Ifs/ s-link := vsyscall-syscall.o vsyscall-sysenter.o vsyscall-sigreturn.o @@ -48,13 +48,11 @@ $(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)) $(patsubst %.o,$(obj)/%.S,$(s-obj-y) $(s-link)): @ln -fsn $(srctree)/arch/x86_64/ia32/$(notdir $@) $@ -$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S ../../i386/kernel/vsyscall-note.S -$(obj)/vsyscall-sysenter.S: $(obj)/vsyscall-sigreturn.S -$(obj)/vsyscall-syscall.S: $(obj)/vsyscall-sigreturn.S +$(obj)/vsyscall-int80.o $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-syscall.o: \ + $(obj)/vsyscall-sigreturn.S $(obj)/../../i386/kernel/vsyscall-note.S -../../i386/kernel/vsyscall-note.S: - @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $(srctree)/arch/xen/i386/kernel/$(notdir $@) - make -C arch/xen/i386/kernel vsyscall-note.S +$(obj)/../../i386/kernel/vsyscall-note.S: + @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@ obj-y += $(c-obj-y) $(s-obj-y) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/ia32/syscall32.c Thu Aug 25 22:53:20 2005 @@ -128,8 +128,12 @@ #endif return 0; } - -__initcall(init_syscall32); + +/* + * This must be done early in case we have an initrd containing 32-bit + * binaries (e.g., hotplug). This could be pushed upstream to arch/x86_64. + */ +core_initcall(init_syscall32); /* May not be __init: called during resume */ void syscall32_cpu_init(void) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/Makefile Thu Aug 25 22:53:20 2005 @@ -25,30 +25,32 @@ c-obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o -#obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o +obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o -c-obj-$(CONFIG_X86_IO_APIC) += genapic.o genapic_cluster.o genapic_flat.o +obj-$(CONFIG_X86_XEN_GENAPIC) += genapic.o genapic_xen.o +c-obj-$(CONFIG_X86_IO_APIC) += genapic_cluster.o genapic_flat.o #obj-$(CONFIG_PM) += suspend.o #obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o #obj-$(CONFIG_CPU_FREQ) += cpufreq/ #obj-$(CONFIG_EARLY_PRINTK) += early_printk.o #obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o -c-obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o -#obj-$(CONFIG_SWIOTLB) += swiotlb.o +obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o +i386-obj-$(CONFIG_DUMMY_IOMMU) += pci-dma.o +i386-obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o c-obj-$(CONFIG_MODULES) += module.o -#obj-y += topology.o +obj-y += topology.o c-obj-y += intel_cacheinfo.o bootflag-y += ../../../i386/kernel/bootflag.o cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../../i386/kernel/cpuid.o topology-y += ../../../i386/mach-default/topology.o -swiotlb-$(CONFIG_SWIOTLB) += ../../../ia64/lib/swiotlb.o +#swiotlb-$(CONFIG_SWIOTLB) += ../../../ia64/lib/swiotlb.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../../i386/kernel/microcode.o intel_cacheinfo-y += ../../../i386/kernel/cpu/intel_cacheinfo.o quirks-y += ../../i386/kernel/quirks.o diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/apic.c Thu Aug 25 22:53:20 2005 @@ -45,10 +45,11 @@ void smp_local_timer_interrupt(struct pt_regs *regs) { + + profile_tick(CPU_PROFILING, regs); +#ifndef CONFIG_XEN int cpu = smp_processor_id(); - profile_tick(CPU_PROFILING, regs); -#if 0 if (--per_cpu(prof_counter, cpu) <= 0) { /* * The multiplier may have changed since the last time we got diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/e820.c Thu Aug 25 22:53:20 2005 @@ -46,7 +46,7 @@ e820.nr_map++; } -#if 0 +#ifndef CONFIG_XEN extern char _end[]; /* @@ -235,7 +235,6 @@ } } - void __init e820_print_map(char *who) { int i; @@ -516,9 +515,31 @@ printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); } + +#else /* CONFIX_XEN */ +extern unsigned long xen_override_max_pfn; +extern union xen_start_info_union xen_start_info_union; +/* + * Guest physical starts from 0. + */ +unsigned long __init e820_end_of_ram(void) +{ + unsigned long max_end_pfn = xen_start_info.nr_pages; + + if ( xen_override_max_pfn < max_end_pfn) + xen_override_max_pfn = max_end_pfn; + + return xen_override_max_pfn; +} + + + +void __init e820_reserve_resources(void) +{ + return; /* Xen won't have reserved entries */ +} + #endif - -extern unsigned long xen_override_max_pfn; void __init parse_memopt(char *p, char **from) { diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/early_printk.c Thu Aug 25 22:53:20 2005 @@ -5,6 +5,8 @@ #include <linux/string.h> #include <asm/io.h> #include <asm/processor.h> + +#ifndef CONFIG_XEN /* Simple VGA output */ @@ -59,7 +61,6 @@ .index = -1, }; -#ifndef CONFIG_XEN /* Serial functions loosely based on a similar package from Klaus P. Gerlicher */ static int early_serial_base = 0x3f8; /* ttyS0 */ @@ -148,7 +149,8 @@ outb((divisor >> 8) & 0xff, early_serial_base + DLH); outb(c & ~DLAB, early_serial_base + LCR); } -#else + +#else /* CONFIG_XEN */ static void early_serial_write(struct console *con, const char *s, unsigned count) @@ -167,6 +169,13 @@ static __init void early_serial_init(char *s) { } + +/* + * No early VGA console on Xen, as we do not have convenient ISA-space + * mappings. Someone should fix this for domain 0. For now, use fake serial. + */ +#define early_vga_console early_serial_console + #endif static struct console early_serial_console = { diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/entry.S Thu Aug 25 22:53:20 2005 @@ -128,7 +128,6 @@ * #define VGCF_IN_SYSCALL (1<<8) */ .macro SWITCH_TO_USER flag - movl $0,%gs:pda_kernel_mode # change to user mode subq $8*4,%rsp # reuse rip, cs, rflags, rsp, ss in the stack movq %rax,(%rsp) movq %r11,1*8(%rsp) @@ -139,7 +138,6 @@ .endm .macro SWITCH_TO_KERNEL ssoff,adjust=0 - btsq $0,%gs:pda_kernel_mode jc 1f orb $1,\ssoff-\adjust+4(%rsp) 1: diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/head.S Thu Aug 25 22:53:20 2005 @@ -28,8 +28,6 @@ #include <asm/page.h> #include <asm/msr.h> #include <asm/cache.h> -/* #include <asm/thread_info.h> */ - /* we are not able to switch in one step to the final KERNEL ADRESS SPACE * because we need identity-mapped pages on setup so define __START_KERNEL to @@ -43,10 +41,9 @@ startup_64: ENTRY(_start) cld - movq init_rsp(%rip),%rsp /* Copy the necessary stuff from xen_start_info structure. */ movq $xen_start_info_union,%rdi - movq $64,%rcx /* sizeof (union xen_start_info_union) / sizeof (long) */ + movq $256,%rcx rep movsq #ifdef CONFIG_SMP @@ -54,6 +51,7 @@ cld #endif /* CONFIG_SMP */ + movq init_rsp(%rip),%rsp /* zero EFLAGS after setting rsp */ pushq $0 popfq @@ -116,15 +114,81 @@ ENTRY(init_level4_user_pgt) .fill 512,8,0 + /* + * In Xen the following pre-initialized pgt entries are re-initialized. + */ +.org 0x3000 +ENTRY(level3_kernel_pgt) + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ + .quad 0x0000000000105007 /* -> level2_kernel_pgt */ + .fill 1,8,0 + +.org 0x4000 +ENTRY(level2_ident_pgt) + /* 40MB for bootup. */ + .quad 0x0000000000000283 + .quad 0x0000000000200183 + .quad 0x0000000000400183 + .quad 0x0000000000600183 + .quad 0x0000000000800183 + .quad 0x0000000000A00183 + .quad 0x0000000000C00183 + .quad 0x0000000000E00183 + .quad 0x0000000001000183 + .quad 0x0000000001200183 + .quad 0x0000000001400183 + .quad 0x0000000001600183 + .quad 0x0000000001800183 + .quad 0x0000000001A00183 + .quad 0x0000000001C00183 + .quad 0x0000000001E00183 + .quad 0x0000000002000183 + .quad 0x0000000002200183 + .quad 0x0000000002400183 + .quad 0x0000000002600183 + /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ + .globl temp_boot_pmds +temp_boot_pmds: + .fill 492,8,0 + +.org 0x5000 +ENTRY(level2_kernel_pgt) + /* 40MB kernel mapping. The kernel code cannot be bigger than that. + When you change this change KERNEL_TEXT_SIZE in page.h too. */ + /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ + .quad 0x0000000000000183 + .quad 0x0000000000200183 + .quad 0x0000000000400183 + .quad 0x0000000000600183 + .quad 0x0000000000800183 + .quad 0x0000000000A00183 + .quad 0x0000000000C00183 + .quad 0x0000000000E00183 + .quad 0x0000000001000183 + .quad 0x0000000001200183 + .quad 0x0000000001400183 + .quad 0x0000000001600183 + .quad 0x0000000001800183 + .quad 0x0000000001A00183 + .quad 0x0000000001C00183 + .quad 0x0000000001E00183 + .quad 0x0000000002000183 + .quad 0x0000000002200183 + .quad 0x0000000002400183 + .quad 0x0000000002600183 + /* Module mapping starts here */ + .fill 492,8,0 + /* * This is used for vsyscall area mapping as we have a different * level4 page table for user. */ -.org 0x3000 +.org 0x6000 ENTRY(level3_user_pgt) .fill 512,8,0 -.org 0x4000 +.org 0x7000 ENTRY(cpu_gdt_table) /* The TLS descriptors are currently at a different place compared to i386. Hopefully nobody expects them at a fixed place (Wine?) */ @@ -140,26 +204,34 @@ .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ - -gdt_end: + .quad 0 /* unused now? __KERNEL16_CS - 16bit PM for S3 wakeup. */ + +gdt_end: +#if 0 /* asm/segment.h:GDT_ENTRIES must match this */ /* This should be a multiple of the cache line size */ /* GDTs of other CPUs: */ .fill (GDT_SIZE * NR_CPUS) - (gdt_end - cpu_gdt_table) - -.org 0x5000 +#endif + +.org 0x8000 ENTRY(empty_zero_page) -.org 0x6000 +.org 0x9000 ENTRY(empty_bad_page) -.org 0x7000 +.org 0xa000 ENTRY(empty_bad_pte_table) -.org 0x8000 +.org 0xb000 ENTRY(empty_bad_pmd_table) - .org 0x9000 +.org 0xc000 +ENTRY(level3_physmem_pgt) + .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */ + + + .org 0xd000 #ifdef CONFIG_ACPI_SLEEP ENTRY(wakeup_level4_pgt) .quad 0x0000000000102007 /* -> level3_ident_pgt */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ioport.c Thu Aug 25 22:53:20 2005 @@ -30,7 +30,6 @@ unsigned int old_io_pl = current->thread.io_pl; physdev_op_t op; - if (new_io_pl > 3) return -EINVAL; @@ -38,16 +37,12 @@ if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO)) return -EPERM; - /* Maintain OS privileges even if user attempts to relinquish them. */ - if (new_io_pl == 0) - new_io_pl = 1; - /* Change our version of the privilege levels. */ current->thread.io_pl = new_io_pl; /* Force the change at ring 0. */ op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = new_io_pl; + op.u.set_iopl.iopl = (new_io_pl == 0) ? 1 : new_io_pl; HYPERVISOR_physdev_op(&op); return 0; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/irq.c Thu Aug 25 22:53:20 2005 @@ -21,6 +21,11 @@ atomic_t irq_err_count; +#ifdef CONFIG_X86_IO_APIC +#ifdef APIC_MISMATCH_DEBUG +atomic_t irq_mis_count; +#endif +#endif /* * Generic, controller-independent functions: diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-nommu.c Thu Aug 25 22:53:20 2005 @@ -61,6 +61,7 @@ EXPORT_SYMBOL(dma_free_coherent); #endif +#if 0 int dma_supported(struct device *hwdev, u64 mask) { /* @@ -76,6 +77,7 @@ return 1; } EXPORT_SYMBOL(dma_supported); +#endif int dma_get_cache_alignment(void) { diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/process.c Thu Aug 25 22:53:20 2005 @@ -148,7 +148,6 @@ if (cpu_is_offline(cpu)) play_dead(); - __IRQ_STAT(cpu,idle_timestamp) = jiffies; xen_idle(); } @@ -454,8 +453,8 @@ #define C(i) do { \ if (unlikely(next->tls_array[i] != prev->tls_array[i])) { \ mcl->op = __HYPERVISOR_update_descriptor; \ - mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu) \ - [GDT_ENTRY_TLS_MIN + i]); \ + mcl->args[0] = virt_to_machine( \ + &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]);\ mcl->args[1] = next->tls_array[i]; \ mcl++; \ } \ @@ -465,7 +464,7 @@ if (unlikely(prev->io_pl != next->io_pl)) { iopl_op.cmd = PHYSDEVOP_SET_IOPL; - iopl_op.u.set_iopl.iopl = next->io_pl; + iopl_op.u.set_iopl.iopl = (next->io_pl == 0) ? 1 : next->io_pl; mcl->op = __HYPERVISOR_physdev_op; mcl->args[0] = (unsigned long)&iopl_op; mcl++; @@ -475,7 +474,7 @@ iobmp_op.cmd = PHYSDEVOP_SET_IOBITMAP; iobmp_op.u.set_iobitmap.bitmap = - (unsigned long)next->io_bitmap_ptr; + (char *)next->io_bitmap_ptr; iobmp_op.u.set_iobitmap.nr_ports = next->io_bitmap_ptr ? IO_BITMAP_BITS : 0; mcl->op = __HYPERVISOR_physdev_op; @@ -744,3 +743,9 @@ sp -= get_random_int() % 8192; return sp & ~0xf; } + +#ifndef CONFIG_SMP +void _restore_vcpu(void) +{ +} +#endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup.c Thu Aug 25 22:53:20 2005 @@ -40,7 +40,6 @@ #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> -#include <linux/percpu.h> #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -58,27 +57,36 @@ #include <asm/setup.h> #include <asm/mach_apic.h> #include <asm/numa.h> +#ifdef CONFIG_XEN +#include <linux/percpu.h> #include <asm-xen/xen-public/physdev.h> #include "setup_arch_pre.h" #include <asm-xen/hypervisor.h> - #define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) #define PFN_PHYS(x) ((x) << PAGE_SHIFT) - +#define end_pfn_map end_pfn #include <asm/mach-xen/setup_arch_post.h> extern unsigned long start_pfn; - -#if 0 -struct edid_info { - unsigned char dummy[128]; -}; -#endif - extern struct edid_info edid_info; + +shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; +EXPORT_SYMBOL(HYPERVISOR_shared_info); /* Allows setting of maximum possible memory size */ unsigned long xen_override_max_pfn; + +u32 *phys_to_machine_mapping, *pfn_to_mfn_frame_list; + +EXPORT_SYMBOL(phys_to_machine_mapping); + +DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]); +DEFINE_PER_CPU(int, nr_multicall_ents); + +/* Raw start-of-day parameters from the hypervisor. */ +union xen_start_info_union xen_start_info_union; +#endif + /* * Machine setup.. */ @@ -166,7 +174,7 @@ #define IORESOURCE_ROM (IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM) -#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN) static struct resource system_rom_resource = { .name = "System ROM", .start = 0xf0000, @@ -200,12 +208,14 @@ #define ADAPTER_ROM_RESOURCES \ (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0]) +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN) static struct resource video_rom_resource = { .name = "Video ROM", .start = 0xc0000, .end = 0xc7fff, .flags = IORESOURCE_ROM, }; +#endif static struct resource video_ram_resource = { .name = "Video RAM area", @@ -214,7 +224,7 @@ .flags = IORESOURCE_RAM, }; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN) #define romsignature(x) (*(unsigned short *)(x) == 0xaa55) static int __init romchecksum(unsigned char *rom, unsigned long length) @@ -292,33 +302,24 @@ } #endif -/* - * Point at the empty zero page to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; -EXPORT_SYMBOL(HYPERVISOR_shared_info); - -u32 *phys_to_machine_mapping, *pfn_to_mfn_frame_list; - -EXPORT_SYMBOL(phys_to_machine_mapping); - -DEFINE_PER_CPU(multicall_entry_t, multicall_list[8]); -DEFINE_PER_CPU(int, nr_multicall_ents); - -/* Raw start-of-day parameters from the hypervisor. */ -union xen_start_info_union xen_start_info_union; static __init void parse_cmdline_early (char ** cmdline_p) { char c = ' ', *to = command_line, *from = COMMAND_LINE; - int len = 0, max_cmdline; - + int len = 0; + + /* Save unparsed command line copy for /proc/cmdline */ +#ifdef CONFIG_XEN + int max_cmdline; + if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE) max_cmdline = COMMAND_LINE_SIZE; memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline); - /* Save unparsed command line copy for /proc/cmdline */ saved_command_line[max_cmdline-1] = '\0'; +#else + memcpy(saved_command_line, COMMAND_LINE, COMMAND_LINE_SIZE); + saved_command_line[COMMAND_LINE_SIZE-1] = '\0'; +#endif for (;;) { if (c != ' ') @@ -376,8 +377,7 @@ acpi_skip_timer_override = 1; #endif #endif - -#if 0 +#ifndef CONFIG_XEN if (!memcmp(from, "nolapic", 7) || !memcmp(from, "disableapic", 11)) disable_apic = 1; @@ -389,8 +389,7 @@ skip_ioapic_setup = 0; ioapic_force = 1; } -#endif - +#endif if (!memcmp(from, "mem=", 4)) parse_memopt(from+4, &from); @@ -424,34 +423,28 @@ } #ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_XEN static void __init contig_initmem_init(void) { + unsigned long bootmap_size = init_bootmem(start_pfn, end_pfn); + free_bootmem(0, end_pfn << PAGE_SHIFT); + reserve_bootmem(HIGH_MEMORY, + (PFN_PHYS(start_pfn) + bootmap_size + PAGE_SIZE-1) + - HIGH_MEMORY); +} +#else +static void __init contig_initmem_init(void) +{ unsigned long bootmap_size, bootmap; - - /* - * partially used pages are not usable - thus - * we are rounding upwards: - */ - bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; - bootmap = start_pfn; - bootmap_size = init_bootmem(bootmap, end_pfn); + bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); + if (bootmap == -1L) + panic("Cannot find bootmem map of size %ld\n",bootmap_size); + bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); + e820_bootmem_free(&contig_page_data, 0, end_pfn << PAGE_SHIFT); reserve_bootmem(bootmap, bootmap_size); - - free_bootmem(start_pfn << PAGE_SHIFT, (end_pfn - start_pfn) << PAGE_SHIFT); - printk("Registering memory for bootmem: from %lx, size = %lx\n", - start_pfn << PAGE_SHIFT, (end_pfn - start_pfn) << PAGE_SHIFT); - /* - * This should cover kernel_end - */ -#if 0 - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); -#endif - reserve_bootmem(0, (PFN_PHYS(start_pfn) + - bootmap_size + PAGE_SIZE-1)); - } +#endif /* !CONFIG_XEN */ #endif /* Use inline assembly to define this because the nops are defined @@ -543,7 +536,7 @@ } #endif -#if 0 +#ifndef CONFIG_XEN #define EBDA_ADDR_POINTER 0x40E static void __init reserve_ebda_region(void) { @@ -559,73 +552,17 @@ } #endif -/* - * Guest physical starts from 0. - */ - -unsigned long __init xen_end_of_ram(void) -{ - unsigned long max_end_pfn = xen_start_info.nr_pages; - - if ( xen_override_max_pfn < max_end_pfn) - xen_override_max_pfn = max_end_pfn; - - return xen_override_max_pfn; -} - -static void __init print_memory_map(char *who) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - early_printk(" %s: %016Lx - %016Lx ", who, - e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); - switch (e820.map[i].type) { - case E820_RAM: early_printk("(usable)\n"); - break; - case E820_RESERVED: - early_printk("(reserved)\n"); - break; - case E820_ACPI: - early_printk("(ACPI data)\n"); - break; - case E820_NVS: - early_printk("(ACPI NVS)\n"); - break; - default: early_printk("type %u\n", e820.map[i].type); - break; - } - } -} - void __init setup_arch(char **cmdline_p) { - int i, j; - physdev_op_t op; - -#if 0 - ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); -#else + unsigned long kernel_end; + +#ifdef CONFIG_XEN ROOT_DEV = MKDEV(RAMDISK_MAJOR,0); -#endif drive_info = DRIVE_INFO; - + kernel_end = 0; /* dummy */ #ifdef CONFIG_XEN_PHYSDEV_ACCESS screen_info = SCREEN_INFO; -#endif - edid_info = EDID_INFO; - saved_video_mode = SAVED_VIDEO_MODE; - bootloader_type = LOADER_TYPE; - -#ifdef CONFIG_BLK_DEV_RAM - rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; - rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); - rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); -#endif -/* register_console(&xen_console); */ - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS + /* This is drawn from a dump from vgacon:startup in standard Linux. */ screen_info.orig_video_mode = 3; screen_info.orig_video_isVGA = 1; @@ -633,27 +570,56 @@ screen_info.orig_video_cols = 80; screen_info.orig_video_ega_bx = 3; screen_info.orig_video_points = 16; -#endif +#endif + edid_info = EDID_INFO; + saved_video_mode = SAVED_VIDEO_MODE; + bootloader_type = LOADER_TYPE; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); + + +#endif + + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_writable_pagetables); + ARCH_SETUP - print_memory_map(machine_specific_memory_setup()); - - /* copy_edd(); */ +#else + ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV); + drive_info = DRIVE_INFO; + screen_info = SCREEN_INFO; + edid_info = EDID_INFO; + saved_video_mode = SAVED_VIDEO_MODE; + bootloader_type = LOADER_TYPE; + +#ifdef CONFIG_BLK_DEV_RAM + rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK; + rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0); + rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0); +#endif + setup_memory_region(); + copy_edd(); +#endif /* !CONFIG_XEN */ if (!MOUNT_ROOT_RDONLY) root_mountflags &= ~MS_RDONLY; init_mm.start_code = (unsigned long) &_text; init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; -/* init_mm.brk = (unsigned long) &_end; */ +#ifdef CONFIG_XEN init_mm.brk = start_pfn << PAGE_SHIFT; - - -#if 0 /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */ +#else + init_mm.brk = (unsigned long) &_end; + code_resource.start = virt_to_phys(&_text); code_resource.end = virt_to_phys(&_etext)-1; data_resource.start = virt_to_phys(&_etext); data_resource.end = virt_to_phys(&_edata)-1; #endif + parse_cmdline_early(cmdline_p); early_identify_cpu(&boot_cpu_data); @@ -662,15 +628,11 @@ * partially used pages are not usable - thus * we are rounding upwards: */ -#if 0 end_pfn = e820_end_of_ram(); -#else - end_pfn = xen_end_of_ram(); -#endif check_efer(); - init_memory_mapping(0, (end_pfn << PAGE_SHIFT)); + init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); #ifdef CONFIG_ACPI_NUMA /* @@ -685,24 +647,25 @@ contig_initmem_init(); #endif - /* Reserve direct mapping and shared info etc. */ -// reserve_bootmem_generic(table_start << PAGE_SHIFT, (table_end + 1 - table_start) << PAGE_SHIFT); - -// reserve_bootmem_generic(0, (table_end + 1) << PAGE_SHIFT); +#ifndef CONFIG_XEN + /* Reserve direct mapping */ + reserve_bootmem_generic(table_start << PAGE_SHIFT, + (table_end - table_start) << PAGE_SHIFT); /* reserve kernel */ -// kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); - -#if 0 + kernel_end = round_up(__pa_symbol(&_end),PAGE_SIZE); + reserve_bootmem_generic(HIGH_MEMORY, kernel_end - HIGH_MEMORY); + /* * reserve physical page 0 - it's a special BIOS page on many boxes, * enabling clean reboots, SMP operation, laptop functions. */ reserve_bootmem_generic(0, PAGE_SIZE); -#endif /* reserve ebda region */ -/* reserve_ebda_region(); */ + reserve_ebda_region(); +#endif + #ifdef CONFIG_SMP /* @@ -722,6 +685,7 @@ */ acpi_reserve_bootmem(); #endif +#ifdef CONFIG_XEN #ifdef CONFIG_BLK_DEV_INITRD if (xen_start_info.mod_start) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { @@ -738,6 +702,25 @@ } } #endif +#else /* CONFIG_XEN */ +#ifdef CONFIG_BLK_DEV_INITRD + if (LOADER_TYPE && INITRD_START) { + if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { + reserve_bootmem_generic(INITRD_START, INITRD_SIZE); + initrd_start = + INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_end = initrd_start+INITRD_SIZE; + } + else { + printk(KERN_ERR "initrd extends beyond end of memory " + "(0x%08lx > 0x%08lx)\ndisabling initrd\n", + (unsigned long)(INITRD_START + INITRD_SIZE), + (unsigned long)(end_pfn << PAGE_SHIFT)); + initrd_start = 0; + } + } +#endif +#endif /* !CONFIG_XEN */ paging_init(); #ifdef CONFIG_X86_LOCAL_APIC /* @@ -745,30 +728,36 @@ */ find_smp_config(); #endif - /* Make sure we have a large enough P->M table. */ - if (end_pfn > xen_start_info.nr_pages) { - phys_to_machine_mapping = alloc_bootmem( - max_pfn * sizeof(unsigned long)); - memset(phys_to_machine_mapping, ~0, - max_pfn * sizeof(unsigned long)); - memcpy(phys_to_machine_mapping, - (unsigned long *)xen_start_info.mfn_list, - xen_start_info.nr_pages * sizeof(unsigned long)); - free_bootmem( - __pa(xen_start_info.mfn_list), - PFN_PHYS(PFN_UP(xen_start_info.nr_pages * - sizeof(unsigned long)))); - } - - pfn_to_mfn_frame_list = alloc_bootmem(PAGE_SIZE); - - for ( i=0, j=0; i < end_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ ) - { - pfn_to_mfn_frame_list[j] = - virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT; - } - -#if 0 +#ifdef CONFIG_XEN + { + int i, j; + /* Make sure we have a large enough P->M table. */ + if (end_pfn > xen_start_info.nr_pages) { + phys_to_machine_mapping = alloc_bootmem( + max_pfn * sizeof(u32)); + memset(phys_to_machine_mapping, ~0, + max_pfn * sizeof(u32)); + memcpy(phys_to_machine_mapping, + (u32 *)xen_start_info.mfn_list, + xen_start_info.nr_pages * sizeof(u32)); + free_bootmem( + __pa(xen_start_info.mfn_list), + PFN_PHYS(PFN_UP(xen_start_info.nr_pages * + sizeof(u32)))); + } + + pfn_to_mfn_frame_list = alloc_bootmem(PAGE_SIZE); + + for ( i=0, j=0; i < end_pfn; i+=(PAGE_SIZE/sizeof(u32)), j++ ) + { + pfn_to_mfn_frame_list[j] = + virt_to_mfn(&phys_to_machine_mapping[i]); + } + + } +#endif + +#ifndef CONFIG_XEN check_ioapic(); #endif @@ -784,6 +773,7 @@ */ acpi_boot_init(); #endif + #ifdef CONFIG_X86_LOCAL_APIC /* * get boot-time SMP configuration: @@ -795,18 +785,14 @@ #endif #endif - /* XXX Disable irqdebug until we have a way to avoid interrupt - * conflicts. */ -/* noirqdebug_setup(""); */ - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#if defined(CONFIG_XEN_PRIVILEGED_GUEST) || !defined(CONFIG_XEN) /* * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ probe_roms(); -#endif -/* e820_reserve_resources(); */ + e820_reserve_resources(); +#endif request_resource(&iomem_resource, &video_ram_resource); @@ -823,14 +809,40 @@ iommu_hole_init(); #endif - op.cmd = PHYSDEVOP_SET_IOPL; - op.u.set_iopl.iopl = current->thread.io_pl = 1; - HYPERVISOR_physdev_op(&op); - - if (xen_start_info.flags & SIF_INITDOMAIN) { - if (!(xen_start_info.flags & SIF_PRIVILEGED)) - panic("Xen granted us console access " - "but not privileged status"); +#ifdef CONFIG_XEN + { + physdev_op_t op; + + op.cmd = PHYSDEVOP_SET_IOPL; + op.u.set_iopl.iopl = 1; + HYPERVISOR_physdev_op(&op); + + if (xen_start_info.flags & SIF_INITDOMAIN) { + if (!(xen_start_info.flags & SIF_PRIVILEGED)) + panic("Xen granted us console access " + "but not privileged status"); + +#ifdef CONFIG_VT +#if defined(CONFIG_VGA_CONSOLE) + conswitchp = &vga_con; +#elif defined(CONFIG_DUMMY_CONSOLE) + conswitchp = &dummy_con; +#endif +#endif + } else { +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + extern const struct consw xennull_con; + extern int console_use_vt; +#if defined(CONFIG_VGA_CONSOLE) + /* disable VGA driver */ + ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB; +#endif + conswitchp = &xennull_con; + console_use_vt = 0; +#endif + } + } +#else /* CONFIG_XEN */ #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) @@ -839,18 +851,8 @@ conswitchp = &dummy_con; #endif #endif - } else { -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - extern const struct consw xennull_con; - extern int console_use_vt; -#if defined(CONFIG_VGA_CONSOLE) - /* disable VGA driver */ - ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB; -#endif - conswitchp = &xennull_con; - console_use_vt = 0; -#endif - } + +#endif /* !CONFIG_XEN */ } static int __init get_model_name(struct cpuinfo_x86 *c) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/setup64.c Thu Aug 25 22:53:20 2005 @@ -30,9 +30,9 @@ #include <asm/proto.h> #include <asm/mman.h> #include <asm/numa.h> - +#ifdef CONFIG_XEN #include <asm-xen/hypervisor.h> - +#endif char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,}; cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; @@ -123,16 +123,61 @@ } } +#ifdef CONFIG_XEN +static void switch_pt(void) +{ + xen_pt_switch(__pa(init_level4_pgt)); + xen_new_user_pt(__pa(init_level4_user_pgt)); +} + +void __init cpu_gdt_init(struct desc_ptr *gdt_descr) +{ + unsigned long frames[16]; + unsigned long va; + int f; + + for (va = gdt_descr->address, f = 0; + va < gdt_descr->address + gdt_descr->size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_page_readonly((void *)va); + } + if (HYPERVISOR_set_gdt(frames, gdt_descr->size / + sizeof (struct desc_struct))) + BUG(); +} +#else +static void switch_pt(void) +{ + asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); +} + +void __init cpu_gdt_init(struct desc_ptr *gdt_descr) +{ +#ifdef CONFIG_SMP + int cpu = stack_smp_processor_id(); +#else + int cpu = smp_processor_id(); +#endif + + asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); + asm volatile("lidt %0" :: "m" (idt_descr)); +} +#endif + + void pda_init(int cpu) { - pgd_t *old_level4 = (pgd_t *)xen_start_info.pt_base; struct x8664_pda *pda = &cpu_pda[cpu]; /* Setup up data that may be needed in __get_free_pages early */ asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); +#ifndef CONFIG_XEN + wrmsrl(MSR_GS_BASE, cpu_pda + cpu); +#else HYPERVISOR_set_segment_base(SEGBASE_GS_KERNEL, (unsigned long)(cpu_pda + cpu)); - +#endif pda->me = pda; pda->cpunumber = cpu; pda->irqcount = -1; @@ -140,21 +185,14 @@ (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; pda->active_mm = &init_mm; pda->mmu_state = 0; - pda->kernel_mode = 1; if (cpu == 0) { - memcpy((void *)init_level4_pgt, - (void *) xen_start_info.pt_base, PAGE_SIZE); +#ifdef CONFIG_XEN + xen_init_pt(); +#endif /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; - make_page_readonly(init_level4_pgt); - make_page_readonly(init_level4_user_pgt); - make_page_readonly(level3_user_pgt); /* for vsyscall stuff */ - xen_pgd_pin(__pa_symbol(init_level4_user_pgt)); - xen_pud_pin(__pa_symbol(level3_user_pgt)); - set_pgd((pgd_t *)(init_level4_user_pgt + 511), - mk_kernel_pgd(__pa_symbol(level3_user_pgt))); } else { pda->irqstackptr = (char *) __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); @@ -162,20 +200,7 @@ panic("cannot allocate irqstack for cpu %d", cpu); } - xen_pt_switch(__pa(init_level4_pgt)); - xen_new_user_pt(__pa(init_level4_user_pgt)); - - if (cpu == 0) { - xen_pgd_unpin(__pa(old_level4)); -#if 0 - early_printk("__pa: %x, <machine_phys> old_level 4 %x\n", - __pa(xen_start_info.pt_base), - pfn_to_mfn(__pa(old_level4) >> PAGE_SHIFT)); -#endif -// make_page_writable(old_level4); -// free_bootmem(__pa(old_level4), PAGE_SIZE); - } - + switch_pt(); pda->irqstackptr += IRQSTACKSIZE-64; } @@ -185,6 +210,18 @@ /* May not be marked __init: used by software suspend */ void syscall_init(void) { +#ifndef CONFIG_XEN + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to set CS/DS + * but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); +#endif #ifdef CONFIG_IA32_EMULATION syscall32_cpu_init (); #endif @@ -197,27 +234,8 @@ rdmsrl(MSR_EFER, efer); if (!(efer & EFER_NX) || do_not_nx) { __supported_pte_mask &= ~_PAGE_NX; - } } - -void __init cpu_gdt_init(struct desc_ptr *gdt_descr) -{ - unsigned long frames[16]; - unsigned long va; - int f; - - for (va = gdt_descr->address, f = 0; - va < gdt_descr->address + gdt_descr->size; - va += PAGE_SIZE, f++) { - frames[f] = virt_to_machine(va) >> PAGE_SHIFT; - make_page_readonly((void *)va); - } - if (HYPERVISOR_set_gdt(frames, gdt_descr->size / - sizeof (struct desc_struct))) - BUG(); -} - /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -247,36 +265,32 @@ me = current; - if (test_and_set_bit(cpu, &cpu_initialized)) + if (cpu_test_and_set(cpu, cpu_initialized)) panic("CPU#%d already initialized!\n", cpu); printk("Initializing CPU#%d\n", cpu); -#if 0 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); -#endif + /* * Initialize the per-CPU GDT with the boot GDT, * and set up the GDT descriptor: */ +#ifndef CONFIG_XEN if (cpu) { memcpy(cpu_gdt_table[cpu], cpu_gdt_table[0], GDT_SIZE); } cpu_gdt_descr[cpu].size = GDT_SIZE; cpu_gdt_descr[cpu].address = (unsigned long)cpu_gdt_table[cpu]; -#if 0 - asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu])); - asm volatile("lidt %0" :: "m" (idt_descr)); -#endif - cpu_gdt_init(&cpu_gdt_descr[cpu]); - -#if 0 + memcpy(me->thread.tls_array, cpu_gdt_table[cpu], GDT_ENTRY_TLS_ENTRIES * 8); - -#endif - memcpy(me->thread.tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], +#else + memcpy(me->thread.tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN], GDT_ENTRY_TLS_ENTRIES * 8); + + cpu_gdt_init(&cpu_gdt_descr[cpu]); +#endif /* * Delete NT @@ -284,12 +298,12 @@ asm volatile("pushfq ; popq %%rax ; btr $14,%%rax ; pushq %%rax ; popfq" ::: "eax"); - if (cpu == 0) - early_identify_cpu(&boot_cpu_data); - syscall_init(); + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); barrier(); + check_efer(); /* @@ -321,19 +335,22 @@ BUG(); enter_lazy_tlb(&init_mm, me); +#ifndef CONFIG_XEN + set_tss_desc(cpu, t); + load_TR_desc(); +#endif load_LDT(&init_mm.context); /* * Clear all 6 debug registers: */ -#define CD(register) HYPERVISOR_set_debugreg(register, 0) - - CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); - -#undef CD + + set_debug(0UL, 0); + set_debug(0UL, 1); + set_debug(0UL, 2); + set_debug(0UL, 3); + set_debug(0UL, 6); + set_debug(0UL, 7); + fpu_init(); - -#ifdef CONFIG_NUMA - numa_add_cpu(cpu); -#endif -} +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smp.c Thu Aug 25 22:53:20 2005 @@ -28,7 +28,12 @@ #include <asm/mmu_context.h> #include <asm/proto.h> #include <asm/apicdef.h> - +#ifdef CONFIG_XEN +#include <asm-xen/evtchn.h> + +#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg) + +#else /* * Smarter SMP flushing macros. * c/o Linus Torvalds. @@ -44,6 +49,7 @@ static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); #define FLUSH_ALL -1ULL +#endif /* * We cannot call mmdrop() because we are in interrupt context, @@ -57,6 +63,7 @@ load_cr3(swapper_pg_dir); } +#ifndef CONFIG_XEN /* * * The flush IPI assumes that a thread switch happens in this order: @@ -250,6 +257,18 @@ { on_each_cpu(do_flush_tlb_all, NULL, 1, 1); } +#else +asmlinkage void smp_invalidate_interrupt (void) +{ return; } +void flush_tlb_current_task(void) +{ xen_tlb_flush_mask(¤t->mm->cpu_vm_mask); } +void flush_tlb_mm (struct mm_struct * mm) +{ xen_tlb_flush_mask(&mm->cpu_vm_mask); } +void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); } +void flush_tlb_all(void) +{ xen_tlb_flush_all(); } +#endif /* Xen */ void smp_kdb_stop(void) { @@ -310,13 +329,21 @@ /* Wait for response */ while (atomic_read(&data.started) != cpus) +#ifndef CONFIG_XEN cpu_relax(); +#else + barrier(); +#endif if (!wait) return; while (atomic_read(&data.finished) != cpus) +#ifndef CONFIG_XEN cpu_relax(); +#else + barrier(); +#endif } /* @@ -350,7 +377,11 @@ */ cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); +#ifndef CONFIG_XEN disable_local_APIC(); +#else + xxprint("stop_this_cpu disable_local_APIC\n"); +#endif local_irq_enable(); } @@ -364,8 +395,10 @@ void smp_send_stop(void) { int nolock = 0; +#ifndef CONFIG_XEN if (reboot_force) return; +#endif /* Don't deadlock on the call lock in panic */ if (!spin_trylock(&call_lock)) { /* ignore locking because we have paniced anyways */ @@ -376,7 +409,11 @@ spin_unlock(&call_lock); local_irq_disable(); +#ifdef CONFIG_XEN + xxprint("stop_this_cpu disable_local_APIC\n"); +#else disable_local_APIC(); +#endif local_irq_enable(); } @@ -385,18 +422,32 @@ * all the work is done automatically when * we return from the interrupt. */ +#ifndef CONFIG_XEN asmlinkage void smp_reschedule_interrupt(void) -{ +#else +asmlinkage irqreturn_t smp_reschedule_interrupt(void) +#endif +{ +#ifndef CONFIG_XEN ack_APIC_irq(); -} - +#else + return IRQ_HANDLED; +#endif +} + +#ifndef CONFIG_XEN asmlinkage void smp_call_function_interrupt(void) +#else +asmlinkage irqreturn_t smp_call_function_interrupt(void) +#endif { void (*func) (void *info) = call_data->func; void *info = call_data->info; int wait = call_data->wait; +#ifndef CONFIG_XEN ack_APIC_irq(); +#endif /* * Notify initiating CPU that I've grabbed the data and am * about to execute the function @@ -413,10 +464,16 @@ mb(); atomic_inc(&call_data->finished); } +#ifdef CONFIG_XEN + return IRQ_HANDLED; +#endif } int safe_smp_processor_id(void) { +#ifdef CONFIG_XEN + return smp_processor_id(); +#else int apicid, i; if (disable_apic) @@ -437,4 +494,5 @@ return 0; return 0; /* Should not happen */ -} +#endif +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c Thu Aug 25 22:53:20 2005 @@ -47,6 +47,9 @@ #include <linux/bootmem.h> #include <linux/thread_info.h> #include <linux/module.h> +#ifdef CONFIG_XEN +#include <linux/interrupt.h> +#endif #include <linux/delay.h> #include <linux/mc146818rtc.h> @@ -57,11 +60,20 @@ #include <asm/tlbflush.h> #include <asm/proto.h> #include <asm/nmi.h> +#ifdef CONFIG_XEN +#include <asm/arch_hooks.h> + +#include <asm-xen/evtchn.h> +#endif /* Change for real CPU hotplug. Note other files need to be fixed first too. */ #define __cpuinit __init #define __cpuinitdata __initdata + +#if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST) + unsigned int maxcpus = NR_CPUS; +#endif /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -96,6 +108,7 @@ cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_core_map); +#ifndef CONFIG_XEN /* * Trampoline 80x86 program as an array. */ @@ -115,6 +128,7 @@ memcpy(tramp, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(tramp); } +#endif /* * The bootstrap kernel entry code has set these up. Save them for @@ -130,6 +144,7 @@ print_cpu_info(c); } +#ifndef CONFIG_XEN /* * New Funky TSC sync algorithm borrowed from IA64. * Main advantage is that it doesn't reset the TSCs fully and @@ -331,6 +346,7 @@ return 0; } __setup("notscsync", notscsync_setup); +#endif static atomic_t init_deasserted __cpuinitdata; @@ -343,6 +359,7 @@ int cpuid, phys_id; unsigned long timeout; +#ifndef CONFIG_XEN /* * If waken up by an INIT in an 82489DX configuration * we may get here before an INIT-deassert IPI reaches @@ -352,10 +369,15 @@ while (!atomic_read(&init_deasserted)) cpu_relax(); +#endif /* * (This works even if the APIC is not enabled.) */ +#ifndef CONFIG_XEN phys_id = GET_APIC_ID(apic_read(APIC_ID)); +#else + phys_id = smp_processor_id(); +#endif cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("smp_callin: phys CPU#%d, CPU#%d already present??\n", @@ -389,6 +411,7 @@ cpuid); } +#ifndef CONFIG_XEN /* * the boot CPU has finished the init stage and is spinning * on callin_map until we finish. We are free to set up this @@ -398,6 +421,7 @@ Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); +#endif /* * Get our bogomips. @@ -405,7 +429,9 @@ calibrate_delay(); Dprintk("Stack at about %p\n",&cpuid); +#ifndef CONFIG_XEN disable_APIC_timer(); +#endif /* * Save our processor parameters @@ -417,6 +443,29 @@ */ cpu_set(cpuid, cpu_callin_map); } + +#ifdef CONFIG_XEN +static irqreturn_t ldebug_interrupt( + int irq, void *dev_id, struct pt_regs *regs) +{ + return IRQ_HANDLED; +} + +static DEFINE_PER_CPU(int, ldebug_irq); +static char ldebug_name[NR_CPUS][15]; + +void ldebug_setup(void) +{ + int cpu = smp_processor_id(); + + per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG); + sprintf(ldebug_name[cpu], "ldebug%d", cpu); + BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt, + SA_INTERRUPT, ldebug_name[cpu], NULL)); +} + +extern void local_setup_timer(void); +#endif /* * Setup code on secondary processor (after comming out of the trampoline) @@ -434,6 +483,7 @@ /* otherwise gcc will move up the smp_processor_id before the cpu_init */ barrier(); +#ifndef CONFIG_XEN Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); setup_secondary_APIC_clock(); @@ -446,6 +496,12 @@ } enable_APIC_timer(); +#else + local_setup_timer(); + ldebug_setup(); + smp_intr_init(); + local_irq_enable(); +#endif /* * Allow the master to continue. @@ -453,10 +509,12 @@ cpu_set(smp_processor_id(), cpu_online_map); mb(); +#ifndef CONFIG_XEN /* Wait for TSC sync to not schedule things before. We still process interrupts, which could see an inconsistent time in that window unfortunately. */ tsc_sync_wait(); +#endif cpu_idle(); } @@ -464,6 +522,7 @@ extern volatile unsigned long init_rsp; extern void (*initial_code)(void); +#ifndef CONFIG_XEN #if APIC_DEBUG static void inquire_remote_apic(int apicid) { @@ -627,6 +686,7 @@ return (send_status | accept_status); } +#endif /* * Boot one CPU. @@ -637,6 +697,14 @@ unsigned long boot_error; int timeout; unsigned long start_rip; +#ifdef CONFIG_XEN + vcpu_guest_context_t ctxt; + extern void startup_64_smp(void); + extern void hypervisor_callback(void); + extern void failsafe_callback(void); + extern void smp_trap_init(trap_info_t *); + int i; +#endif /* * We can't use kernel_thread since we must avoid to * reschedule the child. @@ -649,7 +717,11 @@ cpu_pda[cpu].pcurrent = idle; +#ifndef CONFIG_XEN start_rip = setup_trampoline(); +#else + start_rip = (unsigned long)startup_64_smp; +#endif init_rsp = idle->thread.rsp; per_cpu(init_tss,cpu).rsp0 = init_rsp; @@ -666,6 +738,95 @@ atomic_set(&init_deasserted, 0); +#ifdef CONFIG_XEN + cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL); + BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE); + cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; + memcpy((void *)cpu_gdt_descr[cpu].address, + (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size); + + memset(&ctxt, 0, sizeof(ctxt)); + + ctxt.flags = VGCF_IN_KERNEL; + ctxt.user_regs.ds = __USER_DS; + ctxt.user_regs.es = __USER_DS; + ctxt.user_regs.fs = 0; + ctxt.user_regs.gs = 0; + ctxt.user_regs.ss = __KERNEL_DS|0x3; + ctxt.user_regs.cs = __KERNEL_CS|0x3; + ctxt.user_regs.rip = start_rip; + ctxt.user_regs.rsp = idle->thread.rsp; +#define X86_EFLAGS_IOPL_RING3 0x3000 + ctxt.user_regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_IOPL_RING3; + + /* FPU is set up to default initial state. */ + memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); + + /* Virtual IDT is empty at start-of-day. */ + for ( i = 0; i < 256; i++ ) + { + ctxt.trap_ctxt[i].vector = i; + ctxt.trap_ctxt[i].cs = FLAT_KERNEL_CS; + } + smp_trap_init(ctxt.trap_ctxt); + + /* No LDT. */ + ctxt.ldt_ents = 0; + + { + unsigned long va; + int f; + + for (va = cpu_gdt_descr[cpu].address, f = 0; + va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size; + va += PAGE_SIZE, f++) { + ctxt.gdt_frames[f] = virt_to_mfn(va); + make_page_readonly((void *)va); + } + ctxt.gdt_ents = GDT_ENTRIES; + } + + /* Ring 1 stack is the initial stack. */ + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.rsp; + + /* Callback handlers. */ + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; + ctxt.syscall_callback_eip = (unsigned long)system_call; + + ctxt.ctrlreg[3] = virt_to_mfn(init_level4_pgt) << PAGE_SHIFT; + + boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt); + if (boot_error) + printk("boot error: %ld\n", boot_error); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + } + } + x86_cpu_to_apicid[cpu] = apicid; +#else Dprintk("Setting warm reset code and vector.\n"); CMOS_WRITE(0xa, 0xf); @@ -729,6 +890,7 @@ #endif } } +#endif if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ @@ -790,6 +952,7 @@ } } +#ifndef CONFIG_XEN /* * Cleanup possible dangling ends... */ @@ -817,6 +980,7 @@ free_page((unsigned long) __va(SMP_TRAMPOLINE_BASE)); #endif } +#endif /* * Fall back to non SMP mode after errors. @@ -827,10 +991,12 @@ { cpu_present_map = cpumask_of_cpu(0); cpu_possible_map = cpumask_of_cpu(0); +#ifndef CONFIG_XEN if (smp_found_config) phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); else phys_cpu_present_map = physid_mask_of_physid(0); +#endif cpu_set(0, cpu_sibling_map[0]); cpu_set(0, cpu_core_map[0]); } @@ -857,6 +1023,7 @@ */ static int __cpuinit smp_sanity_check(unsigned max_cpus) { +#ifndef CONFIG_XEN if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk("weird, boot CPU (#%d) not listed by the BIOS.\n", hard_smp_processor_id()); @@ -896,13 +1063,19 @@ nr_ioapics = 0; return -1; } +#endif /* * If SMP should be disabled, then really disable it! */ if (!max_cpus) { +#ifdef CONFIG_XEN + HYPERVISOR_shared_info->n_vcpu = 1; +#endif printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); +#ifndef CONFIG_XEN nr_ioapics = 0; +#endif return -1; } @@ -917,7 +1090,10 @@ { int i; +#if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST) +#else nmi_watchdog_default(); +#endif current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ @@ -927,8 +1103,12 @@ * Fill in cpu_present_mask */ for (i = 0; i < NR_CPUS; i++) { +#ifndef CONFIG_XEN int apicid = cpu_present_to_apicid(i); if (physid_isset(apicid, phys_cpu_present_map)) { +#else + if (i < HYPERVISOR_shared_info->n_vcpu) { +#endif cpu_set(i, cpu_present_map); /* possible map would be different if we supported real CPU hotplug. */ @@ -942,6 +1122,9 @@ return; } +#ifdef CONFIG_XEN + smp_intr_init(); +#else /* * Switch from PIC to APIC mode. @@ -954,20 +1137,26 @@ GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); /* Or can we switch back to PIC here? */ } +#endif /* * Now start the IO-APICs */ +#if defined(CONFIG_XEN) && !defined(CONFIG_XEN_PRIVILEGED_GUEST) +#else if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); else nr_ioapics = 0; +#endif /* * Set up local APIC timer on boot CPU. */ +#ifndef CONFIG_XEN setup_boot_APIC_clock(); +#endif } /* @@ -989,17 +1178,23 @@ int __cpuinit __cpu_up(unsigned int cpu) { int err; +#ifndef CONFIG_XEN int apicid = cpu_present_to_apicid(cpu); +#else + int apicid = cpu; +#endif WARN_ON(irqs_disabled()); Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); +#ifndef CONFIG_XEN if (apicid == BAD_APICID || apicid == boot_cpu_id || !physid_isset(apicid, phys_cpu_present_map)) { printk("__cpu_up: bad cpu %d\n", cpu); return -EINVAL; } +#endif /* Boot it! */ err = do_boot_cpu(cpu, apicid); @@ -1021,15 +1216,82 @@ */ void __cpuinit smp_cpus_done(unsigned int max_cpus) { +#ifndef CONFIG_XEN zap_low_mappings(); smp_cleanup_boot(); #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif +#endif detect_siblings(); +#ifndef CONFIG_XEN time_init_gtod(); check_nmi_watchdog(); -} +#endif +} + +#ifdef CONFIG_XEN +extern int bind_ipi_to_irq(int ipi); +extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); +extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); + +static DEFINE_PER_CPU(int, resched_irq); +static DEFINE_PER_CPU(int, callfunc_irq); +static char resched_name[NR_CPUS][15]; +static char callfunc_name[NR_CPUS][15]; + +void smp_intr_init(void) +{ + int cpu = smp_processor_id(); + + per_cpu(resched_irq, cpu) = + bind_ipi_to_irq(RESCHEDULE_VECTOR); + sprintf(resched_name[cpu], "resched%d", cpu); + BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt, + SA_INTERRUPT, resched_name[cpu], NULL)); + + per_cpu(callfunc_irq, cpu) = + bind_ipi_to_irq(CALL_FUNCTION_VECTOR); + sprintf(callfunc_name[cpu], "callfunc%d", cpu); + BUG_ON(request_irq(per_cpu(callfunc_irq, cpu), + smp_call_function_interrupt, + SA_INTERRUPT, callfunc_name[cpu], NULL)); +} + +static void smp_intr_exit(void) +{ + int cpu = smp_processor_id(); + + free_irq(per_cpu(resched_irq, cpu), NULL); + unbind_ipi_from_irq(RESCHEDULE_VECTOR); + + free_irq(per_cpu(callfunc_irq, cpu), NULL); + unbind_ipi_from_irq(CALL_FUNCTION_VECTOR); +} + +extern void local_setup_timer_irq(void); +extern void local_teardown_timer_irq(void); + +void smp_suspend(void) +{ + /* XXX todo: take down time and ipi's on all cpus */ + local_teardown_timer_irq(); + smp_intr_exit(); +} + +void smp_resume(void) +{ + /* XXX todo: restore time and ipi's on all cpus */ + smp_intr_init(); + local_setup_timer_irq(); +} + +void _restore_vcpu(void) +{ + /* XXX need to write this */ +} + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/traps.c Thu Aug 25 22:53:20 2005 @@ -953,6 +953,17 @@ cpu_init(); } +void smp_trap_init(trap_info_t *trap_ctxt) +{ + trap_info_t *t = trap_table; + + for (t = trap_table; t->address; t++) { + trap_ctxt[t->vector].flags = t->flags; + trap_ctxt[t->vector].cs = t->cs; + trap_ctxt[t->vector].address = t->address; + } +} + /* Actual parsing is done early in setup.c. */ static int __init oops_dummy(char *s) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/vsyscall.c Thu Aug 25 22:53:20 2005 @@ -210,15 +210,16 @@ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); } -extern void __set_fixmap_user (enum fixed_addresses, unsigned long, pgprot_t); - +#ifdef CONFIG_XEN static void __init map_vsyscall_user(void) { + extern void __set_fixmap_user(enum fixed_addresses, unsigned long, pgprot_t); extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); __set_fixmap_user(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); } +#endif static int __init vsyscall_init(void) { @@ -227,7 +228,10 @@ BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); map_vsyscall(); - map_vsyscall_user(); /* establish tranlation for user address space */ +#ifdef CONFIG_XEN + map_vsyscall_user(); + sysctl_vsyscall = 0; /* disable vgettimeofay() */ +#endif #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); #endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/xen_entry.S Thu Aug 25 22:53:20 2005 @@ -8,11 +8,14 @@ #define sizeof_vcpu_shift 3 #ifdef CONFIG_SMP -#define preempt_disable(reg) incl threadinfo_preempt_count(reg) -#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +//#define preempt_disable(reg) incl threadinfo_preempt_count(reg) +//#define preempt_enable(reg) decl threadinfo_preempt_count(reg) +#define preempt_disable(reg) +#define preempt_enable(reg) #define XEN_GET_VCPU_INFO(reg) preempt_disable(%rbp) ; \ movq %gs:pda_cpunumber,reg ; \ - shl $sizeof_vcpu_shift,reg ; \ + shl $32, reg ; \ + shr $32-sizeof_vcpu_shift,reg ; \ addq HYPERVISOR_shared_info,reg #define XEN_PUT_VCPU_INFO(reg) preempt_enable(%rbp) ; \ #define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/mm/Makefile --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/Makefile Thu Aug 25 22:53:20 2005 @@ -6,10 +6,10 @@ CFLAGS += -Iarch/$(XENARCH)/mm -obj-y := init.o fault.o ioremap.o pageattr.o +obj-y := init.o fault.o pageattr.o c-obj-y := extable.o -i386-obj-y := hypervisor.o +i386-obj-y := hypervisor.o ioremap.o #obj-y := init.o fault.o ioremap.o extable.o pageattr.o #c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/fault.c Thu Aug 25 22:53:20 2005 @@ -250,7 +250,11 @@ happen within a race in page table update. In the later case just flush. */ - pgd = pgd_offset(current->mm ?: &init_mm, address); + /* On Xen the line below does not always work. Needs investigating! */ + /*pgd = pgd_offset(current->mm ?: &init_mm, address);*/ + pgd = (pgd_t *)per_cpu(cur_pgd, smp_processor_id()); + pgd += pgd_index(address); + pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Thu Aug 25 22:53:20 2005 @@ -40,12 +40,14 @@ #include <asm/proto.h> #include <asm/smp.h> +extern unsigned long *contiguous_bitmap; + +#if defined(CONFIG_SWIOTLB) +extern void swiotlb_init(void); +#endif + #ifndef Dprintk #define Dprintk(x...) -#endif - -#ifdef CONFIG_GART_IOMMU -extern int swiotlb; #endif extern char _stext[]; @@ -280,7 +282,7 @@ if (!pte_none(*pte) && pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) pte_ERROR(*pte); - xen_l1_entry_update(pte, new_pte); + set_pte(pte, new_pte); /* * It's enough to flush this one mapping. @@ -439,6 +441,31 @@ *dst = val; } +static inline int make_readonly(unsigned long paddr) +{ + int readonly = 0; + + /* Make new page tables read-only. */ + if ((paddr < ((table_start << PAGE_SHIFT) + tables_space)) && + (paddr >= (table_start << PAGE_SHIFT))) + readonly = 1; + + /* Make old page tables read-only. */ + if ((paddr < ((xen_start_info.pt_base - __START_KERNEL_map) + + (xen_start_info.nr_pt_frames << PAGE_SHIFT))) && + (paddr >= (xen_start_info.pt_base - __START_KERNEL_map))) + readonly = 1; + + /* + * No need for writable mapping of kernel image. This also ensures that + * page and descriptor tables embedded inside don't have writable mappings. + */ + if ((paddr >= __pa_symbol(&_text)) && (paddr < __pa_symbol(&_end))) + readonly = 1; + + return readonly; +} + void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) { long i, j, k; @@ -475,9 +502,7 @@ pte = alloc_low_page(&pte_phys); pte_save = pte; for (k = 0; k < PTRS_PER_PTE; pte++, k++, paddr += PTE_SIZE) { - if (paddr < (table_start << PAGE_SHIFT) - + tables_space) - { + if (make_readonly(paddr)) { __set_pte(pte, __pte(paddr | (_KERNPG_TABLE & ~_PAGE_RW))); continue; @@ -511,75 +536,106 @@ round_up(ptes * 8, PAGE_SIZE); } +void __init xen_init_pt(void) +{ + unsigned long addr, *page; + int i; + + for (i = 0; i < NR_CPUS; i++) + per_cpu(cur_pgd, i) = init_mm.pgd; + + memset((void *)init_level4_pgt, 0, PAGE_SIZE); + memset((void *)level3_kernel_pgt, 0, PAGE_SIZE); + memset((void *)level2_kernel_pgt, 0, PAGE_SIZE); + + /* Find the initial pte page that was built for us. */ + page = (unsigned long *)xen_start_info.pt_base; + addr = page[pgd_index(__START_KERNEL_map)]; + addr_to_page(addr, page); + addr = page[pud_index(__START_KERNEL_map)]; + addr_to_page(addr, page); + + /* Construct mapping of initial pte page in our own directories. */ + init_level4_pgt[pgd_index(__START_KERNEL_map)] = + mk_kernel_pgd(__pa_symbol(level3_kernel_pgt)); + level3_kernel_pgt[pud_index(__START_KERNEL_map)] = + __pud(__pa_symbol(level2_kernel_pgt) | + _KERNPG_TABLE | _PAGE_USER); + memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE); + + make_page_readonly(init_level4_pgt); + make_page_readonly(init_level4_user_pgt); + make_page_readonly(level3_kernel_pgt); + make_page_readonly(level3_user_pgt); + make_page_readonly(level2_kernel_pgt); + + xen_pgd_pin(__pa_symbol(init_level4_pgt)); + xen_pgd_pin(__pa_symbol(init_level4_user_pgt)); + xen_pud_pin(__pa_symbol(level3_kernel_pgt)); + xen_pud_pin(__pa_symbol(level3_user_pgt)); + xen_pmd_pin(__pa_symbol(level2_kernel_pgt)); + + set_pgd((pgd_t *)(init_level4_user_pgt + 511), + mk_kernel_pgd(__pa_symbol(level3_user_pgt))); +} /* * Extend kernel mapping to access pages for page tables. The initial * mapping done by Xen is minimal (e.g. 8MB) and we need to extend the * mapping for early initialization. */ - -#define MIN_INIT_SIZE 0x800000 static unsigned long current_size, extended_size; void __init extend_init_mapping(void) { unsigned long va = __START_KERNEL_map; - unsigned long addr, *pte_page; - - unsigned long phys; + unsigned long phys, addr, *pte_page; pmd_t *pmd; pte_t *pte, new_pte; unsigned long *page = (unsigned long *) init_level4_pgt; int i; - addr = (unsigned long) page[pgd_index(va)]; + addr = page[pgd_index(va)]; addr_to_page(addr, page); - addr = page[pud_index(va)]; addr_to_page(addr, page); for (;;) { + pmd = (pmd_t *)&page[pmd_index(va)]; + if (!pmd_present(*pmd)) + break; + addr = page[pmd_index(va)]; + addr_to_page(addr, pte_page); + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = (pte_t *) &pte_page[pte_index(va)]; + if (!pte_present(*pte)) + break; + va += PAGE_SIZE; + current_size += PAGE_SIZE; + } + } + + while (va < __START_KERNEL_map + current_size + tables_space) { pmd = (pmd_t *) &page[pmd_index(va)]; - if (pmd_present(*pmd)) { - /* - * if pmd is valid, check pte. - */ - addr = page[pmd_index(va)]; - addr_to_page(addr, pte_page); - - for (i = 0; i < PTRS_PER_PTE; i++) { - pte = (pte_t *) &pte_page[pte_index(va)]; - - if (pte_present(*pte)) { - va += PAGE_SIZE; - current_size += PAGE_SIZE; - } else - break; - } - - } else - break; - } - - for (; va < __START_KERNEL_map + current_size + tables_space; ) { - pmd = (pmd_t *) &page[pmd_index(va)]; - - if (pmd_none(*pmd)) { - pte_page = (unsigned long *) alloc_static_page(&phys); - make_page_readonly(pte_page); - xen_pte_pin(phys); - set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER)); - - for (i = 0; i < PTRS_PER_PTE; i++, va += PAGE_SIZE) { - new_pte = pfn_pte((va - __START_KERNEL_map) >> PAGE_SHIFT, - __pgprot(_KERNPG_TABLE | _PAGE_USER)); - - pte = (pte_t *) &pte_page[pte_index(va)]; - xen_l1_entry_update(pte, new_pte); - extended_size += PAGE_SIZE; - } - } - } + if (!pmd_none(*pmd)) + continue; + pte_page = (unsigned long *) alloc_static_page(&phys); + make_page_readonly(pte_page); + xen_pte_pin(phys); + set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER)); + for (i = 0; i < PTRS_PER_PTE; i++, va += PAGE_SIZE) { + new_pte = pfn_pte( + (va - __START_KERNEL_map) >> PAGE_SHIFT, + __pgprot(_KERNPG_TABLE | _PAGE_USER)); + pte = (pte_t *)&pte_page[pte_index(va)]; + xen_l1_entry_update(pte, new_pte); + extended_size += PAGE_SIZE; + } + } + + /* Kill mapping of low 1MB. */ + for (va = __START_KERNEL_map; va < (unsigned long)&_text; va += PAGE_SIZE) + HYPERVISOR_update_va_mapping(va, __pte_ma(0), 0); } @@ -620,10 +676,6 @@ start_pfn = ((current_size + extended_size) >> PAGE_SHIFT); - /* - * TBD: Need to calculate at runtime - */ - __flush_tlb_all(); init_mapping_done = 1; } @@ -670,7 +722,7 @@ set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE); else __set_fixmap(FIX_ISAMAP_BEGIN - i, - virt_to_machine(empty_zero_page), + virt_to_mfn(empty_zero_page) << PAGE_SHIFT, PAGE_KERNEL_RO); } #endif @@ -720,8 +772,6 @@ return 1; } -extern int swiotlb_force; - static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -730,14 +780,13 @@ int codesize, reservedpages, datasize, initsize; int tmp; -#ifdef CONFIG_SWIOTLB - if (swiotlb_force) - swiotlb = 1; - if (!iommu_aperture && - (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu)) - swiotlb = 1; - if (swiotlb) - swiotlb_init(); + contiguous_bitmap = alloc_bootmem_low_pages( + (end_pfn + 2*BITS_PER_LONG) >> 3); + BUG_ON(!contiguous_bitmap); + memset(contiguous_bitmap, 0, (end_pfn + 2*BITS_PER_LONG) >> 3); + +#if defined(CONFIG_SWIOTLB) + swiotlb_init(); #endif /* How many end-of-memory variables you have, grandma! */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/pci/Makefile --- a/linux-2.6-xen-sparse/arch/xen/x86_64/pci/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/pci/Makefile Thu Aug 25 22:53:20 2005 @@ -30,8 +30,9 @@ $(patsubst %.o,$(obj)/%.c,$(c-i386-obj-y)): @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@ -obj-y += $(c-i386-obj-y) $(c-obj-y) -obj-y += $(c-xen-obj-y) +# Make sure irq.o gets linked in before common.o +obj-y += $(patsubst common.o,$(c-xen-obj-y) common.o,$(c-i386-obj-y)) +obj-y += $(c-obj-y) clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) clean-files += $(patsubst %.o,%.c,$(c-i386-obj-y) $(c-i386-obj-)) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c --- a/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/balloon/balloon.c Thu Aug 25 22:53:20 2005 @@ -81,20 +81,17 @@ static DECLARE_WORK(balloon_worker, balloon_process, NULL); static struct timer_list balloon_timer; -/* Flag for dom0 xenstore workaround */ -static int balloon_xenbus_init=0; - -/* Init Function */ -void balloon_init_watcher(void); - #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) /* Use the private and mapping fields of struct page as a list. */ -#define PAGE_TO_LIST(p) ( (struct list_head *)&p->private ) -#define LIST_TO_PAGE(l) ( list_entry( ((unsigned long *)l), \ - struct page, private ) ) -#define UNLIST_PAGE(p) do { list_del(PAGE_TO_LIST(p)); \ - p->mapping = NULL; \ - p->private = 0; } while(0) +#define PAGE_TO_LIST(p) ((struct list_head *)&p->private) +#define LIST_TO_PAGE(l) \ + (list_entry(((unsigned long *)l), struct page, private)) +#define UNLIST_PAGE(p) \ + do { \ + list_del(PAGE_TO_LIST(p)); \ + p->mapping = NULL; \ + p->private = 0; \ + } while(0) #else /* There's a dedicated list field in struct page we can use. */ #define PAGE_TO_LIST(p) ( &p->list ) @@ -110,56 +107,53 @@ #endif #define IPRINTK(fmt, args...) \ - printk(KERN_INFO "xen_mem: " fmt, ##args) + printk(KERN_INFO "xen_mem: " fmt, ##args) #define WPRINTK(fmt, args...) \ - printk(KERN_WARNING "xen_mem: " fmt, ##args) + printk(KERN_WARNING "xen_mem: " fmt, ##args) /* balloon_append: add the given page to the balloon. */ static void balloon_append(struct page *page) { - /* Low memory is re-populated first, so highmem pages go at list tail. */ - if ( PageHighMem(page) ) - { - list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); - balloon_high++; - } - else - { - list_add(PAGE_TO_LIST(page), &ballooned_pages); - balloon_low++; - } + /* Lowmem is re-populated first, so highmem pages go at list tail. */ + if (PageHighMem(page)) { + list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); + balloon_high++; + } else { + list_add(PAGE_TO_LIST(page), &ballooned_pages); + balloon_low++; + } } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ static struct page *balloon_retrieve(void) { - struct page *page; - - if ( list_empty(&ballooned_pages) ) - return NULL; - - page = LIST_TO_PAGE(ballooned_pages.next); - UNLIST_PAGE(page); - - if ( PageHighMem(page) ) - balloon_high--; - else - balloon_low--; - - return page; + struct page *page; + + if (list_empty(&ballooned_pages)) + return NULL; + + page = LIST_TO_PAGE(ballooned_pages.next); + UNLIST_PAGE(page); + + if (PageHighMem(page)) + balloon_high--; + else + balloon_low--; + + return page; } static void balloon_alarm(unsigned long unused) { - schedule_work(&balloon_worker); + schedule_work(&balloon_worker); } static unsigned long current_target(void) { - unsigned long target = min(target_pages, hard_limit); - if ( target > (current_pages + balloon_low + balloon_high) ) - target = current_pages + balloon_low + balloon_high; - return target; + unsigned long target = min(target_pages, hard_limit); + if (target > (current_pages + balloon_low + balloon_high)) + target = current_pages + balloon_low + balloon_high; + return target; } /* @@ -170,353 +164,336 @@ */ static void balloon_process(void *unused) { - unsigned long *mfn_list, pfn, i, flags; - struct page *page; - long credit, debt, rc; - void *v; - - down(&balloon_mutex); + unsigned long *mfn_list, pfn, i, flags; + struct page *page; + long credit, debt, rc; + void *v; + + down(&balloon_mutex); retry: - mfn_list = NULL; - - if ( (credit = current_target() - current_pages) > 0 ) - { - mfn_list = (unsigned long *)vmalloc(credit * sizeof(*mfn_list)); - if ( mfn_list == NULL ) - goto out; - - balloon_lock(flags); - rc = HYPERVISOR_dom_mem_op( - MEMOP_increase_reservation, mfn_list, credit, 0); - balloon_unlock(flags); - if ( rc < credit ) - { - /* We hit the Xen hard limit: reprobe. */ - if ( HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, mfn_list, rc, 0) != rc ) - BUG(); - hard_limit = current_pages + rc - driver_pages; - vfree(mfn_list); - goto retry; - } - - for ( i = 0; i < credit; i++ ) - { - if ( (page = balloon_retrieve()) == NULL ) - BUG(); - - pfn = page - mem_map; - if ( phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY ) - BUG(); - - /* Update P->M and M->P tables. */ - phys_to_machine_mapping[pfn] = mfn_list[i]; - xen_machphys_update(mfn_list[i], pfn); + mfn_list = NULL; + + if ((credit = current_target() - current_pages) > 0) { + mfn_list = vmalloc(credit * sizeof(*mfn_list)); + if (mfn_list == NULL) + goto out; + + balloon_lock(flags); + rc = HYPERVISOR_dom_mem_op( + MEMOP_increase_reservation, mfn_list, credit, 0); + balloon_unlock(flags); + if (rc < credit) { + /* We hit the Xen hard limit: reprobe. */ + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, + mfn_list, rc, 0) != rc); + hard_limit = current_pages + rc - driver_pages; + vfree(mfn_list); + goto retry; + } + + for (i = 0; i < credit; i++) { + page = balloon_retrieve(); + BUG_ON(page == NULL); + + pfn = page - mem_map; + if (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY) + BUG(); + + /* Update P->M and M->P tables. */ + phys_to_machine_mapping[pfn] = mfn_list[i]; + xen_machphys_update(mfn_list[i], pfn); - /* Link back into the page tables if it's not a highmem page. */ - if ( pfn < max_low_pfn ) - { - HYPERVISOR_update_va_mapping( - (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma((mfn_list[i] << PAGE_SHIFT) | - pgprot_val(PAGE_KERNEL)), - 0); - } - - /* Finally, relinquish the memory back to the system allocator. */ - ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - } - - current_pages += credit; - } - else if ( credit < 0 ) - { - debt = -credit; - - mfn_list = (unsigned long *)vmalloc(debt * sizeof(*mfn_list)); - if ( mfn_list == NULL ) - goto out; - - for ( i = 0; i < debt; i++ ) - { - if ( (page = alloc_page(GFP_HIGHUSER)) == NULL ) - { - debt = i; - break; - } - - pfn = page - mem_map; - mfn_list[i] = phys_to_machine_mapping[pfn]; - - if ( !PageHighMem(page) ) - { - v = phys_to_virt(pfn << PAGE_SHIFT); - scrub_pages(v, 1); - HYPERVISOR_update_va_mapping( - (unsigned long)v, __pte_ma(0), 0); - } + /* Link back into the page tables if not highmem. */ + if (pfn < max_low_pfn) + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte_ma(mfn_list[i], PAGE_KERNEL), + 0)); + + /* Relinquish the page back to the allocator. */ + ClearPageReserved(page); + set_page_count(page, 1); + __free_page(page); + } + + current_pages += credit; + } else if (credit < 0) { + debt = -credit; + + mfn_list = vmalloc(debt * sizeof(*mfn_list)); + if (mfn_list == NULL) + goto out; + + for (i = 0; i < debt; i++) { + if ((page = alloc_page(GFP_HIGHUSER)) == NULL) { + debt = i; + break; + } + + pfn = page - mem_map; + mfn_list[i] = phys_to_machine_mapping[pfn]; + + if (!PageHighMem(page)) { + v = phys_to_virt(pfn << PAGE_SHIFT); + scrub_pages(v, 1); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)v, __pte_ma(0), 0)); + } #ifdef CONFIG_XEN_SCRUB_PAGES - else - { - v = kmap(page); - scrub_pages(v, 1); - kunmap(page); - } + else { + v = kmap(page); + scrub_pages(v, 1); + kunmap(page); + } #endif - } - - /* Ensure that ballooned highmem pages don't have cached mappings. */ - kmap_flush_unused(); - flush_tlb_all(); - - /* No more mappings: invalidate pages in P2M and add to balloon. */ - for ( i = 0; i < debt; i++ ) - { - pfn = mfn_to_pfn(mfn_list[i]); - phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY; - balloon_append(pfn_to_page(pfn)); - } - - if ( HYPERVISOR_dom_mem_op( - MEMOP_decrease_reservation, mfn_list, debt, 0) != debt ) - BUG(); - - current_pages -= debt; - } + } + + /* Ensure that ballooned highmem pages don't have kmaps. */ + kmap_flush_unused(); + flush_tlb_all(); + + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < debt; i++) { + pfn = mfn_to_pfn(mfn_list[i]); + phys_to_machine_mapping[pfn] = INVALID_P2M_ENTRY; + balloon_append(pfn_to_page(pfn)); + } + + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation,mfn_list, debt, 0) != debt); + + current_pages -= debt; + } out: - if ( mfn_list != NULL ) - vfree(mfn_list); - - /* Schedule more work if there is some still to be done. */ - if ( current_target() != current_pages ) - mod_timer(&balloon_timer, jiffies + HZ); - - up(&balloon_mutex); + if (mfn_list != NULL) + vfree(mfn_list); + + /* Schedule more work if there is some still to be done. */ + if (current_target() != current_pages) + mod_timer(&balloon_timer, jiffies + HZ); + + up(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ static void set_new_target(unsigned long target) { - /* No need for lock. Not read-modify-write updates. */ - hard_limit = ~0UL; - target_pages = target; - schedule_work(&balloon_worker); -} - -static struct xenbus_watch xb_watch = -{ - .node = "memory" -}; - -/* FIXME: This is part of a dom0 sequencing workaround */ -static struct xenbus_watch root_watch = -{ - .node = "/" + /* No need for lock. Not read-modify-write updates. */ + hard_limit = ~0UL; + target_pages = target; + schedule_work(&balloon_worker); +} + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" }; /* React to a change in the target key */ static void watch_target(struct xenbus_watch *watch, const char *node) { - unsigned long new_target; - int err; - - if(watch == &root_watch) - { - /* FIXME: This is part of a dom0 sequencing workaround */ - if(register_xenbus_watch(&xb_watch) == 0) - { - /* - We successfully set a watch on memory/target: - now we can stop watching root - */ - unregister_xenbus_watch(&root_watch); - balloon_xenbus_init=1; - } - else - { - return; - } - } - - err = xenbus_scanf("memory", "target", "%lu", &new_target); + unsigned long new_target; + int err; + + err = xenbus_scanf("memory", "target", "%lu", &new_target); + if (err != 1) { + printk(KERN_ERR "Unable to read memory/target\n"); + return; + } - if(err != 1) - { - IPRINTK("Unable to read memory/target\n"); - return; - } - - set_new_target(new_target >> PAGE_SHIFT); + set_new_target(new_target >> PAGE_SHIFT); } -/* - Try to set up our watcher, if not already set - +/* Setup our watcher + NB: Assumes xenbus_lock is held! */ -void balloon_init_watcher(void) -{ - int err; - - if(!xen_start_info.store_evtchn) - { - IPRINTK("Delaying watcher init until xenstore is available\n"); - return; - } - - down(&xenbus_lock); - - if(! balloon_xenbus_init) - { - err = register_xenbus_watch(&xb_watch); - if(err) - { - /* BIG FAT FIXME: dom0 sequencing workaround - * dom0 can't set a watch on memory/target until - * after the tools create it. So, we have to watch - * the whole store until that happens. - * - * This will go away when we have the ability to watch - * non-existant keys - */ - register_xenbus_watch(&root_watch); - } - else - { - IPRINTK("Balloon xenbus watcher initialized\n"); - balloon_xenbus_init = 1; - } - } - - up(&xenbus_lock); - -} - -EXPORT_SYMBOL(balloon_init_watcher); +int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + BUG_ON(down_trylock(&xenbus_lock) == 0); + + err = register_xenbus_watch(&target_watch); + if (err) + printk(KERN_ERR "Failed to set balloon watcher\n"); + + return NOTIFY_DONE; + +} static int balloon_write(struct file *file, const char __user *buffer, unsigned long count, void *data) { - char memstring[64], *endchar; - unsigned long long target_bytes; - - if ( !capable(CAP_SYS_ADMIN) ) - return -EPERM; - - if ( count <= 1 ) - return -EBADMSG; /* runt */ - if ( count > sizeof(memstring) ) - return -EFBIG; /* too long */ - - if ( copy_from_user(memstring, buffer, count) ) - return -EFAULT; - memstring[sizeof(memstring)-1] = '\0'; - - target_bytes = memparse(memstring, &endchar); - set_new_target(target_bytes >> PAGE_SHIFT); - - return count; + char memstring[64], *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (count <= 1) + return -EBADMSG; /* runt */ + if (count > sizeof(memstring)) + return -EFBIG; /* too long */ + + if (copy_from_user(memstring, buffer, count)) + return -EFAULT; + memstring[sizeof(memstring)-1] = '\0'; + + target_bytes = memparse(memstring, &endchar); + set_new_target(target_bytes >> PAGE_SHIFT); + + return count; } static int balloon_read(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len; - - len = sprintf( - page, - "Current allocation: %8lu kB\n" - "Requested target: %8lu kB\n" - "Low-mem balloon: %8lu kB\n" - "High-mem balloon: %8lu kB\n" - "Xen hard limit: ", - PAGES2KB(current_pages), PAGES2KB(target_pages), - PAGES2KB(balloon_low), PAGES2KB(balloon_high)); - - if ( hard_limit != ~0UL ) - len += sprintf( - page + len, - "%8lu kB (inc. %8lu kB driver headroom)\n", - PAGES2KB(hard_limit), PAGES2KB(driver_pages)); - else - len += sprintf( - page + len, - " ??? kB\n"); - - *eof = 1; - return len; -} + int len; + + len = sprintf( + page, + "Current allocation: %8lu kB\n" + "Requested target: %8lu kB\n" + "Low-mem balloon: %8lu kB\n" + "High-mem balloon: %8lu kB\n" + "Xen hard limit: ", + PAGES2KB(current_pages), PAGES2KB(target_pages), + PAGES2KB(balloon_low), PAGES2KB(balloon_high)); + + if (hard_limit != ~0UL) { + len += sprintf( + page + len, + "%8lu kB (inc. %8lu kB driver headroom)\n", + PAGES2KB(hard_limit), PAGES2KB(driver_pages)); + } else { + len += sprintf( + page + len, + " ??? kB\n"); + } + + *eof = 1; + return len; +} + +static struct notifier_block xenstore_notifier; static int __init balloon_init(void) { - unsigned long pfn; - struct page *page; - - IPRINTK("Initialising balloon driver.\n"); - - current_pages = min(xen_start_info.nr_pages, max_pfn); - target_pages = current_pages; - balloon_low = 0; - balloon_high = 0; - driver_pages = 0UL; - hard_limit = ~0UL; - - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; + unsigned long pfn; + struct page *page; + + IPRINTK("Initialising balloon driver.\n"); + + current_pages = min(xen_start_info.nr_pages, max_pfn); + target_pages = current_pages; + balloon_low = 0; + balloon_high = 0; + driver_pages = 0UL; + hard_limit = ~0UL; + + init_timer(&balloon_timer); + balloon_timer.data = 0; + balloon_timer.function = balloon_alarm; - if ( (balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL ) - { - WPRINTK("Unable to create /proc/xen/balloon.\n"); - return -1; - } - - balloon_pde->read_proc = balloon_read; - balloon_pde->write_proc = balloon_write; + if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) { + WPRINTK("Unable to create /proc/xen/balloon.\n"); + return -1; + } + + balloon_pde->read_proc = balloon_read; + balloon_pde->write_proc = balloon_write; - /* Initialise the balloon with excess memory space. */ - for ( pfn = xen_start_info.nr_pages; pfn < max_pfn; pfn++ ) - { - page = &mem_map[pfn]; - if ( !PageReserved(page) ) - balloon_append(page); - } - - xb_watch.callback = watch_target; - root_watch.callback = watch_target; - - balloon_init_watcher(); - - return 0; + /* Initialise the balloon with excess memory space. */ + for (pfn = xen_start_info.nr_pages; pfn < max_pfn; pfn++) { + page = &mem_map[pfn]; + if (!PageReserved(page)) + balloon_append(page); + } + + target_watch.callback = watch_target; + xenstore_notifier.notifier_call = balloon_init_watcher; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; } subsys_initcall(balloon_init); void balloon_update_driver_allowance(long delta) { - unsigned long flags; - balloon_lock(flags); - driver_pages += delta; /* non-atomic update */ - balloon_unlock(flags); -} - -void balloon_put_pages(unsigned long *mfn_list, unsigned long nr_mfns) -{ - unsigned long flags; - - balloon_lock(flags); - if ( HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation, - mfn_list, nr_mfns, 0) != nr_mfns ) - BUG(); - current_pages -= nr_mfns; /* non-atomic update */ - balloon_unlock(flags); - - schedule_work(&balloon_worker); + unsigned long flags; + balloon_lock(flags); + driver_pages += delta; /* non-atomic update */ + balloon_unlock(flags); +} + +static int dealloc_pte_fn( + pte_t *pte, struct page *pte_page, unsigned long addr, void *data) +{ + unsigned long mfn = pte_mfn(*pte); + set_pte(pte, __pte_ma(0)); + phys_to_machine_mapping[__pa(addr) >> PAGE_SHIFT] = + INVALID_P2M_ENTRY; + BUG_ON(HYPERVISOR_dom_mem_op( + MEMOP_decrease_reservation, &mfn, 1, 0) != 1); + return 0; +} + +struct page *balloon_alloc_empty_page_range(unsigned long nr_pages) +{ + unsigned long vstart, flags; + unsigned int order = get_order(nr_pages * PAGE_SIZE); + + vstart = __get_free_pages(GFP_KERNEL, order); + if (vstart == 0) + return NULL; + + scrub_pages(vstart, 1 << order); + + balloon_lock(flags); + BUG_ON(generic_page_range( + &init_mm, vstart, PAGE_SIZE << order, dealloc_pte_fn, NULL)); + current_pages -= 1UL << order; + balloon_unlock(flags); + + schedule_work(&balloon_worker); + + flush_tlb_all(); + + return virt_to_page(vstart); +} + +void balloon_dealloc_empty_page_range( + struct page *page, unsigned long nr_pages) +{ + unsigned long i, flags; + unsigned int order = get_order(nr_pages * PAGE_SIZE); + + balloon_lock(flags); + for (i = 0; i < (1UL << order); i++) + balloon_append(page + i); + balloon_unlock(flags); + + schedule_work(&balloon_worker); } EXPORT_SYMBOL(balloon_update_driver_allowance); -EXPORT_SYMBOL(balloon_put_pages); +EXPORT_SYMBOL(balloon_alloc_empty_page_range); +EXPORT_SYMBOL(balloon_dealloc_empty_page_range); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/Makefile Thu Aug 25 22:53:20 2005 @@ -1,2 +1,2 @@ -obj-y := blkback.o control.o interface.o vbd.o +obj-y := blkback.o xenbus.o interface.o vbd.o diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Thu Aug 25 22:53:20 2005 @@ -11,11 +11,9 @@ * Copyright (c) 2005, Christopher Clark */ +#include <linux/spinlock.h> +#include <asm-xen/balloon.h> #include "common.h" -#include <asm-xen/evtchn.h> -#ifdef CONFIG_XEN_BLKDEV_GRANT -#include <asm-xen/xen-public/grant_table.h> -#endif /* * These are rather arbitrary. They are fairly large because adjacent requests @@ -67,9 +65,6 @@ static PEND_RING_IDX pending_prod, pending_cons; #define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -static kmem_cache_t *buffer_head_cachep; -#else static request_queue_t *plugged_queue; static inline void flush_plugged_queue(void) { @@ -82,9 +77,7 @@ plugged_queue = NULL; } } -#endif - -#ifdef CONFIG_XEN_BLKDEV_GRANT + /* When using grant tables to map a frame for device access then the * handle returned must be used to unmap the frame. This is needed to * drop the ref count on the frame. @@ -93,7 +86,6 @@ #define pending_handle(_idx, _i) \ (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) #define BLKBACK_INVALID_HANDLE (0xFFFF) -#endif #ifdef CONFIG_XEN_BLKDEV_TAP_BE /* @@ -108,14 +100,12 @@ #endif static int do_block_io_op(blkif_t *blkif, int max_to_do); -static void dispatch_probe(blkif_t *blkif, blkif_request_t *req); static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req); static void make_response(blkif_t *blkif, unsigned long id, unsigned short op, int st); static void fast_flush_area(int idx, int nr_pages) { -#ifdef CONFIG_XEN_BLKDEV_GRANT struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int i, invcount = 0; u16 handle; @@ -124,31 +114,16 @@ { if ( BLKBACK_INVALID_HANDLE != ( handle = pending_handle(idx, i) ) ) { - unmap[i].host_virt_addr = MMAP_VADDR(idx, i); + unmap[i].host_addr = MMAP_VADDR(idx, i); unmap[i].dev_bus_addr = 0; unmap[i].handle = handle; - pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; + pending_handle(idx, i) = BLKBACK_INVALID_HANDLE; invcount++; } } if ( unlikely(HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, unmap, invcount))) BUG(); -#else - - multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int i; - - for ( i = 0; i < nr_pages; i++ ) - { - MULTI_update_va_mapping(mcl+i, MMAP_VADDR(idx, i), - __pte(0), 0); - } - - mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; - if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) - BUG(); -#endif } @@ -205,11 +180,7 @@ blkif_t *blkif; struct list_head *ent; - daemonize( -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - "xenblkd" -#endif - ); + daemonize("xenblkd"); for ( ; ; ) { @@ -236,11 +207,7 @@ } /* Push the batch through to disc. */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - run_task_queue(&tq_disk); -#else flush_plugged_queue(); -#endif } } @@ -289,13 +256,6 @@ } } -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -static void end_block_io_op(struct buffer_head *bh, int uptodate) -{ - __end_block_io_op(bh->b_private, uptodate); - kmem_cache_free(buffer_head_cachep, bh); -} -#else static int end_block_io_op(struct bio *bio, unsigned int done, int error) { if ( bio->bi_size != 0 ) @@ -304,7 +264,6 @@ bio_put(bio); return error; } -#endif /****************************************************************************** @@ -351,10 +310,6 @@ case BLKIF_OP_READ: case BLKIF_OP_WRITE: dispatch_rw_block_io(blkif, req); - break; - - case BLKIF_OP_PROBE: - dispatch_probe(blkif, req); break; default: @@ -369,72 +324,6 @@ return more_to_do; } -static void dispatch_probe(blkif_t *blkif, blkif_request_t *req) -{ - int rsp = BLKIF_RSP_ERROR; - int pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; - - /* We expect one buffer only. */ - if ( unlikely(req->nr_segments != 1) ) - goto out; - - /* Make sure the buffer is page-sized. */ - if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || - (blkif_last_sect(req->frame_and_sects[0]) != 7) ) - goto out; - -#ifdef CONFIG_XEN_BLKDEV_GRANT - { - struct gnttab_map_grant_ref map; - - map.host_virt_addr = MMAP_VADDR(pending_idx, 0); - map.flags = GNTMAP_host_map; - map.ref = blkif_gref_from_fas(req->frame_and_sects[0]); - map.dom = blkif->domid; - - if ( unlikely(HYPERVISOR_grant_table_op( - GNTTABOP_map_grant_ref, &map, 1))) - BUG(); - - if ( map.handle < 0 ) - goto out; - - pending_handle(pending_idx, 0) = map.handle; - } -#else /* else CONFIG_XEN_BLKDEV_GRANT */ - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - /* Grab the real frontend out of the probe message. */ - if (req->frame_and_sects[1] == BLKTAP_COOKIE) - blkif->is_blktap = 1; -#endif - - -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - if ( HYPERVISOR_update_va_mapping_otherdomain( - MMAP_VADDR(pending_idx, 0), - (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, - 0, (blkif->is_blktap ? ID_TO_DOM(req->id) : blkif->domid) ) ) - - goto out; -#else - if ( HYPERVISOR_update_va_mapping_otherdomain( - MMAP_VADDR(pending_idx, 0), - (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL }, - 0, blkif->domid) ) - - goto out; -#endif -#endif /* endif CONFIG_XEN_BLKDEV_GRANT */ - - rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), - PAGE_SIZE / sizeof(vdisk_t)); - - out: - fast_flush_area(pending_idx, 1); - make_response(blkif, req->id, req->operation, rsp); -} - static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req) { extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); @@ -442,24 +331,15 @@ unsigned long fas = 0; int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; pending_req_t *pending_req; -#ifdef CONFIG_XEN_BLKDEV_GRANT struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -#else - unsigned long remap_prot; - multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST]; -#endif struct phys_req preq; struct { unsigned long buf; unsigned int nsec; } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int nseg; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - struct buffer_head *bh; -#else struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; int nbio = 0; request_queue_t *q; -#endif /* Check that number of segments is sane. */ nseg = req->nr_segments; @@ -470,11 +350,10 @@ goto bad_descriptor; } - preq.dev = req->device; + preq.dev = req->handle; preq.sector_number = req->sector_number; preq.nr_sects = 0; -#ifdef CONFIG_XEN_BLKDEV_GRANT for ( i = 0; i < nseg; i++ ) { fas = req->frame_and_sects[i]; @@ -484,7 +363,7 @@ goto bad_descriptor; preq.nr_sects += seg[i].nsec; - map[i].host_virt_addr = MMAP_VADDR(pending_idx, i); + map[i].host_addr = MMAP_VADDR(pending_idx, i); map[i].dom = blkif->domid; map[i].ref = blkif_gref_from_fas(fas); map[i].flags = GNTMAP_host_map; @@ -506,25 +385,15 @@ } phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(map[i].dev_bus_addr); + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); pending_handle(pending_idx, i) = map[i].handle; } -#endif for ( i = 0; i < nseg; i++ ) { fas = req->frame_and_sects[i]; -#ifdef CONFIG_XEN_BLKDEV_GRANT - seg[i].buf = (map[i].dev_bus_addr << PAGE_SHIFT) | - (blkif_first_sect(fas) << 9); -#else - seg[i].buf = (fas & PAGE_MASK) | (blkif_first_sect(fas) << 9); - seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1; - if ( seg[i].nsec <= 0 ) - goto bad_descriptor; - preq.nr_sects += seg[i].nsec; -#endif + seg[i].buf = map[i].dev_bus_addr | (blkif_first_sect(fas) << 9); } if ( vbd_translate(&preq, blkif, operation) != 0 ) @@ -534,40 +403,6 @@ preq.sector_number + preq.nr_sects, preq.dev); goto bad_descriptor; } - -#ifndef CONFIG_XEN_BLKDEV_GRANT - if ( operation == READ ) - remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; - else - remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED; - - - for ( i = 0; i < nseg; i++ ) - { - MULTI_update_va_mapping_otherdomain( - mcl+i, MMAP_VADDR(pending_idx, i), - pfn_pte_ma(seg[i].buf >> PAGE_SHIFT, __pgprot(remap_prot)), - 0, blkif->domid); -#ifdef CONFIG_XEN_BLKDEV_TAP_BE - if ( blkif->is_blktap ) - mcl[i].args[MULTI_UVMDOMID_INDEX] = ID_TO_DOM(req->id); -#endif - phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = - FOREIGN_FRAME(seg[i].buf >> PAGE_SHIFT); - } - - BUG_ON(HYPERVISOR_multicall(mcl, nseg) != 0); - - for ( i = 0; i < nseg; i++ ) - { - if ( unlikely(mcl[i].result != 0) ) - { - DPRINTK("invalid buffer -- could not remap it\n"); - fast_flush_area(pending_idx, nseg); - goto bad_descriptor; - } - } -#endif /* end ifndef CONFIG_XEN_BLKDEV_GRANT */ pending_req = &pending_reqs[pending_idx]; pending_req->blkif = blkif; @@ -575,49 +410,6 @@ pending_req->operation = operation; pending_req->status = BLKIF_RSP_OKAY; pending_req->nr_pages = nseg; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - - atomic_set(&pending_req->pendcnt, nseg); - pending_cons++; - blkif_get(blkif); - - for ( i = 0; i < nseg; i++ ) - { - bh = kmem_cache_alloc(buffer_head_cachep, GFP_KERNEL); - if ( unlikely(bh == NULL) ) - { - __end_block_io_op(pending_req, 0); - continue; - } - - memset(bh, 0, sizeof (struct buffer_head)); - - init_waitqueue_head(&bh->b_wait); - bh->b_size = seg[i].nsec << 9; - bh->b_dev = preq.dev; - bh->b_rdev = preq.dev; - bh->b_rsector = (unsigned long)preq.sector_number; - bh->b_data = (char *)MMAP_VADDR(pending_idx, i) + - (seg[i].buf & ~PAGE_MASK); - bh->b_page = virt_to_page(MMAP_VADDR(pending_idx, i)); - bh->b_end_io = end_block_io_op; - bh->b_private = pending_req; - - bh->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | - (1 << BH_Req) | (1 << BH_Launder); - if ( operation == WRITE ) - bh->b_state |= (1 << BH_JBD) | (1 << BH_Req) | (1 << BH_Uptodate); - - atomic_set(&bh->b_count, 1); - - /* Dispatch a single request. We'll flush it to disc later. */ - generic_make_request(operation, bh); - - preq.sector_number += seg[i].nsec; - } - -#else for ( i = 0; i < nseg; i++ ) { @@ -667,8 +459,6 @@ for ( i = 0; i < nbio; i++ ) submit_bio(operation, biolist[i]); -#endif - return; bad_descriptor: @@ -712,6 +502,7 @@ static int __init blkif_init(void) { int i; + struct page *page; if ( !(xen_start_info.flags & SIF_INITDOMAIN) && !(xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) @@ -719,8 +510,9 @@ blkif_interface_init(); - if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) - BUG(); + page = balloon_alloc_empty_page_range(MMAP_PAGES); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); pending_cons = 0; pending_prod = MAX_PENDING_REQS; @@ -734,18 +526,9 @@ if ( kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES) < 0 ) BUG(); -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - buffer_head_cachep = kmem_cache_create( - "buffer_head_cache", sizeof(struct buffer_head), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); -#endif - - blkif_ctrlif_init(); - -#ifdef CONFIG_XEN_BLKDEV_GRANT + blkif_xenbus_init(); + memset( pending_grant_handles, BLKBACK_INVALID_HANDLE, MMAP_PAGES ); - printk(KERN_ALERT "Blkif backend is using grant tables.\n"); -#endif #ifdef CONFIG_XEN_BLKDEV_TAP_BE printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n"); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Thu Aug 25 22:53:20 2005 @@ -5,17 +5,18 @@ #include <linux/config.h> #include <linux/version.h> #include <linux/module.h> -#include <linux/rbtree.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/blkdev.h> +#include <linux/vmalloc.h> #include <asm/io.h> #include <asm/setup.h> #include <asm/pgalloc.h> -#include <asm-xen/ctrl_if.h> +#include <asm-xen/evtchn.h> #include <asm-xen/hypervisor.h> #include <asm-xen/xen-public/io/blkif.h> #include <asm-xen/xen-public/io/ring.h> +#include <asm-xen/gnttab.h> #if 0 #define ASSERT(_p) \ @@ -28,12 +29,13 @@ #define DPRINTK(_f, _a...) ((void)0) #endif -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -typedef struct rb_root rb_root_t; -typedef struct rb_node rb_node_t; -#else -struct block_device; -#endif +struct vbd { + blkif_vdev_t handle; /* what the domain refers to this vbd as */ + unsigned char readonly; /* Non-zero -> read-only */ + unsigned char type; /* VDISK_xxx */ + blkif_pdev_t pdevice; /* phys device that this vbd maps to */ + struct block_device *bdev; +}; typedef struct blkif_st { /* Unique identifier for this interface. */ @@ -42,34 +44,25 @@ /* Physical parameters of the comms window. */ unsigned long shmem_frame; unsigned int evtchn; - int irq; + unsigned int remote_evtchn; /* Comms information. */ blkif_back_ring_t blk_ring; /* VBDs attached to this interface. */ - rb_root_t vbd_rb; /* Mapping from 16-bit vdevices to VBDs.*/ - spinlock_t vbd_lock; /* Protects VBD mapping. */ + struct vbd vbd; /* Private fields. */ - enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; + enum { DISCONNECTED, CONNECTED } status; #ifdef CONFIG_XEN_BLKDEV_TAP_BE /* Is this a blktap frontend */ unsigned int is_blktap; #endif - struct blkif_st *hash_next; struct list_head blkdev_list; spinlock_t blk_ring_lock; atomic_t refcnt; - struct work_struct work; -#ifdef CONFIG_XEN_BLKDEV_GRANT + struct work_struct free_work; u16 shmem_handle; - memory_t shmem_vaddr; + unsigned long shmem_vaddr; grant_ref_t shmem_ref; -#endif } blkif_t; void blkif_create(blkif_be_create_t *create); @@ -77,18 +70,25 @@ void blkif_connect(blkif_be_connect_t *connect); int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id); void blkif_disconnect_complete(blkif_t *blkif); -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); +blkif_t *alloc_blkif(domid_t domid); +void free_blkif_callback(blkif_t *blkif); +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); + #define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) #define blkif_put(_b) \ do { \ if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - blkif_disconnect_complete(_b); \ + free_blkif_callback(_b); \ } while (0) -void vbd_create(blkif_be_vbd_create_t *create); -void vbd_destroy(blkif_be_vbd_destroy_t *delete); -int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds); -void destroy_all_vbds(blkif_t *blkif); +/* Create a vbd. */ +int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, blkif_pdev_t pdevice, + int readonly); +void vbd_free(struct vbd *vbd); + +unsigned long vbd_size(struct vbd *vbd); +unsigned int vbd_info(struct vbd *vbd); +unsigned long vbd_secsize(struct vbd *vbd); struct phys_req { unsigned short dev; @@ -100,9 +100,10 @@ int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); void blkif_interface_init(void); -void blkif_ctrlif_init(void); void blkif_deschedule(blkif_t *blkif); + +void blkif_xenbus_init(void); irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Thu Aug 25 22:53:20 2005 @@ -7,289 +7,137 @@ */ #include "common.h" - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#endif - -#define BLKIF_HASHSZ 1024 -#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) +#include <asm-xen/evtchn.h> static kmem_cache_t *blkif_cachep; -static blkif_t *blkif_hash[BLKIF_HASHSZ]; -blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +blkif_t *alloc_blkif(domid_t domid) { - blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif != NULL) && - ((blkif->domid != domid) || (blkif->handle != handle)) ) - blkif = blkif->hash_next; + blkif_t *blkif; + + blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->status = DISCONNECTED; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + return blkif; } -static void __blkif_disconnect_complete(void *arg) +static int map_frontend_page(blkif_t *blkif, unsigned long localaddr, + unsigned long shared_page) { - blkif_t *blkif = (blkif_t *)arg; - ctrl_msg_t cmsg; - blkif_be_disconnect_t disc; + struct gnttab_map_grant_ref op; + op.host_addr = localaddr; + op.flags = GNTMAP_host_map; + op.ref = shared_page; + op.dom = blkif->domid; - /* - * These can't be done in blkif_disconnect() because at that point there - * may be outstanding requests at the disc whose asynchronous responses - * must still be notified to the remote driver. - */ - unbind_evtchn_from_irq(blkif->evtchn); + BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); -#ifdef CONFIG_XEN_BLKDEV_GRANT - { - /* - * Release the shared memory page. - */ - struct gnttab_unmap_grant_ref op; + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; + } - op.host_virt_addr = blkif->shmem_vaddr; - op.handle = blkif->shmem_handle; - op.dev_bus_addr = 0; - - if(unlikely(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))) { - BUG(); - } - } -#endif - vfree(blkif->blk_ring.sring); - - /* Construct the deferred response message. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DISCONNECT; - cmsg.id = blkif->disconnect_rspid; - cmsg.length = sizeof(blkif_be_disconnect_t); - disc.domid = blkif->domid; - disc.blkif_handle = blkif->handle; - disc.status = BLKIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'blkif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( blkif->status != DISCONNECTING ) - BUG(); - blkif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + blkif->shmem_vaddr = localaddr; + return 0; } -void blkif_disconnect_complete(blkif_t *blkif) +static void unmap_frontend_page(blkif_t *blkif) { - INIT_WORK(&blkif->work, __blkif_disconnect_complete, (void *)blkif); - schedule_work(&blkif->work); + struct gnttab_unmap_grant_ref op; + + op.host_addr = blkif->shmem_vaddr; + op.handle = blkif->shmem_handle; + op.dev_bus_addr = 0; + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); } -void blkif_create(blkif_be_create_t *create) +int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) { - domid_t domid = create->domid; - unsigned int handle = create->blkif_handle; - blkif_t **pblkif, *blkif; + struct vm_struct *vma; + blkif_sring_t *sring; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; - if ( (blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL)) == NULL ) - { - DPRINTK("Could not create blkif: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; + BUG_ON(blkif->remote_evtchn); + + if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) + return -ENOMEM; + + err = map_frontend_page(blkif, (unsigned long)vma->addr, shared_page); + if (err) { + vfree(vma->addr); + return err; } - memset(blkif, 0, sizeof(*blkif)); - blkif->domid = domid; - blkif->handle = handle; - blkif->status = DISCONNECTED; - spin_lock_init(&blkif->vbd_lock); - spin_lock_init(&blkif->blk_ring_lock); - atomic_set(&blkif->refcnt, 0); - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( *pblkif != NULL ) - { - if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) - { - DPRINTK("Could not create blkif: already exists\n"); - create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; - kmem_cache_free(blkif_cachep, blkif); - return; - } - pblkif = &(*pblkif)->hash_next; + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = blkif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_page(blkif); + vfree(vma->addr); + return err; } - blkif->hash_next = *pblkif; - *pblkif = blkif; + blkif->evtchn = op.u.bind_interdomain.port1; + blkif->remote_evtchn = evtchn; - DPRINTK("Successfully created blkif\n"); - create->status = BLKIF_BE_STATUS_OKAY; -} - -void blkif_destroy(blkif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - unsigned int handle = destroy->blkif_handle; - blkif_t **pblkif, *blkif; - - pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; - while ( (blkif = *pblkif) != NULL ) - { - if ( (blkif->domid == domid) && (blkif->handle == handle) ) - { - if ( blkif->status != DISCONNECTED ) - goto still_connected; - goto destroy; - } - pblkif = &blkif->hash_next; - } - - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pblkif = blkif->hash_next; - destroy_all_vbds(blkif); - kmem_cache_free(blkif_cachep, blkif); - destroy->status = BLKIF_BE_STATUS_OKAY; -} - -void blkif_connect(blkif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int handle = connect->blkif_handle; - unsigned int evtchn = connect->evtchn; - unsigned long shmem_frame = connect->shmem_frame; - struct vm_struct *vma; -#ifdef CONFIG_XEN_BLKDEV_GRANT - int ref = connect->shmem_ref; -#else - pgprot_t prot; - int error; -#endif - blkif_t *blkif; - blkif_sring_t *sring; - - blkif = blkif_find_by_handle(domid, handle); - if ( unlikely(blkif == NULL) ) - { - DPRINTK("blkif_connect attempted for non-existent blkif (%u,%u)\n", - connect->domid, connect->blkif_handle); - connect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - -#ifndef CONFIG_XEN_BLKDEV_GRANT - prot = __pgprot(_KERNPG_TABLE); - error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), - shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - if ( error == -ENOMEM ) - connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) - connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; - else - connect->status = BLKIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } -#else - { /* Map: Use the Grant table reference */ - struct gnttab_map_grant_ref op; - op.host_virt_addr = VMALLOC_VMADDR(vma->addr); - op.flags = GNTMAP_host_map; - op.ref = ref; - op.dom = domid; - - BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); - - handle = op.handle; - - if (op.handle < 0) { - DPRINTK(" Grant table operation failure !\n"); - connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; - vfree(vma->addr); - return; - } - - blkif->shmem_ref = ref; - blkif->shmem_handle = handle; - blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr); - } -#endif - - if ( blkif->status != DISCONNECTED ) - { - connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; - vfree(vma->addr); - return; - } sring = (blkif_sring_t *)vma->addr; SHARED_RING_INIT(sring); BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); - - blkif->evtchn = evtchn; - blkif->irq = bind_evtchn_to_irq(evtchn); - blkif->shmem_frame = shmem_frame; + + bind_evtchn_to_irqhandler(blkif->evtchn, blkif_be_int, 0, "blkif-backend", + blkif); blkif->status = CONNECTED; - blkif_get(blkif); + blkif->shmem_frame = shared_page; - request_irq(blkif->irq, blkif_be_int, 0, "blkif-backend", blkif); - - connect->status = BLKIF_BE_STATUS_OKAY; + return 0; } -int blkif_disconnect(blkif_be_disconnect_t *disconnect, u8 rsp_id) +static void free_blkif(void *arg) { - domid_t domid = disconnect->domid; - unsigned int handle = disconnect->blkif_handle; - blkif_t *blkif; + evtchn_op_t op = { .cmd = EVTCHNOP_close }; + blkif_t *blkif = (blkif_t *)arg; - blkif = blkif_find_by_handle(domid, handle); - if ( unlikely(blkif == NULL) ) - { - DPRINTK("blkif_disconnect attempted for non-existent blkif" - " (%u,%u)\n", disconnect->domid, disconnect->blkif_handle); - disconnect->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ + op.u.close.port = blkif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = blkif->remote_evtchn; + op.u.close.dom = blkif->domid; + HYPERVISOR_event_channel_op(&op); + + vbd_free(&blkif->vbd); + + if (blkif->evtchn) + unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); + + if (blkif->blk_ring.sring) { + unmap_frontend_page(blkif); + vfree(blkif->blk_ring.sring); + blkif->blk_ring.sring = NULL; } - if ( blkif->status == CONNECTED ) - { - blkif->status = DISCONNECTING; - blkif->disconnect_rspid = rsp_id; - wmb(); /* Let other CPUs see the status change. */ - free_irq(blkif->irq, blkif); - blkif_deschedule(blkif); - blkif_put(blkif); - return 0; /* Caller should not send response message. */ - } + kmem_cache_free(blkif_cachep, blkif); +} - disconnect->status = BLKIF_BE_STATUS_OKAY; - return 1; +void free_blkif_callback(blkif_t *blkif) +{ + INIT_WORK(&blkif->free_work, free_blkif, (void *)blkif); + schedule_work(&blkif->free_work); } void __init blkif_interface_init(void) { blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), 0, 0, NULL, NULL); - memset(blkif_hash, 0, sizeof(blkif_hash)); } diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/vbd.c Thu Aug 25 22:53:20 2005 @@ -3,104 +3,61 @@ * * Routines for managing virtual block devices (VBDs). * - * NOTE: vbd_lock protects updates to the rb_tree against concurrent lookups - * in vbd_translate. All other lookups are implicitly protected because the - * only caller (the control message dispatch routine) serializes the calls. - * * Copyright (c) 2003-2005, Keir Fraser & Steve Hand */ #include "common.h" +#include <asm-xen/xenbus.h> -struct vbd { - blkif_vdev_t vdevice; /* what the domain refers to this vbd as */ - unsigned char readonly; /* Non-zero -> read-only */ - unsigned char type; /* VDISK_xxx */ - blkif_pdev_t pdevice; /* phys device that this vbd maps to */ - struct block_device *bdev; - rb_node_t rb; /* for linking into R-B tree lookup struct */ -}; - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) static inline dev_t vbd_map_devnum(blkif_pdev_t cookie) -{ return MKDEV(cookie>>8, cookie&0xff); } +{ + return MKDEV(BLKIF_MAJOR(cookie), BLKIF_MINOR(cookie)); +} #define vbd_sz(_v) ((_v)->bdev->bd_part ? \ (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) #define bdev_put(_b) blkdev_put(_b) -#else -#define vbd_sz(_v) (blk_size[MAJOR((_v)->pdevice)][MINOR((_v)->pdevice)]*2) -#define bdev_put(_b) ((void)0) -#define bdev_hardsect_size(_b) 512 -#endif -void vbd_create(blkif_be_vbd_create_t *create) +unsigned long vbd_size(struct vbd *vbd) { - struct vbd *vbd; - rb_node_t **rb_p, *rb_parent = NULL; - blkif_t *blkif; - blkif_vdev_t vdevice = create->vdevice; + return vbd_sz(vbd); +} - blkif = blkif_find_by_handle(create->domid, create->blkif_handle); - if ( unlikely(blkif == NULL) ) - { - DPRINTK("vbd_create attempted for non-existent blkif (%u,%u)\n", - create->domid, create->blkif_handle); - create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } +unsigned int vbd_info(struct vbd *vbd) +{ + return vbd->type | (vbd->readonly?VDISK_READONLY:0); +} - rb_p = &blkif->vbd_rb.rb_node; - while ( *rb_p != NULL ) - { - rb_parent = *rb_p; - vbd = rb_entry(rb_parent, struct vbd, rb); - if ( vdevice < vbd->vdevice ) - { - rb_p = &rb_parent->rb_left; - } - else if ( vdevice > vbd->vdevice ) - { - rb_p = &rb_parent->rb_right; - } - else - { - DPRINTK("vbd_create attempted for already existing vbd\n"); - create->status = BLKIF_BE_STATUS_VBD_EXISTS; - return; - } - } +unsigned long vbd_secsize(struct vbd *vbd) +{ + return bdev_hardsect_size(vbd->bdev); +} - if ( unlikely((vbd = kmalloc(sizeof(struct vbd), GFP_KERNEL)) == NULL) ) - { - DPRINTK("vbd_create: out of memory\n"); - create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - return; - } +int vbd_create(blkif_t *blkif, blkif_vdev_t handle, + blkif_pdev_t pdevice, int readonly) +{ + struct vbd *vbd; - vbd->vdevice = vdevice; - vbd->readonly = create->readonly; + vbd = &blkif->vbd; + vbd->handle = handle; + vbd->readonly = readonly; vbd->type = 0; - /* Mask to 16-bit for compatibility with old tools */ - vbd->pdevice = create->pdevice & 0xffff; + vbd->pdevice = pdevice; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) vbd->bdev = open_by_devnum( vbd_map_devnum(vbd->pdevice), vbd->readonly ? FMODE_READ : FMODE_WRITE); if ( IS_ERR(vbd->bdev) ) { DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - create->status = BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND; - return; + return -ENOENT; } if ( (vbd->bdev->bd_disk == NULL) ) { DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - create->status = BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND; - bdev_put(vbd->bdev); - return; + vbd_free(vbd); + return -ENOENT; } if ( vbd->bdev->bd_disk->flags & GENHD_FL_CD ) @@ -108,181 +65,27 @@ if ( vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE ) vbd->type |= VDISK_REMOVABLE; -#else - if ( (blk_size[MAJOR(vbd->pdevice)] == NULL) || (vbd_sz(vbd) == 0) ) - { - DPRINTK("vbd_creat: device %08x doesn't exist.\n", vbd->pdevice); - create->status = BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND; - return; - } -#endif - - spin_lock(&blkif->vbd_lock); - rb_link_node(&vbd->rb, rb_parent, rb_p); - rb_insert_color(&vbd->rb, &blkif->vbd_rb); - spin_unlock(&blkif->vbd_lock); - - DPRINTK("Successful creation of vdev=%04x (dom=%u)\n", - vdevice, create->domid); - create->status = BLKIF_BE_STATUS_OKAY; + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", + handle, blkif->domid); + return 0; } - -void vbd_destroy(blkif_be_vbd_destroy_t *destroy) +void vbd_free(struct vbd *vbd) { - blkif_t *blkif; - struct vbd *vbd; - rb_node_t *rb; - blkif_vdev_t vdevice = destroy->vdevice; - - blkif = blkif_find_by_handle(destroy->domid, destroy->blkif_handle); - if ( unlikely(blkif == NULL) ) - { - DPRINTK("vbd_destroy attempted for non-existent blkif (%u,%u)\n", - destroy->domid, destroy->blkif_handle); - destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - rb = blkif->vbd_rb.rb_node; - while ( rb != NULL ) - { - vbd = rb_entry(rb, struct vbd, rb); - if ( vdevice < vbd->vdevice ) - rb = rb->rb_left; - else if ( vdevice > vbd->vdevice ) - rb = rb->rb_right; - else - goto found; - } - - destroy->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; - return; - - found: - spin_lock(&blkif->vbd_lock); - rb_erase(rb, &blkif->vbd_rb); - spin_unlock(&blkif->vbd_lock); - bdev_put(vbd->bdev); - kfree(vbd); + if (vbd->bdev) + bdev_put(vbd->bdev); + vbd->bdev = NULL; } - - -void destroy_all_vbds(blkif_t *blkif) -{ - struct vbd *vbd; - rb_node_t *rb; - - spin_lock(&blkif->vbd_lock); - - while ( (rb = blkif->vbd_rb.rb_node) != NULL ) - { - vbd = rb_entry(rb, struct vbd, rb); - rb_erase(rb, &blkif->vbd_rb); - spin_unlock(&blkif->vbd_lock); - bdev_put(vbd->bdev); - kfree(vbd); - spin_lock(&blkif->vbd_lock); - } - - spin_unlock(&blkif->vbd_lock); -} - - -static void vbd_probe_single( - blkif_t *blkif, vdisk_t *vbd_info, struct vbd *vbd) -{ - vbd_info->device = vbd->vdevice; - vbd_info->info = vbd->type | (vbd->readonly ? VDISK_READONLY : 0); - vbd_info->capacity = vbd_sz(vbd); - vbd_info->sector_size = bdev_hardsect_size(vbd->bdev); -} - - -int vbd_probe(blkif_t *blkif, vdisk_t *vbd_info, int max_vbds) -{ - int rc = 0, nr_vbds = 0; - rb_node_t *rb; - - spin_lock(&blkif->vbd_lock); - - if ( (rb = blkif->vbd_rb.rb_node) == NULL ) - goto out; - - new_subtree: - /* STEP 1. Find least node (it'll be left-most). */ - while ( rb->rb_left != NULL ) - rb = rb->rb_left; - - for ( ; ; ) - { - /* STEP 2. Dealt with left subtree. Now process current node. */ - vbd_probe_single(blkif, &vbd_info[nr_vbds], - rb_entry(rb, struct vbd, rb)); - if ( ++nr_vbds == max_vbds ) - goto out; - - /* STEP 3. Process right subtree, if any. */ - if ( rb->rb_right != NULL ) - { - rb = rb->rb_right; - goto new_subtree; - } - - /* STEP 4. Done both subtrees. Head back through ancesstors. */ - for ( ; ; ) - { - /* We're done when we get back to the root node. */ - if ( rb->rb_parent == NULL ) - goto out; - /* If we are left of parent, then parent is next to process. */ - if ( rb->rb_parent->rb_left == rb ) - break; - /* If we are right of parent, then we climb to grandparent. */ - rb = rb->rb_parent; - } - - rb = rb->rb_parent; - } - - out: - spin_unlock(&blkif->vbd_lock); - return (rc == 0) ? nr_vbds : rc; -} - int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) { - struct vbd *vbd; - rb_node_t *rb; - int rc = -EACCES; + struct vbd *vbd = &blkif->vbd; + int rc = -EACCES; - /* Take the vbd_lock because another thread could be updating the tree. */ - spin_lock(&blkif->vbd_lock); - - rb = blkif->vbd_rb.rb_node; - while ( rb != NULL ) - { - vbd = rb_entry(rb, struct vbd, rb); - if ( req->dev < vbd->vdevice ) - rb = rb->rb_left; - else if ( req->dev > vbd->vdevice ) - rb = rb->rb_right; - else - goto found; - } - - DPRINTK("vbd_translate; domain %u attempted to access " - "non-existent VBD.\n", blkif->domid); - rc = -ENODEV; - goto out; - - found: - - if ( (operation == WRITE) && vbd->readonly ) + if ((operation == WRITE) && vbd->readonly) goto out; - if ( unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd)) ) + if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) goto out; req->dev = vbd->pdevice; @@ -290,6 +93,5 @@ rc = 0; out: - spin_unlock(&blkif->vbd_lock); return rc; } diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Thu Aug 25 22:53:20 2005 @@ -53,47 +53,26 @@ #include <linux/sched.h> #include <linux/interrupt.h> #include <scsi/scsi.h> -#include <asm-xen/ctrl_if.h> #include <asm-xen/evtchn.h> -#ifdef CONFIG_XEN_BLKDEV_GRANT +#include <asm-xen/xenbus.h> #include <asm-xen/xen-public/grant_table.h> #include <asm-xen/gnttab.h> -#endif typedef unsigned char byte; /* from linux/ide.h */ /* Control whether runtime update of vbds is enabled. */ #define ENABLE_VBD_UPDATE 1 -#if ENABLE_VBD_UPDATE -static void vbd_update(void); -#else -static void vbd_update(void){}; -#endif - -#define BLKIF_STATE_CLOSED 0 -#define BLKIF_STATE_DISCONNECTED 1 -#define BLKIF_STATE_CONNECTED 2 - -static int blkif_handle = 0; -static unsigned int blkif_state = BLKIF_STATE_CLOSED; -static unsigned int blkif_evtchn = 0; -static unsigned int blkif_irq = 0; - -static int blkif_control_rsp_valid; -static blkif_response_t blkif_control_rsp; - -static blkif_front_ring_t blk_ring; +#define BLKIF_STATE_DISCONNECTED 0 +#define BLKIF_STATE_CONNECTED 1 + +static unsigned int blkif_state = BLKIF_STATE_DISCONNECTED; #define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) -#ifdef CONFIG_XEN_BLKDEV_GRANT -static domid_t rdomid = 0; -static grant_ref_t gref_head, gref_terminal; #define MAXIMUM_OUTSTANDING_BLOCK_REQS \ (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLKIF_RING_SIZE) #define GRANTREF_INVALID (1<<15) -#endif static struct blk_shadow { blkif_request_t req; @@ -104,9 +83,9 @@ static int recovery = 0; /* Recovery in progress: protected by blkif_io_lock */ -static void kick_pending_request_queues(void); - -int __init xlblk_init(void); +static void kick_pending_request_queues(struct blkfront_info *info); + +static int __init xlblk_init(void); static void blkif_completion(struct blk_shadow *s); @@ -131,7 +110,7 @@ /* Kernel-specific definitions used in the common code */ #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define DISABLE_SCATTERGATHER() +#define DISABLE_SCATTERGATHER() #else static int sg_operation = -1; #define DISABLE_SCATTERGATHER() (sg_operation = -1) @@ -139,38 +118,22 @@ static inline void pickle_request(struct blk_shadow *s, blkif_request_t *r) { -#ifndef CONFIG_XEN_BLKDEV_GRANT - int i; -#endif s->req = *r; - -#ifndef CONFIG_XEN_BLKDEV_GRANT - for ( i = 0; i < r->nr_segments; i++ ) - s->req.frame_and_sects[i] = machine_to_phys(r->frame_and_sects[i]); -#endif } static inline void unpickle_request(blkif_request_t *r, struct blk_shadow *s) { -#ifndef CONFIG_XEN_BLKDEV_GRANT - int i; -#endif *r = s->req; - -#ifndef CONFIG_XEN_BLKDEV_GRANT - for ( i = 0; i < s->req.nr_segments; i++ ) - r->frame_and_sects[i] = phys_to_machine(s->req.frame_and_sects[i]); -#endif -} - - -static inline void flush_requests(void) +} + + +static inline void flush_requests(struct blkfront_info *info) { DISABLE_SCATTERGATHER(); - RING_PUSH_REQUESTS(&blk_ring); - notify_via_evtchn(blkif_evtchn); + RING_PUSH_REQUESTS(&info->ring); + notify_via_evtchn(info->evtchn); } @@ -180,58 +143,45 @@ module_init(xlblk_init); -#if ENABLE_VBD_UPDATE -static void update_vbds_task(void *unused) -{ - xlvbd_update_vbds(); -} - -static void vbd_update(void) -{ - static DECLARE_WORK(update_tq, update_vbds_task, NULL); - schedule_work(&update_tq); -} -#endif /* ENABLE_VBD_UPDATE */ - -static struct xlbd_disk_info *head_waiting = NULL; -static void kick_pending_request_queues(void) -{ - struct xlbd_disk_info *di; - while ( ((di = head_waiting) != NULL) && !RING_FULL(&blk_ring) ) - { - head_waiting = di->next_waiting; - di->next_waiting = NULL; - /* Re-enable calldowns. */ - blk_start_queue(di->rq); - /* Kick things off immediately. */ - do_blkif_request(di->rq); - } +static void kick_pending_request_queues(struct blkfront_info *info) +{ + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +} + +static void blkif_restart_queue(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + spin_lock_irq(&blkif_io_lock); + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); +} + +static void blkif_restart_queue_callback(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + schedule_work(&info->work); } int blkif_open(struct inode *inode, struct file *filep) { - struct gendisk *gd = inode->i_bdev->bd_disk; - struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; - - /* Update of usage count is protected by per-device semaphore. */ - di->mi->usage++; - - return 0; + // struct gendisk *gd = inode->i_bdev->bd_disk; + // struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; + + /* Update of usage count is protected by per-device semaphore. */ + // di->mi->usage++; + + return 0; } int blkif_release(struct inode *inode, struct file *filep) { - struct gendisk *gd = inode->i_bdev->bd_disk; - struct xlbd_disk_info *di = (struct xlbd_disk_info *)gd->private_data; - - /* - * When usage drops to zero it may allow more VBD updates to occur. - * Update of usage count is protected by a per-device semaphore. - */ - if ( --di->mi->usage == 0 ) - vbd_update(); - + /* FIXME: This is where we can actually free up majors, etc. --RR */ return 0; } @@ -242,8 +192,8 @@ int i; DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", - command, (long)argument, inode->i_rdev); - + command, (long)argument, inode->i_rdev); + switch ( command ) { case HDIO_GETGEO: @@ -269,7 +219,7 @@ /* * blkif_queue_request * - * request block io + * request block io * * id: for guest use only. * operation: BLKIF_OP_{READ,WRITE,PROBE} @@ -278,7 +228,7 @@ */ static int blkif_queue_request(struct request *req) { - struct xlbd_disk_info *di = req->rq_disk->private_data; + struct blkfront_info *info = req->rq_disk->private_data; unsigned long buffer_ma; blkif_request_t *ring_req; struct bio *bio; @@ -286,23 +236,29 @@ int idx; unsigned long id; unsigned int fsect, lsect; -#ifdef CONFIG_XEN_BLKDEV_GRANT int ref; -#endif - - if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) + grant_ref_t gref_head; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; + if (gnttab_alloc_grant_references(BLKIF_MAX_SEGMENTS_PER_REQUEST, + &gref_head) < 0) { + gnttab_request_free_callback(&info->callback, + blkif_restart_queue_callback, info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } + /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt); + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); id = GET_ID_FROM_FREELIST(); blk_shadow[id].request = (unsigned long)req; ring_req->id = id; - ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : - BLKIF_OP_READ; + ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; ring_req->sector_number = (blkif_sector_t)req->sector; - ring_req->device = di->xd_device; + ring_req->handle = info->handle; ring_req->nr_segments = 0; rq_for_each_bio(bio, req) @@ -314,38 +270,35 @@ buffer_ma = page_to_phys(bvec->bv_page); fsect = bvec->bv_offset >> 9; lsect = fsect + (bvec->bv_len >> 9) - 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); + ref = gnttab_claim_grant_reference(&gref_head); ASSERT( ref != -ENOSPC ); gnttab_grant_foreign_access_ref( ref, - rdomid, + info->backend_id, buffer_ma >> PAGE_SHIFT, rq_data_dir(req) ); blk_shadow[id].frame[ring_req->nr_segments] = buffer_ma >> PAGE_SHIFT; - ring_req->frame_and_sects[ring_req->nr_segments++] = + ring_req->frame_and_sects[ring_req->nr_segments] = blkif_fas_from_gref(ref, fsect, lsect); -#else - ring_req->frame_and_sects[ring_req->nr_segments++] = - blkif_fas(buffer_ma, fsect, lsect); -#endif + ring_req->nr_segments++; } } - blk_ring.req_prod_pvt++; - + info->ring.req_prod_pvt++; + /* Keep a private copy so we can reissue requests when recovering. */ pickle_request(&blk_shadow[id], ring_req); + gnttab_free_grant_references(gref_head); + return 0; } - /* * do_blkif_request @@ -353,24 +306,26 @@ */ void do_blkif_request(request_queue_t *rq) { - struct xlbd_disk_info *di; + struct blkfront_info *info = NULL; struct request *req; int queued; - DPRINTK("Entered do_blkif_request\n"); + DPRINTK("Entered do_blkif_request\n"); queued = 0; while ( (req = elv_next_request(rq)) != NULL ) { + info = req->rq_disk->private_data; + if ( !blk_fs_request(req) ) { end_request(req, 0); continue; } - if ( RING_FULL(&blk_ring) ) - goto wait; + if (RING_FULL(&info->ring)) + goto wait; DPRINTK("do_blk_req %p: cmd %p, sec %lx, (%u/%li) buffer:%p [%s]\n", req, req->cmd, req->sector, req->current_nr_sectors, @@ -378,25 +333,19 @@ rq_data_dir(req) ? "write" : "read"); blkdev_dequeue_request(req); - if ( blkif_queue_request(req) ) - { + if (blkif_queue_request(req)) { + blk_requeue_request(rq, req); wait: - di = req->rq_disk->private_data; - if ( di->next_waiting == NULL ) - { - di->next_waiting = head_waiting; - head_waiting = di; - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); - } - break; + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; } queued++; } if ( queued != 0 ) - flush_requests(); + flush_requests(info); } @@ -405,25 +354,24 @@ struct request *req; blkif_response_t *bret; RING_IDX i, rp; - unsigned long flags; - - spin_lock_irqsave(&blkif_io_lock, flags); - - if ( unlikely(blkif_state == BLKIF_STATE_CLOSED) || - unlikely(recovery) ) - { + unsigned long flags; + struct blkfront_info *info = (struct blkfront_info *)dev_id; + + spin_lock_irqsave(&blkif_io_lock, flags); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED || recovery)) { spin_unlock_irqrestore(&blkif_io_lock, flags); return IRQ_HANDLED; } - - rp = blk_ring.sring->rsp_prod; + + rp = info->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for ( i = blk_ring.rsp_cons; i != rp; i++ ) + for ( i = info->ring.rsp_cons; i != rp; i++ ) { unsigned long id; - bret = RING_GET_RESPONSE(&blk_ring, i); + bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; req = (struct request *)blk_shadow[id].request; @@ -440,25 +388,21 @@ bret->status); if ( unlikely(end_that_request_first - (req, + (req, (bret->status == BLKIF_RSP_OKAY), req->hard_nr_sectors)) ) BUG(); end_that_request_last(req); break; - case BLKIF_OP_PROBE: - memcpy(&blkif_control_rsp, bret, sizeof(*bret)); - blkif_control_rsp_valid = 1; - break; default: BUG(); } } - blk_ring.rsp_cons = i; - - kick_pending_request_queues(); + info->ring.rsp_cons = i; + + kick_pending_request_queues(info); spin_unlock_irqrestore(&blkif_io_lock, flags); @@ -484,56 +428,34 @@ #define blkif_io_lock io_request_lock /*============================================================================*/ -#if ENABLE_VBD_UPDATE - -/* - * blkif_update_int/update-vbds_task - handle VBD update events. - * Schedule a task for keventd to run, which will update the VBDs and perform - * the corresponding updates to our view of VBD state. - */ -static void update_vbds_task(void *unused) -{ - xlvbd_update_vbds(); -} - -static void vbd_update(void) -{ - static struct tq_struct update_tq; - update_tq.routine = update_vbds_task; - schedule_task(&update_tq); -} - -#endif /* ENABLE_VBD_UPDATE */ -/*============================================================================*/ - static void kick_pending_request_queues(void) { /* We kick pending request queues if the ring is reasonably empty. */ - if ( (nr_pending != 0) && - (RING_PENDING_REQUESTS(&blk_ring) < (BLK_RING_SIZE >> 1)) ) + if ( (nr_pending != 0) && + (RING_PENDING_REQUESTS(&info->ring) < (BLK_RING_SIZE >> 1)) ) { /* Attempt to drain the queue, but bail if the ring becomes full. */ - while ( (nr_pending != 0) && !RING_FULL(&blk_ring) ) + while ( (nr_pending != 0) && !RING_FULL(&info->ring) ) do_blkif_request(pending_queues[--nr_pending]); } } int blkif_open(struct inode *inode, struct file *filep) { - short xldev = inode->i_rdev; + short xldev = inode->i_rdev; struct gendisk *gd = get_gendisk(xldev); xl_disk_t *disk = xldev_to_xldisk(inode->i_rdev); - short minor = MINOR(xldev); + short minor = MINOR(xldev); if ( gd->part[minor].nr_sects == 0 ) - { + { /* * Device either doesn't exist, or has zero capacity; we use a few * cheesy heuristics to return the relevant error code */ if ( (gd->sizes[minor >> gd->minor_shift] != 0) || ((minor & (gd->max_p - 1)) != 0) ) - { + { /* * We have a real device, but no such partition, or we just have a * partition number so guess this is the problem. @@ -542,16 +464,16 @@ } else if ( gd->flags[minor >> gd->minor_shift] & GENHD_FL_REMOVABLE ) { - /* This is a removable device => assume that media is missing. */ + /* This is a removable device => assume that media is missing. */ return -ENOMEDIUM; /* media not present (this is a guess) */ - } + } else - { + { /* Just go for the general 'no such device' error. */ return -ENODEV; /* no such device */ } } - + /* Update of usage count is protected by per-device semaphore. */ disk->usage++; @@ -580,24 +502,24 @@ { kdev_t dev = inode->i_rdev; struct hd_geometry *geo = (struct hd_geometry *)argument; - struct gendisk *gd; - struct hd_struct *part; + struct gendisk *gd; + struct hd_struct *part; int i; unsigned short cylinders; byte heads, sectors; /* NB. No need to check permissions. That is done for us. */ - + DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", - command, (long) argument, dev); - + command, (long) argument, dev); + gd = get_gendisk(dev); - part = &gd->part[MINOR(dev)]; + part = &gd->part[MINOR(dev)]; switch ( command ) { case BLKGETSIZE: - DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); + DPRINTK_IOCTL(" BLKGETSIZE: %x %lx\n", BLKGETSIZE, part->nr_sects); return put_user(part->nr_sects, (unsigned long *) argument); case BLKGETSIZE64: @@ -610,7 +532,7 @@ return blkif_revalidate(dev); case BLKSSZGET: - return hardsect_size[MAJOR(dev)][MINOR(dev)]; + return hardsect_size[MAJOR(dev)][MINOR(dev)]; case BLKBSZGET: /* get block size */ DPRINTK_IOCTL(" BLKBSZGET: %x\n", BLKBSZGET); @@ -636,7 +558,7 @@ values consistent with the size of the device */ heads = 0xff; - sectors = 0x3f; + sectors = 0x3f; cylinders = part->nr_sects / (heads * sectors); if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; @@ -646,7 +568,7 @@ return 0; - case HDIO_GETGEO_BIG: + case HDIO_GETGEO_BIG: DPRINTK_IOCTL(" HDIO_GETGEO_BIG: %x\n", HDIO_GETGEO_BIG); if (!argument) return -EINVAL; @@ -654,7 +576,7 @@ values consistent with the size of the device */ heads = 0xff; - sectors = 0x3f; + sectors = 0x3f; cylinders = part->nr_sects / (heads * sectors); if (put_user(0x00, (unsigned long *) &geo->start)) return -EFAULT; @@ -678,7 +600,7 @@ WPRINTK("ioctl %08x not supported by XL blkif\n", command); return -ENOSYS; } - + return 0; } @@ -698,7 +620,7 @@ xl_disk_t *disk; unsigned long capacity; int i, rc = 0; - + if ( (bd = bdget(dev)) == NULL ) return -EINVAL; @@ -746,7 +668,7 @@ /* * blkif_queue_request * - * request block io + * request block io * * id: for guest use only. * operation: BLKIF_OP_{READ,WRITE,PROBE} @@ -758,7 +680,8 @@ char * buffer, unsigned long sector_number, unsigned short nr_sectors, - kdev_t device) + kdev_t device, + blkif_vdev_t handle) { unsigned long buffer_ma = virt_to_bus(buffer); unsigned long xid; @@ -766,9 +689,7 @@ blkif_request_t *req; struct buffer_head *bh; unsigned int fsect, lsect; -#ifdef CONFIG_XEN_BLKDEV_GRANT int ref; -#endif fsect = (buffer_ma & ~PAGE_MASK) >> 9; lsect = fsect + nr_sectors - 1; @@ -776,12 +697,12 @@ /* Buffer must be sector-aligned. Extent mustn't cross a page boundary. */ if ( unlikely((buffer_ma & ((1<<9)-1)) != 0) ) BUG(); - if ( lsect > 7 ) + if ( lsect > ((PAGE_SIZE/512)-1) ) BUG(); buffer_ma &= PAGE_MASK; - if ( unlikely(blkif_state != BLKIF_STATE_CONNECTED) ) + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return 1; switch ( operation ) @@ -789,7 +710,7 @@ case BLKIF_OP_READ: case BLKIF_OP_WRITE: - gd = get_gendisk(device); + gd = get_gendisk(device); /* * Update the sector_number we'll pass down as appropriate; note that @@ -799,10 +720,10 @@ sector_number += gd->part[MINOR(device)].start_sect; /* - * If this unit doesn't consist of virtual partitions then we clear + * If this unit doesn't consist of virtual partitions then we clear * the partn bits from the device number. */ - if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & + if ( !(gd->flags[MINOR(device)>>gd->minor_shift] & GENHD_FL_VIRT_PARTNS) ) device &= ~(gd->max_p - 1); @@ -810,21 +731,20 @@ (sg_dev == device) && (sg_next_sect == sector_number) ) { - req = RING_GET_REQUEST(&blk_ring, - blk_ring.req_prod_pvt - 1); + req = RING_GET_REQUEST(&info->ring, + info->ring.req_prod_pvt - 1); bh = (struct buffer_head *)id; - + bh->b_reqnext = (struct buffer_head *)blk_shadow[req->id].request; blk_shadow[req->id].request = (unsigned long)id; -#ifdef CONFIG_XEN_BLKDEV_GRANT /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); + ref = gnttab_claim_grant_reference(&gref_head); ASSERT( ref != -ENOSPC ); gnttab_grant_foreign_access_ref( ref, - rdomid, + info->backend_id, buffer_ma >> PAGE_SHIFT, ( operation == BLKIF_OP_WRITE ? 1 : 0 ) ); @@ -833,10 +753,6 @@ req->frame_and_sects[req->nr_segments] = blkif_fas_from_gref(ref, fsect, lsect); -#else - req->frame_and_sects[req->nr_segments] = - blkif_fas(buffer_ma, fsect, lsect); -#endif if ( ++req->nr_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST ) sg_next_sect += nr_sectors; else @@ -847,7 +763,7 @@ return 0; } - else if ( RING_FULL(&blk_ring) ) + else if ( RING_FULL(&info->ring) ) { return 1; } @@ -864,7 +780,7 @@ } /* Fill out a communications ring structure. */ - req = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt); + req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); xid = GET_ID_FROM_FREELIST(); blk_shadow[xid].request = (unsigned long)id; @@ -872,31 +788,27 @@ req->id = xid; req->operation = operation; req->sector_number = (blkif_sector_t)sector_number; - req->device = device; + req->handle = handle; req->nr_segments = 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); + ref = gnttab_claim_grant_reference(&gref_head); ASSERT( ref != -ENOSPC ); gnttab_grant_foreign_access_ref( ref, - rdomid, + info->backend_id, buffer_ma >> PAGE_SHIFT, ( operation == BLKIF_OP_WRITE ? 1 : 0 ) ); blk_shadow[xid].frame[0] = buffer_ma >> PAGE_SHIFT; req->frame_and_sects[0] = blkif_fas_from_gref(ref, fsect, lsect); -#else - req->frame_and_sects[0] = blkif_fas(buffer_ma, fsect, lsect); -#endif - - /* Keep a private copy so we can reissue requests when recovering. */ + + /* Keep a private copy so we can reissue requests when recovering. */ pickle_request(&blk_shadow[xid], req); - blk_ring.req_prod_pvt++; - + info->ring.req_prod_pvt++; + return 0; } @@ -911,13 +823,13 @@ struct buffer_head *bh, *next_bh; int rw, nsect, full, queued = 0; - DPRINTK("Entered do_blkif_request\n"); + DPRINTK("Entered do_blkif_request\n"); while ( !rq->plugged && !list_empty(&rq->queue_head)) { - if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) + if ( (req = blkdev_entry_next_request(&rq->queue_head)) == NULL ) goto out; - + DPRINTK("do_blkif_request %p: cmd %i, sec %lx, (%li/%li) bh:%p\n", req, req->cmd, req->sector, req->current_nr_sectors, req->nr_sectors, req->bh); @@ -938,16 +850,16 @@ full = blkif_queue_request( (unsigned long)bh, - (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, + (rw == READ) ? BLKIF_OP_READ : BLKIF_OP_WRITE, bh->b_data, bh->b_rsector, bh->b_size>>9, bh->b_rdev); if ( full ) - { + { bh->b_reqnext = next_bh; pending_queues[nr_pending++] = rq; if ( unlikely(nr_pending >= MAX_PENDING) ) BUG(); - goto out; + goto out; } queued++; @@ -955,7 +867,7 @@ /* Dequeue the buffer head from the request. */ nsect = bh->b_size >> 9; bh = req->bh = next_bh; - + if ( bh != NULL ) { /* There's another buffer head to do. Update the request. */ @@ -985,27 +897,27 @@ static void blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) { - RING_IDX i, rp; - unsigned long flags; + RING_IDX i, rp; + unsigned long flags; struct buffer_head *bh, *next_bh; - - spin_lock_irqsave(&io_request_lock, flags); - - if ( unlikely(blkif_state == BLKIF_STATE_CLOSED || recovery) ) + + spin_lock_irqsave(&io_request_lock, flags); + + if ( unlikely(info->connected != BLKIF_STATE_CONNECTED || recovery) ) { spin_unlock_irqrestore(&io_request_lock, flags); return; } - rp = blk_ring.sring->rsp_prod; + rp = info->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for ( i = blk_ring.rsp_cons; i != rp; i++ ) + for ( i = info->ring.rsp_cons; i != rp; i++ ) { unsigned long id; blkif_response_t *bret; - - bret = RING_GET_RESPONSE(&blk_ring, i); + + bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; bh = (struct buffer_head *)blk_shadow[id].request; @@ -1037,8 +949,8 @@ } } - blk_ring.rsp_cons = i; - + info->ring.rsp_cons = i; + kick_pending_request_queues(); spin_unlock_irqrestore(&io_request_lock, flags); @@ -1048,157 +960,29 @@ /***************************** COMMON CODE *******************************/ -#ifdef CONFIG_XEN_BLKDEV_GRANT -void blkif_control_probe_send(blkif_request_t *req, blkif_response_t *rsp, - unsigned long address) -{ - int ref = gnttab_claim_grant_reference(&gref_head, gref_terminal); - ASSERT( ref != -ENOSPC ); - - gnttab_grant_foreign_access_ref( ref, rdomid, address >> PAGE_SHIFT, 0 ); - - req->frame_and_sects[0] = blkif_fas_from_gref(ref, 0, (PAGE_SIZE/512)-1); - - blkif_control_send(req, rsp); -} -#endif - -void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp) -{ - unsigned long flags, id; - blkif_request_t *req_d; - - retry: - while ( RING_FULL(&blk_ring) ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - spin_lock_irqsave(&blkif_io_lock, flags); - if ( RING_FULL(&blk_ring) ) - { - spin_unlock_irqrestore(&blkif_io_lock, flags); - goto retry; - } - - DISABLE_SCATTERGATHER(); - req_d = RING_GET_REQUEST(&blk_ring, blk_ring.req_prod_pvt); - *req_d = *req; - - id = GET_ID_FROM_FREELIST(); - req_d->id = id; - blk_shadow[id].request = (unsigned long)req; - - pickle_request(&blk_shadow[id], req); - - blk_ring.req_prod_pvt++; - flush_requests(); - - spin_unlock_irqrestore(&blkif_io_lock, flags); - - while ( !blkif_control_rsp_valid ) - { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); - } - - memcpy(rsp, &blkif_control_rsp, sizeof(*rsp)); - blkif_control_rsp_valid = 0; -} - - -/* Send a driver status notification to the domain controller. */ -static void send_driver_status(int ok) -{ - ctrl_msg_t cmsg = { - .type = CMSG_BLKIF_FE, - .subtype = CMSG_BLKIF_FE_DRIVER_STATUS, - .length = sizeof(blkif_fe_driver_status_t), - }; - blkif_fe_driver_status_t *msg = (void*)cmsg.msg; - - msg->status = (ok ? BLKIF_DRIVER_STATUS_UP : BLKIF_DRIVER_STATUS_DOWN); - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -/* Tell the controller to bring up the interface. */ -static void blkif_send_interface_connect(void) -{ - ctrl_msg_t cmsg = { - .type = CMSG_BLKIF_FE, - .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT, - .length = sizeof(blkif_fe_interface_connect_t), - }; - blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; - - msg->handle = 0; - msg->shmem_frame = (virt_to_machine(blk_ring.sring) >> PAGE_SHIFT); - -#ifdef CONFIG_XEN_BLKDEV_GRANT - msg->shmem_ref = gnttab_claim_grant_reference( &gref_head, gref_terminal ); - ASSERT( msg->shmem_ref != -ENOSPC ); - gnttab_grant_foreign_access_ref ( msg->shmem_ref , rdomid, msg->shmem_frame, 0 ); -#endif - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -static void blkif_free(void) +static void blkif_free(struct blkfront_info *info) { /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&blkif_io_lock); - recovery = 1; - blkif_state = BLKIF_STATE_DISCONNECTED; + info->connected = BLKIF_STATE_DISCONNECTED; spin_unlock_irq(&blkif_io_lock); /* Free resources associated with old device channel. */ - if ( blk_ring.sring != NULL ) - { - free_page((unsigned long)blk_ring.sring); - blk_ring.sring = NULL; - } - free_irq(blkif_irq, NULL); - blkif_irq = 0; - - unbind_evtchn_from_irq(blkif_evtchn); - blkif_evtchn = 0; -} - -static void blkif_close(void) -{ -} - -/* Move from CLOSED to DISCONNECTED state. */ -static void blkif_disconnect(void) -{ - blkif_sring_t *sring; - - if ( blk_ring.sring != NULL ) - free_page((unsigned long)blk_ring.sring); - - sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL); - SHARED_RING_INIT(sring); - FRONT_RING_INIT(&blk_ring, sring, PAGE_SIZE); - blkif_state = BLKIF_STATE_DISCONNECTED; - blkif_send_interface_connect(); -} - -static void blkif_reset(void) -{ - blkif_free(); - blkif_disconnect(); -} - -static void blkif_recover(void) + if ( info->ring.sring != NULL ) + { + free_page((unsigned long)info->ring.sring); + info->ring.sring = NULL; + } + unbind_evtchn_from_irqhandler(info->evtchn, NULL); + info->evtchn = 0; +} + +static void blkif_recover(struct blkfront_info *info) { int i; blkif_request_t *req; struct blk_shadow *copy; -#ifdef CONFIG_XEN_BLKDEV_GRANT int j; -#endif /* Stage 1: Make a safe copy of the shadow state. */ copy = (struct blk_shadow *)kmalloc(sizeof(blk_shadow), GFP_KERNEL); @@ -1209,7 +993,7 @@ memset(&blk_shadow, 0, sizeof(blk_shadow)); for ( i = 0; i < BLK_RING_SIZE; i++ ) blk_shadow[i].req.id = i+1; - blk_shadow_free = blk_ring.req_prod_pvt; + blk_shadow_free = info->ring.req_prod_pvt; blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; /* Stage 3: Find pending requests and requeue them. */ @@ -1221,195 +1005,339 @@ /* Grab a request slot and unpickle shadow state into it. */ req = RING_GET_REQUEST( - &blk_ring, blk_ring.req_prod_pvt); + &info->ring, info->ring.req_prod_pvt); unpickle_request(req, ©[i]); /* We get a new request id, and must reset the shadow state. */ req->id = GET_ID_FROM_FREELIST(); memcpy(&blk_shadow[req->id], ©[i], sizeof(copy[i])); -#ifdef CONFIG_XEN_BLKDEV_GRANT /* Rewrite any grant references invalidated by suspend/resume. */ for ( j = 0; j < req->nr_segments; j++ ) { if ( req->frame_and_sects[j] & GRANTREF_INVALID ) gnttab_grant_foreign_access_ref( blkif_gref_from_fas(req->frame_and_sects[j]), - rdomid, + info->backend_id, blk_shadow[req->id].frame[j], rq_data_dir((struct request *) blk_shadow[req->id].request)); req->frame_and_sects[j] &= ~GRANTREF_INVALID; } blk_shadow[req->id].req = *req; -#endif - - blk_ring.req_prod_pvt++; + + info->ring.req_prod_pvt++; } kfree(copy); recovery = 0; - /* blk_ring->req_prod will be set when we flush_requests().*/ + /* info->ring->req_prod will be set when we flush_requests().*/ wmb(); /* Kicks things back into life. */ - flush_requests(); + flush_requests(info); /* Now safe to left other people use the interface. */ - blkif_state = BLKIF_STATE_CONNECTED; -} - -static void blkif_connect(blkif_fe_interface_status_t *status) + info->connected = BLKIF_STATE_CONNECTED; +} + +static void blkif_connect(struct blkfront_info *info, u16 evtchn) { int err = 0; - blkif_evtchn = status->evtchn; - blkif_irq = bind_evtchn_to_irq(blkif_evtchn); - - err = request_irq(blkif_irq, blkif_int, SA_SAMPLE_RANDOM, "blkif", NULL); - if ( err ) - { - WPRINTK("request_irq failed (err=%d)\n", err); + info->evtchn = evtchn; + + err = bind_evtchn_to_irqhandler( + info->evtchn, blkif_int, SA_SAMPLE_RANDOM, "blkif", info); + if ( err != 0 ) + { + WPRINTK("bind_evtchn_to_irqhandler failed (err=%d)\n", err); return; } - - if ( recovery ) - { - blkif_recover(); - } - else - { - /* Transition to connected in case we need to do - * a partition probe on a whole disk. */ - blkif_state = BLKIF_STATE_CONNECTED; - - /* Probe for discs attached to the interface. */ - xlvbd_init(); - } - - /* Kick pending requests. */ - spin_lock_irq(&blkif_io_lock); - kick_pending_request_queues(); - spin_unlock_irq(&blkif_io_lock); -} - -static void unexpected(blkif_fe_interface_status_t *status) -{ - DPRINTK(" Unexpected blkif status %u in state %u\n", - status->status, blkif_state); -} - -static void blkif_status(blkif_fe_interface_status_t *status) -{ -#ifdef CONFIG_XEN_BLKDEV_GRANT - rdomid = status->domid; /* need to set rdomid early */ -#endif - - if ( status->handle != blkif_handle ) - { - WPRINTK(" Invalid blkif: handle=%u\n", status->handle); - unexpected(status); - return; - } - - switch ( status->status ) - { - case BLKIF_INTERFACE_STATUS_CLOSED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_close(); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_DISCONNECTED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - blkif_disconnect(); - break; - case BLKIF_STATE_DISCONNECTED: - case BLKIF_STATE_CONNECTED: - /* unexpected(status); */ /* occurs during suspend/resume */ - blkif_reset(); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CONNECTED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - unexpected(status); - blkif_disconnect(); - blkif_connect(status); - break; - case BLKIF_STATE_DISCONNECTED: - blkif_connect(status); - break; - case BLKIF_STATE_CONNECTED: - unexpected(status); - blkif_connect(status); - break; - } - break; - - case BLKIF_INTERFACE_STATUS_CHANGED: - switch ( blkif_state ) - { - case BLKIF_STATE_CLOSED: - case BLKIF_STATE_DISCONNECTED: - unexpected(status); - break; - case BLKIF_STATE_CONNECTED: - vbd_update(); - break; - } - break; - - default: - WPRINTK(" Invalid blkif status: %d\n", status->status); - break; - } -} - - -static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - switch ( msg->subtype ) - { - case CMSG_BLKIF_FE_INTERFACE_STATUS: - blkif_status((blkif_fe_interface_status_t *) - &msg->msg[0]); - break; - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -int wait_for_blkif(void) +} + + +static struct xenbus_device_id blkfront_ids[] = { + { "vbd" }, + { "" } +}; + +static void watch_for_status(struct xenbus_watch *watch, const char *node) +{ + struct blkfront_info *info; + unsigned int binfo; + unsigned long sectors, sector_size; + int err; + + info = container_of(watch, struct blkfront_info, watch); + node += strlen(watch->node); + + /* FIXME: clean up when error on the other end. */ + if (info->connected == BLKIF_STATE_CONNECTED) + return; + + err = xenbus_gather(watch->node, + "sectors", "%lu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_error(info->xbdev, err, + "reading backend fields at %s", watch->node); + return; + } + + xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); + info->connected = BLKIF_STATE_CONNECTED; + + blkif_state = BLKIF_STATE_CONNECTED; + + xenbus_dev_ok(info->xbdev); + + /* Kick pending requests. */ + spin_lock_irq(&blkif_io_lock); + kick_pending_request_queues(info); + spin_unlock_irq(&blkif_io_lock); +} + +static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) +{ + blkif_sring_t *sring; + evtchn_op_t op = { .cmd = EVTCHNOP_alloc_unbound }; + int err; + + sring = (void *)__get_free_page(GFP_KERNEL); + if (!sring) { + xenbus_dev_error(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = gnttab_grant_foreign_access(info->backend_id, + virt_to_mfn(info->ring.sring), 0); + if (err == -ENOSPC) { + free_page((unsigned long)info->ring.sring); + info->ring.sring = 0; + xenbus_dev_error(dev, err, "granting access to ring page"); + return err; + } + info->ring_ref = err; + + op.u.alloc_unbound.dom = info->backend_id; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + gnttab_end_foreign_access(info->ring_ref, 0); + free_page((unsigned long)info->ring.sring); + info->ring.sring = 0; + xenbus_dev_error(dev, err, "allocating event channel"); + return err; + } + blkif_connect(info, op.u.alloc_unbound.port); + return 0; +} + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct blkfront_info *info) +{ + char *backend; + const char *message; + int err; + + backend = NULL; + err = xenbus_gather(dev->nodename, + "backend-id", "%i", &info->backend_id, + "backend", NULL, &backend, + NULL); + if (XENBUS_EXIST_ERR(err)) + goto out; + if (backend && strlen(backend) == 0) { + err = -ENOENT; + goto out; + } + if (err < 0) { + xenbus_dev_error(dev, err, "reading %s/backend or backend-id", + dev->nodename); + goto out; + } + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) { + xenbus_dev_error(dev, err, "setting up block ring"); + goto out; + } + + err = xenbus_transaction_start(dev->nodename); + if (err) { + xenbus_dev_error(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(dev->nodename, "ring-ref","%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(dev->nodename, + "event-channel", "%u", info->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + info->backend = backend; + backend = NULL; + + info->watch.node = info->backend; + info->watch.callback = watch_for_status; + err = register_xenbus_watch(&info->watch); + if (err) { + message = "registering watch on backend"; + goto abort_transaction; + } + + err = xenbus_transaction_end(0); + if (err) { + xenbus_dev_error(dev, err, "completing transaction"); + goto destroy_blkring; + } + + out: + if (backend) + kfree(backend); + return err; + + abort_transaction: + xenbus_transaction_end(1); + /* Have to do this *outside* transaction. */ + xenbus_dev_error(dev, err, "%s", message); + destroy_blkring: + blkif_free(info); + goto out; +} + +/* Setup supplies the backend dir, virtual device. + + We place an event channel and shared frame entries. + We watch backend to wait if it's ok. */ +static int blkfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct blkfront_info *info; + int vdevice; + + /* FIXME: Use dynamic device id if this is not set. */ + err = xenbus_scanf(dev->nodename, "virtual-device", "%i", &vdevice); + if (XENBUS_EXIST_ERR(err)) + return err; + if (err < 0) { + xenbus_dev_error(dev, err, "reading virtual-device"); + return err; + } + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_error(dev, err, "allocating info structure"); + return err; + } + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + info->mi = NULL; + INIT_WORK(&info->work, blkif_restart_queue, (void *)info); + + /* Front end dir is a number, which is used as the id. */ + info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0); + dev->data = info; + + err = talk_to_backend(dev, info); + if (err) { + kfree(info); + dev->data = NULL; + return err; + } + + /* Call once in case entries already there. */ + watch_for_status(&info->watch, info->watch.node); + return 0; +} + +static int blkfront_remove(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->data; + + if (info->backend) + unregister_xenbus_watch(&info->watch); + + if (info->mi) + xlvbd_del(info); + + blkif_free(info); + + kfree(info->backend); + kfree(info); + + return 0; +} + +static int blkfront_suspend(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->data; + + unregister_xenbus_watch(&info->watch); + kfree(info->backend); + info->backend = NULL; + + recovery = 1; + blkif_free(info); + + return 0; +} + +static int blkfront_resume(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->data; + int err; + + /* FIXME: Check geometry hasn't changed here... */ + err = talk_to_backend(dev, info); + if (!err) { + blkif_recover(info); + } + return err; +} + +static struct xenbus_driver blkfront = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkfront_ids, + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, + .suspend = blkfront_suspend, +}; + +static void __init init_blk_xenbus(void) +{ + xenbus_register_device(&blkfront); +} + +static int wait_for_blkif(void) { int err = 0; int i; - send_driver_status(1); /* - * We should read 'nr_interfaces' from response message and wait - * for notifications before proceeding. For now we assume that we - * will be notified of exactly one interface. + * We should figure out how many and which devices we need to + * proceed and only wait for those. For now, continue once the + * first device is around. */ - for ( i=0; (blkif_state != BLKIF_STATE_CONNECTED) && (i < 10*HZ); i++ ) + for ( i=0; blkif_state != BLKIF_STATE_CONNECTED && (i < 10*HZ); i++ ) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); @@ -1423,17 +1351,9 @@ return err; } -int __init xlblk_init(void) +static int __init xlblk_init(void) { int i; - -#ifdef CONFIG_XEN_BLKDEV_GRANT - /* A grant for every ring slot, plus one for the ring itself. */ - if ( 0 > gnttab_alloc_grant_references(MAXIMUM_OUTSTANDING_BLOCK_REQS + 1, - &gref_head, &gref_terminal) ) - return 1; - printk(KERN_ALERT "Blkif frontend is using grant tables.\n"); -#endif if ( (xen_start_info.flags & SIF_INITDOMAIN) || (xen_start_info.flags & SIF_BLK_BE_DOMAIN) ) @@ -1447,46 +1367,17 @@ blk_shadow[i].req.id = i+1; blk_shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; - (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); + init_blk_xenbus(); wait_for_blkif(); return 0; } -void blkdev_suspend(void) -{ -} - -void blkdev_resume(void) -{ -#ifdef CONFIG_XEN_BLKDEV_GRANT - int i, j; - for ( i = 0; i < BLK_RING_SIZE; i++ ) - for ( j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++ ) - blk_shadow[i].req.frame_and_sects[j] |= GRANTREF_INVALID; -#endif - send_driver_status(1); -} - static void blkif_completion(struct blk_shadow *s) { int i; -#ifdef CONFIG_XEN_BLKDEV_GRANT for ( i = 0; i < s->req.nr_segments; i++ ) - gnttab_release_grant_reference( - &gref_head, blkif_gref_from_fas(s->req.frame_and_sects[i])); -#else - /* This is a hack to get the dirty logging bits set */ - if ( s->req.operation == BLKIF_OP_READ ) - { - for ( i = 0; i < s->req.nr_segments; i++ ) - { - unsigned long pfn = s->req.frame_and_sects[i] >> PAGE_SHIFT; - unsigned long mfn = phys_to_machine_mapping[pfn]; - xen_machphys_update(mfn, pfn); - } - } -#endif -} + gnttab_free_grant_reference( + blkif_gref_from_fas(s->req.frame_and_sects[i])); +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkfront/block.h --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/block.h Thu Aug 25 22:53:20 2005 @@ -33,6 +33,7 @@ #define __XEN_DRIVERS_BLOCK_H__ #include <linux/config.h> +#include <linux/version.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -44,6 +45,8 @@ #include <linux/blkdev.h> #include <linux/major.h> #include <linux/devfs_fs_kernel.h> +#include <asm-xen/hypervisor.h> +#include <asm-xen/xenbus.h> #include <asm-xen/xen-public/xen.h> #include <asm-xen/xen-public/io/blkif.h> #include <asm-xen/xen-public/io/ring.h> @@ -77,11 +80,20 @@ #define DPRINTK_IOCTL(_f, _a...) ((void)0) #endif -struct xlbd_type_info { - int partn_shift; - int disks_per_major; - char *devname; - char *diskname; +struct xlbd_type_info +{ + int partn_shift; + int disks_per_major; + char *devname; + char *diskname; +}; + +struct xlbd_major_info +{ + int major; + int index; + int usage; + struct xlbd_type_info *type; }; /* @@ -89,25 +101,27 @@ * hang in private_data off the gendisk structure. We may end up * putting all kinds of interesting stuff here :-) */ -struct xlbd_major_info { - int major; - int index; - int usage; - struct xlbd_type_info *type; +struct blkfront_info +{ + struct xenbus_device *xbdev; + /* We watch the backend */ + struct xenbus_watch watch; + dev_t dev; + int vdevice; + blkif_vdev_t handle; + int connected; + char *backend; + int backend_id; + int ring_ref; + blkif_front_ring_t ring; + unsigned int evtchn; + struct xlbd_major_info *mi; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + request_queue_t *rq; +#endif + struct work_struct work; + struct gnttab_free_callback callback; }; - -struct xlbd_disk_info { - int xd_device; - struct xlbd_major_info *mi; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - struct xlbd_disk_info *next_waiting; - request_queue_t *rq; -#endif -}; - -typedef struct xen_block { - int usage; -} xen_block_t; extern spinlock_t blkif_io_lock; @@ -117,17 +131,10 @@ unsigned command, unsigned long argument); extern int blkif_check(dev_t dev); extern int blkif_revalidate(dev_t dev); -extern void blkif_control_send(blkif_request_t *req, blkif_response_t *rsp); -#ifdef CONFIG_XEN_BLKDEV_GRANT -extern void blkif_control_probe_send( - blkif_request_t *req, blkif_response_t *rsp, unsigned long address); -#endif extern void do_blkif_request (request_queue_t *rq); -extern void xlvbd_update_vbds(void); - /* Virtual block-device subsystem. */ -extern int xlvbd_init(void); -extern void xlvbd_cleanup(void); - +int xlvbd_add(blkif_sector_t capacity, int device, + u16 vdisk_info, u16 sector_size, struct blkfront_info *info); +void xlvbd_del(struct blkfront_info *info); #endif /* __XEN_DRIVERS_BLOCK_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/vbd.c Thu Aug 25 22:53:20 2005 @@ -43,458 +43,269 @@ #define NUM_SCSI_MAJORS 9 #define NUM_VBD_MAJORS 1 -struct lvdisk -{ - blkif_sector_t capacity; /* 0: Size in terms of 512-byte sectors. */ - blkif_vdev_t device; /* 8: Device number (opaque 16 bit value). */ - u16 info; - struct list_head list; +static struct xlbd_type_info xlbd_ide_type = { + .partn_shift = 6, + .disks_per_major = 2, + .devname = "ide", + .diskname = "hd", }; -static struct xlbd_type_info xlbd_ide_type = { - .partn_shift = 6, - .disks_per_major = 2, - .devname = "ide", - .diskname = "hd", +static struct xlbd_type_info xlbd_scsi_type = { + .partn_shift = 4, + .disks_per_major = 16, + .devname = "sd", + .diskname = "sd", }; -static struct xlbd_type_info xlbd_scsi_type = { - .partn_shift = 4, - .disks_per_major = 16, - .devname = "sd", - .diskname = "sd", +static struct xlbd_type_info xlbd_vbd_type = { + .partn_shift = 4, + .disks_per_major = 16, + .devname = "xvd", + .diskname = "xvd", }; -static struct xlbd_type_info xlbd_vbd_type = { - .partn_shift = 4, - .disks_per_major = 16, - .devname = "xvd", - .diskname = "xvd", -}; - static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS + - NUM_VBD_MAJORS]; - -#define XLBD_MAJOR_IDE_START 0 -#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS) -#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) - -#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1 -#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1 -#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1 + NUM_VBD_MAJORS]; + +#define XLBD_MAJOR_IDE_START 0 +#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS) +#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) + +#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1 +#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1 +#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1 /* Information about our VBDs. */ #define MAX_VBDS 64 -struct list_head vbds_list; - -#define MAJOR_XEN(dev) ((dev)>>8) -#define MINOR_XEN(dev) ((dev) & 0xff) - -static struct block_device_operations xlvbd_block_fops = -{ - .owner = THIS_MODULE, - .open = blkif_open, - .release = blkif_release, - .ioctl = blkif_ioctl, +static LIST_HEAD(vbds_list); + +static struct block_device_operations xlvbd_block_fops = +{ + .owner = THIS_MODULE, + .open = blkif_open, + .release = blkif_release, + .ioctl = blkif_ioctl, }; spinlock_t blkif_io_lock = SPIN_LOCK_UNLOCKED; -static struct lvdisk *xlvbd_device_alloc(void) -{ - struct lvdisk *disk; - - disk = kmalloc(sizeof(*disk), GFP_KERNEL); - if (disk != NULL) { - memset(disk, 0, sizeof(*disk)); - INIT_LIST_HEAD(&disk->list); - } - return disk; -} - -static void xlvbd_device_free(struct lvdisk *disk) -{ - list_del(&disk->list); - kfree(disk); -} - -static vdisk_t *xlvbd_probe(int *ret) -{ - blkif_response_t rsp; - blkif_request_t req; - vdisk_t *disk_info = NULL; - unsigned long buf; - int nr; - - buf = __get_free_page(GFP_KERNEL); - if ((void *)buf == NULL) - goto out; - - memset(&req, 0, sizeof(req)); - req.operation = BLKIF_OP_PROBE; - req.nr_segments = 1; -#ifdef CONFIG_XEN_BLKDEV_GRANT - blkif_control_probe_send(&req, &rsp, - (unsigned long)(virt_to_machine(buf))); -#else - req.frame_and_sects[0] = blkif_fas(virt_to_machine(buf), 0, ((PAGE_SIZE/512)-1); - - blkif_control_send(&req, &rsp); -#endif - if ( rsp.status <= 0 ) { - WPRINTK("Could not probe disks (%d)\n", rsp.status); - goto out; - } - nr = rsp.status; - if ( nr > MAX_VBDS ) - nr = MAX_VBDS; - - disk_info = kmalloc(nr * sizeof(vdisk_t), GFP_KERNEL); - if (disk_info != NULL) - memcpy(disk_info, (void *) buf, nr * sizeof(vdisk_t)); - - if (ret != NULL) - *ret = nr; - -out: - free_page(buf); - return disk_info; -} - -static struct xlbd_major_info *xlbd_alloc_major_info( - int major, int minor, int index) -{ - struct xlbd_major_info *ptr; - - ptr = kmalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); - if (ptr == NULL) - return NULL; - - memset(ptr, 0, sizeof(struct xlbd_major_info)); - - ptr->major = major; - - switch (index) { - case XLBD_MAJOR_IDE_RANGE: - ptr->type = &xlbd_ide_type; - ptr->index = index - XLBD_MAJOR_IDE_START; - break; - case XLBD_MAJOR_SCSI_RANGE: - ptr->type = &xlbd_scsi_type; - ptr->index = index - XLBD_MAJOR_SCSI_START; - break; - case XLBD_MAJOR_VBD_RANGE: - ptr->type = &xlbd_vbd_type; - ptr->index = index - XLBD_MAJOR_VBD_START; - break; - } - - if (register_blkdev(ptr->major, ptr->type->devname)) { - WPRINTK("can't get major %d with name %s\n", - ptr->major, ptr->type->devname); - kfree(ptr); - return NULL; - } - - devfs_mk_dir(ptr->type->devname); - major_info[index] = ptr; - return ptr; -} - -static struct xlbd_major_info *xlbd_get_major_info(int device) -{ - int major, minor, index; - - major = MAJOR_XEN(device); - minor = MINOR_XEN(device); - - switch (major) { - case IDE0_MAJOR: index = 0; break; - case IDE1_MAJOR: index = 1; break; - case IDE2_MAJOR: index = 2; break; - case IDE3_MAJOR: index = 3; break; - case IDE4_MAJOR: index = 4; break; - case IDE5_MAJOR: index = 5; break; - case IDE6_MAJOR: index = 6; break; - case IDE7_MAJOR: index = 7; break; - case IDE8_MAJOR: index = 8; break; - case IDE9_MAJOR: index = 9; break; - case SCSI_DISK0_MAJOR: index = 10; break; - case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: - index = 11 + major - SCSI_DISK1_MAJOR; - break; - case SCSI_CDROM_MAJOR: index = 18; break; - default: index = 19; break; - } - - return ((major_info[index] != NULL) ? major_info[index] : - xlbd_alloc_major_info(major, minor, index)); -} - -static int xlvbd_init_blk_queue(struct gendisk *gd, vdisk_t *disk) -{ - request_queue_t *rq; - - rq = blk_init_queue(do_blkif_request, &blkif_io_lock); - if (rq == NULL) - return -1; - - elevator_init(rq, "noop"); - - /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_hardsect_size(rq, disk->sector_size); - blk_queue_max_sectors(rq, 512); - - /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(rq, PAGE_SIZE - 1); - blk_queue_max_segment_size(rq, PAGE_SIZE); - - /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); - blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - /* Make sure buffer addresses are sector-aligned. */ - blk_queue_dma_alignment(rq, 511); - - gd->queue = rq; - - return 0; -} - -struct gendisk *xlvbd_alloc_gendisk( - struct xlbd_major_info *mi, int minor, vdisk_t *disk) -{ - struct gendisk *gd; - struct xlbd_disk_info *di; - int nr_minors = 1; - - di = kmalloc(sizeof(struct xlbd_disk_info), GFP_KERNEL); - if (di == NULL) - return NULL; - memset(di, 0, sizeof(*di)); - di->mi = mi; - di->xd_device = disk->device; - - if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0) - nr_minors = 1 << mi->type->partn_shift; - - gd = alloc_disk(nr_minors); - if (gd == NULL) - goto out; - - if (nr_minors > 1) - sprintf(gd->disk_name, "%s%c", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift)); - else - sprintf(gd->disk_name, "%s%c%d", mi->type->diskname, - 'a' + mi->index * mi->type->disks_per_major + - (minor >> mi->type->partn_shift), - minor & ((1 << mi->type->partn_shift) - 1)); - - gd->major = mi->major; - gd->first_minor = minor; - gd->fops = &xlvbd_block_fops; - gd->private_data = di; - set_capacity(gd, disk->capacity); - - if (xlvbd_init_blk_queue(gd, disk)) { - del_gendisk(gd); - goto out; - } - - di->rq = gd->queue; - - if (disk->info & VDISK_READONLY) - set_disk_ro(gd, 1); - - if (disk->info & VDISK_REMOVABLE) - gd->flags |= GENHD_FL_REMOVABLE; - - if (disk->info & VDISK_CDROM) - gd->flags |= GENHD_FL_CD; - - add_disk(gd); - - return gd; - -out: - kfree(di); - return NULL; -} - -static int xlvbd_device_add(struct list_head *list, vdisk_t *disk) -{ - struct lvdisk *new; - int minor; - dev_t device; - struct block_device *bd; - struct gendisk *gd; - struct xlbd_major_info *mi; - - mi = xlbd_get_major_info(disk->device); - if (mi == NULL) - return -EPERM; - - new = xlvbd_device_alloc(); - if (new == NULL) - return -1; - new->capacity = disk->capacity; - new->device = disk->device; - new->info = disk->info; - - minor = MINOR_XEN(disk->device); - device = MKDEV(mi->major, minor); - - bd = bdget(device); - if (bd == NULL) - goto out; - - gd = xlvbd_alloc_gendisk(mi, minor, disk); - if (gd == NULL) - goto out_bd; - - list_add(&new->list, list); -out_bd: - bdput(bd); -out: - return 0; -} - -static int xlvbd_device_del(struct lvdisk *disk) -{ - dev_t device; - struct block_device *bd; - struct gendisk *gd; - struct xlbd_disk_info *di; - int ret = 0, unused; - request_queue_t *rq; - - device = MKDEV(MAJOR_XEN(disk->device), MINOR_XEN(disk->device)); - - bd = bdget(device); - if (bd == NULL) - return -1; - - gd = get_gendisk(device, &unused); - di = gd->private_data; - - if (di->mi->usage != 0) { - WPRINTK("disk removal failed: used [dev=%x]\n", device); - ret = -1; - goto out; - } - - rq = gd->queue; - del_gendisk(gd); - put_disk(gd); - blk_cleanup_queue(rq); - - xlvbd_device_free(disk); -out: - bdput(bd); - return ret; -} - -static int xlvbd_device_update(struct lvdisk *ldisk, vdisk_t *disk) -{ - dev_t device; - struct block_device *bd; - struct gendisk *gd; - int unused; - - if ((ldisk->capacity == disk->capacity) && (ldisk->info == disk->info)) - return 0; - - device = MKDEV(MAJOR_XEN(ldisk->device), MINOR_XEN(ldisk->device)); - - bd = bdget(device); - if (bd == NULL) - return -1; - - gd = get_gendisk(device, &unused); - set_capacity(gd, disk->capacity); - ldisk->capacity = disk->capacity; - - bdput(bd); - - return 0; -} - -void xlvbd_refresh(void) -{ - vdisk_t *newdisks; - struct list_head *tmp, *tmp2; - struct lvdisk *disk; - int i, nr; - - newdisks = xlvbd_probe(&nr); - if (newdisks == NULL) { - WPRINTK("failed to probe\n"); - return; - } - - i = 0; - list_for_each_safe(tmp, tmp2, &vbds_list) { - disk = list_entry(tmp, struct lvdisk, list); - - for (i = 0; i < nr; i++) { - if ( !newdisks[i].device ) - continue; - if ( disk->device == newdisks[i].device ) { - xlvbd_device_update(disk, &newdisks[i]); - newdisks[i].device = 0; - break; - } - } - if (i == nr) { - xlvbd_device_del(disk); - newdisks[i].device = 0; - } - } - for (i = 0; i < nr; i++) - if ( newdisks[i].device ) - xlvbd_device_add(&vbds_list, &newdisks[i]); - kfree(newdisks); -} - -/* - * xlvbd_update_vbds - reprobes the VBD status and performs updates driver - * state. The VBDs need to be updated in this way when the domain is - * initialised and also each time we receive an XLBLK_UPDATE event. - */ -void xlvbd_update_vbds(void) -{ - xlvbd_refresh(); -} - -/* - * Set up all the linux device goop for the virtual block devices - * (vbd's) that we know about. Note that although from the backend - * driver's p.o.v. VBDs are addressed simply an opaque 16-bit device - * number, the domain creation tools conventionally allocate these - * numbers to correspond to those used by 'real' linux -- this is just - * for convenience as it means e.g. that the same /etc/fstab can be - * used when booting with or without Xen. - */ -int xlvbd_init(void) -{ - int i, nr; - vdisk_t *disks; - - INIT_LIST_HEAD(&vbds_list); - - memset(major_info, 0, sizeof(major_info)); - - disks = xlvbd_probe(&nr); - if (disks == NULL) { - WPRINTK("failed to probe\n"); - return -1; - } - - for (i = 0; i < nr; i++) - xlvbd_device_add(&vbds_list, &disks[i]); - - kfree(disks); - return 0; -} +static struct xlbd_major_info * +xlbd_alloc_major_info(int major, int minor, int index) +{ + struct xlbd_major_info *ptr; + + ptr = kmalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); + if (ptr == NULL) + return NULL; + + memset(ptr, 0, sizeof(struct xlbd_major_info)); + + ptr->major = major; + + switch (index) { + case XLBD_MAJOR_IDE_RANGE: + ptr->type = &xlbd_ide_type; + ptr->index = index - XLBD_MAJOR_IDE_START; + break; + case XLBD_MAJOR_SCSI_RANGE: + ptr->type = &xlbd_scsi_type; + ptr->index = index - XLBD_MAJOR_SCSI_START; + break; + case XLBD_MAJOR_VBD_RANGE: + ptr->type = &xlbd_vbd_type; + ptr->index = index - XLBD_MAJOR_VBD_START; + break; + } + + printk("Registering block device major %i\n", ptr->major); + if (register_blkdev(ptr->major, ptr->type->devname)) { + WPRINTK("can't get major %d with name %s\n", + ptr->major, ptr->type->devname); + kfree(ptr); + return NULL; + } + + devfs_mk_dir(ptr->type->devname); + major_info[index] = ptr; + return ptr; +} + +static struct xlbd_major_info * +xlbd_get_major_info(int vdevice) +{ + struct xlbd_major_info *mi; + int major, minor, index; + + major = BLKIF_MAJOR(vdevice); + minor = BLKIF_MINOR(vdevice); + + switch (major) { + case IDE0_MAJOR: index = 0; break; + case IDE1_MAJOR: index = 1; break; + case IDE2_MAJOR: index = 2; break; + case IDE3_MAJOR: index = 3; break; + case IDE4_MAJOR: index = 4; break; + case IDE5_MAJOR: index = 5; break; + case IDE6_MAJOR: index = 6; break; + case IDE7_MAJOR: index = 7; break; + case IDE8_MAJOR: index = 8; break; + case IDE9_MAJOR: index = 9; break; + case SCSI_DISK0_MAJOR: index = 10; break; + case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: + index = 11 + major - SCSI_DISK1_MAJOR; + break; + case SCSI_CDROM_MAJOR: index = 18; break; + default: index = 19; break; + } + + mi = ((major_info[index] != NULL) ? major_info[index] : + xlbd_alloc_major_info(major, minor, index)); + mi->usage++; + return mi; +} + +static void +xlbd_put_major_info(struct xlbd_major_info *mi) +{ + mi->usage--; + /* XXX: release major if 0 */ +} + +static int +xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) +{ + request_queue_t *rq; + + rq = blk_init_queue(do_blkif_request, &blkif_io_lock); + if (rq == NULL) + return -1; + + elevator_init(rq, "noop"); + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_hardsect_size(rq, sector_size); + blk_queue_max_sectors(rq, 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + gd->queue = rq; + + return 0; +} + +static int +xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, + u16 vdisk_info, u16 sector_size, + struct blkfront_info *info) +{ + struct gendisk *gd; + struct xlbd_major_info *mi; + int nr_minors = 1; + int err = -ENODEV; + + mi = xlbd_get_major_info(vdevice); + if (mi == NULL) + goto out; + info->mi = mi; + + if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0) + nr_minors = 1 << mi->type->partn_shift; + + gd = alloc_disk(nr_minors); + if (gd == NULL) + goto out; + + if (nr_minors > 1) + sprintf(gd->disk_name, "%s%c", mi->type->diskname, + 'a' + mi->index * mi->type->disks_per_major + + (minor >> mi->type->partn_shift)); + else + sprintf(gd->disk_name, "%s%c%d", mi->type->diskname, + 'a' + mi->index * mi->type->disks_per_major + + (minor >> mi->type->partn_shift), + minor & ((1 << mi->type->partn_shift) - 1)); + + gd->major = mi->major; + gd->first_minor = minor; + gd->fops = &xlvbd_block_fops; + gd->private_data = info; + set_capacity(gd, capacity); + + if (xlvbd_init_blk_queue(gd, sector_size)) { + del_gendisk(gd); + goto out; + } + + info->rq = gd->queue; + + if (vdisk_info & VDISK_READONLY) + set_disk_ro(gd, 1); + + if (vdisk_info & VDISK_REMOVABLE) + gd->flags |= GENHD_FL_REMOVABLE; + + if (vdisk_info & VDISK_CDROM) + gd->flags |= GENHD_FL_CD; + + add_disk(gd); + + return 0; + + out: + if (mi) + xlbd_put_major_info(mi); + return err; +} + +int +xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info, + u16 sector_size, struct blkfront_info *info) +{ + struct block_device *bd; + int err = 0; + + info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice)); + + bd = bdget(info->dev); + if (bd == NULL) + return -ENODEV; + + err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice, + vdisk_info, sector_size, info); + + bdput(bd); + return err; +} + +void +xlvbd_del(struct blkfront_info *info) +{ + struct block_device *bd; + struct gendisk *gd; + int unused; + request_queue_t *rq; + + bd = bdget(info->dev); + if (bd == NULL) + return; + + gd = get_gendisk(info->dev, &unused); + rq = gd->queue; + + del_gendisk(gd); + put_disk(gd); + xlbd_put_major_info(info->mi); + info->mi = NULL; + blk_cleanup_queue(rq); + + bdput(bd); +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.c Thu Aug 25 22:53:20 2005 @@ -23,6 +23,9 @@ blkif_be_driver_status_t be_st; printk(KERN_INFO "Initialising Xen block tap device\n"); +#ifdef CONFIG_XEN_BLKDEV_GRANT + printk(KERN_INFO "Block tap is using grant tables.\n"); +#endif DPRINTK(" tap - Backend connection init:\n"); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap.h Thu Aug 25 22:53:20 2005 @@ -71,7 +71,6 @@ /* Physical parameters of the comms window. */ unsigned long shmem_frame; unsigned int evtchn; - int irq; /* Comms information. */ blkif_back_ring_t blk_ring; @@ -86,6 +85,11 @@ spinlock_t blk_ring_lock; atomic_t refcnt; struct work_struct work; +#ifdef CONFIG_XEN_BLKDEV_GRANT + u16 shmem_handle; + unsigned long shmem_vaddr; + grant_ref_t shmem_ref; +#endif } blkif_t; blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle); @@ -104,8 +108,6 @@ blkif_t *blkif; unsigned long id; int nr_pages; - unsigned long mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned long virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST]; int next_free; } active_req_t; @@ -173,32 +175,7 @@ /* -------[ Mappings to User VMA ]------------------------------------ */ -#define MAX_PENDING_REQS 64 #define BATCH_PER_DOMAIN 16 -extern struct vm_area_struct *blktap_vma; - -/* The following are from blkback.c and should probably be put in a - * header and included from there. - * The mmap area described here is where attached data pages eill be mapped. - */ - -extern unsigned long mmap_vstart; -#define MMAP_PAGES_PER_REQUEST \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) -#define MMAP_PAGES \ - (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) -#define MMAP_VADDR(_req,_seg) \ - (mmap_vstart + \ - ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ - ((_seg) * PAGE_SIZE)) - -/* immediately before the mmap area, we have a bunch of pages reserved - * for shared memory rings. - */ - -#define RING_PAGES 3 /* Ctrl, Front, and Back */ -extern unsigned long rings_vstart; - /* -------[ Here be globals ]----------------------------------------- */ extern unsigned long blktap_mode; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c Thu Aug 25 22:53:20 2005 @@ -9,6 +9,7 @@ */ #include "blktap.h" +#include <asm-xen/evtchn.h> static char *blkif_state_name[] = { [BLKIF_STATE_CLOSED] = "closed", @@ -16,16 +17,15 @@ [BLKIF_STATE_CONNECTED] = "connected", }; -static char * blkif_status_name[] = { +static char *blkif_status_name[] = { [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", }; -static unsigned blktap_be_irq; -unsigned int blktap_be_state = BLKIF_STATE_CLOSED; -unsigned int blktap_be_evtchn; +unsigned int blktap_be_state = BLKIF_STATE_CLOSED; +unsigned int blktap_be_evtchn; /*-----[ Control Messages to/from Frontend VMs ]--------------------------*/ @@ -49,13 +49,21 @@ blkif_t *blkif = (blkif_t *)arg; ctrl_msg_t cmsg; blkif_be_disconnect_t disc; +#ifdef CONFIG_XEN_BLKDEV_GRANT + struct gnttab_unmap_grant_ref op; +#endif /* * These can't be done in blkif_disconnect() because at that point there * may be outstanding requests at the disc whose asynchronous responses * must still be notified to the remote driver. */ - unbind_evtchn_from_irq(blkif->evtchn); +#ifdef CONFIG_XEN_BLKDEV_GRANT + op.host_addr = blkif->shmem_vaddr; + op.handle = blkif->shmem_handle; + op.dev_bus_addr = 0; + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); +#endif vfree(blkif->blk_ring.sring); /* Construct the deferred response message. */ @@ -179,8 +187,12 @@ unsigned int evtchn = connect->evtchn; unsigned long shmem_frame = connect->shmem_frame; struct vm_struct *vma; +#ifdef CONFIG_XEN_BLKDEV_GRANT + int ref = connect->shmem_ref; +#else pgprot_t prot; int error; +#endif blkif_t *blkif; blkif_sring_t *sring; @@ -201,24 +213,46 @@ return; } - prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED); +#ifndef CONFIG_XEN_BLKDEV_GRANT + prot = __pgprot(_KERNPG_TABLE); error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr), shmem_frame<<PAGE_SHIFT, PAGE_SIZE, prot, domid); if ( error != 0 ) { - WPRINTK("BE_CONNECT: error! (%d)\n", error); if ( error == -ENOMEM ) connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) { + else if ( error == -EFAULT ) connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; - WPRINTK("BE_CONNECT: MAPPING error!\n"); - } else connect->status = BLKIF_BE_STATUS_ERROR; vfree(vma->addr); return; } +#else + { /* Map: Use the Grant table reference */ + struct gnttab_map_grant_ref op; + op.host_addr = VMALLOC_VMADDR(vma->addr); + op.flags = GNTMAP_host_map; + op.ref = ref; + op.dom = domid; + + BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); + + handle = op.handle; + + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + connect->status = BLKIF_BE_STATUS_MAPPING_ERROR; + vfree(vma->addr); + return; + } + + blkif->shmem_ref = ref; + blkif->shmem_handle = handle; + blkif->shmem_vaddr = VMALLOC_VMADDR(vma->addr); + } +#endif if ( blkif->status != DISCONNECTED ) { @@ -232,12 +266,12 @@ BACK_RING_INIT(&blkif->blk_ring, sring, PAGE_SIZE); blkif->evtchn = evtchn; - blkif->irq = bind_evtchn_to_irq(evtchn); blkif->shmem_frame = shmem_frame; blkif->status = CONNECTED; blkif_get(blkif); - request_irq(blkif->irq, blkif_ptfe_int, 0, "blkif-pt-backend", blkif); + bind_evtchn_to_irqhandler( + evtchn, blkif_ptfe_int, 0, "blkif-pt-backend", blkif); connect->status = BLKIF_BE_STATUS_OKAY; } @@ -264,7 +298,7 @@ blkif->status = DISCONNECTING; blkif->disconnect_rspid = rsp_id; wmb(); /* Let other CPUs see the status change. */ - free_irq(blkif->irq, blkif); + unbind_evtchn_from_irqhandler(blkif->evtchn, blkif); blkif_deschedule(blkif); blkif_put(blkif); return 0; /* Caller should not send response message. */ @@ -286,7 +320,7 @@ }; blkif_fe_interface_connect_t *msg = (void*)cmsg.msg; msg->handle = 0; - msg->shmem_frame = virt_to_machine(blktap_be_ring.sring) >> PAGE_SHIFT; + msg->shmem_frame = virt_to_mfn(blktap_be_ring.sring); ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); } @@ -313,12 +347,11 @@ int err = 0; blktap_be_evtchn = status->evtchn; - blktap_be_irq = bind_evtchn_to_irq(blktap_be_evtchn); - - err = request_irq(blktap_be_irq, blkif_ptbe_int, - SA_SAMPLE_RANDOM, "blkif", NULL); + + err = bind_evtchn_to_irqhandler( + blktap_be_evtchn, blkif_ptbe_int, SA_SAMPLE_RANDOM, "blkif", NULL); if ( err ) { - WPRINTK("blkfront request_irq failed (%d)\n", err); + WPRINTK("blkfront bind_evtchn_to_irqhandler failed (%d)\n", err); return; } else { /* transtion to connected in case we need to do a diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_datapath.c Thu Aug 25 22:53:20 2005 @@ -280,8 +280,6 @@ int more_to_do = 0; int notify_be = 0, notify_user = 0; - if (NR_ACTIVE_REQS == MAX_ACTIVE_REQS) return 1; - /* lock both rings */ spin_lock_irqsave(&blkif_io_lock, flags); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c --- a/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blktap/blktap_userdev.c Thu Aug 25 22:53:20 2005 @@ -5,7 +5,6 @@ * Control interface between the driver and a character device. * * Copyright (c) 2004, Andrew Warfield - * */ #include <linux/config.h> @@ -19,7 +18,11 @@ #include <linux/gfp.h> #include <linux/poll.h> #include <asm/pgalloc.h> +#include <asm/tlbflush.h> #include <asm-xen/xen-public/io/blkif.h> /* for control ring. */ +#ifdef CONFIG_XEN_BLKDEV_GRANT +#include <asm-xen/xen-public/grant_table.h> +#endif #include "blktap.h" @@ -32,11 +35,6 @@ /* for poll: */ static wait_queue_head_t blktap_wait; - -/* Where things are inside the device mapping. */ -struct vm_area_struct *blktap_vma = NULL; -unsigned long mmap_vstart; -unsigned long rings_vstart; /* Rings up to user space. */ static blkif_front_ring_t blktap_ufe_ring; @@ -47,6 +45,61 @@ static int blktap_read_fe_ring(void); static int blktap_read_be_ring(void); + +/* -------[ mmap region ]--------------------------------------------- */ +/* + * We use a big chunk of address space to map in-flight requests into, + * and export this region up to user-space. See the comments in blkback + * about this -- the two must be kept in sync if the tap is used as a + * passthrough. + */ + +#define MAX_PENDING_REQS 64 + +/* immediately before the mmap area, we have a bunch of pages reserved + * for shared memory rings. + */ +#define RING_PAGES 3 /* Ctrl, Front, and Back */ + +/* Where things are inside the device mapping. */ +struct vm_area_struct *blktap_vma = NULL; +unsigned long mmap_vstart; /* Kernel pages for mapping in data. */ +unsigned long rings_vstart; /* start of mmaped vma */ +unsigned long user_vstart; /* start of user mappings */ + +#define MMAP_PAGES_PER_REQUEST \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1) +#define MMAP_PAGES \ + (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST) +#define MMAP_VADDR(_start, _req,_seg) \ + ( _start + \ + ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \ + ((_seg) * PAGE_SIZE)) + +/* -------[ grant handles ]------------------------------------------- */ + +#ifdef CONFIG_XEN_BLKDEV_GRANT +/* When using grant tables to map a frame for device access then the + * handle returned must be used to unmap the frame. This is needed to + * drop the ref count on the frame. + */ +struct grant_handle_pair +{ + u16 kernel; + u16 user; +}; +static struct grant_handle_pair pending_grant_handles[MMAP_PAGES]; +#define pending_handle(_idx, _i) \ + (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)]) +#define BLKTAP_INVALID_HANDLE(_g) \ + (((_g->kernel) == 0xFFFF) && ((_g->user) == 0xFFFF)) +#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ + (_g)->kernel = 0xFFFF; (_g)->user = 0xFFFF; \ + } while(0) + +#endif + + /* -------[ blktap vm ops ]------------------------------------------- */ static struct page *blktap_nopage(struct vm_area_struct *vma, @@ -76,8 +129,6 @@ if ( test_and_set_bit(0, &blktap_dev_inuse) ) return -EBUSY; - - printk(KERN_ALERT "blktap open.\n"); /* Allocate the ctrl ring. */ csring = (ctrl_sring_t *)get_zeroed_page(GFP_KERNEL); @@ -128,7 +179,7 @@ blktap_dev_inuse = 0; blktap_ring_ok = 0; - printk(KERN_ALERT "blktap closed.\n"); + DPRINTK(KERN_ALERT "blktap closed.\n"); /* Free the ring page. */ ClearPageReserved(virt_to_page(blktap_uctrl_ring.sring)); @@ -140,7 +191,7 @@ ClearPageReserved(virt_to_page(blktap_ube_ring.sring)); free_page((unsigned long) blktap_ube_ring.sring); - /* Clear any active mappings. */ + /* Clear any active mappings and free foreign map table */ if (blktap_vma != NULL) { zap_page_range(blktap_vma, blktap_vma->vm_start, blktap_vma->vm_end - blktap_vma->vm_start, NULL); @@ -151,21 +202,36 @@ } /* Note on mmap: - * remap_pfn_range sets VM_IO on vma->vm_flags. In trying to make libaio - * work to do direct page access from userspace, this ended up being a - * problem. The bigger issue seems to be that there is no way to map - * a foreign page in to user space and have the virtual address of that - * page map sanely down to a mfn. - * Removing the VM_IO flag results in a loop in get_user_pages, as - * pfn_valid() always fails on a foreign page. + * We need to map pages to user space in a way that will allow the block + * subsystem set up direct IO to them. This couldn't be done before, because + * there isn't really a sane way to make a user virtual address down to a + * physical address when the page belongs to another domain. + * + * My first approach was to map the page in to kernel memory, add an entry + * for it in the physical frame list (using alloc_lomem_region as in blkback) + * and then attempt to map that page up to user space. This is disallowed + * by xen though, which realizes that we don't really own the machine frame + * underlying the physical page. + * + * The new approach is to provide explicit support for this in xen linux. + * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages + * mapped from other vms. vma->vm_private_data is set up as a mapping + * from pages to actual page structs. There is a new clause in get_user_pages + * that does the right thing for this sort of mapping. + * + * blktap_mmap sets up this mapping. Most of the real work is done in + * blktap_write_fe_ring below. */ static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) { int size; - - printk(KERN_ALERT "blktap mmap (%lx, %lx)\n", + struct page **map; + int i; + + DPRINTK(KERN_ALERT "blktap mmap (%lx, %lx)\n", vma->vm_start, vma->vm_end); + vma->vm_flags |= VM_RESERVED; vma->vm_ops = &blktap_vm_ops; size = vma->vm_end - vma->vm_start; @@ -177,10 +243,10 @@ } size >>= PAGE_SHIFT; - printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); + DPRINTK(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1); rings_vstart = vma->vm_start; - mmap_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); + user_vstart = rings_vstart + (RING_PAGES << PAGE_SHIFT); /* Map the ring pages to the start of the region and reserve it. */ @@ -190,29 +256,44 @@ DPRINTK("Mapping ctrl_ring page %lx.\n", __pa(blktap_uctrl_ring.sring)); if (remap_pfn_range(vma, vma->vm_start, __pa(blktap_uctrl_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - WPRINTK("ctrl_ring: remap_pfn_range failure!\n"); - } + PAGE_SIZE, vma->vm_page_prot)) + goto fail; DPRINTK("Mapping be_ring page %lx.\n", __pa(blktap_ube_ring.sring)); if (remap_pfn_range(vma, vma->vm_start + PAGE_SIZE, __pa(blktap_ube_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - WPRINTK("be_ring: remap_pfn_range failure!\n"); - } + PAGE_SIZE, vma->vm_page_prot)) + goto fail; DPRINTK("Mapping fe_ring page %lx.\n", __pa(blktap_ufe_ring.sring)); if (remap_pfn_range(vma, vma->vm_start + ( 2 * PAGE_SIZE ), __pa(blktap_ufe_ring.sring) >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) { - WPRINTK("fe_ring: remap_pfn_range failure!\n"); - } - + PAGE_SIZE, vma->vm_page_prot)) + goto fail; + + /* Mark this VM as containing foreign pages, and set up mappings. */ + map = kmalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + * sizeof(struct page_struct*), + GFP_KERNEL); + if (map == NULL) goto fail; + + for (i=0; i<((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) + map[i] = NULL; + + vma->vm_private_data = map; + vma->vm_flags |= VM_FOREIGN; + blktap_vma = vma; blktap_ring_ok = 1; return 0; + fail: + /* Clear any active mappings. */ + zap_page_range(vma, vma->vm_start, + vma->vm_end - vma->vm_start, NULL); + + return -ENOMEM; } static int blktap_ioctl(struct inode *inode, struct file *filp, @@ -263,6 +344,8 @@ RING_HAS_UNPUSHED_REQUESTS(&blktap_ufe_ring) || RING_HAS_UNPUSHED_RESPONSES(&blktap_ube_ring) ) { + flush_tlb_all(); + RING_PUSH_REQUESTS(&blktap_uctrl_ring); RING_PUSH_REQUESTS(&blktap_ufe_ring); RING_PUSH_RESPONSES(&blktap_ube_ring); @@ -289,11 +372,71 @@ /*-----[ Data to/from user space ]----------------------------------------*/ +static void fast_flush_area(int idx, int nr_pages) +{ +#ifdef CONFIG_XEN_BLKDEV_GRANT + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + unsigned int i, op = 0; + struct grant_handle_pair *handle; + unsigned long ptep; + + for (i=0; i<nr_pages; i++) + { + handle = &pending_handle(idx, i); + if (!BLKTAP_INVALID_HANDLE(handle)) + { + + unmap[op].host_addr = MMAP_VADDR(mmap_vstart, idx, i); + unmap[op].dev_bus_addr = 0; + unmap[op].handle = handle->kernel; + op++; + + if (create_lookup_pte_addr(blktap_vma->vm_mm, + MMAP_VADDR(user_vstart, idx, i), + &ptep) !=0) { + DPRINTK("Couldn't get a pte addr!\n"); + return; + } + unmap[op].host_addr = ptep; + unmap[op].dev_bus_addr = 0; + unmap[op].handle = handle->user; + op++; + + BLKTAP_INVALIDATE_HANDLE(handle); + } + } + if ( unlikely(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, op))) + BUG(); +#else + multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int i; + + for ( i = 0; i < nr_pages; i++ ) + { + MULTI_update_va_mapping(mcl+i, MMAP_VADDR(mmap_vstart, idx, i), + __pte(0), 0); + } + + mcl[nr_pages-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + if ( unlikely(HYPERVISOR_multicall(mcl, nr_pages) != 0) ) + BUG(); +#endif +} + int blktap_write_fe_ring(blkif_request_t *req) { blkif_request_t *target; - int error, i; + int i, ret = 0; +#ifdef CONFIG_XEN_BLKDEV_GRANT + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; + int op; +#else + unsigned long remap_prot; + multicall_entry_t mcl[BLKIF_MAX_SEGMENTS_PER_REQUEST+1]; + mmu_update_t mmu[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +#endif /* * This is called to pass a request from the real frontend domain's @@ -310,29 +453,184 @@ return 0; } - target = RING_GET_REQUEST(&blktap_ufe_ring, - blktap_ufe_ring.req_prod_pvt); + flush_cache_all(); /* a noop on intel... */ + + target = RING_GET_REQUEST(&blktap_ufe_ring, blktap_ufe_ring.req_prod_pvt); memcpy(target, req, sizeof(*req)); - /* Attempt to map the foreign pages directly in to the application */ + /* Map the foreign pages directly in to the application */ +#ifdef CONFIG_XEN_BLKDEV_GRANT + op = 0; for (i=0; i<target->nr_segments; i++) { - error = direct_remap_area_pages(blktap_vma->vm_mm, - MMAP_VADDR(ID_TO_IDX(req->id), i), - target->frame_and_sects[i] & PAGE_MASK, - PAGE_SIZE, - blktap_vma->vm_page_prot, - ID_TO_DOM(req->id)); - if ( error != 0 ) { - printk(KERN_INFO "remapping attached page failed! (%d)\n", error); - /* the request is now dropped on the floor. */ - return 0; - } - } - + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long ptep; + + uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i); + kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i); + + /* Map the remote page to kernel. */ + map[op].host_addr = kvaddr; + map[op].dom = ID_TO_DOM(req->id); + map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]); + map[op].flags = GNTMAP_host_map; + /* This needs a bit more thought in terms of interposition: + * If we want to be able to modify pages during write using + * grant table mappings, the guest will either need to allow + * it, or we'll need to incur a copy. */ + if (req->operation == BLKIF_OP_WRITE) + map[op].flags |= GNTMAP_readonly; + op++; + + /* Now map it to user. */ + ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); + if (ret) + { + DPRINTK("Couldn't get a pte addr!\n"); + goto fail; + } + + map[op].host_addr = ptep; + map[op].dom = ID_TO_DOM(req->id); + map[op].ref = blkif_gref_from_fas(target->frame_and_sects[i]); + map[op].flags = GNTMAP_host_map | GNTMAP_application_map + | GNTMAP_contains_pte; + /* Above interposition comment applies here as well. */ + if (req->operation == BLKIF_OP_WRITE) + map[op].flags |= GNTMAP_readonly; + op++; + } + + if ( unlikely(HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, map, op))) + BUG(); + + op = 0; + for (i=0; i<(target->nr_segments*2); i+=2) { + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + int cancel = 0; + + uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i/2); + kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i/2); + + if ( unlikely(map[i].handle < 0) ) { + DPRINTK("Error on kernel grant mapping (%d)\n", map[i].handle); + ret = map[i].handle; + cancel = 1; + } + + if ( unlikely(map[i+1].handle < 0) ) { + DPRINTK("Error on user grant mapping (%d)\n", map[i+1].handle); + ret = map[i+1].handle; + cancel = 1; + } + + if (cancel) + goto fail; + + /* Set the necessary mappings in p2m and in the VM_FOREIGN + * vm_area_struct to allow user vaddr -> struct page lookups + * to work. This is needed for direct IO to foreign pages. */ + phys_to_machine_mapping[__pa(kvaddr) >> PAGE_SHIFT] = + FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT); + + offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + ((struct page **)blktap_vma->vm_private_data)[offset] = + pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + + /* Save handles for unmapping later. */ + pending_handle(ID_TO_IDX(req->id), i/2).kernel = map[i].handle; + pending_handle(ID_TO_IDX(req->id), i/2).user = map[i+1].handle; + } + +#else + + remap_prot = _PAGE_PRESENT|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW; + + for (i=0; i<target->nr_segments; i++) { + unsigned long buf; + unsigned long uvaddr; + unsigned long kvaddr; + unsigned long offset; + unsigned long ptep; + + buf = target->frame_and_sects[i] & PAGE_MASK; + uvaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(req->id), i); + kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i); + + MULTI_update_va_mapping_otherdomain( + mcl+i, + kvaddr, + pfn_pte_ma(buf >> PAGE_SHIFT, __pgprot(remap_prot)), + 0, + ID_TO_DOM(req->id)); + + phys_to_machine_mapping[__pa(kvaddr)>>PAGE_SHIFT] = + FOREIGN_FRAME(buf >> PAGE_SHIFT); + + ret = create_lookup_pte_addr(blktap_vma->vm_mm, uvaddr, &ptep); + if (ret) + { + DPRINTK("error getting pte\n"); + goto fail; + } + + mmu[i].ptr = ptep; + mmu[i].val = (target->frame_and_sects[i] & PAGE_MASK) + | pgprot_val(blktap_vma->vm_page_prot); + + offset = (uvaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + ((struct page **)blktap_vma->vm_private_data)[offset] = + pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); + } + + /* Add the mmu_update call. */ + mcl[i].op = __HYPERVISOR_mmu_update; + mcl[i].args[0] = (unsigned long)mmu; + mcl[i].args[1] = target->nr_segments; + mcl[i].args[2] = 0; + mcl[i].args[3] = ID_TO_DOM(req->id); + + BUG_ON(HYPERVISOR_multicall(mcl, target->nr_segments+1) != 0); + + /* Make sure it all worked. */ + for ( i = 0; i < target->nr_segments; i++ ) + { + if ( unlikely(mcl[i].result != 0) ) + { + DPRINTK("invalid buffer -- could not remap it\n"); + ret = mcl[i].result; + goto fail; + } + } + if ( unlikely(mcl[i].result != 0) ) + { + DPRINTK("direct remapping of pages to /dev/blktap failed.\n"); + ret = mcl[i].result; + goto fail; + } +#endif /* CONFIG_XEN_BLKDEV_GRANT */ + + /* Mark mapped pages as reserved: */ + for ( i = 0; i < target->nr_segments; i++ ) + { + unsigned long kvaddr; + + kvaddr = MMAP_VADDR(mmap_vstart, ID_TO_IDX(req->id), i); + SetPageReserved(pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT)); + } + + blktap_ufe_ring.req_prod_pvt++; return 0; + + fail: + fast_flush_area(ID_TO_IDX(req->id), target->nr_segments); + return ret; } int blktap_write_be_ring(blkif_response_t *rsp) @@ -366,7 +664,7 @@ { /* This is called to read responses from the UFE ring. */ - RING_IDX i, rp; + RING_IDX i, j, rp; blkif_response_t *resp_s; blkif_t *blkif; active_req_t *ar; @@ -387,7 +685,21 @@ DPRINTK("resp->fe_ring\n"); ar = lookup_active_req(ID_TO_IDX(resp_s->id)); blkif = ar->blkif; - zap_page_range(blktap_vma, MMAP_VADDR(ID_TO_IDX(resp_s->id), 0), + for (j = 0; j < ar->nr_pages; j++) { + unsigned long vaddr; + struct page **map = blktap_vma->vm_private_data; + int offset; + + vaddr = MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), j); + offset = (vaddr - blktap_vma->vm_start) >> PAGE_SHIFT; + + ClearPageReserved(virt_to_page(vaddr)); + map[offset] = NULL; + } + + fast_flush_area(ID_TO_IDX(resp_s->id), ar->nr_pages); + zap_page_range(blktap_vma, + MMAP_VADDR(user_vstart, ID_TO_IDX(resp_s->id), 0), ar->nr_pages << PAGE_SHIFT, NULL); write_resp_to_fe_ring(blkif, resp_s); blktap_ufe_ring.rsp_cons = i + 1; @@ -462,7 +774,18 @@ int blktap_init(void) { - int err; + int err, i, j; + struct page *page; + + page = balloon_alloc_empty_page_range(MMAP_PAGES); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + +#ifdef CONFIG_XEN_BLKDEV_GRANT + for (i=0; i<MAX_PENDING_REQS ; i++) + for (j=0; j<BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) + BLKTAP_INVALIDATE_HANDLE(&pending_handle(i, j)); +#endif err = misc_register(&blktap_miscdev); if ( err != 0 ) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/console/console.c --- a/linux-2.6-xen-sparse/drivers/xen/console/console.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/console/console.c Thu Aug 25 22:53:20 2005 @@ -240,7 +240,11 @@ #endif /*** Useful function for console debugging -- goes straight to Xen. ***/ +#ifdef CONFIG_XEN_PRIVILEGED_GUEST asmlinkage int xprintk(const char *fmt, ...) +#else +asmlinkage int xprintk(const char *fmt, ...) +#endif { va_list args; int printk_len; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netback/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/Makefile Thu Aug 25 22:53:20 2005 @@ -1,2 +1,2 @@ -obj-y := netback.o control.o interface.o loopback.o +obj-y := netback.o xenbus.o interface.o loopback.o diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/netback/common.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/common.h Thu Aug 25 22:53:20 2005 @@ -15,9 +15,17 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <asm-xen/ctrl_if.h> +#include <asm-xen/evtchn.h> #include <asm-xen/xen-public/io/netif.h> #include <asm/io.h> #include <asm/pgalloc.h> + +#if defined(CONFIG_XEN_NETDEV_GRANT_TX) || defined(CONFIG_XEN_NETDEV_GRANT_RX) +#include <asm-xen/xen-public/grant_table.h> +#include <asm-xen/gnttab.h> +#endif + + #if 0 #define ASSERT(_p) \ @@ -39,9 +47,19 @@ /* Physical parameters of the comms window. */ unsigned long tx_shmem_frame; +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + u16 tx_shmem_handle; + unsigned long tx_shmem_vaddr; + grant_ref_t tx_shmem_ref; +#endif unsigned long rx_shmem_frame; +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + u16 rx_shmem_handle; + unsigned long rx_shmem_vaddr; + grant_ref_t rx_shmem_ref; +#endif unsigned int evtchn; - int irq; + unsigned int remote_evtchn; /* The shared rings and indexes. */ netif_tx_interface_t *tx; @@ -65,36 +83,30 @@ /* Miscellaneous private stuff. */ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; int active; - /* - * DISCONNECT response is deferred until pending requests are ack'ed. - * We therefore need to store the id from the original request. - */ - u8 disconnect_rspid; - struct netif_st *hash_next; struct list_head list; /* scheduling list */ atomic_t refcnt; struct net_device *dev; struct net_device_stats stats; - struct work_struct work; + struct work_struct free_work; } netif_t; -void netif_create(netif_be_create_t *create); -void netif_destroy(netif_be_destroy_t *destroy); -void netif_creditlimit(netif_be_creditlimit_t *creditlimit); -void netif_connect(netif_be_connect_t *connect); -int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id); -void netif_disconnect_complete(netif_t *netif); -netif_t *netif_find_by_handle(domid_t domid, unsigned int handle); +void netif_creditlimit(netif_t *netif); +int netif_disconnect(netif_t *netif); + +netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]); +void free_netif_callback(netif_t *netif); +int netif_map(netif_t *netif, unsigned long tx_ring_ref, + unsigned long rx_ring_ref, unsigned int evtchn); + #define netif_get(_b) (atomic_inc(&(_b)->refcnt)) #define netif_put(_b) \ do { \ if ( atomic_dec_and_test(&(_b)->refcnt) ) \ - netif_disconnect_complete(_b); \ + free_netif_callback(_b); \ } while (0) -void netif_interface_init(void); -void netif_ctrlif_init(void); +void netif_xenbus_init(void); void netif_schedule_work(netif_t *netif); void netif_deschedule_work(netif_t *netif); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/interface.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/interface.c Thu Aug 25 22:53:20 2005 @@ -9,31 +9,14 @@ #include "common.h" #include <linux/rtnetlink.h> -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#endif - -#define NETIF_HASHSZ 1024 -#define NETIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(NETIF_HASHSZ-1)) - -static netif_t *netif_hash[NETIF_HASHSZ]; - -netif_t *netif_find_by_handle(domid_t domid, unsigned int handle) -{ - netif_t *netif = netif_hash[NETIF_HASH(domid, handle)]; - while ( (netif != NULL) && - ((netif->domid != domid) || (netif->handle != handle)) ) - netif = netif->hash_next; - return netif; -} - static void __netif_up(netif_t *netif) { struct net_device *dev = netif->dev; spin_lock_bh(&dev->xmit_lock); netif->active = 1; spin_unlock_bh(&dev->xmit_lock); - (void)request_irq(netif->irq, netif_be_int, 0, dev->name, netif); + (void)bind_evtchn_to_irqhandler( + netif->evtchn, netif_be_int, 0, dev->name, netif); netif_schedule_work(netif); } @@ -43,14 +26,14 @@ spin_lock_bh(&dev->xmit_lock); netif->active = 0; spin_unlock_bh(&dev->xmit_lock); - free_irq(netif->irq, netif); + unbind_evtchn_from_irqhandler(netif->evtchn, netif); netif_deschedule_work(netif); } static int net_open(struct net_device *dev) { netif_t *netif = netdev_priv(dev); - if ( netif->status == CONNECTED ) + if (netif->status == CONNECTED) __netif_up(netif); netif_start_queue(dev); return 0; @@ -60,74 +43,23 @@ { netif_t *netif = netdev_priv(dev); netif_stop_queue(dev); - if ( netif->status == CONNECTED ) + if (netif->status == CONNECTED) __netif_down(netif); return 0; } -static void __netif_disconnect_complete(void *arg) -{ - netif_t *netif = (netif_t *)arg; - ctrl_msg_t cmsg; - netif_be_disconnect_t disc; - - /* - * These can't be done in netif_disconnect() because at that point there - * may be outstanding requests in the network stack whose asynchronous - * responses must still be notified to the remote driver. - */ - unbind_evtchn_from_irq(netif->evtchn); - vfree(netif->tx); /* Frees netif->rx as well. */ - - /* Construct the deferred response message. */ - cmsg.type = CMSG_NETIF_BE; - cmsg.subtype = CMSG_NETIF_BE_DISCONNECT; - cmsg.id = netif->disconnect_rspid; - cmsg.length = sizeof(netif_be_disconnect_t); - disc.domid = netif->domid; - disc.netif_handle = netif->handle; - disc.status = NETIF_BE_STATUS_OKAY; - memcpy(cmsg.msg, &disc, sizeof(disc)); - - /* - * Make sure message is constructed /before/ status change, because - * after the status change the 'netif' structure could be deallocated at - * any time. Also make sure we send the response /after/ status change, - * as otherwise a subsequent CONNECT request could spuriously fail if - * another CPU doesn't see the status change yet. - */ - mb(); - if ( netif->status != DISCONNECTING ) - BUG(); - netif->status = DISCONNECTED; - mb(); - - /* Send the successful response. */ - ctrl_if_send_response(&cmsg); -} - -void netif_disconnect_complete(netif_t *netif) -{ - INIT_WORK(&netif->work, __netif_disconnect_complete, (void *)netif); - schedule_work(&netif->work); -} - -void netif_create(netif_be_create_t *create) -{ - int err = 0; - domid_t domid = create->domid; - unsigned int handle = create->netif_handle; +netif_t *alloc_netif(domid_t domid, unsigned int handle, u8 be_mac[ETH_ALEN]) +{ + int err = 0, i; struct net_device *dev; - netif_t **pnetif, *netif; - char name[IFNAMSIZ] = {}; + netif_t *netif; + char name[IFNAMSIZ] = {}; snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); dev = alloc_netdev(sizeof(netif_t), name, ether_setup); - if ( dev == NULL ) - { + if (dev == NULL) { DPRINTK("Could not create netif: out of memory\n"); - create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - return; + return NULL; } netif = netdev_priv(dev); @@ -142,19 +74,6 @@ netif->credit_usec = 0UL; init_timer(&netif->credit_timeout); - pnetif = &netif_hash[NETIF_HASH(domid, handle)]; - while ( *pnetif != NULL ) - { - if ( ((*pnetif)->domid == domid) && ((*pnetif)->handle == handle) ) - { - DPRINTK("Could not create netif: already exists\n"); - create->status = NETIF_BE_STATUS_INTERFACE_EXISTS; - free_netdev(dev); - return; - } - pnetif = &(*pnetif)->hash_next; - } - dev->hard_start_xmit = netif_be_start_xmit; dev->get_stats = netif_be_get_stats; dev->open = net_open; @@ -164,10 +83,10 @@ /* Disable queuing. */ dev->tx_queue_len = 0; - if ( (create->be_mac[0] == 0) && (create->be_mac[1] == 0) && - (create->be_mac[2] == 0) && (create->be_mac[3] == 0) && - (create->be_mac[4] == 0) && (create->be_mac[5] == 0) ) - { + for (i = 0; i < ETH_ALEN; i++) + if (be_mac[i] != 0) + break; + if (i == ETH_ALEN) { /* * Initialise a dummy MAC address. We choose the numerically largest * non-broadcast address to prevent the address getting stolen by an @@ -175,87 +94,200 @@ */ memset(dev->dev_addr, 0xFF, ETH_ALEN); dev->dev_addr[0] &= ~0x01; - } - else - { - memcpy(dev->dev_addr, create->be_mac, ETH_ALEN); - } - - memcpy(netif->fe_dev_addr, create->mac, ETH_ALEN); + } else + memcpy(dev->dev_addr, be_mac, ETH_ALEN); rtnl_lock(); err = register_netdevice(dev); rtnl_unlock(); - - if ( err != 0 ) - { + if (err) { DPRINTK("Could not register new net device %s: err=%d\n", dev->name, err); - create->status = NETIF_BE_STATUS_OUT_OF_MEMORY; free_netdev(dev); - return; - } - - netif->hash_next = *pnetif; - *pnetif = netif; + return NULL; + } DPRINTK("Successfully created netif\n"); - create->status = NETIF_BE_STATUS_OKAY; -} - -void netif_destroy(netif_be_destroy_t *destroy) -{ - domid_t domid = destroy->domid; - unsigned int handle = destroy->netif_handle; - netif_t **pnetif, *netif; - - pnetif = &netif_hash[NETIF_HASH(domid, handle)]; - while ( (netif = *pnetif) != NULL ) + return netif; +} + +static int map_frontend_page(netif_t *netif, unsigned long localaddr, + unsigned long tx_ring_ref, unsigned long rx_ring_ref) +{ +#if !defined(CONFIG_XEN_NETDEV_GRANT_TX)||!defined(CONFIG_XEN_NETDEV_GRANT_RX) + pgprot_t prot = __pgprot(_KERNPG_TABLE); + int err; +#endif +#if defined(CONFIG_XEN_NETDEV_GRANT_TX) { - if ( (netif->domid == domid) && (netif->handle == handle) ) - { - if ( netif->status != DISCONNECTED ) - goto still_connected; - goto destroy; + struct gnttab_map_grant_ref op; + + /* Map: Use the Grant table reference */ + op.host_addr = localaddr; + op.flags = GNTMAP_host_map; + op.ref = tx_ring_ref; + op.dom = netif->domid; + + BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; } - pnetif = &netif->hash_next; - } - - destroy->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - - still_connected: - destroy->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; - return; - - destroy: - *pnetif = netif->hash_next; + + netif->tx_shmem_ref = tx_ring_ref; + netif->tx_shmem_handle = op.handle; + netif->tx_shmem_vaddr = localaddr; + } +#else + err = direct_remap_area_pages(&init_mm, localaddr, + tx_ring_ref<<PAGE_SHIFT, PAGE_SIZE, + prot, netif->domid); + if (err) + return err; +#endif + +#if defined(CONFIG_XEN_NETDEV_GRANT_RX) + { + struct gnttab_map_grant_ref op; + + /* Map: Use the Grant table reference */ + op.host_addr = localaddr + PAGE_SIZE; + op.flags = GNTMAP_host_map; + op.ref = rx_ring_ref; + op.dom = netif->domid; + + BUG_ON( HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1) ); + if (op.handle < 0) { + DPRINTK(" Grant table operation failure !\n"); + return op.handle; + } + + netif->rx_shmem_ref = rx_ring_ref; + netif->rx_shmem_handle = op.handle; + netif->rx_shmem_vaddr = localaddr + PAGE_SIZE; + } +#else + err = direct_remap_area_pages(&init_mm, localaddr + PAGE_SIZE, + rx_ring_ref<<PAGE_SHIFT, PAGE_SIZE, + prot, netif->domid); + if (err) + return err; +#endif + + return 0; +} + +static void unmap_frontend_page(netif_t *netif) +{ +#if defined(CONFIG_XEN_NETDEV_GRANT_RX) || defined(CONFIG_XEN_NETDEV_GRANT_TX) + struct gnttab_unmap_grant_ref op; +#endif + +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + op.host_addr = netif->tx_shmem_vaddr; + op.handle = netif->tx_shmem_handle; + op.dev_bus_addr = 0; + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); +#endif + +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + op.host_addr = netif->rx_shmem_vaddr; + op.handle = netif->rx_shmem_handle; + op.dev_bus_addr = 0; + BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)); +#endif +} + +int netif_map(netif_t *netif, unsigned long tx_ring_ref, + unsigned long rx_ring_ref, unsigned int evtchn) +{ + struct vm_struct *vma; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP); + if (vma == NULL) + return -ENOMEM; + + err = map_frontend_page(netif, (unsigned long)vma->addr, tx_ring_ref, + rx_ring_ref); + if (err) { + vfree(vma->addr); + return err; + } + + op.u.bind_interdomain.dom1 = DOMID_SELF; + op.u.bind_interdomain.dom2 = netif->domid; + op.u.bind_interdomain.port1 = 0; + op.u.bind_interdomain.port2 = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_frontend_page(netif); + vfree(vma->addr); + return err; + } + + netif->evtchn = op.u.bind_interdomain.port1; + netif->remote_evtchn = evtchn; + + netif->tx = (netif_tx_interface_t *)vma->addr; + netif->rx = (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); + netif->tx->resp_prod = netif->rx->resp_prod = 0; + netif_get(netif); + wmb(); /* Other CPUs see new state before interface is started. */ + + rtnl_lock(); + netif->status = CONNECTED; + wmb(); + if (netif_running(netif->dev)) + __netif_up(netif); + rtnl_unlock(); + + return 0; +} + +static void free_netif(void *arg) +{ + evtchn_op_t op = { .cmd = EVTCHNOP_close }; + netif_t *netif = (netif_t *)arg; + + /* + * These can't be done in netif_disconnect() because at that point there + * may be outstanding requests in the network stack whose asynchronous + * responses must still be notified to the remote driver. + */ + + op.u.close.port = netif->evtchn; + op.u.close.dom = DOMID_SELF; + HYPERVISOR_event_channel_op(&op); + op.u.close.port = netif->remote_evtchn; + op.u.close.dom = netif->domid; + HYPERVISOR_event_channel_op(&op); + unregister_netdev(netif->dev); + + if (netif->tx) { + unmap_frontend_page(netif); + vfree(netif->tx); /* Frees netif->rx as well. */ + } + free_netdev(netif->dev); - destroy->status = NETIF_BE_STATUS_OKAY; -} - -void netif_creditlimit(netif_be_creditlimit_t *creditlimit) -{ - domid_t domid = creditlimit->domid; - unsigned int handle = creditlimit->netif_handle; - netif_t *netif; - - netif = netif_find_by_handle(domid, handle); - if ( unlikely(netif == NULL) ) - { - DPRINTK("netif_creditlimit attempted for non-existent netif" - " (%u,%u)\n", creditlimit->domid, creditlimit->netif_handle); - creditlimit->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - +} + +void free_netif_callback(netif_t *netif) +{ + INIT_WORK(&netif->free_work, free_netif, (void *)netif); + schedule_work(&netif->free_work); +} + +void netif_creditlimit(netif_t *netif) +{ +#if 0 /* Set the credit limit (reset remaining credit to new limit). */ netif->credit_bytes = netif->remaining_credit = creditlimit->credit_bytes; netif->credit_usec = creditlimit->period_usec; - if ( netif->status == CONNECTED ) - { + if (netif->status == CONNECTED) { /* * Schedule work so that any packets waiting under previous credit * limit are dealt with (acts like a replenishment point). @@ -263,119 +295,22 @@ netif->credit_timeout.expires = jiffies; netif_schedule_work(netif); } - - creditlimit->status = NETIF_BE_STATUS_OKAY; -} - -void netif_connect(netif_be_connect_t *connect) -{ - domid_t domid = connect->domid; - unsigned int handle = connect->netif_handle; - unsigned int evtchn = connect->evtchn; - unsigned long tx_shmem_frame = connect->tx_shmem_frame; - unsigned long rx_shmem_frame = connect->rx_shmem_frame; - struct vm_struct *vma; - pgprot_t prot; - int error; - netif_t *netif; - - netif = netif_find_by_handle(domid, handle); - if ( unlikely(netif == NULL) ) - { - DPRINTK("netif_connect attempted for non-existent netif (%u,%u)\n", - connect->domid, connect->netif_handle); - connect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return; - } - - if ( netif->status != DISCONNECTED ) - { - connect->status = NETIF_BE_STATUS_INTERFACE_CONNECTED; - return; - } - - if ( (vma = get_vm_area(2*PAGE_SIZE, VM_IOREMAP)) == NULL ) - { - connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - return; - } - - prot = __pgprot(_KERNPG_TABLE); - error = direct_remap_area_pages(&init_mm, - VMALLOC_VMADDR(vma->addr), - tx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - error |= direct_remap_area_pages(&init_mm, - VMALLOC_VMADDR(vma->addr) + PAGE_SIZE, - rx_shmem_frame<<PAGE_SHIFT, PAGE_SIZE, - prot, domid); - if ( error != 0 ) - { - if ( error == -ENOMEM ) - connect->status = NETIF_BE_STATUS_OUT_OF_MEMORY; - else if ( error == -EFAULT ) - connect->status = NETIF_BE_STATUS_MAPPING_ERROR; - else - connect->status = NETIF_BE_STATUS_ERROR; - vfree(vma->addr); - return; - } - - netif->evtchn = evtchn; - netif->irq = bind_evtchn_to_irq(evtchn); - netif->tx_shmem_frame = tx_shmem_frame; - netif->rx_shmem_frame = rx_shmem_frame; - netif->tx = - (netif_tx_interface_t *)vma->addr; - netif->rx = - (netif_rx_interface_t *)((char *)vma->addr + PAGE_SIZE); - netif->tx->resp_prod = netif->rx->resp_prod = 0; - netif_get(netif); - wmb(); /* Other CPUs see new state before interface is started. */ - - rtnl_lock(); - netif->status = CONNECTED; - wmb(); - if ( netif_running(netif->dev) ) - __netif_up(netif); - rtnl_unlock(); - - connect->status = NETIF_BE_STATUS_OKAY; -} - -int netif_disconnect(netif_be_disconnect_t *disconnect, u8 rsp_id) -{ - domid_t domid = disconnect->domid; - unsigned int handle = disconnect->netif_handle; - netif_t *netif; - - netif = netif_find_by_handle(domid, handle); - if ( unlikely(netif == NULL) ) - { - DPRINTK("netif_disconnect attempted for non-existent netif" - " (%u,%u)\n", disconnect->domid, disconnect->netif_handle); - disconnect->status = NETIF_BE_STATUS_INTERFACE_NOT_FOUND; - return 1; /* Caller will send response error message. */ - } - - if ( netif->status == CONNECTED ) - { +#endif +} + +int netif_disconnect(netif_t *netif) +{ + + if (netif->status == CONNECTED) { rtnl_lock(); netif->status = DISCONNECTING; - netif->disconnect_rspid = rsp_id; wmb(); - if ( netif_running(netif->dev) ) + if (netif_running(netif->dev)) __netif_down(netif); rtnl_unlock(); netif_put(netif); return 0; /* Caller should not send response message. */ } - disconnect->status = NETIF_BE_STATUS_OKAY; return 1; } - -void netif_interface_init(void) -{ - memset(netif_hash, 0, sizeof(netif_hash)); -} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netback/netback.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/netback.c Thu Aug 25 22:53:20 2005 @@ -12,11 +12,6 @@ #include "common.h" #include <asm-xen/balloon.h> -#include <asm-xen/evtchn.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#include <linux/delay.h> -#endif #if defined(CONFIG_XEN_NETDEV_GRANT_TX) || defined(CONFIG_XEN_NETDEV_GRANT_RX) #include <asm-xen/xen-public/grant_table.h> @@ -44,7 +39,7 @@ static int make_rx_response(netif_t *netif, u16 id, s8 st, - memory_t addr, + unsigned long addr, u16 size, u16 csum_valid); @@ -55,11 +50,15 @@ static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); static struct timer_list net_timer; + +#define MAX_PENDING_REQS 256 static struct sk_buff_head rx_queue; static multicall_entry_t rx_mcl[NETIF_RX_RING_SIZE*2+1]; static mmu_update_t rx_mmu[NETIF_RX_RING_SIZE]; -#ifndef CONFIG_XEN_NETDEV_GRANT_RX +#ifdef CONFIG_XEN_NETDEV_GRANT_RX +static gnttab_donate_t grant_rx_op[MAX_PENDING_REQS]; +#else static struct mmuext_op rx_mmuext[NETIF_RX_RING_SIZE]; #endif static unsigned char rx_notify[NR_EVENT_CHANNELS]; @@ -67,7 +66,6 @@ /* Don't currently gate addition of an interface to the tx scheduling list. */ #define tx_work_exists(_if) (1) -#define MAX_PENDING_REQS 256 static unsigned long mmap_vstart; #define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) @@ -91,11 +89,9 @@ #ifdef CONFIG_XEN_NETDEV_GRANT_TX static u16 grant_tx_ref[MAX_PENDING_REQS]; -#endif -#ifdef CONFIG_XEN_NETDEV_GRANT_RX -static gnttab_donate_t grant_rx_op[MAX_PENDING_REQS]; -#endif -#ifndef CONFIG_XEN_NETDEV_GRANT_TX +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; +#else static multicall_entry_t tx_mcl[MAX_PENDING_REQS]; #endif @@ -153,11 +149,7 @@ static inline int is_xen_skb(struct sk_buff *skb) { extern kmem_cache_t *skbuff_cachep; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->lru.next; -#else - kmem_cache_t *cp = (kmem_cache_t *)virt_to_page(skb->head)->list.next; -#endif return (cp == skbuff_cachep); } @@ -251,7 +243,7 @@ #else struct mmuext_op *mmuext; #endif - unsigned long vdata, mdata, new_mfn; + unsigned long vdata, old_mfn, new_mfn; struct sk_buff_head rxq; struct sk_buff *skb; u16 notify_list[NETIF_RX_RING_SIZE]; @@ -271,7 +263,7 @@ { netif = netdev_priv(skb->dev); vdata = (unsigned long)skb->data; - mdata = virt_to_machine(vdata); + old_mfn = virt_to_mfn(vdata); /* Memory squeeze? Back off for an arbitrary while. */ if ( (new_mfn = alloc_mfn()) == 0 ) @@ -293,7 +285,7 @@ mcl++; #ifdef CONFIG_XEN_NETDEV_GRANT_RX - gop->mfn = mdata >> PAGE_SHIFT; + gop->mfn = old_mfn; gop->domid = netif->domid; gop->handle = netif->rx->ring[ MASK_NETIF_RX_IDX(netif->rx_resp_prod_copy)].req.gref; @@ -308,7 +300,7 @@ mcl++; mmuext->cmd = MMUEXT_REASSIGN_PAGE; - mmuext->mfn = mdata >> PAGE_SHIFT; + mmuext->mfn = old_mfn; mmuext++; #endif mmu->ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; @@ -318,7 +310,7 @@ __skb_queue_tail(&rxq, skb); #ifdef DEBUG_GRANT - dump_packet('a', mdata, vdata); + dump_packet('a', old_mfn, vdata); #endif /* Filled the batch queue? */ if ( (mcl - rx_mcl) == ARRAY_SIZE(rx_mcl) ) @@ -345,10 +337,8 @@ mcl = rx_mcl; #ifdef CONFIG_XEN_NETDEV_GRANT_RX - if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_donate, - grant_rx_op, gop - grant_rx_op))) { - BUG(); - } + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_donate, grant_rx_op, gop - grant_rx_op)); gop = grant_rx_op; #else mmuext = rx_mmuext; @@ -361,10 +351,9 @@ /* Rederive the machine addresses. */ new_mfn = mcl[0].args[1] >> PAGE_SHIFT; #ifdef CONFIG_XEN_NETDEV_GRANT_RX - mdata = (unsigned long)skb->data & ~PAGE_MASK; -#else - mdata = ((mmuext[0].mfn << PAGE_SHIFT) | - ((unsigned long)skb->data & ~PAGE_MASK)); + old_mfn = 0; /* XXX Fix this so we can free_mfn() on error! */ +#else + old_mfn = mmuext[0].mfn; #endif atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; @@ -379,18 +368,20 @@ /* Check the reassignment error code. */ status = NETIF_RSP_OKAY; #ifdef CONFIG_XEN_NETDEV_GRANT_RX - BUG_ON(gop->status != 0); + BUG_ON(gop->status != 0); /* XXX */ #else if ( unlikely(mcl[1].result != 0) ) { DPRINTK("Failed MMU update transferring to DOM%u\n", netif->domid); - free_mfn(mdata >> PAGE_SHIFT); + free_mfn(old_mfn); status = NETIF_RSP_ERROR; } #endif evtchn = netif->evtchn; id = netif->rx->ring[MASK_NETIF_RX_IDX(netif->rx_resp_prod)].req.id; - if ( make_rx_response(netif, id, status, mdata, + if ( make_rx_response(netif, id, status, + (old_mfn << PAGE_SHIFT) | /* XXX */ + ((unsigned long)skb->data & ~PAGE_MASK), size, skb->proto_csum_valid) && (rx_notify[evtchn] == 0) ) { @@ -493,7 +484,6 @@ inline static void net_tx_action_dealloc(void) { #ifdef CONFIG_XEN_NETDEV_GRANT_TX - gnttab_unmap_grant_ref_t unmap_ops[MAX_PENDING_REQS]; gnttab_unmap_grant_ref_t *gop; #else multicall_entry_t *mcl; @@ -509,19 +499,18 @@ /* * Free up any grants we have finished using */ - gop = unmap_ops; - while (dc != dp) { + gop = tx_unmap_ops; + while ( dc != dp ) + { pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; - gop->host_virt_addr = MMAP_VADDR(pending_idx); + gop->host_addr = MMAP_VADDR(pending_idx); gop->dev_bus_addr = 0; - gop->handle = grant_tx_ref[pending_idx]; + gop->handle = grant_tx_ref[pending_idx]; grant_tx_ref[pending_idx] = GRANT_INVALID_REF; gop++; } - if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - unmap_ops, gop - unmap_ops))) { - BUG(); - } + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops)); #else mcl = tx_mcl; while ( dc != dp ) @@ -584,7 +573,6 @@ u16 pending_idx; NETIF_RING_IDX i; #ifdef CONFIG_XEN_NETDEV_GRANT_TX - gnttab_map_grant_ref_t map_ops[MAX_PENDING_REQS]; gnttab_map_grant_ref_t *mop; #else multicall_entry_t *mcl; @@ -595,7 +583,7 @@ net_tx_action_dealloc(); #ifdef CONFIG_XEN_NETDEV_GRANT_TX - mop = map_ops; + mop = tx_map_ops; #else mcl = tx_mcl; #endif @@ -646,11 +634,7 @@ netif->credit_timeout.expires = next_credit; netif->credit_timeout.data = (unsigned long)netif; netif->credit_timeout.function = tx_credit_callback; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) add_timer_on(&netif->credit_timeout, smp_processor_id()); -#else - add_timer(&netif->credit_timeout); -#endif break; } } @@ -700,10 +684,10 @@ /* Packets passed to netif_rx() must have some headroom. */ skb_reserve(skb, 16); #ifdef CONFIG_XEN_NETDEV_GRANT_TX - mop->host_virt_addr = MMAP_VADDR(pending_idx); - mop->dom = netif->domid; - mop->ref = txreq.addr >> PAGE_SHIFT; - mop->flags = GNTMAP_host_map | GNTMAP_readonly; + mop->host_addr = MMAP_VADDR(pending_idx); + mop->dom = netif->domid; + mop->ref = txreq.addr >> PAGE_SHIFT; + mop->flags = GNTMAP_host_map | GNTMAP_readonly; mop++; #else MULTI_update_va_mapping_otherdomain( @@ -723,7 +707,7 @@ pending_cons++; #ifdef CONFIG_XEN_NETDEV_GRANT_TX - if ((mop - map_ops) >= ARRAY_SIZE(map_ops)) + if ( (mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops) ) break; #else /* Filled the batch queue? */ @@ -733,20 +717,18 @@ } #ifdef CONFIG_XEN_NETDEV_GRANT_TX - if (mop == map_ops) { + if ( mop == tx_map_ops ) return; - } - if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, - map_ops, mop - map_ops))) { - BUG(); - } - mop = map_ops; + + BUG_ON(HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops)); + + mop = tx_map_ops; #else if ( mcl == tx_mcl ) return; - if ( unlikely(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0) ) - BUG(); + BUG_ON(HYPERVISOR_multicall(tx_mcl, mcl - tx_mcl) != 0); mcl = tx_mcl; #endif @@ -758,7 +740,13 @@ /* Check the remap error code. */ #ifdef CONFIG_XEN_NETDEV_GRANT_TX - if (unlikely(mop->dev_bus_addr == 0)) { + /* + XXX SMH: error returns from grant operations are pretty poorly + specified/thought out, but the below at least conforms with + what the rest of the code uses. + */ + if ( unlikely(mop->handle < 0) ) + { printk(KERN_ALERT "#### netback grant fails\n"); make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); netif_put(netif); @@ -768,7 +756,7 @@ continue; } phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT] = - FOREIGN_FRAME(mop->dev_bus_addr); + FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT); grant_tx_ref[pending_idx] = mop->handle; #else if ( unlikely(mcl[0].result != 0) ) @@ -887,7 +875,7 @@ static int make_rx_response(netif_t *netif, u16 id, s8 st, - memory_t addr, + unsigned long addr, u16 size, u16 csum_valid) { @@ -966,10 +954,9 @@ net_timer.data = 0; net_timer.function = net_alarm; - netif_interface_init(); - - mmap_vstart = allocate_empty_lowmem_region(MAX_PENDING_REQS); - BUG_ON(mmap_vstart == 0); + page = balloon_alloc_empty_page_range(MAX_PENDING_REQS); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); for ( i = 0; i < MAX_PENDING_REQS; i++ ) { @@ -986,7 +973,7 @@ spin_lock_init(&net_schedule_list_lock); INIT_LIST_HEAD(&net_schedule_list); - netif_ctrlif_init(); + netif_xenbus_init(); (void)request_irq(bind_virq_to_irq(VIRQ_DEBUG), netif_be_dbg, SA_SHIRQ, diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Aug 25 22:53:20 2005 @@ -48,7 +48,7 @@ #include <asm/io.h> #include <asm/uaccess.h> #include <asm-xen/evtchn.h> -#include <asm-xen/ctrl_if.h> +#include <asm-xen/xenbus.h> #include <asm-xen/xen-public/io/netif.h> #include <asm-xen/balloon.h> #include <asm/page.h> @@ -59,7 +59,7 @@ #include <asm-xen/gnttab.h> #ifdef GRANT_DEBUG static void -dump_packet(int tag, u32 addr, u32 ap) +dump_packet(int tag, void *addr, u32 ap) { unsigned char *p = (unsigned char *)ap; int i; @@ -102,19 +102,23 @@ #endif #ifdef CONFIG_XEN_NETDEV_GRANT_TX -static grant_ref_t gref_tx_head, gref_tx_terminal; +static grant_ref_t gref_tx_head; static grant_ref_t grant_tx_ref[NETIF_TX_RING_SIZE + 1]; #endif #ifdef CONFIG_XEN_NETDEV_GRANT_RX -static grant_ref_t gref_rx_head, gref_rx_terminal; +static grant_ref_t gref_rx_head; static grant_ref_t grant_rx_ref[NETIF_RX_RING_SIZE + 1]; #endif #if defined(CONFIG_XEN_NETDEV_GRANT_TX) || defined(CONFIG_XEN_NETDEV_GRANT_RX) -static domid_t rdomid = 0; #define GRANT_INVALID_REF (0xFFFF) #endif + +#define NETIF_STATE_DISCONNECTED 0 +#define NETIF_STATE_CONNECTED 1 + +static unsigned int netif_state = NETIF_STATE_DISCONNECTED; static void network_tx_buf_gc(struct net_device *dev); static void network_alloc_rx_buffers(struct net_device *dev); @@ -133,12 +137,11 @@ #define xennet_proc_delif(d) ((void)0) #endif -static struct list_head dev_list; - +#define netfront_info net_private struct net_private { struct list_head list; - struct net_device *dev; + struct net_device *netdev; struct net_device_stats stats; NETIF_RING_IDX rx_resp_cons, tx_resp_cons; @@ -152,7 +155,6 @@ unsigned int handle; unsigned int evtchn; - unsigned int irq; /* What is the status of our connection to the remote backend? */ #define BEST_CLOSED 0 @@ -177,6 +179,14 @@ */ struct sk_buff *tx_skbs[NETIF_TX_RING_SIZE+1]; struct sk_buff *rx_skbs[NETIF_RX_RING_SIZE+1]; + + struct xenbus_device *xbdev; + char *backend; + int backend_id; + struct xenbus_watch watch; + int tx_ring_ref; + int rx_ring_ref; + u8 mac[ETH_ALEN]; }; /* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ @@ -188,20 +198,15 @@ (_list)[0] = (_list)[_id]; \ (unsigned short)_id; }) -static char *status_name[] = { - [NETIF_INTERFACE_STATUS_CLOSED] = "closed", - [NETIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", - [NETIF_INTERFACE_STATUS_CONNECTED] = "connected", - [NETIF_INTERFACE_STATUS_CHANGED] = "changed", -}; - +#ifdef DEBUG static char *be_state_name[] = { [BEST_CLOSED] = "closed", [BEST_DISCONNECTED] = "disconnected", [BEST_CONNECTED] = "connected", }; - -#if DEBUG +#endif + +#ifdef DEBUG #define DPRINTK(fmt, args...) \ printk(KERN_ALERT "xen_net (%s:%d) " fmt, __FUNCTION__, __LINE__, ##args) #else @@ -211,89 +216,6 @@ printk(KERN_INFO "xen_net: " fmt, ##args) #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "xen_net: " fmt, ##args) - -static struct net_device *find_dev_by_handle(unsigned int handle) -{ - struct list_head *ent; - struct net_private *np; - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - if (np->handle == handle) - return np->dev; - } - return NULL; -} - -/** Network interface info. */ -struct netif_ctrl { - /** Number of interfaces. */ - int interface_n; - /** Number of connected interfaces. */ - int connected_n; - /** Error code. */ - int err; - int up; -}; - -static struct netif_ctrl netctrl; - -static void netctrl_init(void) -{ - memset(&netctrl, 0, sizeof(netctrl)); - netctrl.up = NETIF_DRIVER_STATUS_DOWN; -} - -/** Get or set a network interface error. - */ -static int netctrl_err(int err) -{ - if ((err < 0) && !netctrl.err) - netctrl.err = err; - return netctrl.err; -} - -/** Test if all network interfaces are connected. - * - * @return 1 if all connected, 0 if not, negative error code otherwise - */ -static int netctrl_connected(void) -{ - int ok; - - if (netctrl.err) - ok = netctrl.err; - else if (netctrl.up == NETIF_DRIVER_STATUS_UP) - ok = (netctrl.connected_n == netctrl.interface_n); - else - ok = 0; - - return ok; -} - -/** Count the connected network interfaces. - * - * @return connected count - */ -static int netctrl_connected_count(void) -{ - - struct list_head *ent; - struct net_private *np; - unsigned int connected; - - connected = 0; - - list_for_each(ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - if (np->backend_state == BEST_CONNECTED) - connected++; - } - - netctrl.connected_n = connected; - DPRINTK("> connected_n=%d interface_n=%d\n", - netctrl.connected_n, netctrl.interface_n); - return connected; -} /** Send a packet on a net device to encourage switches to learn the * MAC. We send a fake ARP request. @@ -357,10 +279,14 @@ id = np->tx->ring[MASK_NETIF_TX_IDX(i)].resp.id; skb = np->tx_skbs[id]; #ifdef CONFIG_XEN_NETDEV_GRANT_TX - if (gnttab_query_foreign_access(grant_tx_ref[id]) != 0) { - printk(KERN_ALERT "netfront: query foreign access\n"); + if (unlikely(gnttab_query_foreign_access(grant_tx_ref[id]) != 0)) { + /* other domain is still using this grant - shouldn't happen + but if it does, we'll try to reclaim the grant later */ + printk(KERN_ALERT "network_tx_buf_gc: warning -- grant " + "still in use by backend domain.\n"); + goto out; } - gnttab_end_foreign_access(grant_tx_ref[id], GNTMAP_readonly); + gnttab_end_foreign_access_ref(grant_tx_ref[id], GNTMAP_readonly); gnttab_release_grant_reference(&gref_tx_head, grant_tx_ref[id]); grant_tx_ref[id] = GRANT_INVALID_REF; #endif @@ -383,6 +309,10 @@ mb(); } while (prod != np->tx->resp_prod); +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + out: +#endif + if (np->tx_full && ((np->tx->req_prod - prod) < NETIF_TX_RING_SIZE)) { np->tx_full = 0; if (np->user_state == UST_OPEN) @@ -434,16 +364,17 @@ np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.id = id; #ifdef CONFIG_XEN_NETDEV_GRANT_RX - if ((ref = gnttab_claim_grant_reference(&gref_rx_head, gref_rx_terminal)) < 0) { + ref = gnttab_claim_grant_reference(&gref_rx_head); + if (unlikely(ref < 0)) { printk(KERN_ALERT "#### netfront can't claim rx reference\n"); BUG(); } grant_rx_ref[id] = ref; - gnttab_grant_foreign_transfer_ref(ref, rdomid, - virt_to_machine(skb->head) >> PAGE_SHIFT); + gnttab_grant_foreign_transfer_ref(ref, np->backend_id, + virt_to_mfn(skb->head)); np->rx->ring[MASK_NETIF_RX_IDX(req_prod + i)].req.gref = ref; #endif - rx_pfn_array[i] = virt_to_machine(skb->head) >> PAGE_SHIFT; + rx_pfn_array[i] = virt_to_mfn(skb->head); /* Remove this page from pseudo phys map before passing back to Xen. */ phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] @@ -529,17 +460,19 @@ tx->id = id; #ifdef CONFIG_XEN_NETDEV_GRANT_TX - if ((ref = gnttab_claim_grant_reference(&gref_tx_head, gref_tx_terminal)) < 0) { + ref = gnttab_claim_grant_reference(&gref_tx_head); + if (unlikely(ref < 0)) { printk(KERN_ALERT "#### netfront can't claim tx grant reference\n"); BUG(); } - mfn = virt_to_machine(skb->data) >> PAGE_SHIFT; - gnttab_grant_foreign_access_ref(ref, rdomid, mfn, GNTMAP_readonly); - tx->addr = (ref << PAGE_SHIFT) | ((unsigned long)skb->data & ~PAGE_MASK); + mfn = virt_to_mfn(skb->data); + gnttab_grant_foreign_access_ref(ref, np->backend_id, mfn, GNTMAP_readonly); + tx->addr = ref << PAGE_SHIFT; grant_tx_ref[id] = ref; #else - tx->addr = virt_to_machine(skb->data); -#endif + tx->addr = virt_to_mfn(skb->data) << PAGE_SHIFT; +#endif + tx->addr |= (unsigned long)skb->data & ~PAGE_MASK; tx->size = skb->len; tx->csum_blank = (skb->ip_summed == CHECKSUM_HW); @@ -639,8 +572,7 @@ #ifdef CONFIG_XEN_NETDEV_GRANT_RX ref = grant_rx_ref[rx->id]; grant_rx_ref[rx->id] = GRANT_INVALID_REF; - - mfn = gnttab_end_foreign_transfer(ref); + mfn = gnttab_end_foreign_transfer_ref(ref); gnttab_release_grant_reference(&gref_rx_head, ref); #endif @@ -675,18 +607,20 @@ pfn_pte_ma(mfn, PAGE_KERNEL), 0); #else MULTI_update_va_mapping(mcl, (unsigned long)skb->head, - pfn_pte_ma(rx->addr >> PAGE_SHIFT, PAGE_KERNEL), 0); + pfn_pte_ma(rx->addr >> PAGE_SHIFT, + PAGE_KERNEL), 0); #endif mcl++; +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = mfn; +#else phys_to_machine_mapping[__pa(skb->head) >> PAGE_SHIFT] = -#ifdef CONFIG_XEN_NETDEV_GRANT_RX - mfn; -#else rx->addr >> PAGE_SHIFT; #endif + #ifdef GRANT_DEBUG - printk(KERN_ALERT "#### rx_poll enqueue vdata=%08x mfn=%08x ref=%04x\n", + printk(KERN_ALERT "#### rx_poll enqueue vdata=%p mfn=%lu ref=%x\n", skb->data, mfn, ref); #endif __skb_queue_tail(&rxq, skb); @@ -708,9 +642,9 @@ while ((skb = __skb_dequeue(&rxq)) != NULL) { #ifdef GRANT_DEBUG - printk(KERN_ALERT "#### rx_poll dequeue vdata=%08x mfn=%08x\n", - skb->data, virt_to_machine(skb->data)>>PAGE_SHIFT); - dump_packet('d', skb->data, (unsigned long)skb->data); + printk(KERN_ALERT "#### rx_poll dequeue vdata=%p mfn=%lu\n", + skb->data, virt_to_mfn(skb->data)); + dump_packet('d', skb->data, (unsigned long)skb->data); #endif /* * Enough room in skbuff for the data we were passed? Also, Linux @@ -797,7 +731,7 @@ { struct net_private *np = netdev_priv(dev); np->user_state = UST_CLOSED; - netif_stop_queue(np->dev); + netif_stop_queue(np->netdev); return 0; } @@ -809,8 +743,7 @@ } -static void network_connect(struct net_device *dev, - netif_fe_interface_status_t *status) +static void network_connect(struct net_device *dev) { struct net_private *np; int i, requeue_idx; @@ -843,18 +776,23 @@ * interface has been down. */ for (requeue_idx = 0, i = 1; i <= NETIF_TX_RING_SIZE; i++) { - if ((unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET) { - struct sk_buff *skb = np->tx_skbs[i]; - - tx = &np->tx->ring[requeue_idx++].req; - - tx->id = i; - tx->addr = virt_to_machine(skb->data); - tx->size = skb->len; - - np->stats.tx_bytes += skb->len; - np->stats.tx_packets++; - } + if ((unsigned long)np->tx_skbs[i] >= __PAGE_OFFSET) { + struct sk_buff *skb = np->tx_skbs[i]; + + tx = &np->tx->ring[requeue_idx++].req; + + tx->id = i; +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + tx->addr = 0; /*(ref << PAGE_SHIFT) |*/ +#else + tx->addr = virt_to_mfn(skb->data) << PAGE_SHIFT; +#endif + tx->addr |= (unsigned long)skb->data & ~PAGE_MASK; + tx->size = skb->len; + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + } } wmb(); np->tx->req_prod = requeue_idx; @@ -873,7 +811,7 @@ */ np->backend_state = BEST_CONNECTED; wmb(); - notify_via_evtchn(status->evtchn); + notify_via_evtchn(np->evtchn); network_tx_buf_gc(dev); if (np->user_state == UST_OPEN) @@ -883,132 +821,21 @@ spin_unlock_irq(&np->tx_lock); } -static void vif_show(struct net_private *np) -{ -#if DEBUG - if (np) { - IPRINTK("<vif handle=%u %s(%s) evtchn=%u irq=%u tx=%p rx=%p>\n", - np->handle, - be_state_name[np->backend_state], - np->user_state ? "open" : "closed", - np->evtchn, - np->irq, - np->tx, - np->rx); - } else { - IPRINTK("<vif NULL>\n"); - } -#endif -} - -/* Send a connect message to xend to tell it to bring up the interface. */ -static void send_interface_connect(struct net_private *np) -{ - ctrl_msg_t cmsg = { - .type = CMSG_NETIF_FE, - .subtype = CMSG_NETIF_FE_INTERFACE_CONNECT, - .length = sizeof(netif_fe_interface_connect_t), - }; - netif_fe_interface_connect_t *msg = (void*)cmsg.msg; - - msg->handle = np->handle; - msg->tx_shmem_frame = (virt_to_machine(np->tx) >> PAGE_SHIFT); - msg->rx_shmem_frame = (virt_to_machine(np->rx) >> PAGE_SHIFT); - - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} - -/* Send a driver status notification to the domain controller. */ -static int send_driver_status(int ok) -{ - int err = 0; - ctrl_msg_t cmsg = { - .type = CMSG_NETIF_FE, - .subtype = CMSG_NETIF_FE_DRIVER_STATUS, - .length = sizeof(netif_fe_driver_status_t), - }; - netif_fe_driver_status_t *msg = (void*)cmsg.msg; - - msg->status = (ok ? NETIF_DRIVER_STATUS_UP : NETIF_DRIVER_STATUS_DOWN); - err = ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); - return err; -} - -/* Stop network device and free tx/rx queues and irq. - */ -static void vif_release(struct net_private *np) -{ - /* Stop old i/f to prevent errors whilst we rebuild the state. */ - spin_lock_irq(&np->tx_lock); - spin_lock(&np->rx_lock); - netif_stop_queue(np->dev); - /* np->backend_state = BEST_DISCONNECTED; */ - spin_unlock(&np->rx_lock); - spin_unlock_irq(&np->tx_lock); - - /* Free resources. */ - if(np->tx != NULL){ - free_irq(np->irq, np->dev); - unbind_evtchn_from_irq(np->evtchn); - free_page((unsigned long)np->tx); - free_page((unsigned long)np->rx); - np->irq = 0; - np->evtchn = 0; - np->tx = NULL; - np->rx = NULL; - } -} - -/* Release vif resources and close it down completely. - */ -static void vif_close(struct net_private *np) -{ - WPRINTK("Unexpected netif-CLOSED message in state %s\n", - be_state_name[np->backend_state]); - vif_release(np); - np->backend_state = BEST_CLOSED; - /* todo: take dev down and free. */ - vif_show(np); -} - -/* Move the vif into disconnected state. - * Allocates tx/rx pages. - * Sends connect message to xend. - */ -static void vif_disconnect(struct net_private *np) -{ - if(np->tx) free_page((unsigned long)np->tx); - if(np->rx) free_page((unsigned long)np->rx); - // Before this np->tx and np->rx had better be null. - np->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); - np->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); - memset(np->tx, 0, PAGE_SIZE); - memset(np->rx, 0, PAGE_SIZE); - np->backend_state = BEST_DISCONNECTED; - send_interface_connect(np); - vif_show(np); -} - -/* Begin interface recovery. - * - * NB. Whilst we're recovering, we turn the carrier state off. We - * take measures to ensure that this device isn't used for - * anything. We also stop the queue for this device. Various - * different approaches (e.g. continuing to buffer packets) have - * been tested but don't appear to improve the overall impact on - * TCP connections. - * - * TODO: (MAW) Change the Xend<->Guest protocol so that a recovery - * is initiated by a special "RESET" message - disconnect could - * just mean we're not allowed to use this interface any more. - */ -static void vif_reset(struct net_private *np) -{ - IPRINTK("Attempting to reconnect network interface: handle=%u\n", - np->handle); - vif_release(np); - vif_disconnect(np); - vif_show(np); +static void show_device(struct net_private *np) +{ +#ifdef DEBUG + if (np) { + IPRINTK("<vif handle=%u %s(%s) evtchn=%u tx=%p rx=%p>\n", + np->handle, + be_state_name[np->backend_state], + np->user_state ? "open" : "closed", + np->evtchn, + np->tx, + np->rx); + } else { + IPRINTK("<vif NULL>\n"); + } +#endif } /* Move the vif into connected state. @@ -1016,26 +843,22 @@ * Binds the irq to the event channel. */ static void -vif_connect(struct net_private *np, netif_fe_interface_status_t *status) -{ - struct net_device *dev = np->dev; - memcpy(dev->dev_addr, status->mac, ETH_ALEN); - network_connect(dev, status); - np->evtchn = status->evtchn; - np->irq = bind_evtchn_to_irq(np->evtchn); -#if defined(CONFIG_XEN_NETDEV_GRANT_TX) || defined(CONFIG_XEN_NETDEV_GRANT_RX) - rdomid = status->domid; -#endif - (void)request_irq(np->irq, netif_int, SA_SAMPLE_RANDOM, dev->name, dev); - netctrl_connected_count(); - (void)send_fake_arp(dev); - vif_show(np); +connect_device(struct net_private *np, unsigned int evtchn) +{ + struct net_device *dev = np->netdev; + memcpy(dev->dev_addr, np->mac, ETH_ALEN); + np->evtchn = evtchn; + network_connect(dev); + (void)bind_evtchn_to_irqhandler( + np->evtchn, netif_int, SA_SAMPLE_RANDOM, dev->name, dev); + (void)send_fake_arp(dev); + show_device(np); } static struct ethtool_ops network_ethtool_ops = { - .get_tx_csum = ethtool_op_get_tx_csum, - .set_tx_csum = ethtool_op_set_tx_csum, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = ethtool_op_set_tx_csum, }; /** Create a network device. @@ -1043,22 +866,24 @@ * @param val return parameter for created device * @return 0 on success, error code otherwise */ -static int create_netdev(int handle, struct net_device **val) +static int create_netdev(int handle, struct xenbus_device *dev, + struct net_device **val) { int i, err = 0; - struct net_device *dev = NULL; + struct net_device *netdev = NULL; struct net_private *np = NULL; - if ((dev = alloc_etherdev(sizeof(struct net_private))) == NULL) { + if ((netdev = alloc_etherdev(sizeof(struct net_private))) == NULL) { printk(KERN_WARNING "%s> alloc_etherdev failed.\n", __FUNCTION__); err = -ENOMEM; goto exit; } - np = netdev_priv(dev); + np = netdev_priv(netdev); np->backend_state = BEST_CLOSED; np->user_state = UST_CLOSED; np->handle = handle; + np->xbdev = dev; spin_lock_init(&np->tx_lock); spin_lock_init(&np->rx_lock); @@ -1082,268 +907,47 @@ #endif } - dev->open = network_open; - dev->hard_start_xmit = network_start_xmit; - dev->stop = network_close; - dev->get_stats = network_get_stats; - dev->poll = netif_poll; - dev->weight = 64; - dev->features = NETIF_F_IP_CSUM; - - SET_ETHTOOL_OPS(dev, &network_ethtool_ops); - - if ((err = register_netdev(dev)) != 0) { + netdev->open = network_open; + netdev->hard_start_xmit = network_start_xmit; + netdev->stop = network_close; + netdev->get_stats = network_get_stats; + netdev->poll = netif_poll; + netdev->weight = 64; + netdev->features = NETIF_F_IP_CSUM; + + SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); + + if ((err = register_netdev(netdev)) != 0) { printk(KERN_WARNING "%s> register_netdev err=%d\n", __FUNCTION__, err); goto exit; } - if ((err = xennet_proc_addif(dev)) != 0) { - unregister_netdev(dev); + if ((err = xennet_proc_addif(netdev)) != 0) { + unregister_netdev(netdev); goto exit; } - np->dev = dev; - list_add(&np->list, &dev_list); + np->netdev = netdev; exit: - if ((err != 0) && (dev != NULL)) - kfree(dev); + if ((err != 0) && (netdev != NULL)) + kfree(netdev); else if (val != NULL) - *val = dev; + *val = netdev; return err; } -/* Get the target interface for a status message. - * Creates the interface when it makes sense. - * The returned interface may be null when there is no error. - * - * @param status status message - * @param np return parameter for interface state - * @return 0 on success, error code otherwise - */ -static int -target_vif(netif_fe_interface_status_t *status, struct net_private **np) -{ - int err = 0; - struct net_device *dev; - - DPRINTK("> handle=%d\n", status->handle); - if (status->handle < 0) { - err = -EINVAL; - goto exit; - } - - if ((dev = find_dev_by_handle(status->handle)) != NULL) - goto exit; - - if (status->status == NETIF_INTERFACE_STATUS_CLOSED) - goto exit; - if (status->status == NETIF_INTERFACE_STATUS_CHANGED) - goto exit; - - /* It's a new interface in a good state - create it. */ - DPRINTK("> create device...\n"); - if ((err = create_netdev(status->handle, &dev)) != 0) - goto exit; - - netctrl.interface_n++; - - exit: - if (np != NULL) - *np = ((dev && !err) ? netdev_priv(dev) : NULL); - DPRINTK("< err=%d\n", err); - return err; -} - -/* Handle an interface status message. */ -static void netif_interface_status(netif_fe_interface_status_t *status) -{ - int err = 0; - struct net_private *np = NULL; - - DPRINTK("> status=%s handle=%d\n", - status_name[status->status], status->handle); - - if ((err = target_vif(status, &np)) != 0) { - WPRINTK("Invalid netif: handle=%u\n", status->handle); - return; - } - - if (np == NULL) { - DPRINTK("> no vif\n"); - return; - } - - switch (status->status) { - case NETIF_INTERFACE_STATUS_CLOSED: - switch (np->backend_state) { - case BEST_CLOSED: - case BEST_DISCONNECTED: - case BEST_CONNECTED: - vif_close(np); - break; - } - break; - - case NETIF_INTERFACE_STATUS_DISCONNECTED: - switch (np->backend_state) { - case BEST_CLOSED: - vif_disconnect(np); - break; - case BEST_DISCONNECTED: - case BEST_CONNECTED: - vif_reset(np); - break; - } - break; - - case NETIF_INTERFACE_STATUS_CONNECTED: - switch (np->backend_state) { - case BEST_CLOSED: - WPRINTK("Unexpected netif status %s in state %s\n", - status_name[status->status], - be_state_name[np->backend_state]); - vif_disconnect(np); - vif_connect(np, status); - break; - case BEST_DISCONNECTED: - vif_connect(np, status); - break; - } - break; - - case NETIF_INTERFACE_STATUS_CHANGED: - /* - * The domain controller is notifying us that a device has been - * added or removed. - */ - break; - - default: - WPRINTK("Invalid netif status code %d\n", status->status); - break; - } - - vif_show(np); -} - -/* - * Initialize the network control interface. - */ -static void netif_driver_status(netif_fe_driver_status_t *status) -{ - netctrl.up = status->status; - netctrl_connected_count(); -} - -/* Receive handler for control messages. */ -static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - - switch (msg->subtype) { - case CMSG_NETIF_FE_INTERFACE_STATUS: - netif_interface_status((netif_fe_interface_status_t *) &msg->msg[0]); - break; - - case CMSG_NETIF_FE_DRIVER_STATUS: - netif_driver_status((netif_fe_driver_status_t *) &msg->msg[0]); - break; - - default: - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - - -#if 1 -/* Wait for all interfaces to be connected. - * - * This works OK, but we'd like to use the probing mode (see below). - */ -static int probe_interfaces(void) -{ - int err = 0, conn = 0; - int wait_i, wait_n = 100; - - DPRINTK(">\n"); - - for (wait_i = 0; wait_i < wait_n; wait_i++) { - DPRINTK("> wait_i=%d\n", wait_i); - conn = netctrl_connected(); - if(conn) break; - DPRINTK("> schedule_timeout...\n"); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(10); - } - - DPRINTK("> wait finished...\n"); - if (conn <= 0) { - err = netctrl_err(-ENETDOWN); - WPRINTK("Failed to connect all virtual interfaces: err=%d\n", err); - } - - DPRINTK("< err=%d\n", err); - - return err; -} -#else -/* Probe for interfaces until no more are found. - * - * This is the mode we'd like to use, but at the moment it panics the kernel. -*/ -static int probe_interfaces(void) -{ - int err = 0; - int wait_i, wait_n = 100; - ctrl_msg_t cmsg = { - .type = CMSG_NETIF_FE, - .subtype = CMSG_NETIF_FE_INTERFACE_STATUS, - .length = sizeof(netif_fe_interface_status_t), - }; - netif_fe_interface_status_t msg = {}; - ctrl_msg_t rmsg = {}; - netif_fe_interface_status_t *reply = (void*)rmsg.msg; - int state = TASK_UNINTERRUPTIBLE; - u32 query = -1; - - DPRINTK(">\n"); - - netctrl.interface_n = 0; - for (wait_i = 0; wait_i < wait_n; wait_i++) { - DPRINTK("> wait_i=%d query=%d\n", wait_i, query); - msg.handle = query; - memcpy(cmsg.msg, &msg, sizeof(msg)); - DPRINTK("> set_current_state...\n"); - set_current_state(state); - DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); - DPRINTK("> sending...\n"); - err = ctrl_if_send_message_and_get_response(&cmsg, &rmsg, state); - DPRINTK("> err=%d\n", err); - if(err) goto exit; - DPRINTK("> rmsg=%p msg=%p, reply=%p\n", &rmsg, rmsg.msg, reply); - if((int)reply->handle < 0) { - // No more interfaces. - break; - } - query = -reply->handle - 2; - DPRINTK(">netif_interface_status ...\n"); - netif_interface_status(reply); - } - - exit: - if (err) { - err = netctrl_err(-ENETDOWN); - WPRINTK("Connecting virtual network interfaces failed: err=%d\n", err); - } - - DPRINTK("< err=%d\n", err); - return err; -} - -#endif +static int destroy_netdev(struct net_device *netdev) +{ + +#ifdef CONFIG_PROC_FS + xennet_proc_delif(netdev); +#endif + + unregister_netdev(netdev); + + return 0; +} /* * We use this notifier to send out a fake ARP reply to reset switches and @@ -1354,19 +958,11 @@ { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct net_device *dev = ifa->ifa_dev->dev; - struct list_head *ent; - struct net_private *np; - - if (event != NETDEV_UP) - goto out; - - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - if (np->dev == dev) - (void)send_fake_arp(dev); - } + + /* UP event and is it one of our devices? */ + if (event == NETDEV_UP && dev->open == network_open) + (void)send_fake_arp(dev); - out: return NOTIFY_DONE; } @@ -1376,66 +972,315 @@ .priority = 0 }; -static int __init netif_init(void) -{ - int err = 0; - - if (xen_start_info.flags & SIF_INITDOMAIN) - return 0; +static struct xenbus_device_id netfront_ids[] = { + { "vif" }, + { "" } +}; + +static void watch_for_status(struct xenbus_watch *watch, const char *node) +{ +} + +static int setup_device(struct xenbus_device *dev, struct netfront_info *info) +{ + evtchn_op_t op = { .cmd = EVTCHNOP_alloc_unbound }; + int err; + #ifdef CONFIG_XEN_NETDEV_GRANT_TX - if (gnttab_alloc_grant_references(NETIF_TX_RING_SIZE, - &gref_tx_head, &gref_tx_terminal) < 0) { - printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); - return 1; - } - printk(KERN_ALERT "#### netfront tx using grant tables\n"); + info->tx_ring_ref = GRANT_INVALID_REF; #endif #ifdef CONFIG_XEN_NETDEV_GRANT_RX - if (gnttab_alloc_grant_references(NETIF_RX_RING_SIZE, - &gref_rx_head, &gref_rx_terminal) < 0) { - printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); - return 1; - } - printk(KERN_ALERT "#### netfront rx using grant tables\n"); -#endif - - if ((err = xennet_proc_init()) != 0) - return err; - - IPRINTK("Initialising virtual ethernet driver.\n"); - INIT_LIST_HEAD(&dev_list); - (void)register_inetaddr_notifier(¬ifier_inetdev); - netctrl_init(); - (void)ctrl_if_register_receiver(CMSG_NETIF_FE, netif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - send_driver_status(1); - err = probe_interfaces(); - if (err) - ctrl_if_unregister_receiver(CMSG_NETIF_FE, netif_ctrlif_rx); - - DPRINTK("< err=%d\n", err); - return err; -} - -static void netif_exit(void) -{ + info->rx_ring_ref = GRANT_INVALID_REF; +#endif + + info->tx = (netif_tx_interface_t *)__get_free_page(GFP_KERNEL); + if (info->tx == 0) { + err = -ENOMEM; + xenbus_dev_error(dev, err, "allocating tx ring page"); + goto out; + } + info->rx = (netif_rx_interface_t *)__get_free_page(GFP_KERNEL); + if (info->rx == 0) { + err = -ENOMEM; + xenbus_dev_error(dev, err, "allocating rx ring page"); + goto out; + } + memset(info->tx, 0, PAGE_SIZE); + memset(info->rx, 0, PAGE_SIZE); + info->backend_state = BEST_DISCONNECTED; + #ifdef CONFIG_XEN_NETDEV_GRANT_TX - gnttab_free_grant_references(NETIF_TX_RING_SIZE, gref_tx_head); -#endif + err = gnttab_grant_foreign_access(info->backend_id, + virt_to_mfn(info->tx), 0); + if (err < 0) { + xenbus_dev_error(dev, err, "granting access to tx ring page"); + goto out; + } + info->tx_ring_ref = err; +#else + info->tx_ring_ref = virt_to_mfn(info->tx); +#endif + #ifdef CONFIG_XEN_NETDEV_GRANT_RX - gnttab_free_grant_references(NETIF_RX_RING_SIZE, gref_rx_head); -#endif -} - -static void vif_suspend(struct net_private *np) -{ + err = gnttab_grant_foreign_access(info->backend_id, + virt_to_mfn(info->rx), 0); + if (err < 0) { + xenbus_dev_error(dev, err, "granting access to rx ring page"); + goto out; + } + info->rx_ring_ref = err; +#else + info->rx_ring_ref = virt_to_mfn(info->rx); +#endif + + op.u.alloc_unbound.dom = info->backend_id; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + xenbus_dev_error(dev, err, "allocating event channel"); + goto out; + } + connect_device(info, op.u.alloc_unbound.port); + return 0; + + out: + if (info->tx) + free_page((unsigned long)info->tx); + info->tx = 0; + if (info->rx) + free_page((unsigned long)info->rx); + info->rx = 0; +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + if (info->tx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(info->tx_ring_ref, 0); + info->tx_ring_ref = GRANT_INVALID_REF; +#endif +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + if (info->rx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(info->rx_ring_ref, 0); + info->rx_ring_ref = GRANT_INVALID_REF; +#endif + return err; +} + +static void netif_free(struct netfront_info *info) +{ + if (info->tx) + free_page((unsigned long)info->tx); + info->tx = 0; + if (info->rx) + free_page((unsigned long)info->rx); + info->rx = 0; +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + if (info->tx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(info->tx_ring_ref, 0); + info->tx_ring_ref = GRANT_INVALID_REF; +#endif +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + if (info->rx_ring_ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(info->rx_ring_ref, 0); + info->rx_ring_ref = GRANT_INVALID_REF; +#endif + unbind_evtchn_from_irqhandler(info->evtchn, info->netdev); + info->evtchn = 0; +} + +/* Stop network device and free tx/rx queues and irq. + */ +static void shutdown_device(struct net_private *np) +{ + /* Stop old i/f to prevent errors whilst we rebuild the state. */ + spin_lock_irq(&np->tx_lock); + spin_lock(&np->rx_lock); + netif_stop_queue(np->netdev); + /* np->backend_state = BEST_DISCONNECTED; */ + spin_unlock(&np->rx_lock); + spin_unlock_irq(&np->tx_lock); + + /* Free resources. */ + netif_free(np); +} + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct netfront_info *info) +{ + char *backend, *mac, *e, *s; + const char *message; + int err, i; + + backend = NULL; + err = xenbus_gather(dev->nodename, + "backend-id", "%i", &info->backend_id, + "backend", NULL, &backend, + NULL); + if (XENBUS_EXIST_ERR(err)) + goto out; + if (backend && strlen(backend) == 0) { + err = -ENOENT; + goto out; + } + if (err < 0) { + xenbus_dev_error(dev, err, "reading %s/backend or backend-id", + dev->nodename); + goto out; + } + + mac = xenbus_read(dev->nodename, "mac", NULL); + if (IS_ERR(mac)) { + err = PTR_ERR(mac); + xenbus_dev_error(dev, err, "reading %s/mac", + dev->nodename); + goto out; + } + s = mac; + for (i = 0; i < ETH_ALEN; i++) { + info->mac[i] = simple_strtoul(s, &e, 16); + if (s == e || (e[0] != ':' && e[0] != 0)) { + kfree(mac); + err = -ENOENT; + xenbus_dev_error(dev, err, "parsing %s/mac", + dev->nodename); + goto out; + } + s = &e[1]; + } + kfree(mac); + + /* Create shared ring, alloc event channel. */ + err = setup_device(dev, info); + if (err) { + xenbus_dev_error(dev, err, "setting up ring"); + goto out; + } + + err = xenbus_transaction_start(dev->nodename); + if (err) { + xenbus_dev_error(dev, err, "starting transaction"); + goto destroy_ring; + } + + err = xenbus_printf(dev->nodename, "tx-ring-ref","%u", + info->tx_ring_ref); + if (err) { + message = "writing tx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(dev->nodename, "rx-ring-ref","%u", + info->rx_ring_ref); + if (err) { + message = "writing rx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(dev->nodename, + "event-channel", "%u", info->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + info->backend = backend; + backend = NULL; + + info->watch.node = info->backend; + info->watch.callback = watch_for_status; + err = register_xenbus_watch(&info->watch); + if (err) { + message = "registering watch on backend"; + goto abort_transaction; + } + + err = xenbus_transaction_end(0); + if (err) { + xenbus_dev_error(dev, err, "completing transaction"); + goto destroy_ring; + } + + netif_state = NETIF_STATE_CONNECTED; + + out: + if (backend) + kfree(backend); + return err; + + abort_transaction: + xenbus_transaction_end(1); + /* Have to do this *outside* transaction. */ + xenbus_dev_error(dev, err, "%s", message); + destroy_ring: + shutdown_device(info); + goto out; +} + +/* Setup supplies the backend dir, virtual device. + + We place an event channel and shared frame entries. + We watch backend to wait if it's ok. */ +static int netfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct net_device *netdev; + struct netfront_info *info; + unsigned int handle; + + err = xenbus_scanf(dev->nodename, "handle", "%u", &handle); + if (XENBUS_EXIST_ERR(err)) + return err; + if (err < 0) { + xenbus_dev_error(dev, err, "reading handle"); + return err; + } + + err = create_netdev(handle, dev, &netdev); + if (err) { + xenbus_dev_error(dev, err, "creating netdev"); + return err; + } + + info = netdev_priv(netdev); + dev->data = info; + + err = talk_to_backend(dev, info); + if (err) { + destroy_netdev(netdev); + kfree(netdev); + dev->data = NULL; + return err; + } + + + /* Call once in case entries already there. */ + watch_for_status(&info->watch, info->watch.node); + + return 0; +} + +static int netfront_remove(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->data; + + if (info->backend) + unregister_xenbus_watch(&info->watch); + + netif_free(info); + + kfree(info->backend); + kfree(info); + + return 0; +} + +static int netfront_suspend(struct xenbus_device *dev) +{ + struct net_private *np = dev->data; /* Avoid having tx/rx stuff happen until we're ready. */ - free_irq(np->irq, np->dev); - unbind_evtchn_from_irq(np->evtchn); -} - -static void vif_resume(struct net_private *np) -{ + unbind_evtchn_from_irqhandler(np->evtchn, np->netdev); + return 0; +} + +static int netfront_resume(struct xenbus_device *dev) +{ + struct net_private *np = dev->data; /* * Connect regardless of whether IFF_UP flag set. * Stop bad things from happening until we're back up. @@ -1444,29 +1289,96 @@ memset(np->tx, 0, PAGE_SIZE); memset(np->rx, 0, PAGE_SIZE); - send_interface_connect(np); -} - -void netif_suspend(void) -{ - struct list_head *ent; - struct net_private *np; - - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - vif_suspend(np); - } -} - -void netif_resume(void) -{ - struct list_head *ent; - struct net_private *np; - - list_for_each (ent, &dev_list) { - np = list_entry(ent, struct net_private, list); - vif_resume(np); - } + // send_interface_connect(np); + return 0; +} + +static struct xenbus_driver netfront = { + .name = "vif", + .owner = THIS_MODULE, + .ids = netfront_ids, + .probe = netfront_probe, + .remove = netfront_remove, + .resume = netfront_resume, + .suspend = netfront_suspend, +}; + +static void __init init_net_xenbus(void) +{ + xenbus_register_device(&netfront); +} + +static int wait_for_netif(void) +{ + int err = 0; + int i; + + /* + * We should figure out how many and which devices we need to + * proceed and only wait for those. For now, continue once the + * first device is around. + */ + for ( i=0; netif_state != NETIF_STATE_CONNECTED && (i < 10*HZ); i++ ) + { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + + if (netif_state != NETIF_STATE_CONNECTED) { + WPRINTK("Timeout connecting to device!\n"); + err = -ENOSYS; + } + return err; +} + +static int __init netif_init(void) +{ + int err = 0; + + if (xen_start_info.flags & SIF_INITDOMAIN) + return 0; + +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + /* A grant for every ring slot */ + if (gnttab_alloc_grant_references(NETIF_TX_RING_SIZE, + &gref_tx_head) < 0) { + printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); + return 1; + } + printk(KERN_ALERT "Netdev frontend (TX) is using grant tables.\n"); +#endif +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + /* A grant for every ring slot */ + if (gnttab_alloc_grant_references(NETIF_RX_RING_SIZE, + &gref_rx_head) < 0) { + printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); + return 1; + } + printk(KERN_ALERT "Netdev frontend (RX) is using grant tables.\n"); +#endif + + if ((err = xennet_proc_init()) != 0) + return err; + + IPRINTK("Initialising virtual ethernet driver.\n"); + + (void)register_inetaddr_notifier(¬ifier_inetdev); + + init_net_xenbus(); + + wait_for_netif(); + + return err; +} + +static void netif_exit(void) +{ +#ifdef CONFIG_XEN_NETDEV_GRANT_TX + gnttab_free_grant_references(gref_tx_head); +#endif +#ifdef CONFIG_XEN_NETDEV_GRANT_RX + gnttab_free_grant_references(gref_rx_head); +#endif } #ifdef CONFIG_PROC_FS diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c --- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Thu Aug 25 22:53:20 2005 @@ -139,7 +139,7 @@ privcmd_mmapbatch_t m; struct vm_area_struct *vma = NULL; unsigned long *p, addr; - unsigned long mfn; + unsigned long mfn, ptep; int i; if ( copy_from_user(&m, (void *)data, sizeof(m)) ) @@ -163,12 +163,12 @@ if ( get_user(mfn, p) ) return -EFAULT; - u.val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot); - - __direct_remap_area_pages(vma->vm_mm, - addr, - PAGE_SIZE, - &u); + ret = create_lookup_pte_addr(vma->vm_mm, addr, &ptep); + if (ret) + goto batch_err; + + u.val = pte_val_ma(pfn_pte_ma(mfn, vma->vm_page_prot)); + u.ptr = ptep; if ( unlikely(HYPERVISOR_mmu_update(&u, 1, NULL, m.dom) < 0) ) put_user(0xF0000000 | mfn, p); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/usbback/common.h --- a/linux-2.6-xen-sparse/drivers/xen/usbback/common.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/usbback/common.h Thu Aug 25 22:53:20 2005 @@ -37,7 +37,6 @@ /* Physical parameters of the comms window. */ unsigned long shmem_frame; unsigned int evtchn; - int irq; /* Comms Information */ usbif_back_ring_t usb_ring; /* Private fields. */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/usbback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/usbback/interface.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/usbback/interface.c Thu Aug 25 22:53:20 2005 @@ -6,15 +6,6 @@ * by Mark Williamson, Copyright (c) 2004 */ - -/****************************************************************************** - * arch/xen/drivers/blkif/backend/interface.c - * - * Block-device interface management. - * - * Copyright (c) 2004, Keir Fraser - */ - #include "common.h" #define USBIF_HASHSZ 1024 @@ -42,7 +33,6 @@ * may be outstanding requests at the device whose asynchronous responses * must still be notified to the remote driver. */ - unbind_evtchn_from_irq(usbif->evtchn); vfree(usbif->usb_ring.sring); /* Construct the deferred response message. */ @@ -198,12 +188,12 @@ BACK_RING_INIT(&up->usb_ring, sring, PAGE_SIZE); up->evtchn = evtchn; - up->irq = bind_evtchn_to_irq(evtchn); up->shmem_frame = shmem_frame; up->status = CONNECTED; usbif_get(up); - request_irq(up->irq, usbif_be_int, 0, "usbif-backend", up); + (void)bind_evtchn_to_irqhandler( + evtchn, usbif_be_int, 0, "usbif-backend", up); connect->status = USBIF_BE_STATUS_OKAY; } @@ -233,7 +223,7 @@ up->status = DISCONNECTING; up->disconnect_rspid = rsp_id; wmb(); /* Let other CPUs see the status change. */ - free_irq(up->irq, up); + unbind_evtchn_from_irqhandler(up->evtchn, up); usbif_deschedule(up); usbif_put(up); return 0; /* Caller should not send response message. */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/usbback/usbback.c --- a/linux-2.6-xen-sparse/drivers/xen/usbback/usbback.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/usbback/usbback.c Thu Aug 25 22:53:20 2005 @@ -657,8 +657,8 @@ phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] = FOREIGN_FRAME((buffer_mach + offset) >> PAGE_SHIFT); - ASSERT(virt_to_machine(MMAP_VADDR(pending_idx, i)) - == buffer_mach + i << PAGE_SHIFT); + ASSERT(virt_to_mfn(MMAP_VADDR(pending_idx, i)) + == ((buffer_mach >> PAGE_SHIFT) + i)); } if ( req->pipe_type == 0 && req->num_iso > 0 ) /* Maybe schedule ISO... */ @@ -1027,13 +1027,15 @@ static int __init usbif_init(void) { int i; + struct page *page; if ( !(xen_start_info.flags & SIF_INITDOMAIN) && !(xen_start_info.flags & SIF_USB_BE_DOMAIN) ) return 0; - - if ( (mmap_vstart = allocate_empty_lowmem_region(MMAP_PAGES)) == 0 ) - BUG(); + + page = balloon_alloc_empty_page_range(MMAP_PAGES); + BUG_ON(page == NULL); + mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); pending_cons = 0; pending_prod = MAX_PENDING_REQS; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/usbfront/usbfront.c --- a/linux-2.6-xen-sparse/drivers/xen/usbfront/usbfront.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/usbfront/usbfront.c Thu Aug 25 22:53:20 2005 @@ -195,7 +195,7 @@ } urb_priv->schedule = schedule; - req->iso_schedule = virt_to_machine(schedule); + req->iso_schedule = virt_to_mfn(schedule) << PAGE_SHIFT; return 0; } @@ -212,7 +212,7 @@ #if DEBUG printk(KERN_DEBUG "usbif = %p, req_prod = %d (@ 0x%lx), resp_prod = %d, resp_cons = %d\n", - usbif, usbif->req_prod, virt_to_machine(&usbif->req_prod), + usbif, usbif->req_prod, virt_to_mfn(&usbif->req_prod), usbif->resp_prod, xhci->usb_resp_cons); #endif @@ -232,7 +232,7 @@ req->operation = USBIF_OP_IO; req->port = 0; /* We don't care what the port is. */ req->id = (unsigned long) urb->hcpriv; - req->transfer_buffer = virt_to_machine(urb->transfer_buffer); + req->transfer_buffer = virt_to_mfn(urb->transfer_buffer) << PAGE_SHIFT; req->devnum = usb_pipedevice(urb->pipe); req->direction = usb_pipein(urb->pipe); req->speed = usb_pipeslow(urb->pipe); @@ -280,7 +280,7 @@ printk(KERN_DEBUG "queuing probe: req_prod = %d (@ 0x%lx), resp_prod = %d, " "resp_cons = %d\n", usbif->req_prod, - virt_to_machine(&usbif->req_prod), + virt_to_mfn(&usbif->req_prod), usbif->resp_prod, xhci->usb_resp_cons); #endif @@ -1536,8 +1536,7 @@ /* Clean up resources. */ free_page((unsigned long)xhci->usb_ring.sring); - free_irq(xhci->irq, xhci); - unbind_evtchn_from_irq(xhci->evtchn); + unbind_evtchn_from_irqhandler(xhci->evtchn, xhci); /* Plug the ring. */ xhci->recovery = 1; @@ -1556,7 +1555,7 @@ cmsg.type = CMSG_USBIF_FE; cmsg.subtype = CMSG_USBIF_FE_INTERFACE_CONNECT; cmsg.length = sizeof(usbif_fe_interface_connect_t); - up.shmem_frame = virt_to_machine(sring) >> PAGE_SHIFT; + up.shmem_frame = virt_to_mfn(sring); memcpy(cmsg.msg, &up, sizeof(up)); /* Tell the controller to bring up the interface. */ @@ -1572,7 +1571,6 @@ } xhci->evtchn = status->evtchn; - xhci->irq = bind_evtchn_to_irq(xhci->evtchn); xhci->bandwidth = status->bandwidth; xhci->rh.numports = status->num_ports; @@ -1595,14 +1593,14 @@ usb_claim_bandwidth(xhci->rh.dev, xhci->rh.urb, 1000 - xhci->bandwidth, 0); - if ( (rc = request_irq(xhci->irq, xhci_interrupt, + if ( (rc = bind_evtchn_to_irqhandler(xhci->evtchn, xhci_interrupt, SA_SAMPLE_RANDOM, "usbif", xhci)) ) printk(KERN_ALERT"usbfront request_irq failed (%ld)\n",rc); DPRINTK(KERN_INFO __FILE__ - ": USB XHCI: SHM at %p (0x%lx), EVTCHN %d IRQ %d\n", - xhci->usb_ring.sring, virt_to_machine(xhci->usbif), - xhci->evtchn, xhci->irq); + ": USB XHCI: SHM at %p (0x%lx), EVTCHN %d\n", + xhci->usb_ring.sring, virt_to_mfn(xhci->usbif), + xhci->evtchn); xhci->state = USBIF_STATE_CONNECTED; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/usbfront/xhci.h --- a/linux-2.6-xen-sparse/drivers/xen/usbfront/xhci.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/usbfront/xhci.h Thu Aug 25 22:53:20 2005 @@ -54,7 +54,6 @@ #endif int evtchn; /* Interdom channel to backend */ - int irq; /* Bound to evtchn */ enum { USBIF_STATE_CONNECTED = 2, USBIF_STATE_DISCONNECTED = 1, diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/Makefile Thu Aug 25 22:53:20 2005 @@ -4,7 +4,3 @@ xenbus-objs += xenbus_comms.o xenbus-objs += xenbus_xs.o xenbus-objs += xenbus_probe.o - -XEN_TOOLS_DIR := "../tools" -vpath %.h $(XEN_TOOLS_DIR) -EXTRA_CFLAGS += -I $(XEN_TOOLS_DIR) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.c Thu Aug 25 22:53:20 2005 @@ -26,7 +26,6 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ -//#define DEBUG #include <asm-xen/hypervisor.h> #include <asm-xen/evtchn.h> @@ -49,13 +48,12 @@ static inline struct ringbuf_head *outbuf(void) { - return machine_to_virt(xen_start_info.store_mfn << PAGE_SHIFT); + return mfn_to_virt(xen_start_info.store_mfn); } static inline struct ringbuf_head *inbuf(void) { - return machine_to_virt(xen_start_info.store_mfn << PAGE_SHIFT) - + PAGE_SIZE/2; + return mfn_to_virt(xen_start_info.store_mfn) + PAGE_SIZE/2; } static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs) @@ -202,14 +200,17 @@ return 0; } -/* Set up interrpt handler off store event channel. */ +/* Set up interrupt handler off store event channel. */ int xb_init_comms(void) { - int err, irq; - - irq = bind_evtchn_to_irq(xen_start_info.store_evtchn); - - err = request_irq(irq, wake_waiting, SA_SHIRQ, "xenbus", &xb_waitq); + int err; + + if (!xen_start_info.store_evtchn) + return 0; + + err = bind_evtchn_to_irqhandler( + xen_start_info.store_evtchn, wake_waiting, + 0, "xenbus", &xb_waitq); if (err) { printk(KERN_ERR "XENBUS request irq failed %i\n", err); unbind_evtchn_from_irq(xen_start_info.store_evtchn); @@ -217,8 +218,16 @@ } /* FIXME zero out page -- domain builder should probably do this*/ - memset(machine_to_virt(xen_start_info.store_mfn << PAGE_SHIFT), - 0, PAGE_SIZE); + memset(mfn_to_virt(xen_start_info.store_mfn), 0, PAGE_SIZE); return 0; } + +void xb_suspend_comms(void) +{ + + if (!xen_start_info.store_evtchn) + return; + + unbind_evtchn_from_irqhandler(xen_start_info.store_evtchn, &xb_waitq); +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.h --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_comms.h Thu Aug 25 22:53:20 2005 @@ -1,8 +1,36 @@ -/* Private include for xenbus communications. */ +/* + * Private include for xenbus communications. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + #ifndef _XENBUS_COMMS_H #define _XENBUS_COMMS_H + int xs_init(void); int xb_init_comms(void); +void xb_suspend_comms(void); /* Low level routines. */ int xb_write(const void *data, unsigned len); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_probe.c Thu Aug 25 22:53:20 2005 @@ -29,30 +29,26 @@ #include <asm-xen/hypervisor.h> #include <asm-xen/xenbus.h> +#include <asm-xen/balloon.h> #include <linux/kernel.h> #include <linux/err.h> #include <linux/string.h> #include <linux/ctype.h> #include <linux/fcntl.h> #include <stdarg.h> +#include <linux/notifier.h> #include "xenbus_comms.h" #define streq(a, b) (strcmp((a), (b)) == 0) + +static struct notifier_block *xenstore_chain; /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) { for (; !streq(arr->devicetype, ""); arr++) { - if (!streq(arr->devicetype, dev->devicetype)) - continue; - - /* If they don't care what subtype, it's a match. */ - if (streq(arr->subtype, "")) - return arr; - - /* If they care, device must have (same) subtype. */ - if (dev->subtype && streq(arr->subtype, dev->subtype)) + if (streq(arr->devicetype, dev->devicetype)) return arr; } return NULL; @@ -68,10 +64,102 @@ return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; } +struct xen_bus_type +{ + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); + int (*probe)(const char *type, const char *dir); + struct bus_type bus; + struct device dev; +}; + +/* device/<type>/<id> => <type>-<id> */ +static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) +{ + nodename = strchr(nodename, '/'); + if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) { + printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); + return -EINVAL; + } + + strlcpy(bus_id, nodename + 1, BUS_ID_SIZE); + if (!strchr(bus_id, '/')) { + printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); + return -EINVAL; + } + *strchr(bus_id, '/') = '-'; + return 0; +} + /* Bus type for frontend drivers. */ -static struct bus_type xenbus_type = { - .name = "xenbus", - .match = xenbus_match, +static int xenbus_probe_frontend(const char *type, const char *name); +static struct xen_bus_type xenbus_frontend = { + .root = "device", + .levels = 2, /* device/type/<id> */ + .get_bus_id = frontend_bus_id, + .probe = xenbus_probe_frontend, + .bus = { + .name = "xen", + .match = xenbus_match, + }, + .dev = { + .bus_id = "xen", + }, +}; + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + + if (!err && !xenbus_exists(frontend, "")) + err = -ENOENT; + + if (err) { + kfree(frontend); + return err; + } + + if (snprintf(bus_id, BUS_ID_SIZE, + "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static int xenbus_probe_backend(const char *type, const char *uuid); +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + }, + .dev = { + .bus_id = "xen-backend", + }, }; static int xenbus_dev_probe(struct device *_dev) @@ -100,12 +188,13 @@ return drv->remove(dev); } -int xenbus_register_driver(struct xenbus_driver *drv) +static int xenbus_register_driver(struct xenbus_driver *drv, + struct xen_bus_type *bus) { int err; drv->driver.name = drv->name; - drv->driver.bus = &xenbus_type; + drv->driver.bus = &bus->bus; drv->driver.owner = drv->owner; drv->driver.probe = xenbus_dev_probe; drv->driver.remove = xenbus_dev_remove; @@ -116,6 +205,16 @@ return err; } +int xenbus_register_device(struct xenbus_driver *drv) +{ + return xenbus_register_driver(drv, &xenbus_frontend); +} + +int xenbus_register_backend(struct xenbus_driver *drv) +{ + return xenbus_register_driver(drv, &xenbus_backend); +} + void xenbus_unregister_driver(struct xenbus_driver *drv) { down(&xenbus_lock); @@ -126,52 +225,98 @@ struct xb_find_info { struct xenbus_device *dev; - const char *busid; + const char *nodename; }; static int cmp_dev(struct device *dev, void *data) { + struct xenbus_device *xendev = to_xenbus_device(dev); struct xb_find_info *info = data; - if (streq(dev->bus_id, info->busid)) { - info->dev = container_of(get_device(dev), - struct xenbus_device, dev); + if (streq(xendev->nodename, info->nodename)) { + info->dev = xendev; + get_device(dev); return 1; } return 0; } -/* FIXME: device_find is fixed in 2.6.13-rc2 according to Greg KH --RR */ -struct xenbus_device *xenbus_device_find(const char *busid) -{ - struct xb_find_info info = { .dev = NULL, .busid = busid }; - - bus_for_each_dev(&xenbus_type, NULL, &info, cmp_dev); +struct xenbus_device *xenbus_device_find(const char *nodename, + struct bus_type *bus) +{ + struct xb_find_info info = { .dev = NULL, .nodename = nodename }; + + bus_for_each_dev(bus, NULL, &info, cmp_dev); return info.dev; } +static int cleanup_dev(struct device *dev, void *data) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct xb_find_info *info = data; + int len = strlen(info->nodename); + + if (!strncmp(xendev->nodename, info->nodename, len)) { + info->dev = xendev; + get_device(dev); + return 1; + } + return 0; +} + +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus) +{ + struct xb_find_info info = { .nodename = path }; + + do { + info.dev = NULL; + bus_for_each_dev(bus, NULL, &info, cleanup_dev); + if (info.dev) { + device_unregister(&info.dev->dev); + put_device(&info.dev->dev); + } + } while (info.dev); +} static void xenbus_release_device(struct device *dev) { if (dev) { struct xenbus_device *xendev = to_xenbus_device(dev); - kfree(xendev->subtype); kfree(xendev); } } -/* devices/<typename>/<name> */ -static int xenbus_probe_device(const char *dirpath, const char *devicetype, - const char *name) + +/* Simplified asprintf. */ +static char *kasprintf(const char *fmt, ...) +{ + va_list ap; + unsigned int len; + char *p, dummy[1]; + + va_start(ap, fmt); + /* FIXME: vsnprintf has a bug, NULL should work */ + len = vsnprintf(dummy, 0, fmt, ap); + va_end(ap); + + p = kmalloc(len + 1, GFP_KERNEL); + if (!p) + return NULL; + va_start(ap, fmt); + vsprintf(p, fmt, ap); + va_end(ap); + return p; +} + +static int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename) { int err; struct xenbus_device *xendev; unsigned int stringlen; - /* Nodename: /device/<typename>/<name>/ */ - stringlen = strlen(dirpath) + strlen(devicetype) + strlen(name) + 3; - /* Typename */ - stringlen += strlen(devicetype) + 1; + stringlen = strlen(nodename) + 1 + strlen(type) + 1; xendev = kmalloc(sizeof(*xendev) + stringlen, GFP_KERNEL); if (!xendev) return -ENOMEM; @@ -179,38 +324,103 @@ /* Copy the strings into the extra space. */ xendev->nodename = (char *)(xendev + 1); - sprintf(xendev->nodename, "%s/%s/%s", dirpath, devicetype, name); + strcpy(xendev->nodename, nodename); xendev->devicetype = xendev->nodename + strlen(xendev->nodename) + 1; - strcpy(xendev->devicetype, devicetype); - - /* FIXME: look for "subtype" field. */ - snprintf(xendev->dev.bus_id, BUS_ID_SIZE, "%s-%s", devicetype, name); - xendev->dev.bus = &xenbus_type; + strcpy(xendev->devicetype, type); + + xendev->dev.parent = &bus->dev; + xendev->dev.bus = &bus->bus; xendev->dev.release = xenbus_release_device; + + err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename); + if (err) { + kfree(xendev); + return err; + } /* Register with generic device framework. */ err = device_register(&xendev->dev); if (err) { - printk("XENBUS: Registering device %s: error %i\n", - xendev->dev.bus_id, err); + printk("XENBUS: Registering %s device %s: error %i\n", + bus->bus.name, xendev->dev.bus_id, err); kfree(xendev); } return err; } -static int xenbus_probe_device_type(const char *dirpath, const char *typename) +/* device/<typename>/<name> */ +static int xenbus_probe_frontend(const char *type, const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf("%s/%s/%s", xenbus_frontend.root, type, name); + if (!nodename) + return -ENOMEM; + + err = xenbus_probe_node(&xenbus_frontend, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf("%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + err = xenbus_probe_node(&xenbus_backend, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-uuid> */ +static int xenbus_probe_backend(const char *type, const char *uuid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + nodename = kasprintf("%s/%s/%s", xenbus_backend.root, type, uuid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) { int err = 0; char **dir; unsigned int dir_n = 0; int i; - dir = xenbus_directory(dirpath, typename, &dir_n); + dir = xenbus_directory(bus->root, type, &dir_n); if (IS_ERR(dir)) return PTR_ERR(dir); for (i = 0; i < dir_n; i++) { - err = xenbus_probe_device(dirpath, typename, dir[i]); + err = bus->probe(type, dir[i]); if (err) break; } @@ -218,18 +428,18 @@ return err; } -static int xenbus_probe_devices(const char *path) +static int xenbus_probe_devices(struct xen_bus_type *bus) { int err = 0; char **dir; unsigned int i, dir_n; - dir = xenbus_directory(path, "", &dir_n); + dir = xenbus_directory(bus->root, "", &dir_n); if (IS_ERR(dir)) return PTR_ERR(dir); for (i = 0; i < dir_n; i++) { - err = xenbus_probe_device_type(path, dir[i]); + err = xenbus_probe_device_type(bus, dir[i]); if (err) break; } @@ -247,53 +457,154 @@ return ret; } -static void dev_changed(struct xenbus_watch *watch, const char *node) -{ - char busid[BUS_ID_SIZE]; - int exists; +static int strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + return (len == 0) ? i : -ERANGE; +} + +static void dev_changed(const char *node, struct xen_bus_type *bus) +{ + int exists, rootlen; struct xenbus_device *dev; - char *p; - - /* Node is of form device/<type>/<identifier>[/...] */ - if (char_count(node, '/') != 2) + char type[BUS_ID_SIZE]; + const char *p, *root; + + if (char_count(node, '/') < 2) + return; + + exists = xenbus_exists(node, ""); + if (!exists) { + xenbus_cleanup_devices(node, &bus->bus); return; - - /* Created or deleted? */ - exists = xenbus_exists(node, ""); - + } + + /* backend/<type>/... or device/<type>/... */ p = strchr(node, '/') + 1; - if (strlen(p) + 1 > BUS_ID_SIZE) { - printk("Device for node %s is too big!\n", node); + snprintf(type, BUS_ID_SIZE, "%.*s", strcspn(p, "/"), p); + type[BUS_ID_SIZE-1] = '\0'; + + rootlen = strsep_len(node, '/', bus->levels); + if (rootlen < 0) return; - } - /* Bus ID is name with / changed to - */ - strcpy(busid, p); - *strchr(busid, '/') = '-'; - - dev = xenbus_device_find(busid); - printk("xenbus: device %s %s\n", busid, dev ? "exists" : "new"); - if (dev && !exists) { - printk("xenbus: Unregistering device %s\n", busid); - /* FIXME: free? */ - device_unregister(&dev->dev); - } else if (!dev && exists) { - printk("xenbus: Adding device %s\n", busid); - /* Hack bus id back into two strings. */ - *strrchr(busid, '-') = '\0'; - xenbus_probe_device("device", busid, busid+strlen(busid)+1); - } else - printk("xenbus: strange, %s already %s\n", busid, - exists ? "exists" : "gone"); - if (dev) + root = kasprintf("%.*s", rootlen, node); + if (!root) + return; + + dev = xenbus_device_find(root, &bus->bus); + if (!dev) + xenbus_probe_node(bus, type, root); + else put_device(&dev->dev); + + kfree(root); +} + +static void frontend_changed(struct xenbus_watch *watch, const char *node) +{ + dev_changed(node, &xenbus_frontend); +} + +static void backend_changed(struct xenbus_watch *watch, const char *node) +{ + dev_changed(node, &xenbus_backend); } /* We watch for devices appearing and vanishing. */ -static struct xenbus_watch dev_watch = { - /* FIXME: Ideally we'd only watch for changes 2 levels deep... */ +static struct xenbus_watch fe_watch = { .node = "device", - .callback = dev_changed, + .callback = frontend_changed, }; + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +static int suspend_dev(struct device *dev, void *data) +{ + int err = 0; + struct xenbus_driver *drv; + struct xenbus_device *xdev; + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + xdev = container_of(dev, struct xenbus_device, dev); + if (drv->suspend) + err = drv->suspend(xdev); + if (err) + printk("xenbus: suspend %s failed: %i\n", dev->bus_id, err); + return 0; +} + +static int resume_dev(struct device *dev, void *data) +{ + int err = 0; + struct xenbus_driver *drv; + struct xenbus_device *xdev; + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + xdev = container_of(dev, struct xenbus_device, dev); + if (drv->resume) + err = drv->resume(xdev); + if (err) + printk("xenbus: resume %s failed: %i\n", dev->bus_id, err); + return 0; +} + +void xenbus_suspend(void) +{ + /* We keep lock, so no comms can happen as page moves. */ + down(&xenbus_lock); + bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, suspend_dev); + xb_suspend_comms(); +} + +void xenbus_resume(void) +{ + xb_init_comms(); + reregister_xenbus_watches(); + bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); + bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, resume_dev); + up(&xenbus_lock); +} + +int register_xenstore_notifier(struct notifier_block *nb) +{ + int ret = 0; + + down(&xenbus_lock); + + if (xen_start_info.store_evtchn) { + ret = nb->notifier_call(nb, 0, NULL); + } else { + notifier_chain_register(&xenstore_chain, nb); + } + + up(&xenbus_lock); + + return ret; +} +EXPORT_SYMBOL(register_xenstore_notifier); + +void unregister_xenstore_notifier(struct notifier_block *nb) +{ + down(&xenbus_lock); + notifier_chain_unregister(&xenstore_chain, nb); + up(&xenbus_lock); +} +EXPORT_SYMBOL(unregister_xenstore_notifier); /* called from a thread in privcmd/privcmd.c */ int do_xenbus_probe(void *unused) @@ -309,21 +620,25 @@ return err; } - /* Initialize non-xenbus drivers */ - balloon_init_watcher(); - down(&xenbus_lock); /* Enumerate devices in xenstore. */ - xenbus_probe_devices("device"); + xenbus_probe_devices(&xenbus_frontend); + xenbus_probe_devices(&xenbus_backend); /* Watch for changes. */ - register_xenbus_watch(&dev_watch); + register_xenbus_watch(&fe_watch); + register_xenbus_watch(&be_watch); + /* Notify others that xenstore is up */ + notifier_call_chain(&xenstore_chain, 0, 0); up(&xenbus_lock); return 0; } static int __init xenbus_probe_init(void) { - bus_register(&xenbus_type); + bus_register(&xenbus_frontend.bus); + bus_register(&xenbus_backend.bus); + device_register(&xenbus_frontend.dev); + device_register(&xenbus_backend.dev); if (!xen_start_info.store_evtchn) return 0; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c --- a/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/xenbus/xenbus_xs.c Thu Aug 25 22:53:20 2005 @@ -30,7 +30,6 @@ #include <linux/errno.h> #include <linux/types.h> -#include "xenstore/xenstored.h" #include <linux/uio.h> #include <linux/kernel.h> #include <linux/string.h> @@ -39,6 +38,7 @@ #include <linux/fcntl.h> #include <linux/kthread.h> #include <asm-xen/xenbus.h> +#include "xenstored.h" #include "xenbus_comms.h" #define streq(a, b) (strcmp((a), (b)) == 0) @@ -187,6 +187,7 @@ static char buffer[4096]; BUG_ON(down_trylock(&xenbus_lock) == 0); + /* XXX FIXME: might not be correct if name == "" */ BUG_ON(strlen(dir) + strlen("/") + strlen(name) + 1 > sizeof(buffer)); strcpy(buffer, dir); @@ -399,9 +400,12 @@ ret = PTR_ERR(p); break; } - if (sscanf(p, fmt, result) == 0) - ret = -EINVAL; - kfree(p); + if (fmt) { + if (sscanf(p, fmt, result) == 0) + ret = -EINVAL; + kfree(p); + } else + *(char **)result = p; } va_end(ap); return ret; @@ -494,6 +498,18 @@ printk(KERN_WARNING "XENBUS Failed to release watch %s: %i\n", watch->node, err); +} + +/* Re-register callbacks to all watches. */ +void reregister_xenbus_watches(void) +{ + struct xenbus_watch *watch; + char token[sizeof(watch) * 2 + 1]; + + list_for_each_entry(watch, &watches, list) { + sprintf(token, "%lX", (long)watch); + xs_watch(watch->node, token); + } } static int watch_thread(void *unused) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-generic/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-generic/pgtable.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-generic/pgtable.h Thu Aug 25 22:53:20 2005 @@ -37,7 +37,7 @@ */ #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ do { \ - set_pte_at((__vma)>vm_mm, (__address), __ptep, __entry); \ + set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ flush_tlb_page(__vma, __address); \ } while (0) #endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/desc.h Thu Aug 25 22:53:20 2005 @@ -93,7 +93,7 @@ static inline void load_TLS(struct thread_struct *t, unsigned int cpu) { -#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), ((u32 *)&t->tls_array[i])[0], ((u32 *)&t->tls_array[i])[1]) +#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i]) C(0); C(1); C(2); #undef C } diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/dma-mapping.h Thu Aug 25 22:53:20 2005 @@ -1,11 +1,35 @@ #ifndef _ASM_I386_DMA_MAPPING_H #define _ASM_I386_DMA_MAPPING_H +/* + * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for + * documentation. + */ + +#include <linux/config.h> #include <linux/mm.h> - #include <asm/cache.h> #include <asm/io.h> #include <asm/scatterlist.h> +#include <asm-i386/swiotlb.h> + +static inline int +address_needs_mapping(struct device *hwdev, dma_addr_t addr) +{ + dma_addr_t mask = 0xffffffff; + /* If the device has a mask, use it, otherwise default to 32 bits */ + if (hwdev && hwdev->dma_mask) + mask = *hwdev->dma_mask; + return (addr & ~mask) != 0; +} + +static inline int +range_straddles_page_boundary(void *p, size_t size) +{ + extern unsigned long *contiguous_bitmap; + return (((((unsigned long)p & ~PAGE_MASK) + size) > PAGE_SIZE) && + !test_bit(__pa(p) >> PAGE_SHIFT, contiguous_bitmap)); +} #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h) @@ -24,46 +48,18 @@ dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction); -static inline int -dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - int i; +extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction); +extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, enum dma_data_direction direction); - BUG_ON(direction == DMA_NONE); +extern dma_addr_t +dma_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction direction); - for (i = 0; i < nents; i++ ) { - BUG_ON(!sg[i].page); - - sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset; - } - - flush_write_buffers(); - return nents; -} - -static inline dma_addr_t -dma_map_page(struct device *dev, struct page *page, unsigned long offset, - size_t size, enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - return page_to_phys(page) + offset; -} - -static inline void +extern void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} - - -static inline void -dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); -} + enum dma_data_direction direction); extern void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, @@ -93,34 +89,25 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction) { + if (swiotlb) + swiotlb_sync_sg_for_cpu(dev,sg,nelems,direction); + flush_write_buffers(); } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction direction) { + if (swiotlb) + swiotlb_sync_sg_for_device(dev,sg,nelems,direction); flush_write_buffers(); } -static inline int -dma_mapping_error(dma_addr_t dma_addr) -{ - return 0; -} +extern int +dma_mapping_error(dma_addr_t dma_addr); -static inline int -dma_supported(struct device *dev, u64 mask) -{ - /* - * we fall back to GFP_DMA when the mask isn't all 1s, - * so we can't guarantee allocations that must be - * within a tighter range than GFP_DMA.. - */ - if(mask < 0x00ffffff) - return 0; - - return 1; -} +extern int +dma_supported(struct device *dev, u64 mask); static inline int dma_set_mask(struct device *dev, u64 mask) @@ -133,6 +120,7 @@ return 0; } +#ifdef __i386__ static inline int dma_get_cache_alignment(void) { @@ -140,6 +128,9 @@ * maximum possible, to be safe */ return (1 << L1_CACHE_SHIFT_MAX); } +#else +extern int dma_get_cache_alignment(void); +#endif #define dma_is_consistent(d) (1) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/fixmap.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/fixmap.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/fixmap.h Thu Aug 25 22:53:20 2005 @@ -102,8 +102,8 @@ __end_of_fixed_addresses }; -extern void __set_fixmap (enum fixed_addresses idx, - unsigned long phys, pgprot_t flags); +extern void __set_fixmap( + enum fixed_addresses idx, maddr_t phys, pgprot_t flags); #define set_fixmap(idx, phys) \ __set_fixmap(idx, phys, PAGE_KERNEL) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h Thu Aug 25 22:53:20 2005 @@ -163,7 +163,7 @@ TRAP_INSTR : "=a" (ret), "=b" (ign) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_yield) - : "memory" ); + : "memory", "ecx" ); return ret; } @@ -178,7 +178,7 @@ TRAP_INSTR : "=a" (ret), "=b" (ign1) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_block) - : "memory" ); + : "memory", "ecx" ); return ret; } @@ -194,7 +194,7 @@ : "=a" (ret), "=b" (ign1) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift)) - : "memory" ); + : "memory", "ecx" ); return ret; } @@ -210,7 +210,7 @@ : "=a" (ret), "=b" (ign1) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift)) - : "memory" ); + : "memory", "ecx" ); return ret; } @@ -228,7 +228,7 @@ : "=a" (ret), "=b" (ign1), "=S" (ign2) : "0" (__HYPERVISOR_sched_op), "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), - "S" (srec) : "memory"); + "S" (srec) : "memory", "ecx"); return ret; } @@ -244,7 +244,7 @@ : "=a" (ret), "=b" (ign1) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_shutdown | (SHUTDOWN_crash << SCHEDOP_reasonshift)) - : "memory" ); + : "memory", "ecx" ); return ret; } @@ -316,16 +316,17 @@ static inline int HYPERVISOR_update_descriptor( - unsigned long ma, unsigned long word1, unsigned long word2) -{ - int ret; - unsigned long ign1, ign2, ign3; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) - : "0" (__HYPERVISOR_update_descriptor), "1" (ma), "2" (word1), - "3" (word2) + u64 ma, u64 desc) +{ + int ret; + unsigned long ign1, ign2, ign3, ign4; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) + : "0" (__HYPERVISOR_update_descriptor), + "1" ((unsigned long)ma), "2" ((unsigned long)(ma>>32)), + "3" ((unsigned long)desc), "4" ((unsigned long)(desc>>32)) : "memory" ); return ret; @@ -385,13 +386,6 @@ #endif "4" (flags) : "memory" ); - - if ( unlikely(ret < 0) ) - { - printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n", - va, (new_val).pte_low, flags); - BUG(); - } return ret; } @@ -536,12 +530,15 @@ { int ret; unsigned long ign1; + /* Yes, I really do want to clobber edx here: when we resume a + vcpu after unpickling a multi-processor domain, it returns + here, but clobbers all of the call clobbered registers. */ __asm__ __volatile__ ( TRAP_INSTR : "=a" (ret), "=b" (ign1) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_vcpu_down | (vcpu << SCHEDOP_vcpushift)) - : "memory" ); + : "memory", "ecx", "edx" ); return ret; } @@ -557,8 +554,26 @@ : "=a" (ret), "=b" (ign1) : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_vcpu_up | (vcpu << SCHEDOP_vcpushift)) + : "memory", "ecx" ); + + return ret; +} + +static inline int +HYPERVISOR_vcpu_pickle( + int vcpu, vcpu_guest_context_t *ctxt) +{ + int ret; + unsigned long ign1, ign2; + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=c" (ign2) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_vcpu_pickle | (vcpu << SCHEDOP_vcpushift)), + "2" (ctxt) : "memory" ); return ret; } + #endif /* __HYPERCALL_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/irq_vectors.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/irq_vectors.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/irq_vectors.h Thu Aug 25 22:53:20 2005 @@ -124,17 +124,4 @@ #define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) #define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) -#ifndef __ASSEMBLY__ -/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ -extern int bind_virq_to_irq(int virq); -extern void unbind_virq_from_irq(int virq); -extern int bind_ipi_to_irq(int ipi); -extern void unbind_ipi_from_irq(int ipi); -extern int bind_evtchn_to_irq(int evtchn); -extern void unbind_evtchn_from_irq(int evtchn); - -extern void irq_suspend(void); -extern void irq_resume(void); -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_IRQ_VECTORS_H */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mach-xen/setup_arch_post.h Thu Aug 25 22:53:20 2005 @@ -8,23 +8,12 @@ static char * __init machine_specific_memory_setup(void) { - char *who; - unsigned long start_pfn, max_pfn; - - who = "Xen"; - - /* In dom0, we have to start the fake e820 map above the first - * 1MB, in other domains, it can start at 0. */ - if (xen_start_info.flags & SIF_INITDOMAIN) - start_pfn = 0x100; - else - start_pfn = 0; - max_pfn = xen_start_info.nr_pages; + unsigned long max_pfn = xen_start_info.nr_pages; e820.nr_map = 0; - add_memory_region(PFN_PHYS(start_pfn), PFN_PHYS(max_pfn) - PFN_PHYS(start_pfn), E820_RAM); + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); - return who; + return "Xen"; } void __init machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h Thu Aug 25 22:53:20 2005 @@ -34,10 +34,10 @@ * are always kernel segments while inside the kernel. Must * happen before reload of cr3/ldt (i.e., not in __switch_to). */ - __asm__ __volatile__ ( "mov %%fs,%0 ; mov %%gs,%1" + asm volatile ( "mov %%fs,%0 ; mov %%gs,%1" : "=m" (*(int *)¤t->thread.fs), "=m" (*(int *)¤t->thread.gs)); - __asm__ __volatile__ ( "mov %0,%%fs ; mov %0,%%gs" + asm volatile ( "mov %0,%%fs ; mov %0,%%gs" : : "r" (0) ); } @@ -100,7 +100,7 @@ } #define deactivate_mm(tsk, mm) \ - asm("mov %0,%%fs ; mov %0,%%gs": :"r" (0)) + asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) #define activate_mm(prev, next) \ switch_mm((prev),(next),NULL) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/page.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/page.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/page.h Thu Aug 25 22:53:20 2005 @@ -60,18 +60,50 @@ #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0U) +#define FOREIGN_FRAME(m) ((m) | 0x80000000U) extern unsigned int *phys_to_machine_mapping; -#define pfn_to_mfn(_pfn) ((unsigned long)(phys_to_machine_mapping[(_pfn)])) -#define mfn_to_pfn(_mfn) ((unsigned long)(machine_to_phys_mapping[(_mfn)])) -static inline unsigned long phys_to_machine(unsigned long phys) -{ - unsigned long machine = pfn_to_mfn(phys >> PAGE_SHIFT); +#define pfn_to_mfn(pfn) \ +((unsigned long)phys_to_machine_mapping[(unsigned int)(pfn)] & 0x7FFFFFFFUL) +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned int pfn; + + /* + * The array access can fail (e.g., device space beyond end of RAM). + * In such cases it doesn't matter what we return (we return garbage), + * but we must handle the fault without crashing! + */ + asm ( + "1: movl %1,%0\n" + "2:\n" + ".section __ex_table,\"a\"\n" + " .align 4\n" + " .long 1b,2b\n" + ".previous" + : "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) ); + + return (unsigned long)pfn; +} + +/* Definitions for machine and pseudophysical addresses. */ +#ifdef CONFIG_X86_PAE +typedef unsigned long long paddr_t; +typedef unsigned long long maddr_t; +#else +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; +#endif + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); return machine; } -static inline unsigned long machine_to_phys(unsigned long machine) -{ - unsigned long phys = mfn_to_pfn(machine >> PAGE_SHIFT); +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); return phys; } @@ -86,8 +118,9 @@ typedef struct { unsigned long long pmd; } pmd_t; typedef struct { unsigned long long pgd; } pgd_t; typedef struct { unsigned long long pgprot; } pgprot_t; -#define __pte(x) ({ unsigned long long _x = (x); \ - (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); }) +#define __pte(x) ({ unsigned long long _x = (x); \ + if (_x & 1) _x = phys_to_machine(_x); \ + ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); }) #define __pgd(x) ({ unsigned long long _x = (x); \ (((_x)&1) ? ((pgd_t) {phys_to_machine(_x)}) : ((pgd_t) {(_x)})); }) #define __pmd(x) ({ unsigned long long _x = (x); \ @@ -227,8 +260,10 @@ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) /* VIRT <-> MACHINE conversion */ -#define virt_to_machine(_a) (phys_to_machine(__pa(_a))) -#define machine_to_virt(_m) (__va(machine_to_phys(_m))) +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define machine_to_virt(m) (__va(machine_to_phys(m))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) #endif /* __KERNEL__ */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pci.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pci.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pci.h Thu Aug 25 22:53:20 2005 @@ -43,11 +43,8 @@ struct pci_dev; -/* The PCI address space does equal the physical memory - * address space. The networking and block device layers use - * this boolean for bounce buffer decisions. - */ -#define PCI_DMA_BUS_IS_PHYS (1) +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */ +#define PCI_DMA_BUS_IS_PHYS (0) /* pci_unmap_{page,single} is a nop so... */ #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h Thu Aug 25 22:53:20 2005 @@ -14,9 +14,9 @@ do { \ if (unlikely((mm)->context.pinned)) { \ if (!PageHighMem(pte)) \ - HYPERVISOR_update_va_mapping( \ + BUG_ON(HYPERVISOR_update_va_mapping( \ (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\ - pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0);\ + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0));\ set_pmd(pmd, __pmd(_PAGE_TABLE + \ ((unsigned long long)page_to_pfn(pte) << \ (unsigned long long) PAGE_SHIFT))); \ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-2level.h Thu Aug 25 22:53:20 2005 @@ -14,7 +14,28 @@ * hook is made available. */ #define set_pte(pteptr, pteval) (*(pteptr) = pteval) -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) + +inline static void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t val ) +{ + if ( ((mm != current->mm) && (mm != &init_mm)) || + HYPERVISOR_update_va_mapping( (addr), (val), 0 ) ) + { + set_pte(ptep, val); + } +} + +inline static void set_pte_at_sync(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t val ) +{ + if ( ((mm != current->mm) && (mm != &init_mm)) || + HYPERVISOR_update_va_mapping( (addr), (val), UVMF_INVLPG ) ) + { + set_pte(ptep, val); + xen_invlpg(addr); + } +} + #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) #ifndef CONFIG_XEN_SHADOW_MODE @@ -42,17 +63,15 @@ * * NB2. When deliberately mapping foreign pages into the p2m table, you *must* * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we - * require. In all the cases we care about, the high bit gets shifted out - * (e.g., phys_to_machine()) so behaviour there is correct. + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. */ -#define INVALID_P2M_ENTRY (~0U) -#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) #define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) #define pte_pfn(_pte) \ ({ \ unsigned long mfn = pte_mfn(_pte); \ unsigned long pfn = mfn_to_pfn(mfn); \ - if ((pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn)) \ + if ((pfn >= max_mapnr) || (phys_to_machine_mapping[pfn] != mfn))\ pfn = max_mapnr; /* special: force !pfn_valid() */ \ pfn; \ }) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-3level.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-3level.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable-3level.h Thu Aug 25 22:53:20 2005 @@ -68,7 +68,27 @@ xen_l1_entry_update((pteptr), (pteval)) # define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval) #endif -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) + +inline static void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t val ) +{ + if ( ((mm != current->mm) && (mm != &init_mm)) || + HYPERVISOR_update_va_mapping( (addr), (val), 0 ) ) + { + set_pte(ptep, val); + } +} + +inline static void set_pte_at_sync(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t val ) +{ + if ( ((mm != current->mm) && (mm != &init_mm)) || + HYPERVISOR_update_va_mapping( (addr), (val), UVMF_INVLPG ) ) + { + set_pte(ptep, val); + xen_invlpg(addr); + } +} #ifdef CONFIG_XEN_SHADOW_MODE # define set_pmd(pmdptr,pmdval) \ @@ -130,14 +150,13 @@ return !pte.pte_low && !pte.pte_high; } -#define INVALID_P2M_ENTRY (~0U) -#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) -#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) /* FIXME */ +#define pte_mfn(_pte) ( ((_pte).pte_low >> PAGE_SHIFT) |\ + (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)) ) #define pte_pfn(_pte) \ ({ \ unsigned long mfn = pte_mfn(_pte); \ unsigned long pfn = mfn_to_pfn(mfn); \ - if ((pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn)) \ + if ((pfn >= max_mapnr) || (phys_to_machine_mapping[pfn] != mfn))\ pfn = max_mapnr; /* special: force !pfn_valid() */ \ pfn; \ }) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgtable.h Thu Aug 25 22:53:20 2005 @@ -32,7 +32,7 @@ */ #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; -extern pgd_t swapper_pg_dir[1024]; +extern pgd_t *swapper_pg_dir; extern kmem_cache_t *pgd_cache; extern kmem_cache_t *pmd_cache; extern spinlock_t pgd_lock; @@ -398,7 +398,7 @@ do { \ if (__dirty) { \ if ( likely((__vma)->vm_mm == current->mm) ) { \ - HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits)); \ + BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \ } else { \ xen_l1_entry_update((__ptep), (__entry)); \ flush_tlb_page((__vma), (__address)); \ @@ -416,8 +416,8 @@ #define ptep_establish_new(__vma, __address, __ptep, __entry) \ do { \ if (likely((__vma)->vm_mm == current->mm)) { \ - HYPERVISOR_update_va_mapping((__address), \ - __entry, 0); \ + BUG_ON(HYPERVISOR_update_va_mapping((__address), \ + __entry, 0)); \ } else { \ xen_l1_entry_update((__ptep), (__entry)); \ } \ @@ -450,7 +450,7 @@ #define arbitrary_virt_to_machine(__va) \ ({ \ pte_t *__pte = virt_to_ptep(__va); \ - unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK; \ + maddr_t __pa = (maddr_t)pte_mfn(*__pte) << PAGE_SHIFT; \ __pa | ((unsigned long)(__va) & (PAGE_SIZE-1)); \ }) @@ -466,10 +466,12 @@ unsigned long size, pgprot_t prot, domid_t domid); -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + unsigned long *ptep); +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); #define io_remap_page_range(vma,from,phys,size,prot) \ direct_remap_area_pages(vma->vm_mm,from,phys,size,prot,DOMID_IO) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/dma-mapping.h Thu Aug 25 22:53:20 2005 @@ -1,89 +1,1 @@ -#ifndef _X8664_DMA_MAPPING_H -#define _X8664_DMA_MAPPING_H 1 - -/* - * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for - * documentation. - */ - -#include <linux/config.h> - -#include <asm/scatterlist.h> -#include <asm/io.h> -#include <asm/swiotlb.h> - -extern dma_addr_t bad_dma_address; -#define dma_mapping_error(x) \ - (swiotlb ? swiotlb_dma_mapping_error(x) : ((x) == bad_dma_address)) - -void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned gfp); -void dma_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle); - -extern dma_addr_t dma_map_single(struct device *hwdev, void *ptr, size_t size, - enum dma_data_direction direction); -extern void dma_unmap_single(struct device *dev, dma_addr_t addr,size_t size, - enum dma_data_direction direction); - -#define dma_map_page(dev,page,offset,size,dir) \ - dma_map_single((dev), page_address(page)+(offset), (size), (dir)) - -extern void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction); - -extern void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction); - -static inline void dma_sync_sg_for_cpu(struct device *hwdev, - struct scatterlist *sg, - int nelems, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - - if (swiotlb) - return swiotlb_sync_sg_for_cpu(hwdev,sg,nelems,direction); - - flush_write_buffers(); -} - -static inline void dma_sync_sg_for_device(struct device *hwdev, - struct scatterlist *sg, - int nelems, int direction) -{ - if (direction == DMA_NONE) - out_of_line_bug(); - - if (swiotlb) - return swiotlb_sync_sg_for_device(hwdev,sg,nelems,direction); - - flush_write_buffers(); -} - -extern int dma_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction); -extern void dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction); - -#define dma_unmap_page dma_unmap_single - -extern int dma_supported(struct device *hwdev, u64 mask); -extern int dma_get_cache_alignment(void); -#define dma_is_consistent(h) 1 - -static inline int dma_set_mask(struct device *dev, u64 mask) -{ - if (!dev->dma_mask || !dma_supported(dev, mask)) - return -EIO; - *dev->dma_mask = mask; - return 0; -} - -static inline void dma_cache_sync(void *vaddr, size_t size, enum dma_data_direction dir) -{ - flush_write_buffers(); -} -#endif +#include <asm-i386/dma-mapping.h> diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h Thu Aug 25 22:53:20 2005 @@ -502,4 +502,21 @@ return ret; } +static inline int +HYPERVISOR_vcpu_pickle( + int vcpu, vcpu_guest_context_t *ctxt) +{ + int ret; + + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret) + : "0" ((unsigned long)__HYPERVISOR_sched_op), + "D" ((unsigned long)SCHEDOP_vcpu_pickle | (vcpu << SCHEDOP_vcpushift)), + "S" ((unsigned long)ctxt) + : __syscall_clobber ); + + return ret; +} + #endif /* __HYPERCALL_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/irq_vectors.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/irq_vectors.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mach-xen/irq_vectors.h Thu Aug 25 22:53:20 2005 @@ -122,17 +122,4 @@ #define dynirq_to_irq(_x) ((_x) + DYNIRQ_BASE) #define irq_to_dynirq(_x) ((_x) - DYNIRQ_BASE) -#ifndef __ASSEMBLY__ -/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */ -extern int bind_virq_to_irq(int virq); -extern void unbind_virq_from_irq(int virq); -extern int bind_ipi_to_irq(int ipi); -extern void unbind_ipi_from_irq(int ipi); -extern int bind_evtchn_to_irq(int evtchn); -extern void unbind_evtchn_from_irq(int evtchn); - -extern void irq_suspend(void); -extern void irq_resume(void); -#endif /* __ASSEMBLY__ */ - #endif /* _ASM_IRQ_VECTORS_H */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/page.h Thu Aug 25 22:53:20 2005 @@ -62,19 +62,46 @@ #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0U) +#define FOREIGN_FRAME(m) ((m) | 0x80000000U) extern u32 *phys_to_machine_mapping; -#define pfn_to_mfn(_pfn) ((unsigned long) phys_to_machine_mapping[(unsigned int)(_pfn)]) -#define mfn_to_pfn(_mfn) ((unsigned long) machine_to_phys_mapping[(unsigned int)(_mfn)]) -static inline unsigned long phys_to_machine(unsigned long phys) -{ - unsigned long machine = pfn_to_mfn(phys >> PAGE_SHIFT); +#define pfn_to_mfn(pfn) \ +((unsigned long)phys_to_machine_mapping[(unsigned int)(pfn)] & 0x7FFFFFFFUL) +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned int pfn; + + /* + * The array access can fail (e.g., device space beyond end of RAM). + * In such cases it doesn't matter what we return (we return garbage), + * but we must handle the fault without crashing! + */ + asm ( + "1: movl %1,%k0\n" + "2:\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 1b,2b\n" + ".previous" + : "=r" (pfn) : "m" (machine_to_phys_mapping[mfn]) ); + + return (unsigned long)pfn; +} + +/* Definitions for machine and pseudophysical addresses. */ +typedef unsigned long paddr_t; +typedef unsigned long maddr_t; + +static inline maddr_t phys_to_machine(paddr_t phys) +{ + maddr_t machine = pfn_to_mfn(phys >> PAGE_SHIFT); machine = (machine << PAGE_SHIFT) | (phys & ~PAGE_MASK); return machine; } -static inline unsigned long machine_to_phys(unsigned long machine) -{ - unsigned long phys = mfn_to_pfn(machine >> PAGE_SHIFT); +static inline paddr_t machine_to_phys(maddr_t machine) +{ + paddr_t phys = mfn_to_pfn(machine >> PAGE_SHIFT); phys = (phys << PAGE_SHIFT) | (machine & ~PAGE_MASK); return phys; } @@ -211,8 +238,10 @@ #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) /* VIRT <-> MACHINE conversion */ -#define virt_to_machine(_a) (phys_to_machine(__pa(_a))) -#define machine_to_virt(_m) (__va(machine_to_phys(_m))) +#define virt_to_machine(v) (phys_to_machine(__pa(v))) +#define machine_to_virt(m) (__va(machine_to_phys(m))) +#define virt_to_mfn(v) (pfn_to_mfn(__pa(v) >> PAGE_SHIFT)) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) #define VM_DATA_DEFAULT_FLAGS \ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pci.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pci.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pci.h Thu Aug 25 22:53:20 2005 @@ -79,7 +79,9 @@ #else /* No IOMMU */ -#define PCI_DMA_BUS_IS_PHYS 1 +/* On Xen we use SWIOTLB instead of blk-specific bounce buffers. */ +#define PCI_DMA_BUS_IS_PHYS (0) + #define pci_dac_dma_supported(pci_dev, mask) 1 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgtable.h Thu Aug 25 22:53:20 2005 @@ -4,31 +4,20 @@ /* * This file contains the functions and defines necessary to modify and use * the x86-64 page table tree. - * - * x86-64 has a 4 level table setup. Generic linux MM only supports - * three levels. The fourth level is currently a single static page that - * is shared by everybody and just contains a pointer to the current - * three level page setup on the beginning and some kernel mappings at - * the end. For more details see Documentation/x86_64/mm.txt */ #include <asm/processor.h> #include <asm/fixmap.h> #include <asm/bitops.h> #include <linux/threads.h> +#include <linux/sched.h> #include <asm/pda.h> +#ifdef CONFIG_XEN #include <asm-xen/hypervisor.h> + extern pud_t level3_user_pgt[512]; -extern pud_t init_level4_pgt[]; extern pud_t init_level4_user_pgt[]; -extern unsigned long __supported_pte_mask; - -#define swapper_pg_dir NULL - -extern int nonx_setup(char *str); -extern void paging_init(void); -extern void clear_kernel_mapping(unsigned long addr, unsigned long size); - -extern unsigned long pgkern_mask; + +extern void xen_init_pt(void); #define virt_to_ptep(__va) \ ({ \ @@ -44,6 +33,22 @@ unsigned long __pa = (*(unsigned long *)__pte) & PAGE_MASK; \ __pa | ((unsigned long)(__va) & (PAGE_SIZE-1)); \ }) +#endif + +extern pud_t level3_kernel_pgt[512]; +extern pud_t level3_physmem_pgt[512]; +extern pud_t level3_ident_pgt[512]; +extern pmd_t level2_kernel_pgt[512]; +extern pgd_t init_level4_pgt[]; +extern unsigned long __supported_pte_mask; + +#define swapper_pg_dir init_level4_pgt + +extern int nonx_setup(char *str); +extern void paging_init(void); +extern void clear_kernel_mapping(unsigned long addr, unsigned long size); + +extern unsigned long pgkern_mask; /* * ZERO_PAGE is a global shared page that is always zero: used @@ -52,11 +57,14 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ #define PGDIR_SHIFT 39 #define PTRS_PER_PGD 512 /* - * PUDIR_SHIFT determines what a top-level page table entry can map + * 3rd level page */ #define PUD_SHIFT 30 #define PTRS_PER_PUD 512 @@ -80,7 +88,7 @@ #define pud_ERROR(e) \ printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) + printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) #define pgd_none(x) (!pgd_val(x)) #define pud_none(x) (!pud_val(x)) @@ -90,18 +98,10 @@ extern inline int pud_present(pud_t pud) { return !pud_none(pud); } -#ifdef CONFIG_SMP -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval)) - -#else -#define set_pte(pteptr, pteval) xen_l1_entry_update(pteptr, (pteval)) -#if 0 static inline void set_pte(pte_t *dst, pte_t val) { *dst = val; } -#endif -#endif #define set_pmd(pmdptr, pmdval) xen_l2_entry_update(pmdptr, (pmdval)) #define set_pud(pudptr, pudval) xen_l3_entry_update(pudptr, (pudval)) @@ -132,6 +132,9 @@ * each domain will have separate page tables, with their own versions of * accessed & dirty state. */ +#define ptep_get_and_clear(mm,addr,xp) __pte_ma(xchg(&(xp)->pte, 0)) + +#if 0 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) { pte_t pte = *xp; @@ -139,21 +142,22 @@ set_pte(xp, __pte_ma(0)); return pte; } +#endif #define pte_same(a, b) ((a).pte == (b).pte) -#define PMD_SIZE (1UL << PMD_SHIFT) -#define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SIZE (1UL << PUD_SHIFT) -#define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SIZE (1UL << PGDIR_SHIFT) -#define PGDIR_MASK (~(PGDIR_SIZE-1)) - -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) +#define PUD_SIZE (1UL << PUD_SHIFT) +#define PUD_MASK (~(PUD_SIZE-1)) +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) #define FIRST_USER_ADDRESS 0 #ifndef __ASSEMBLY__ -#define MAXMEM 0x3fffffffffffUL +#define MAXMEM 0x3fffffffffffUL #define VMALLOC_START 0xffffc20000000000UL #define VMALLOC_END 0xffffe1ffffffffffUL #define MODULES_VADDR 0xffffffff88000000UL @@ -262,7 +266,16 @@ val &= ~(_PAGE_USER | _PAGE_DIRTY); return val & ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED); } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) + +inline static void set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t val ) +{ + if ( ((mm != current->mm) && (mm != &init_mm)) || + HYPERVISOR_update_va_mapping( (addr), (val), 0 ) ) + { + set_pte(ptep, val); + } +} #define pte_none(x) (!(x).pte) #define pte_present(x) ((x).pte & (_PAGE_PRESENT | _PAGE_PROTNONE)) @@ -287,17 +300,15 @@ * * NB2. When deliberately mapping foreign pages into the p2m table, you *must* * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we - * require. In all the cases we care about, the high bit gets shifted out - * (e.g., phys_to_machine()) so behaviour there is correct. - */ -#define INVALID_P2M_ENTRY (~0U) -#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ #define pte_mfn(_pte) (((_pte).pte & PTE_MASK) >> PAGE_SHIFT) #define pte_pfn(_pte) \ ({ \ unsigned long mfn = pte_mfn(_pte); \ unsigned pfn = mfn_to_pfn(mfn); \ - if ((pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn)) \ + if ((pfn >= max_mapnr) || (phys_to_machine_mapping[pfn] != mfn))\ pfn = max_mapnr; /* special: force !pfn_valid() */ \ pfn; \ }) @@ -347,7 +358,7 @@ pte_t pte = *ptep; int ret = pte_dirty(pte); if (ret) - xen_l1_entry_update(ptep, pte_mkclean(pte)); + set_pte(ptep, pte_mkclean(pte)); return ret; } @@ -356,7 +367,7 @@ pte_t pte = *ptep; int ret = pte_young(pte); if (ret) - xen_l1_entry_update(ptep, pte_mkold(pte)); + set_pte(ptep, pte_mkold(pte)); return ret; } @@ -398,7 +409,7 @@ /* PUD - Level3 access */ /* to find an entry in a page-table-directory. */ -#define pud_index(address) ((address >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #define pud_offset(pgd, address) ((pud_t *) pgd_page(*(pgd)) + pud_index(address)) static inline pud_t *__pud_offset_k(pud_t *pud, unsigned long address) { @@ -413,7 +424,7 @@ { unsigned long addr; - addr = pud_val(init_level4_pgt[pud_index(address)]); + addr = pgd_val(init_level4_pgt[pud_index(address)]); addr &= PHYSICAL_PAGE_MASK; /* machine physical */ addr = machine_to_phys(addr); return __pud_offset_k((pud_t *)__va(addr), address); @@ -427,9 +438,11 @@ #define pmd_offset(dir, address) ((pmd_t *) pud_page(*(dir)) + \ pmd_index(address)) #define pmd_none(x) (!pmd_val(x)) -#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ +#define pmd_present(x) (pmd_val(x)) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) ((pmd_val(x) & ~PTE_MASK) != _KERNPG_TABLE ) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot))) #define pmd_pfn(x) ((pmd_val(x) >> PAGE_SHIFT) & __PHYSICAL_MASK) @@ -479,11 +492,24 @@ * race with other CPU's that might be updating the dirty * bit at the same time. */ #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +#if 0 #define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ do { \ if (__dirty) { \ set_pte(__ptep, __entry); \ flush_tlb_page(__vma, __address); \ + } \ + } while (0) +#endif +#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ + do { \ + if (__dirty) { \ + if ( likely((__vma)->vm_mm == current->mm) ) { \ + BUG_ON(HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG|UVMF_MULTI|(unsigned long)((__vma)->vm_mm->cpu_vm_mask.bits))); \ + } else { \ + xen_l1_entry_update((__ptep), (__entry)); \ + flush_tlb_page((__vma), (__address)); \ + } \ } \ } while (0) @@ -510,12 +536,18 @@ unsigned long address, unsigned long size, mmu_update_t *v); +int create_lookup_pte_addr(struct mm_struct *mm, + unsigned long address, + unsigned long *ptep); +int touch_pte_range(struct mm_struct *mm, + unsigned long address, + unsigned long size); #define io_remap_page_range(vma, vaddr, paddr, size, prot) \ - remap_pfn_range(vma, vaddr, (paddr) >> PAGE_SHIFT, size, prot) + direct_remap_area_pages((vma)->vm_mm,vaddr,paddr,size,prot,DOMID_IO) #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ - remap_pfn_range(vma, vaddr, pfn, size, prot) + direct_remap_area_pages((vma)->vm_mm,vaddr,(pfn)<<PAGE_SHIFT,size,prot,DOMID_IO) #define MK_IOSPACE_PFN(space, pfn) (pfn) #define GET_IOSPACE(pfn) 0 diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/processor.h Thu Aug 25 22:53:20 2005 @@ -153,6 +153,20 @@ } while (0); } } + + +static inline void clear_in_cr4 (unsigned long mask) +{ +#ifndef CONFIG_XEN + mmu_cr4_features &= ~mask; + __asm__("movq %%cr4,%%rax\n\t" + "andq %0,%%rax\n\t" + "movq %%rax,%%cr4\n" + : : "irg" (~mask) + :"ax"); +#endif +} + #define load_cr3(pgdir) do { \ xen_pt_switch(__pa(pgdir)); \ @@ -283,9 +297,9 @@ load_gs_index(0); \ (regs)->rip = (new_rip); \ (regs)->rsp = (new_rsp); \ - write_pda(oldrsp, (new_rsp)); \ - (regs)->cs = __USER_CS; \ - (regs)->ss = __USER_DS; \ + write_pda(oldrsp, (new_rsp)); \ + (regs)->cs = __USER_CS; \ + (regs)->ss = __USER_DS; \ (regs)->eflags = 0x200; \ set_fs(USER_DS); \ } while(0) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/balloon.h --- a/linux-2.6-xen-sparse/include/asm-xen/balloon.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/balloon.h Thu Aug 25 22:53:20 2005 @@ -35,10 +35,19 @@ * Inform the balloon driver that it should allow some slop for device-driver * memory activities. */ -extern void balloon_update_driver_allowance(long delta); +extern void +balloon_update_driver_allowance( + long delta); -/* Give up unmapped pages to the balloon driver. */ -extern void balloon_put_pages(unsigned long *mfn_list, unsigned long nr_mfns); +/* Allocate an empty low-memory page range. */ +extern struct page * +balloon_alloc_empty_page_range( + unsigned long nr_pages); + +/* Deallocate an empty page range, adding to the balloon. */ +extern void +balloon_dealloc_empty_page_range( + struct page *page, unsigned long nr_pages); /* * Prevent the balloon driver from changing the memory reservation during diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/evtchn.h --- a/linux-2.6-xen-sparse/include/asm-xen/evtchn.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/evtchn.h Thu Aug 25 22:53:20 2005 @@ -32,6 +32,7 @@ #define __ASM_EVTCHN_H__ #include <linux/config.h> +#include <linux/interrupt.h> #include <asm-xen/hypervisor.h> #include <asm/ptrace.h> #include <asm-xen/synch_bitops.h> @@ -41,6 +42,34 @@ /* * LOW-LEVEL DEFINITIONS */ + +/* Dynamically bind a VIRQ source to Linux IRQ space. */ +extern int bind_virq_to_irq(int virq); +extern void unbind_virq_from_irq(int virq); + +/* Dynamically bind an IPI source to Linux IRQ space. */ +extern int bind_ipi_to_irq(int ipi); +extern void unbind_ipi_from_irq(int ipi); + +/* Dynamically bind an event-channel port to Linux IRQ space. */ +extern int bind_evtchn_to_irq(unsigned int evtchn); +extern void unbind_evtchn_from_irq(unsigned int evtchn); + +/* + * Dynamically bind an event-channel port to an IRQ-like callback handler. + * On some platforms this may not be implemented via the Linux IRQ subsystem. + * You *cannot* trust the irq argument passed to the callback handler. + */ +extern int bind_evtchn_to_irqhandler( + unsigned int evtchn, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char *devname, + void *dev_id); +extern void unbind_evtchn_from_irqhandler(unsigned int evtchn, void *dev_id); + +extern void irq_suspend(void); +extern void irq_resume(void); /* Entry point for notifications into Linux subsystems. */ asmlinkage void evtchn_do_upcall(struct pt_regs *regs); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/gnttab.h --- a/linux-2.6-xen-sparse/include/asm-xen/gnttab.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/gnttab.h Thu Aug 25 22:53:20 2005 @@ -19,54 +19,48 @@ /* NR_GRANT_FRAMES must be less than or equal to that configured in Xen */ #define NR_GRANT_FRAMES 4 -#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t)) -int -gnttab_grant_foreign_access( - domid_t domid, unsigned long frame, int readonly); +struct gnttab_free_callback { + struct gnttab_free_callback *next; + void (*fn)(void *); + void *arg; + u16 count; +}; -void -gnttab_end_foreign_access( - grant_ref_t ref, int readonly); +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + int readonly); -int -gnttab_grant_foreign_transfer( - domid_t domid, unsigned long pfn); +void gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly); +void gnttab_end_foreign_access(grant_ref_t ref, int readonly); -unsigned long -gnttab_end_foreign_transfer( - grant_ref_t ref); +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn); -int -gnttab_query_foreign_access( - grant_ref_t ref ); +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); + +int gnttab_query_foreign_access(grant_ref_t ref); /* * operations on reserved batches of grant references */ -int -gnttab_alloc_grant_references( - u16 count, grant_ref_t *pprivate_head, grant_ref_t *private_terminal ); +int gnttab_alloc_grant_references(u16 count, grant_ref_t *pprivate_head); -void -gnttab_free_grant_references( - u16 count, grant_ref_t private_head ); +void gnttab_free_grant_reference(grant_ref_t ref); -int -gnttab_claim_grant_reference( grant_ref_t *pprivate_head, grant_ref_t terminal -); +void gnttab_free_grant_references(grant_ref_t head); -void -gnttab_release_grant_reference( - grant_ref_t *private_head, grant_ref_t release ); +int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); -void -gnttab_grant_foreign_access_ref( - grant_ref_t ref, domid_t domid, unsigned long frame, int readonly); +void gnttab_release_grant_reference(grant_ref_t *private_head, + grant_ref_t release); -void -gnttab_grant_foreign_transfer_ref( - grant_ref_t, domid_t domid, unsigned long pfn); +void gnttab_request_free_callback(struct gnttab_free_callback *callback, + void (*fn)(void *), void *arg, u16 count); +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int readonly); + +void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, + unsigned long pfn); #endif /* __ASM_GNTTAB_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/hypervisor.h --- a/linux-2.6-xen-sparse/include/asm-xen/hypervisor.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/hypervisor.h Thu Aug 25 22:53:20 2005 @@ -134,12 +134,8 @@ #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var) #endif /* linux < 2.6.0 */ -void xen_contig_memory(unsigned long vstart, unsigned int order); - -#ifdef CONFIG_XEN_PHYSDEV_ACCESS -/* Allocate a contiguous empty region of low memory. Return virtual start. */ -unsigned long allocate_empty_lowmem_region(unsigned long pages); -#endif +void xen_create_contiguous_region(unsigned long vstart, unsigned int order); +void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order); #include <asm/hypercall.h> diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/xenbus.h --- a/linux-2.6-xen-sparse/include/asm-xen/xenbus.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/xenbus.h Thu Aug 25 22:53:20 2005 @@ -1,5 +1,3 @@ -#ifndef _ASM_XEN_XENBUS_H -#define _ASM_XEN_XENBUS_H /****************************************************************************** * xenbus.h * @@ -28,13 +26,17 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ + +#ifndef _ASM_XEN_XENBUS_H +#define _ASM_XEN_XENBUS_H + #include <linux/device.h> +#include <linux/notifier.h> #include <asm/semaphore.h> /* A xenbus device. */ struct xenbus_device { char *devicetype; - char *subtype; char *nodename; struct device dev; int has_error; @@ -50,7 +52,6 @@ { /* .../device/<device_type>/<identifier> */ char devicetype[32]; /* General class of device. */ - char subtype[32]; /* Contents of "subtype" for this device */ }; /* A xenbus driver. */ @@ -58,9 +59,11 @@ char *name; struct module *owner; const struct xenbus_device_id *ids; - int (*probe) (struct xenbus_device * dev, - const struct xenbus_device_id * id); - int (*remove) (struct xenbus_device * dev); + int (*probe)(struct xenbus_device *dev, + const struct xenbus_device_id *id); + int (*remove)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev); + int (*resume)(struct xenbus_device *dev); struct device_driver driver; }; @@ -69,7 +72,8 @@ return container_of(drv, struct xenbus_driver, driver); } -int xenbus_register_driver(struct xenbus_driver *drv); +int xenbus_register_device(struct xenbus_driver *drv); +int xenbus_register_backend(struct xenbus_driver *drv); void xenbus_unregister_driver(struct xenbus_driver *drv); /* Caller must hold this lock to call these functions: it's also held @@ -112,7 +116,26 @@ void (*callback)(struct xenbus_watch *, const char *node); }; +/* notifer routines for when the xenstore comes up */ +int register_xenstore_notifier(struct notifier_block *nb); +void unregister_xenstore_notifier(struct notifier_block *nb); + int register_xenbus_watch(struct xenbus_watch *watch); void unregister_xenbus_watch(struct xenbus_watch *watch); +void reregister_xenbus_watches(void); + +/* Called from xen core code. */ +void xenbus_suspend(void); +void xenbus_resume(void); + +#define XENBUS_IS_ERR_READ(str) ({ \ + if (!IS_ERR(str) && strlen(str) == 0) { \ + kfree(str); \ + str = ERR_PTR(-ERANGE); \ + } \ + IS_ERR(str); \ +}) + +#define XENBUS_EXIST_ERR(err) ((err) == -ENOENT || (err) == -ERANGE) #endif /* _ASM_XEN_XENBUS_H */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/linux/mm.h --- a/linux-2.6-xen-sparse/include/linux/mm.h Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/linux/mm.h Thu Aug 25 22:53:20 2005 @@ -817,6 +817,12 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, unsigned long, pgprot_t); +typedef int (*pte_fn_t)(pte_t *pte, struct page *pte_page, unsigned long addr, + void *data); +extern int generic_page_range(struct mm_struct *mm, unsigned long address, + unsigned long size, pte_fn_t fn, void *data); + + #ifdef CONFIG_PROC_FS void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); #else diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/mkbuildtree --- a/linux-2.6-xen-sparse/mkbuildtree Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/mkbuildtree Thu Aug 25 22:53:20 2005 @@ -102,10 +102,11 @@ relative_lndir ${RS} rm -f mkbuildtree - # Create links to the shared definitions of the Xen interfaces. rm -rf ${AD}/include/asm-xen/xen-public mkdir ${AD}/include/asm-xen/xen-public cd ${AD}/include/asm-xen/xen-public relative_lndir ../../../${RS}/../xen/include/public +cd ${AD}/drivers/xen/xenbus +ln -sf ../../../${RS}/../tools/xenstore/xenstored.h diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/mm/memory.c --- a/linux-2.6-xen-sparse/mm/memory.c Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/mm/memory.c Thu Aug 25 22:53:20 2005 @@ -954,8 +954,10 @@ i++; start += PAGE_SIZE; len--; +printk(KERN_ALERT "HIT 0x%lx\n", start); continue; - } + } +else printk(KERN_ALERT "MISS 0x%lx\n", start); } if (!vma || (vma->vm_flags & VM_IO) @@ -1213,6 +1215,104 @@ } EXPORT_SYMBOL(remap_pfn_range); +static inline int generic_pte_range(struct mm_struct *mm, + pmd_t *pmd, + unsigned long addr, + unsigned long end, + pte_fn_t fn, void *data) +{ + pte_t *pte; + int err; + struct page *pte_page; + + pte = (mm == &init_mm) ? + pte_alloc_kernel(mm, pmd, addr) : + pte_alloc_map(mm, pmd, addr); + if (!pte) + return -ENOMEM; + + pte_page = pmd_page(*pmd); + + do { + err = fn(pte, pte_page, addr, data); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + if (mm != &init_mm) + pte_unmap(pte-1); + return err; + +} + +static inline int generic_pmd_range(struct mm_struct *mm, + pud_t *pud, + unsigned long addr, + unsigned long end, + pte_fn_t fn, void *data) +{ + pmd_t *pmd; + unsigned long next; + int err; + + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + do { + next = pmd_addr_end(addr, end); + err = generic_pte_range(mm, pmd, addr, next, fn, data); + if (err) + break; + } while (pmd++, addr = next, addr != end); + return err; +} + +static inline int generic_pud_range(struct mm_struct *mm, pgd_t *pgd, + unsigned long addr, + unsigned long end, + pte_fn_t fn, void *data) +{ + pud_t *pud; + unsigned long next; + int err; + + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return -ENOMEM; + do { + next = pud_addr_end(addr, end); + err = generic_pmd_range(mm, pud, addr, next, fn, data); + if (err) + break; + } while (pud++, addr = next, addr != end); + return err; +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int generic_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + pgd_t *pgd; + unsigned long next; + unsigned long end = addr + size; + int err; + + BUG_ON(addr >= end); + pgd = pgd_offset(mm, addr); + spin_lock(&mm->page_table_lock); + do { + next = pgd_addr_end(addr, end); + err = generic_pud_range(mm, pgd, addr, next, fn, data); + if (err) + break; + } while (pgd++, addr = next, addr != end); + spin_unlock(&mm->page_table_lock); + return err; +} + /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want diff -r 5f1ed597f107 -r 8799d14bef77 tools/Makefile --- a/tools/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/Makefile Thu Aug 25 22:53:20 2005 @@ -13,7 +13,8 @@ #SUBDIRS += pygrub SUBDIRS += firmware SUBDIRS += security -#SUBDIRS += consoled +SUBDIRS += console +SUBDIRS += xenstat .PHONY: all install clean check check_clean ioemu eioemuinstall ioemuclean diff -r 5f1ed597f107 -r 8799d14bef77 tools/Rules.mk --- a/tools/Rules.mk Wed Aug 24 02:43:18 2005 +++ b/tools/Rules.mk Thu Aug 25 22:53:20 2005 @@ -6,6 +6,7 @@ XEN_LIBXC = $(XEN_ROOT)/tools/libxc XEN_XCS = $(XEN_ROOT)/tools/xcs XEN_XENSTORE = $(XEN_ROOT)/tools/xenstore +XEN_LIBXENSTAT = $(XEN_ROOT)/tools/xenstat/libxenstat/src ifeq ($(XEN_TARGET_ARCH),x86_32) CFLAGS += -m32 -march=i686 diff -r 5f1ed597f107 -r 8799d14bef77 tools/blktap/blktaplib.c --- a/tools/blktap/blktaplib.c Wed Aug 24 02:43:18 2005 +++ b/tools/blktap/blktaplib.c Thu Aug 25 22:53:20 2005 @@ -34,7 +34,7 @@ #else #define DPRINTF(_f, _a...) ((void)0) #endif -#define DEBUG_RING_IDXS 0 +#define DEBUG_RING_IDXS 1 #define POLLRDNORM 0x040 diff -r 5f1ed597f107 -r 8799d14bef77 tools/blktap/blktaplib.h --- a/tools/blktap/blktaplib.h Wed Aug 24 02:43:18 2005 +++ b/tools/blktap/blktaplib.h Thu Aug 25 22:53:20 2005 @@ -7,7 +7,7 @@ #ifndef __BLKTAPLIB_H__ #define __BLKTAPLIB_H__ -#include <xc.h> +#include <xenctrl.h> #include <sys/user.h> #include <xen/xen.h> #include <xen/io/blkif.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/blktap/parallax/block-async.h --- a/tools/blktap/parallax/block-async.h Wed Aug 24 02:43:18 2005 +++ b/tools/blktap/parallax/block-async.h Thu Aug 25 22:53:20 2005 @@ -7,7 +7,7 @@ #define _BLOCKASYNC_H_ #include <assert.h> -#include <xc.h> +#include <xenctrl.h> #include "vdi.h" struct io_ret diff -r 5f1ed597f107 -r 8799d14bef77 tools/blktap/parallax/blockstore.h --- a/tools/blktap/parallax/blockstore.h Wed Aug 24 02:43:18 2005 +++ b/tools/blktap/parallax/blockstore.h Thu Aug 25 22:53:20 2005 @@ -10,7 +10,7 @@ #define __BLOCKSTORE_H__ #include <netinet/in.h> -#include <xc.h> +#include <xenctrl.h> #define BLOCK_SIZE 4096 #define BLOCK_SHIFT 12 diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/linux-xen-low.c --- a/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/linux-xen-low.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/gdb/gdb-6.2.1-xen-sparse/gdb/gdbserver/linux-xen-low.c Thu Aug 25 22:53:20 2005 @@ -35,7 +35,7 @@ #include <stdlib.h> #include <unistd.h> #include <errno.h> -#include <xc.h> +#include <xenctrl.h> #define TRACE_ENTER /* printf("enter %s\n", __FUNCTION__) */ long (*myptrace)(enum __ptrace_request, pid_t, long, long); int (*myxcwait)(int domain, int *status, int options) ; diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/libxendebug/Makefile --- a/tools/debugger/libxendebug/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/libxendebug/Makefile Thu Aug 25 22:53:20 2005 @@ -20,7 +20,7 @@ CFLAGS += -Wp,-MD,.$(@F).d DEPS = .*.d -LDFLAGS += -L$(XEN_ROOT)/tools/libxc -lxc +LDFLAGS += -L$(XEN_ROOT)/tools/libxc -lxenctrl LIB_OBJS := $(patsubst %.c,%.o,$(SRCS)) PIC_OBJS := $(patsubst %.c,%.opic,$(SRCS)) diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/libxendebug/xendebug.c --- a/tools/debugger/libxendebug/xendebug.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/libxendebug/xendebug.c Thu Aug 25 22:53:20 2005 @@ -12,7 +12,7 @@ #include <string.h> #include <errno.h> #include <sys/mman.h> -#include <xc.h> +#include <xenctrl.h> #include "list.h" #if defined(__i386__) @@ -40,7 +40,7 @@ typedef struct bwcpoint /* break/watch/catch point */ { struct list_head list; - memory_t address; + unsigned long address; u32 domain; u8 old_value; /* old value for software bkpt */ } bwcpoint_t, *bwcpoint_p; @@ -311,7 +311,7 @@ /* access to one page */ static int xendebug_memory_page (domain_context_p ctxt, int xc_handle, u32 vcpu, - int protection, memory_t address, int length, u8 *buffer) + int protection, unsigned long address, int length, u8 *buffer) { vcpu_guest_context_t *vcpu_ctxt = &ctxt->context[vcpu]; unsigned long pde, page; @@ -407,7 +407,7 @@ /* divide a memory operation into accesses to individual pages */ static int xendebug_memory_op (domain_context_p ctxt, int xc_handle, u32 vcpu, - int protection, memory_t address, int length, u8 *buffer) + int protection, unsigned long address, int length, u8 *buffer) { int remain; /* number of bytes to touch past this page */ int bytes = 0; @@ -431,7 +431,7 @@ xendebug_read_memory(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length, u8 *data) { @@ -451,7 +451,7 @@ xendebug_write_memory(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length, u8 *data) { @@ -471,7 +471,7 @@ xendebug_insert_memory_breakpoint(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length) { bwcpoint_p bkpt; @@ -517,7 +517,7 @@ xendebug_remove_memory_breakpoint(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length) { bwcpoint_p bkpt = NULL; diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/libxendebug/xendebug.h --- a/tools/debugger/libxendebug/xendebug.h Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/libxendebug/xendebug.h Thu Aug 25 22:53:20 2005 @@ -9,7 +9,7 @@ #ifndef _XENDEBUG_H_DEFINED #define _XENDEBUG_H_DEFINED -#include <xc.h> +#include <xenctrl.h> int xendebug_attach(int xc_handle, u32 domid, @@ -45,7 +45,7 @@ int xendebug_read_memory(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length, u8 *data); @@ -53,7 +53,7 @@ int xendebug_write_memory(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length, u8 *data); @@ -61,13 +61,13 @@ int xendebug_insert_memory_breakpoint(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length); int xendebug_remove_memory_breakpoint(int xc_handle, u32 domid, u32 vcpu, - memory_t address, + unsigned long address, u32 length); int xendebug_query_domain_stop(int xc_handle, diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/Domain.ml --- a/tools/debugger/pdb/Domain.ml Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/Domain.ml Thu Aug 25 22:53:20 2005 @@ -36,6 +36,7 @@ Printf.sprintf "{domain} domain: %d, vcpu: %d" ctx.domain ctx.vcpu +external read_register : context_t -> int -> int32 = "dom_read_register" external read_registers : context_t -> registers = "dom_read_registers" external write_register : context_t -> register -> int32 -> unit = "dom_write_register" diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/Domain.mli --- a/tools/debugger/pdb/Domain.mli Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/Domain.mli Thu Aug 25 22:53:20 2005 @@ -22,6 +22,7 @@ val string_of_context : context_t -> string +val read_register : context_t -> int -> int32 val read_registers : context_t -> registers val write_register : context_t -> register -> int32 -> unit val read_memory : context_t -> int32 -> int -> int list diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/Makefile --- a/tools/debugger/pdb/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/Makefile Thu Aug 25 22:53:20 2005 @@ -33,7 +33,8 @@ LIBS += unix str # bc = byte-code, dc = debug byte-code -all : patches dc +# patches = patch linux domU source code +all : dc SOURCES += pdb_caml_xc.c SOURCES += pdb_caml_domain.c pdb_caml_process.c diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/PDB.ml --- a/tools/debugger/pdb/PDB.ml Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/PDB.ml Thu Aug 25 22:53:20 2005 @@ -219,6 +219,17 @@ (***************************************************************************) +let read_register ctx register = (* register is int32 because of sscanf *) + match ctx with + | Void -> 0l (* default for startup *) + | Domain d -> Domain.read_register d register + | Process p -> + begin + Process.read_register p register; + raise No_reply + end + | _ -> raise (Unimplemented "read registers") + let read_registers ctx = match ctx with | Void -> Intel.null_registers (* default for startup *) @@ -278,14 +289,42 @@ let insert_memory_breakpoint ctx addr len = match ctx with | Domain d -> Domain.insert_memory_breakpoint d addr len - | Process p -> Process.insert_memory_breakpoint p addr len + | Process p -> + begin + Process.insert_memory_breakpoint p addr len; + raise No_reply + end | _ -> raise (Unimplemented "insert memory breakpoint") let remove_memory_breakpoint ctx addr len = match ctx with | Domain d -> Domain.remove_memory_breakpoint d addr len - | Process p -> Process.remove_memory_breakpoint p addr len + | Process p -> + begin + Process.remove_memory_breakpoint p addr len; + raise No_reply + end | _ -> raise (Unimplemented "remove memory breakpoint") + +let insert_watchpoint ctx kind addr len = + match ctx with +(* | Domain d -> Domain.insert_watchpoint d kind addr len TODO *) + | Process p -> + begin + Process.insert_watchpoint p kind addr len; + raise No_reply + end + | _ -> raise (Unimplemented "insert watchpoint") + +let remove_watchpoint ctx kind addr len = + match ctx with +(* | Domain d -> Domain.remove_watchpoint d kind addr len TODO *) + | Process p -> + begin + Process.remove_watchpoint p kind addr len; + raise No_reply + end + | _ -> raise (Unimplemented "remove watchpoint") let pause ctx = diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/Process.ml --- a/tools/debugger/pdb/Process.ml Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/Process.ml Thu Aug 25 22:53:20 2005 @@ -54,6 +54,7 @@ proc_ctx.ring <- Xen_domain.get_ring dom_ctx; _attach_debugger proc_ctx +external read_register : context_t -> int -> unit = "proc_read_register" external read_registers : context_t -> unit = "proc_read_registers" external write_register : context_t -> register -> int32 -> unit = "proc_write_register" @@ -69,6 +70,10 @@ "proc_insert_memory_breakpoint" external remove_memory_breakpoint : context_t -> int32 -> int -> unit = "proc_remove_memory_breakpoint" +external insert_watchpoint : context_t -> int -> int32 -> int -> unit = + "proc_insert_watchpoint" +external remove_watchpoint : context_t -> int -> int32 -> int -> unit = + "proc_remove_watchpoint" let pause ctx = pause_target ctx diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/Process.mli --- a/tools/debugger/pdb/Process.mli Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/Process.mli Thu Aug 25 22:53:20 2005 @@ -26,7 +26,7 @@ val detach_debugger : context_t -> unit val pause : context_t -> unit - +val read_register : context_t -> int -> unit val read_registers : context_t -> unit val write_register : context_t -> register -> int32 -> unit val read_memory : context_t -> int32 -> int -> unit @@ -37,3 +37,5 @@ val insert_memory_breakpoint : context_t -> int32 -> int -> unit val remove_memory_breakpoint : context_t -> int32 -> int -> unit +val insert_watchpoint : context_t -> int -> int32 -> int -> unit +val remove_watchpoint : context_t -> int -> int32 -> int -> unit diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/debugger.ml --- a/tools/debugger/pdb/debugger.ml Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/debugger.ml Thu Aug 25 22:53:20 2005 @@ -53,10 +53,20 @@ PDB.step ctx; raise No_reply +(** + Read Register Command. + return register as a 4-byte value. + *) +let gdb_read_register ctx command = + let read_reg register = + (Printf.sprintf "%08lx" (Util.flip_int32 (PDB.read_register ctx register))) + in + Scanf.sscanf command "p%x" read_reg + (** Read Registers Command. - returns 16 4-byte registers in a particular defined by gdb. + returns 16 4-byte registers in a particular format defined by gdb. *) let gdb_read_registers ctx = let regs = PDB.read_registers ctx in @@ -100,7 +110,7 @@ with Failure s -> "E02" in - Scanf.sscanf command "m%lx,%d" read_mem + Scanf.sscanf command "m%lx,%x" read_mem @@ -218,16 +228,24 @@ (** Insert Breakpoint or Watchpoint Packet *) + +let bwc_watch_write = 102 (* from pdb_module.h *) +let bwc_watch_read = 103 +let bwc_watch_access = 104 + let gdb_insert_bwcpoint ctx command = let insert cmd addr length = try match cmd with | 0 -> PDB.insert_memory_breakpoint ctx addr length; "OK" + | 2 -> PDB.insert_watchpoint ctx bwc_watch_write addr length; "OK" + | 3 -> PDB.insert_watchpoint ctx bwc_watch_read addr length; "OK" + | 4 -> PDB.insert_watchpoint ctx bwc_watch_access addr length; "OK" | _ -> "" with Failure s -> "E03" in - Scanf.sscanf command "Z%d,%lx,%d" insert + Scanf.sscanf command "Z%d,%lx,%x" insert (** Remove Breakpoint or Watchpoint Packet @@ -237,6 +255,9 @@ try match cmd with | 0 -> PDB.remove_memory_breakpoint ctx addr length; "OK" + | 2 -> PDB.remove_watchpoint ctx bwc_watch_write addr length; "OK" + | 3 -> PDB.remove_watchpoint ctx bwc_watch_read addr length; "OK" + | 4 -> PDB.remove_watchpoint ctx bwc_watch_access addr length; "OK" | _ -> "" with Failure s -> "E04" @@ -260,6 +281,7 @@ | 'k' -> gdb_kill () | 'm' -> gdb_read_memory ctx command | 'M' -> gdb_write_memory ctx command + | 'p' -> gdb_read_register ctx command | 'P' -> gdb_write_register ctx command | 'q' -> gdb_query command | 's' -> gdb_step ctx @@ -270,7 +292,7 @@ | 'Z' -> gdb_insert_bwcpoint ctx command | _ -> print_endline (Printf.sprintf "unknown gdb command [%s]" command); - "E02" + "" with Unimplemented s -> print_endline (Printf.sprintf "loser. unimplemented command [%s][%s]" diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/linux-2.6-module/debug.c --- a/tools/debugger/pdb/linux-2.6-module/debug.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/linux-2.6-module/debug.c Thu Aug 25 22:53:20 2005 @@ -9,33 +9,143 @@ #include <asm-i386/kdebug.h> #include <asm-xen/asm-i386/processor.h> #include <asm-xen/asm-i386/ptrace.h> +#include <asm-xen/asm-i386/tlbflush.h> #include <asm-xen/xen-public/xen.h> #include "pdb_module.h" #include "pdb_debug.h" -#define BWC_DEBUG 1 -#define BWC_INT3 3 + +static int pdb_debug_fn (struct pt_regs *regs, long error_code, + unsigned int condition); +static int pdb_int3_fn (struct pt_regs *regs, long error_code); +static int pdb_page_fault_fn (struct pt_regs *regs, long error_code, + unsigned int condition); + +/***********************************************************************/ + typedef struct bwcpoint /* break/watch/catch point */ { struct list_head list; - memory_t address; - u32 domain; + unsigned long address; + int length; + + u8 type; /* BWC_??? */ + u8 mode; /* for BWC_PAGE, the current protection mode */ u32 process; - u8 old_value; /* old value for software bkpt */ - u8 type; /* BWC_??? */ + u8 error; /* error occured when enabling: don't disable. */ + + /* original values */ + u8 orig_bkpt; /* single byte breakpoint */ + pte_t orig_pte; + + struct list_head watchpt_read_list; /* read watchpoints on this page */ + struct list_head watchpt_write_list; /* write */ + struct list_head watchpt_access_list; /* access */ + struct list_head watchpt_disabled_list; /* disabled */ + + struct bwcpoint *parent; /* watchpoint: bwc_watch (the page) */ + struct bwcpoint *watchpoint; /* bwc_watch_step: original watchpoint */ } bwcpoint_t, *bwcpoint_p; -static bwcpoint_t bwcpoint_list; +static struct list_head bwcpoint_list = LIST_HEAD_INIT(bwcpoint_list); + +#define _pdb_bwcpoint_alloc(_var) \ +{ \ + if ( (_var = kmalloc(sizeof(bwcpoint_t), GFP_KERNEL)) == NULL ) \ + printk("error: unable to allocate memory %d\n", __LINE__); \ + else { \ + memset(_var, 0, sizeof(bwcpoint_t)); \ + INIT_LIST_HEAD(&_var->watchpt_read_list); \ + INIT_LIST_HEAD(&_var->watchpt_write_list); \ + INIT_LIST_HEAD(&_var->watchpt_access_list); \ + INIT_LIST_HEAD(&_var->watchpt_disabled_list); \ + } \ +} + +/***********************************************************************/ + +static void _pdb_bwc_print_list (struct list_head *, char *, int); + +static void +_pdb_bwc_print (bwcpoint_p bwc, char *label, int level) +{ + printk("%s%03d 0x%08lx:0x%02x %c\n", label, bwc->type, + bwc->address, bwc->length, bwc->error ? 'e' : '-'); + + if ( !list_empty(&bwc->watchpt_read_list) ) + _pdb_bwc_print_list(&bwc->watchpt_read_list, "r", level); + if ( !list_empty(&bwc->watchpt_write_list) ) + _pdb_bwc_print_list(&bwc->watchpt_write_list, "w", level); + if ( !list_empty(&bwc->watchpt_access_list) ) + _pdb_bwc_print_list(&bwc->watchpt_access_list, "a", level); + if ( !list_empty(&bwc->watchpt_disabled_list) ) + _pdb_bwc_print_list(&bwc->watchpt_disabled_list, "d", level); +} + +static void +_pdb_bwc_print_list (struct list_head *bwc_list, char *label, int level) +{ + struct list_head *ptr; + int counter = 0; + + list_for_each(ptr, bwc_list) + { + bwcpoint_p bwc = list_entry(ptr, bwcpoint_t, list); + printk(" %s[%02d]%s ", level > 0 ? " " : "", counter++, + level > 0 ? "" : " "); + _pdb_bwc_print(bwc, label, level+1); + } + + if (counter == 0) + { + printk(" empty list\n"); + } +} void -pdb_initialize_bwcpoint (void) -{ - memset((void *) &bwcpoint_list, 0, sizeof(bwcpoint_t)); - INIT_LIST_HEAD(&bwcpoint_list.list); - - return; -} - +pdb_bwc_print_list (void) +{ + _pdb_bwc_print_list(&bwcpoint_list, " ", 0); +} + +bwcpoint_p +pdb_search_watchpoint (u32 process, unsigned long address) +{ + bwcpoint_p bwc_watch = (bwcpoint_p) 0; + bwcpoint_p bwc_entry = (bwcpoint_p) 0; + struct list_head *ptr; + + list_for_each(ptr, &bwcpoint_list) /* find bwc page entry */ + { + bwc_watch = list_entry(ptr, bwcpoint_t, list); + if (bwc_watch->address == (address & PAGE_MASK)) break; + } + + if ( !bwc_watch ) + { + return (bwcpoint_p) 0; + } + +#define __pdb_search_watchpoint_list(__list) \ + list_for_each(ptr, (__list)) \ + { \ + bwc_entry = list_entry(ptr, bwcpoint_t, list); \ + if ( bwc_entry->process == process && \ + bwc_entry->address <= address && \ + bwc_entry->address + bwc_entry->length > address ) \ + return bwc_entry; \ + } + + __pdb_search_watchpoint_list(&bwc_watch->watchpt_read_list); + __pdb_search_watchpoint_list(&bwc_watch->watchpt_write_list); + __pdb_search_watchpoint_list(&bwc_watch->watchpt_access_list); + +#undef __pdb_search_watchpoint_list + + return (bwcpoint_p) 0; +} + +/*************************************************************/ int pdb_suspend (struct task_struct *target) @@ -134,6 +244,35 @@ *(unsigned long *) stack = value; return; +} + +int +pdb_read_register (struct task_struct *target, pdb_op_rd_reg_p op) +{ + int rc = 0; + + switch (op->reg) + { + case 0: op->value = _pdb_get_register(target, LINUX_EAX); break; + case 1: op->value = _pdb_get_register(target, LINUX_ECX); break; + case 2: op->value = _pdb_get_register(target, LINUX_EDX); break; + case 3: op->value = _pdb_get_register(target, LINUX_EBX); break; + case 4: op->value = _pdb_get_register(target, LINUX_ESP); break; + case 5: op->value = _pdb_get_register(target, LINUX_EBP); break; + case 6: op->value = _pdb_get_register(target, LINUX_ESI); break; + case 7: op->value = _pdb_get_register(target, LINUX_EDI); break; + case 8: op->value = _pdb_get_register(target, LINUX_EIP); break; + case 9: op->value = _pdb_get_register(target, LINUX_EFL); break; + + case 10: op->value = _pdb_get_register(target, LINUX_CS); break; + case 11: op->value = _pdb_get_register(target, LINUX_SS); break; + case 12: op->value = _pdb_get_register(target, LINUX_DS); break; + case 13: op->value = _pdb_get_register(target, LINUX_ES); break; + case 14: op->value = _pdb_get_register(target, LINUX_FS); break; + case 15: op->value = _pdb_get_register(target, LINUX_GS); break; + } + + return rc; } int @@ -209,18 +348,14 @@ eflags |= X86_EFLAGS_TF; _pdb_set_register(target, LINUX_EFL, eflags); - bkpt = kmalloc(sizeof(bwcpoint_t), GFP_KERNEL); - if ( bkpt == NULL ) - { - printk("error: unable to allocation memory\n"); - return -1; - } + _pdb_bwcpoint_alloc(bkpt); + if ( bkpt == NULL ) return -1; bkpt->process = target->pid; bkpt->address = 0; bkpt->type = BWC_DEBUG; - list_add(&bkpt->list, &bwcpoint_list.list); + list_add_tail(&bkpt->list, &bwcpoint_list); wake_up_process(target); @@ -229,7 +364,7 @@ int pdb_insert_memory_breakpoint (struct task_struct *target, - memory_t address, u32 length) + unsigned long address, u32 length) { int rc = 0; bwcpoint_p bkpt; @@ -237,38 +372,34 @@ printk("insert breakpoint %d:%lx len: %d\n", target->pid, address, length); - bkpt = kmalloc(sizeof(bwcpoint_t), GFP_KERNEL); - if ( bkpt == NULL ) - { - printk("error: unable to allocation memory\n"); + if ( length != 1 ) + { + printk("error: breakpoint length should be 1\n"); return -1; } - if ( length != 1 ) - { - printk("error: breakpoint length should be 1\n"); - kfree(bkpt); - return -1; - } + _pdb_bwcpoint_alloc(bkpt); + if ( bkpt == NULL ) return -1; bkpt->process = target->pid; bkpt->address = address; bkpt->type = BWC_INT3; - pdb_access_memory(target, address, &bkpt->old_value, 1, 0); - pdb_access_memory(target, address, &breakpoint_opcode, 1, 1); + pdb_access_memory(target, address, &bkpt->orig_bkpt, 1, PDB_MEM_READ); + pdb_access_memory(target, address, &breakpoint_opcode, 1, PDB_MEM_WRITE); - list_add(&bkpt->list, &bwcpoint_list.list); + list_add_tail(&bkpt->list, &bwcpoint_list); printk("breakpoint_set %d:%lx OLD: 0x%x\n", - target->pid, address, bkpt->old_value); + target->pid, address, bkpt->orig_bkpt); + pdb_bwc_print_list(); return rc; } int pdb_remove_memory_breakpoint (struct task_struct *target, - memory_t address, u32 length) + unsigned long address, u32 length) { int rc = 0; bwcpoint_p bkpt = NULL; @@ -276,7 +407,7 @@ printk ("remove breakpoint %d:%lx\n", target->pid, address); struct list_head *entry; - list_for_each(entry, &bwcpoint_list.list) + list_for_each(entry, &bwcpoint_list) { bkpt = list_entry(entry, bwcpoint_t, list); if ( target->pid == bkpt->process && @@ -285,17 +416,223 @@ break; } - if (bkpt == &bwcpoint_list || bkpt == NULL) + if (entry == &bwcpoint_list) { printk ("error: no breakpoint found\n"); return -1; } + pdb_access_memory(target, address, &bkpt->orig_bkpt, 1, PDB_MEM_WRITE); + list_del(&bkpt->list); - - pdb_access_memory(target, address, &bkpt->old_value, 1, 1); - kfree(bkpt); + + pdb_bwc_print_list(); + + return rc; +} + +#define PDB_PTE_UPDATE 1 +#define PDB_PTE_RESTORE 2 + +int +pdb_change_pte (struct task_struct *target, bwcpoint_p bwc, int mode) +{ + int rc = 0; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + + pgd = pgd_offset(target->mm, bwc->address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return -1; + + pud = pud_offset(pgd, bwc->address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) return -2; + + pmd = pmd_offset(pud, bwc->address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) return -3; + + ptep = pte_offset_map(pmd, bwc->address); + if (!ptep) return -4; + + switch ( mode ) + { + case PDB_PTE_UPDATE: /* added or removed a watchpoint. update pte. */ + { + pte_t new_pte; + + if ( pte_val(bwc->parent->orig_pte) == 0 ) /* new watchpoint page */ + { + bwc->parent->orig_pte = *ptep; + } + + new_pte = bwc->parent->orig_pte; + + if ( !list_empty(&bwc->parent->watchpt_read_list) || + !list_empty(&bwc->parent->watchpt_access_list) ) + { + new_pte = pte_rdprotect(new_pte); + } + + if ( !list_empty(&bwc->parent->watchpt_write_list) || + !list_empty(&bwc->parent->watchpt_access_list) ) + { + new_pte = pte_wrprotect(new_pte); + } + + if ( pte_val(new_pte) != pte_val(*ptep) ) + { + *ptep = new_pte; + flush_tlb_mm(target->mm); + } + break; + } + case PDB_PTE_RESTORE : /* suspend watchpoint by restoring original pte */ + { + *ptep = bwc->parent->orig_pte; + flush_tlb_mm(target->mm); + break; + } + default : + { + printk("(linux) unknown mode %d %d\n", mode, __LINE__); + break; + } + } + + pte_unmap(ptep); /* can i flush the tlb before pte_unmap? */ + + return rc; +} + +int +pdb_insert_watchpoint (struct task_struct *target, pdb_op_watchpt_p watchpt) +{ + int rc = 0; + + bwcpoint_p bwc_watch; + bwcpoint_p bwc_entry; + struct list_head *ptr; + unsigned long page = watchpt->address & PAGE_MASK; + struct list_head *watchpoint_list; + + printk("insert watchpoint: %d %x %x\n", + watchpt->type, watchpt->address, watchpt->length); + + list_for_each(ptr, &bwcpoint_list) /* find existing bwc page entry */ + { + bwc_watch = list_entry(ptr, bwcpoint_t, list); + + if (bwc_watch->address == page) goto got_bwc_watch; + } + + _pdb_bwcpoint_alloc(bwc_watch); /* create new bwc:watch */ + if ( bwc_watch == NULL ) return -1; + + bwc_watch->type = BWC_WATCH; + bwc_watch->process = target->pid; + bwc_watch->address = page; + + list_add_tail(&bwc_watch->list, &bwcpoint_list); + + got_bwc_watch: + + switch (watchpt->type) + { + case BWC_WATCH_READ: + watchpoint_list = &bwc_watch->watchpt_read_list; break; + case BWC_WATCH_WRITE: + watchpoint_list = &bwc_watch->watchpt_write_list; break; + case BWC_WATCH_ACCESS: + watchpoint_list = &bwc_watch->watchpt_access_list; break; + default: + printk("unknown type %d\n", watchpt->type); return -2; + } + + _pdb_bwcpoint_alloc(bwc_entry); /* create new bwc:entry */ + if ( bwc_entry == NULL ) return -1; + + bwc_entry->process = target->pid; + bwc_entry->address = watchpt->address; + bwc_entry->length = watchpt->length; + bwc_entry->type = watchpt->type; + bwc_entry->parent = bwc_watch; + + list_add_tail(&bwc_entry->list, watchpoint_list); + pdb_change_pte(target, bwc_entry, PDB_PTE_UPDATE); + + pdb_bwc_print_list(); + + return rc; +} + +int +pdb_remove_watchpoint (struct task_struct *target, pdb_op_watchpt_p watchpt) +{ + int rc = 0; + bwcpoint_p bwc_watch = (bwcpoint_p) NULL; + bwcpoint_p bwc_entry = (bwcpoint_p) NULL; + unsigned long page = watchpt->address & PAGE_MASK; + struct list_head *ptr; + struct list_head *watchpoint_list; + + printk("remove watchpoint: %d %x %x\n", + watchpt->type, watchpt->address, watchpt->length); + + list_for_each(ptr, &bwcpoint_list) /* find bwc page entry */ + { + bwc_watch = list_entry(ptr, bwcpoint_t, list); + if (bwc_watch->address == page) break; + } + + if ( !bwc_watch ) + { + printk("(linux) delete watchpoint: can't find bwc page 0x%08x\n", + watchpt->address); + return -1; + } + + switch (watchpt->type) + { + case BWC_WATCH_READ: + watchpoint_list = &bwc_watch->watchpt_read_list; break; + case BWC_WATCH_WRITE: + watchpoint_list = &bwc_watch->watchpt_write_list; break; + case BWC_WATCH_ACCESS: + watchpoint_list = &bwc_watch->watchpt_access_list; break; + default: + printk("unknown type %d\n", watchpt->type); return -2; + } + + list_for_each(ptr, watchpoint_list) /* find watchpoint */ + { + bwc_entry = list_entry(ptr, bwcpoint_t, list); + if ( bwc_entry->address == watchpt->address && + bwc_entry->length == watchpt->length ) break; + } + + if ( !bwc_entry ) /* or ptr == watchpoint_list */ + { + printk("(linux) delete watchpoint: can't find watchpoint 0x%08x\n", + watchpt->address); + return -1; + } + + list_del(&bwc_entry->list); + pdb_change_pte(target, bwc_entry, PDB_PTE_UPDATE); + kfree(bwc_entry); + + + if ( list_empty(&bwc_watch->watchpt_read_list) && + list_empty(&bwc_watch->watchpt_write_list) && + list_empty(&bwc_watch->watchpt_access_list) ) + { + list_del(&bwc_watch->list); + kfree(bwc_watch); + } + + pdb_bwc_print_list(); return rc; } @@ -312,16 +649,24 @@ switch (val) { case DIE_DEBUG: - if (pdb_debug_fn(args->regs, args->trapnr, args->err)) + if ( pdb_debug_fn(args->regs, args->trapnr, args->err) ) return NOTIFY_STOP; break; case DIE_TRAP: - if (args->trapnr == 3 && pdb_int3_fn(args->regs, args->err)) + if ( args->trapnr == 3 && pdb_int3_fn(args->regs, args->err) ) return NOTIFY_STOP; break; case DIE_INT3: /* without kprobes, we should never see DIE_INT3 */ + if ( pdb_int3_fn(args->regs, args->err) ) + return NOTIFY_STOP; + break; + case DIE_PAGE_FAULT: + if ( pdb_page_fault_fn(args->regs, args->trapnr, args->err) ) + return NOTIFY_STOP; + break; case DIE_GPF: - case DIE_PAGE_FAULT: + printk("---------------GPF\n"); + break; default: break; } @@ -330,70 +675,110 @@ } -int +static int pdb_debug_fn (struct pt_regs *regs, long error_code, unsigned int condition) { pdb_response_t resp; bwcpoint_p bkpt = NULL; - struct list_head *entry; - list_for_each(entry, &bwcpoint_list.list) + + printk("pdb_debug_fn\n"); + + list_for_each(entry, &bwcpoint_list) { bkpt = list_entry(entry, bwcpoint_t, list); if ( current->pid == bkpt->process && - bkpt->type == BWC_DEBUG ) + (bkpt->type == BWC_DEBUG || /* single step */ + bkpt->type == BWC_WATCH_STEP)) /* single step over watchpoint */ break; } - if (bkpt == &bwcpoint_list || bkpt == NULL) + if (entry == &bwcpoint_list) { printk("not my debug 0x%x 0x%lx\n", current->pid, regs->eip); return 0; } - list_del(&bkpt->list); - pdb_suspend(current); - printk("(pdb) debug pid: %d, eip: 0x%08lx\n", current->pid, regs->eip); + printk("(pdb) %s pid: %d, eip: 0x%08lx\n", + bkpt->type == BWC_DEBUG ? "debug" : "watch-step", + current->pid, regs->eip); regs->eflags &= ~X86_EFLAGS_TF; set_tsk_thread_flag(current, TIF_SINGLESTEP); - resp.operation = PDB_OPCODE_STEP; + switch (bkpt->type) + { + case BWC_DEBUG: + resp.operation = PDB_OPCODE_STEP; + break; + case BWC_WATCH_STEP: + { + struct list_head *watchpoint_list; + bwcpoint_p watch_page = bkpt->watchpoint->parent; + + switch (bkpt->watchpoint->type) + { + case BWC_WATCH_READ: + watchpoint_list = &watch_page->watchpt_read_list; break; + case BWC_WATCH_WRITE: + watchpoint_list = &watch_page->watchpt_write_list; break; + case BWC_WATCH_ACCESS: + watchpoint_list = &watch_page->watchpt_access_list; break; + default: + printk("unknown type %d\n", bkpt->watchpoint->type); return 0; + } + + resp.operation = PDB_OPCODE_WATCHPOINT; + list_del_init(&bkpt->watchpoint->list); + list_add_tail(&bkpt->watchpoint->list, watchpoint_list); + pdb_change_pte(current, bkpt->watchpoint, PDB_PTE_UPDATE); + pdb_bwc_print_list(); + break; + } + default: + printk("unknown breakpoint type %d %d\n", __LINE__, bkpt->type); + return 0; + } + resp.process = current->pid; resp.status = PDB_RESPONSE_OKAY; pdb_send_response(&resp); + list_del(&bkpt->list); + kfree(bkpt); + return 1; } -int +static int pdb_int3_fn (struct pt_regs *regs, long error_code) { pdb_response_t resp; bwcpoint_p bkpt = NULL; + unsigned long address = regs->eip - 1; struct list_head *entry; - list_for_each(entry, &bwcpoint_list.list) + list_for_each(entry, &bwcpoint_list) { bkpt = list_entry(entry, bwcpoint_t, list); if ( current->pid == bkpt->process && - regs->eip == bkpt->address && + address == bkpt->address && bkpt->type == BWC_INT3 ) break; } - if (bkpt == &bwcpoint_list || bkpt == NULL) - { - printk("not my int3 bkpt 0x%x 0x%lx\n", current->pid, regs->eip); + if (entry == &bwcpoint_list) + { + printk("not my int3 bkpt 0x%x 0x%lx\n", current->pid, address); return 0; } - printk("(pdb) int3 pid: %d, eip: 0x%08lx\n", current->pid, regs->eip); + printk("(pdb) int3 pid: %d, eip: 0x%08lx\n", current->pid, address); pdb_suspend(current); @@ -405,6 +790,54 @@ return 1; } + +static int +pdb_page_fault_fn (struct pt_regs *regs, long error_code, + unsigned int condition) +{ + unsigned long cr2; + unsigned long cr3; + bwcpoint_p bwc; + bwcpoint_p watchpt; + bwcpoint_p bkpt; + + __asm__ __volatile__ ("movl %%cr3,%0" : "=r" (cr3) : ); + __asm__ __volatile__ ("movl %%cr2,%0" : "=r" (cr2) : ); + + bwc = pdb_search_watchpoint(current->pid, cr2); + if ( !bwc ) + { + return 0; /* not mine */ + } + + printk("page_fault cr2:%08lx err:%lx eip:%08lx\n", + cr2, error_code, regs->eip); + + /* disable the watchpoint */ + watchpt = bwc->watchpoint; + list_del_init(&bwc->list); + list_add_tail(&bwc->list, &bwc->parent->watchpt_disabled_list); + pdb_change_pte(current, bwc, PDB_PTE_RESTORE); + + /* single step the faulting instruction */ + regs->eflags |= X86_EFLAGS_TF; + + /* create a bwcpoint entry so we know what to do once we regain control */ + _pdb_bwcpoint_alloc(bkpt); + if ( bkpt == NULL ) return -1; + + bkpt->process = current->pid; + bkpt->address = 0; + bkpt->type = BWC_WATCH_STEP; + bkpt->watchpoint = bwc; + + /* add to head so we see it first the next time we break */ + list_add(&bkpt->list, &bwcpoint_list); + + pdb_bwc_print_list(); + return 1; +} + /* * Local variables: diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/linux-2.6-module/module.c --- a/tools/debugger/pdb/linux-2.6-module/module.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/linux-2.6-module/module.c Thu Aug 25 22:53:20 2005 @@ -98,6 +98,11 @@ printk("(linux) detach 0x%x\n", request->process); resp.status = PDB_RESPONSE_OKAY; break; + case PDB_OPCODE_RD_REG : + resp.u.rd_reg.reg = request->u.rd_reg.reg; + pdb_read_register(target, &resp.u.rd_reg); + resp.status = PDB_RESPONSE_OKAY; + break; case PDB_OPCODE_RD_REGS : pdb_read_registers(target, &resp.u.rd_regs); resp.status = PDB_RESPONSE_OKAY; @@ -108,14 +113,16 @@ break; case PDB_OPCODE_RD_MEM : pdb_access_memory(target, request->u.rd_mem.address, - &resp.u.rd_mem.data, request->u.rd_mem.length, 0); + &resp.u.rd_mem.data, request->u.rd_mem.length, + PDB_MEM_READ); resp.u.rd_mem.address = request->u.rd_mem.address; resp.u.rd_mem.length = request->u.rd_mem.length; resp.status = PDB_RESPONSE_OKAY; break; case PDB_OPCODE_WR_MEM : pdb_access_memory(target, request->u.wr_mem.address, - &request->u.wr_mem.data, request->u.wr_mem.length, 1); + &request->u.wr_mem.data, request->u.wr_mem.length, + PDB_MEM_WRITE); resp.status = PDB_RESPONSE_OKAY; break; case PDB_OPCODE_CONTINUE : @@ -137,6 +144,14 @@ request->u.bkpt.length); resp.status = PDB_RESPONSE_OKAY; break; + case PDB_OPCODE_SET_WATCHPT : + pdb_insert_watchpoint(target, &request->u.watchpt); + resp.status = PDB_RESPONSE_OKAY; + break; + case PDB_OPCODE_CLR_WATCHPT : + pdb_remove_watchpoint(target, &request->u.watchpt); + resp.status = PDB_RESPONSE_OKAY; + break; default: printk("(pdb) unknown request operation %d\n", request->operation); resp.status = PDB_RESPONSE_ERROR; @@ -184,7 +199,7 @@ } static void -pdb_send_connection_status(int status, memory_t ring) +pdb_send_connection_status(int status, unsigned long ring) { ctrl_msg_t cmsg = { @@ -248,8 +263,6 @@ pdb_sring_t *sring; printk("----\npdb initialize %s %s\n", __DATE__, __TIME__); - - pdb_initialize_bwcpoint(); /* if ( xen_start_info.flags & SIF_INITDOMAIN ) diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/linux-2.6-module/pdb_debug.h --- a/tools/debugger/pdb/linux-2.6-module/pdb_debug.h Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/linux-2.6-module/pdb_debug.h Thu Aug 25 22:53:20 2005 @@ -6,6 +6,7 @@ void pdb_initialize_bwcpoint (void); int pdb_suspend (struct task_struct *target); int pdb_resume (struct task_struct *target); +int pdb_read_register (struct task_struct *target, pdb_op_rd_reg_p op); int pdb_read_registers (struct task_struct *target, pdb_op_rd_regs_p op); int pdb_write_register (struct task_struct *target, pdb_op_wr_reg_p op); int pdb_read_memory (struct task_struct *target, pdb_op_rd_mem_req_p req, @@ -17,16 +18,16 @@ int pdb_step (struct task_struct *target); int pdb_insert_memory_breakpoint (struct task_struct *target, - memory_t address, u32 length); + unsigned long address, u32 length); int pdb_remove_memory_breakpoint (struct task_struct *target, - memory_t address, u32 length); + unsigned long address, u32 length); +int pdb_insert_watchpoint (struct task_struct *target, + pdb_op_watchpt_p watchpt); +int pdb_remove_watchpoint (struct task_struct *target, + pdb_op_watchpt_p watchpt); int pdb_exceptions_notify (struct notifier_block *self, unsigned long val, void *data); - -int pdb_debug_fn (struct pt_regs *regs, long error_code, - unsigned int condition); -int pdb_int3_fn (struct pt_regs *regs, long error_code); /* module.c */ void pdb_send_response (pdb_response_t *response); diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/linux-2.6-module/pdb_module.h --- a/tools/debugger/pdb/linux-2.6-module/pdb_module.h Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/linux-2.6-module/pdb_module.h Thu Aug 25 22:53:20 2005 @@ -14,20 +14,27 @@ #define PDB_OPCODE_DETACH 3 -#define PDB_OPCODE_RD_REGS 4 +#define PDB_OPCODE_RD_REG 4 +typedef struct pdb_op_rd_reg +{ + u32 reg; + u32 value; +} pdb_op_rd_reg_t, *pdb_op_rd_reg_p; + +#define PDB_OPCODE_RD_REGS 5 typedef struct pdb_op_rd_regs { u32 reg[GDB_REGISTER_FRAME_SIZE]; } pdb_op_rd_regs_t, *pdb_op_rd_regs_p; -#define PDB_OPCODE_WR_REG 5 +#define PDB_OPCODE_WR_REG 6 typedef struct pdb_op_wr_reg { u32 reg; u32 value; } pdb_op_wr_reg_t, *pdb_op_wr_reg_p; -#define PDB_OPCODE_RD_MEM 6 +#define PDB_OPCODE_RD_MEM 7 typedef struct pdb_op_rd_mem_req { u32 address; @@ -41,7 +48,7 @@ u8 data[1024]; } pdb_op_rd_mem_resp_t, *pdb_op_rd_mem_resp_p; -#define PDB_OPCODE_WR_MEM 7 +#define PDB_OPCODE_WR_MEM 8 typedef struct pdb_op_wr_mem { u32 address; @@ -49,16 +56,33 @@ u8 data[1024]; /* arbitrary */ } pdb_op_wr_mem_t, *pdb_op_wr_mem_p; -#define PDB_OPCODE_CONTINUE 8 -#define PDB_OPCODE_STEP 9 +#define PDB_OPCODE_CONTINUE 9 +#define PDB_OPCODE_STEP 10 -#define PDB_OPCODE_SET_BKPT 10 -#define PDB_OPCODE_CLR_BKPT 11 +#define PDB_OPCODE_SET_BKPT 11 +#define PDB_OPCODE_CLR_BKPT 12 typedef struct pdb_op_bkpt { u32 address; u32 length; } pdb_op_bkpt_t, *pdb_op_bkpt_p; + +#define PDB_OPCODE_SET_WATCHPT 13 +#define PDB_OPCODE_CLR_WATCHPT 14 +#define PDB_OPCODE_WATCHPOINT 15 +typedef struct pdb_op_watchpt +{ +#define BWC_DEBUG 1 +#define BWC_INT3 3 +#define BWC_WATCH 100 /* pdb: watchpoint page */ +#define BWC_WATCH_STEP 101 /* pdb: watchpoint single step */ +#define BWC_WATCH_WRITE 102 +#define BWC_WATCH_READ 103 +#define BWC_WATCH_ACCESS 104 + u32 type; + u32 address; + u32 length; +} pdb_op_watchpt_t, *pdb_op_watchpt_p; typedef struct @@ -68,10 +92,12 @@ union { pdb_op_attach_t attach; + pdb_op_rd_reg_t rd_reg; pdb_op_wr_reg_t wr_reg; pdb_op_rd_mem_req_t rd_mem; pdb_op_wr_mem_t wr_mem; pdb_op_bkpt_t bkpt; + pdb_op_watchpt_t watchpt; } u; } pdb_request_t, *pdb_request_p; @@ -87,6 +113,7 @@ s16 status; /* PDB_RESPONSE_??? */ union { + pdb_op_rd_reg_t rd_reg; pdb_op_rd_regs_t rd_regs; pdb_op_rd_mem_resp_t rd_mem; } u; @@ -94,6 +121,11 @@ DEFINE_RING_TYPES(pdb, pdb_request_t, pdb_response_t); + + +/* from access_process_vm */ +#define PDB_MEM_READ 0 +#define PDB_MEM_WRITE 1 #endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/linux-2.6-patches/i386_ksyms.patch --- a/tools/debugger/pdb/linux-2.6-patches/i386_ksyms.patch Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/linux-2.6-patches/i386_ksyms.patch Thu Aug 25 22:53:20 2005 @@ -1,7 +1,15 @@ diff -u linux-2.6.12/arch/xen/i386/kernel/i386_ksyms.c linux-2.6.12-pdb/arch/xen/i386/kernel/i386_ksyms.c --- linux-2.6.12/arch/xen/i386/kernel/i386_ksyms.c 2005-07-31 22:36:50.000000000 +0100 +++ linux-2.6.12-pdb/arch/xen/i386/kernel/i386_ksyms.c 2005-08-01 10:57:31.000000000 +0100 -@@ -172,6 +172,7 @@ +@@ -151,6 +151,7 @@ + /* TLB flushing */ + EXPORT_SYMBOL(flush_tlb_page); + #endif ++EXPORT_SYMBOL(flush_tlb_mm); + + #ifdef CONFIG_X86_IO_APIC + EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); +@@ -172,6 +173,7 @@ EXPORT_SYMBOL_GPL(unset_nmi_callback); EXPORT_SYMBOL(register_die_notifier); diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/pdb_caml_domain.c --- a/tools/debugger/pdb/pdb_caml_domain.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/pdb_caml_domain.c Thu Aug 25 22:53:20 2005 @@ -6,7 +6,7 @@ * PDB's OCaml interface library for debugging domains */ -#include <xc.h> +#include <xenctrl.h> #include <xendebug.h> #include <errno.h> #include <stdio.h> @@ -41,6 +41,54 @@ /****************************************************************************/ + +/* + * dom_read_register : context_t -> int -> int32 + */ +value +dom_read_register (value context, value reg) +{ + CAMLparam2(context, reg); + CAMLlocal1(result); + + int my_reg = Int_val(reg); + cpu_user_regs_t *regs; + context_t ctx; + + decode_context(&ctx, context); + + if ( xendebug_read_registers(xc_handle, ctx.domain, ctx.vcpu, ®s) ) + { + printf("(pdb) read registers error!\n"); fflush(stdout); + failwith("read registers error"); + } + + dump_regs(regs); + + result = caml_alloc_tuple(16); + + switch (my_reg) + { + case GDB_EAX: result = caml_copy_int32(regs->eax); break; + case GDB_ECX: result = caml_copy_int32(regs->ecx); break; + case GDB_EDX: result = caml_copy_int32(regs->edx); break; + case GDB_EBX: result = caml_copy_int32(regs->ebx); break; + case GDB_ESP: result = caml_copy_int32(regs->esp); break; + case GDB_EBP: result = caml_copy_int32(regs->ebp); break; + case GDB_ESI: result = caml_copy_int32(regs->esi); break; + case GDB_EDI: result = caml_copy_int32(regs->edi); break; + case GDB_EIP: result = caml_copy_int32(regs->eip); break; + case GDB_EFL: result = caml_copy_int32(regs->eflags); break; + case GDB_CS: result = caml_copy_int32(regs->cs); break; + case GDB_SS: result = caml_copy_int32(regs->ss); break; + case GDB_DS: result = caml_copy_int32(regs->ds); break; + case GDB_ES: result = caml_copy_int32(regs->es); break; + case GDB_FS: result = caml_copy_int32(regs->fs); break; + case GDB_GS: result = caml_copy_int32(regs->gs); break; + } + + CAMLreturn(result); +} /* * dom_read_registers : context_t -> int32 @@ -155,7 +203,7 @@ context_t ctx; int loop; char *buffer; - memory_t my_address = Int32_val(address); + unsigned long my_address = Int32_val(address); u32 my_length = Int_val(length); printf ("(pdb) read memory\n"); @@ -211,7 +259,7 @@ context_t ctx; char buffer[4096]; /* a big buffer */ - memory_t my_address; + unsigned long my_address; u32 length = 0; printf ("(pdb) write memory\n"); @@ -231,7 +279,7 @@ } buffer[length++] = Int_val(Field(node, 0)); - my_address = (memory_t) Int32_val(address); + my_address = (unsigned long) Int32_val(address); if ( xendebug_write_memory(xc_handle, ctx.domain, ctx.vcpu, my_address, length, buffer) ) @@ -296,7 +344,7 @@ CAMLparam3(context, address, length); context_t ctx; - memory_t my_address = (memory_t) Int32_val(address); + unsigned long my_address = (unsigned long) Int32_val(address); int my_length = Int_val(length); decode_context(&ctx, context); @@ -325,7 +373,7 @@ context_t ctx; - memory_t my_address = (memory_t) Int32_val(address); + unsigned long my_address = (unsigned long) Int32_val(address); int my_length = Int_val(length); printf ("(pdb) remove memory breakpoint 0x%lx %d\n", diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/pdb_caml_evtchn.c --- a/tools/debugger/pdb/pdb_caml_evtchn.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/pdb_caml_evtchn.c Thu Aug 25 22:53:20 2005 @@ -6,7 +6,7 @@ * PDB's OCaml interface library for event channels */ -#include <xc.h> +#include <xenctrl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/pdb_caml_process.c --- a/tools/debugger/pdb/pdb_caml_process.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/pdb_caml_process.c Thu Aug 25 22:53:20 2005 @@ -15,7 +15,7 @@ #include <caml/memory.h> #include <caml/mlvalues.h> -#include <xc.h> +#include <xenctrl.h> #include <xen/xen.h> #include <xen/io/domain_controller.h> #include <xen/linux/privcmd.h> @@ -113,6 +113,12 @@ case PDB_OPCODE_DETACH : break; + case PDB_OPCODE_RD_REG : + { + sprintf(&msg[0], "%08x", _flip(resp->u.rd_reg.value)); + break; + } + case PDB_OPCODE_RD_REGS : { int loop; @@ -161,16 +167,22 @@ } case PDB_OPCODE_SET_BKPT : - { - break; - } case PDB_OPCODE_CLR_BKPT : - { + case PDB_OPCODE_SET_WATCHPT : + case PDB_OPCODE_CLR_WATCHPT : + { + break; + } + + case PDB_OPCODE_WATCHPOINT : + { + sprintf(msg, "S05"); break; } default : - printf("(linux) UNKNOWN MESSAGE TYPE IN RESPONSE\n"); + printf("(linux) UNKNOWN MESSAGE TYPE IN RESPONSE %d\n", + resp->operation); break; } @@ -258,6 +270,32 @@ CAMLreturn(Val_unit); } + + +/* + * proc_read_register : context_t -> int -> unit + */ +value +proc_read_register (value context, value reg) +{ + CAMLparam1(context); + + pdb_request_t req; + context_t ctx; + int my_reg = Int_val(reg); + + decode_context(&ctx, context); + + req.operation = PDB_OPCODE_RD_REG; + req.process = ctx.process; + req.u.rd_reg.reg = my_reg; + req.u.rd_reg.value = 0; + + send_request (ctx.ring, ctx.evtchn, &req); + + CAMLreturn(Val_unit); +} + /* @@ -443,7 +481,7 @@ /* - * proc_insert_memory_breakpoint : context_t -> int32 -> int list -> unit + * proc_insert_memory_breakpoint : context_t -> int32 -> int -> unit */ value proc_insert_memory_breakpoint (value context, value address, value length) @@ -457,7 +495,7 @@ req.operation = PDB_OPCODE_SET_BKPT; req.process = ctx.process; - req.u.bkpt.address = (memory_t) Int32_val(address); + req.u.bkpt.address = (unsigned long) Int32_val(address); req.u.bkpt.length = Int_val(length); send_request(ctx.ring, ctx.evtchn, &req); @@ -466,7 +504,7 @@ } /* - * proc_remove_memory_breakpoint : context_t -> int32 -> int list -> unit + * proc_remove_memory_breakpoint : context_t -> int32 -> int -> unit */ value proc_remove_memory_breakpoint (value context, value address, value length) @@ -480,8 +518,56 @@ req.operation = PDB_OPCODE_CLR_BKPT; req.process = ctx.process; - req.u.bkpt.address = (memory_t) Int32_val(address); + req.u.bkpt.address = (unsigned long) Int32_val(address); req.u.bkpt.length = Int_val(length); + + send_request(ctx.ring, ctx.evtchn, &req); + + CAMLreturn(Val_unit); +} + +/* + * proc_insert_watchpoint : context_t -> bwcpoint_t -> int32 -> int -> unit + */ +value +proc_insert_watchpoint (value context, value kind, value address, value length) +{ + CAMLparam3(context, address, length); + + context_t ctx; + pdb_request_t req; + + decode_context(&ctx, context); + + req.operation = PDB_OPCODE_SET_WATCHPT; + req.process = ctx.process; + req.u.watchpt.type = Int_val(kind); + req.u.watchpt.address = (unsigned long) Int32_val(address); + req.u.watchpt.length = Int_val(length); + + send_request(ctx.ring, ctx.evtchn, &req); + + CAMLreturn(Val_unit); +} + +/* + * proc_remove_watchpoint : context_t -> bwcpoint_t -> int32 -> int -> unit + */ +value +proc_remove_watchpoint (value context, value kind, value address, value length) +{ + CAMLparam3(context, address, length); + + context_t ctx; + pdb_request_t req; + + decode_context(&ctx, context); + + req.operation = PDB_OPCODE_CLR_WATCHPT; + req.process = ctx.process; + req.u.watchpt.type = Int_val(kind); + req.u.watchpt.address = (unsigned long) Int32_val(address); + req.u.watchpt.length = Int_val(length); send_request(ctx.ring, ctx.evtchn, &req); diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/pdb_caml_xc.c --- a/tools/debugger/pdb/pdb_caml_xc.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/pdb_caml_xc.c Thu Aug 25 22:53:20 2005 @@ -6,7 +6,7 @@ * PDB's OCaml interface library for debugging domains */ -#include <xc.h> +#include <xenctrl.h> #include <xendebug.h> #include <errno.h> #include <stdio.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/pdb_caml_xcs.c --- a/tools/debugger/pdb/pdb_caml_xcs.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/pdb_caml_xcs.c Thu Aug 25 22:53:20 2005 @@ -17,7 +17,7 @@ #include <sys/types.h> #include <sys/socket.h> #include <errno.h> -#include <xc.h> +#include <xenctrl.h> #include <xen/xen.h> #include <xen/io/domain_controller.h> @@ -50,7 +50,7 @@ { CAMLparam2(domain, ring); int my_domain = Int_val(domain); - memory_t my_ring = Int32_val(ring); + unsigned long my_ring = Int32_val(ring); pdb_front_ring_t *front_ring; pdb_sring_t *sring; diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/pdb_xen.c --- a/tools/debugger/pdb/pdb_xen.c Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/pdb_xen.c Thu Aug 25 22:53:20 2005 @@ -7,7 +7,7 @@ * PDB interface library for accessing Xen */ -#include <xc.h> +#include <xenctrl.h> #include <stdio.h> #include <stdlib.h> #include <errno.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/debugger/pdb/readme --- a/tools/debugger/pdb/readme Wed Aug 24 02:43:18 2005 +++ b/tools/debugger/pdb/readme Thu Aug 25 22:53:20 2005 @@ -1,9 +1,9 @@ -PDB 0.3 +PDB 0.3.3 http://www.cl.cam.ac.uk/netos/pdb Alex Ho -June 2005 +August 2005 This is the latest incarnation of the pervasive debugger. @@ -79,6 +79,11 @@ Process PDB can also debug a process running in a Linux 2.6 domain. + You will need to patch the Linux 2.6 domain U tree to export some + additional symbols for the pdb module + + % make -C linux-2.6-patches + After running PDB in domain 0, insert the pdb module in dom u: % insmod linux-2.6-module/pdb.ko @@ -87,7 +92,14 @@ (gdb) maint packet x context = process <domid> <pid> + Read, write, and access watchpoint should also work for processes, + use the "rwatch", "watch" and "awatch" gdb commands respectively. + + If you are having trouble with GDB 5.3 (i386-redhat-linux-gnu), + try GDB 6.3 (configured with --target=i386-linux-gnu). + + To Do -- watchpoints +- watchpoints for domains - support for SMP diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/Makefile --- a/tools/examples/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/examples/Makefile Thu Aug 25 22:53:20 2005 @@ -16,7 +16,7 @@ # Xen script dir and scripts to go there. XEN_SCRIPT_DIR = /etc/xen/scripts -XEN_SCRIPTS = network vif-bridge +XEN_SCRIPTS = network-bridge vif-bridge XEN_SCRIPTS += network-route vif-route XEN_SCRIPTS += block-file XEN_SCRIPTS += block-enbd @@ -24,10 +24,14 @@ XEN_BOOT_DIR = /usr/lib/xen/boot XEN_BOOT = mem-map.sxp +XEN_HOTPLUG_DIR = /etc/hotplug.d/xen-backend +XEN_HOTPLUG_SCRIPTS = backend.hotplug + all: build: -install: all install-initd install-configs install-scripts install-boot +install: all install-initd install-configs install-scripts install-boot \ + install-hotplug install-initd: [ -d $(DESTDIR)/etc/init.d ] || $(INSTALL_DIR) $(DESTDIR)/etc/init.d @@ -60,4 +64,12 @@ $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_BOOT_DIR); \ done +install-hotplug: + [ -d $(DESTDIR)$(XEN_HOTPLUG_DIR) ] || \ + $(INSTALL_DIR) $(DESTDIR)$(XEN_HOTPLUG_DIR) + for i in $(XEN_HOTPLUG_SCRIPTS); \ + do [ -a $(DESTDIR)$(XEN_HOTPLUG_DIR)/$$i ] || \ + $(INSTALL_PROG) $$i $(DESTDIR)$(XEN_HOTPLUG_DIR); \ + done + clean: diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/README --- a/tools/examples/README Wed Aug 24 02:43:18 2005 +++ b/tools/examples/README Thu Aug 25 22:53:20 2005 @@ -9,9 +9,20 @@ send it (preferably with a little summary to go in this file) to <xen-devel@xxxxxxxxxxxxxxxxxxxxx> so we can add it to this directory. +block-enbd - binds/unbinds network block devices +block-file - binds/unbinds file to loopback device +mem-map.sxp - memory map xend configuration file. network - default network setup script called by xend at startup. +network-route - default xen network start/stop script. +network-nat - default xen network start/stop script when using NAT. vif-bridge - default virtual network interface setup script. +vif-route - default xen virtual network start/stop script +vif-nat - configures vif in routed-nat mode. xend-config.sxp - default xend configuration file. xmexample1 - example configuration script for 'xm create'. xmexample2 - a more complex configuration script for 'xm create'. +xmexample3 - an advanced configuration script for 'xm create' + that utilizes the vmid. +xmexample.vmx - a configuration script for creating a vmx domain with + 'xm create'. diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/vif-bridge --- a/tools/examples/vif-bridge Wed Aug 24 02:43:18 2005 +++ b/tools/examples/vif-bridge Thu Aug 25 22:53:20 2005 @@ -74,8 +74,10 @@ exit fi -# Add/remove vif to/from bridge. -brctl ${brcmd} ${bridge} ${vif} +# Add vif to bridge. vifs are auto-removed from bridge. +if [ "${brcmd}" == "addif" ] ; then + brctl ${brcmd} ${bridge} ${vif} +fi ifconfig ${vif} $OP if [ ${ip} ] ; then diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/xend-config.sxp --- a/tools/examples/xend-config.sxp Wed Aug 24 02:43:18 2005 +++ b/tools/examples/xend-config.sxp Thu Aug 25 22:53:20 2005 @@ -28,7 +28,7 @@ ## Use the following if VIF traffic is bridged. # The script used to start/stop networking for xend. -(network-script network) +(network-script network-bridge) # The default bridge that virtual interfaces should be connected to. (vif-bridge xen-br0) # The default script used to control virtual interfaces. diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/xmexample.vmx --- a/tools/examples/xmexample.vmx Wed Aug 24 02:43:18 2005 +++ b/tools/examples/xmexample.vmx Thu Aug 25 22:53:20 2005 @@ -10,13 +10,8 @@ # Kernel image file. kernel = "/usr/lib/xen/boot/vmxloader" -# Optional ramdisk. -#ramdisk = "/boot/initrd.gz" - -# The domain build function. Default is 'linux'. +# The domain build function. VMX domain uses 'vmx'. builder='vmx' -#builder='linux' -#builder='netbsd' # Initial memory allocation (in megabytes) for the new domain. memory = 128 @@ -26,13 +21,6 @@ # Which CPU to start domain on? #cpu = -1 # leave to Xen to pick - -#---------------------------------------------------------------------------- -# Define network interfaces. - -# Number of network interfaces. Default is 1. -#nics=1 -nics=0 # Optionally define mac and/or bridge for the network interfaces. # Random MACs are assigned if not given. @@ -46,37 +34,7 @@ # and MODE is r for read-only, w for read-write. #disk = [ 'phy:hda1,hda1,r' ] -disk = [ 'file:/var/images/min-el3-i386.img,hda,w' ] - -#---------------------------------------------------------------------------- -# Set the kernel command line for the new domain. -# You only need to define the IP parameters and hostname if the domain's -# IP config doesn't, e.g. in ifcfg-eth0 or via DHCP. -# You can use 'extra' to set the runlevel and custom environment -# variables used by custom rc scripts (e.g. VMID=, usr= ). - -# Set if you want dhcp to allocate the IP address. -#dhcp="dhcp" -# Set netmask. -#netmask= -# Set default gateway. -#gateway= -# Set the hostname. -#hostname= "vm%d" % vmid - -# Set root device. -#root = "/dev/ram0" -root = "/dev/hda1 ro" - -# Root device for nfs. -#root = "/dev/nfs" -# The nfs server. -#nfs_server = '169.254.1.0' -# Root directory on the nfs server. -#nfs_root = '/full/path/to/root/directory' - -# Sets runlevel 4. -#extra = "acpi=off console=ttyS0 console=tty0 1" +disk = [ 'file:/var/images/min-el3-i386.img,ioemu:hda,w' ] #---------------------------------------------------------------------------- # Set according to whether you want the domain restarted when it exits. @@ -125,15 +83,10 @@ #----------------------------------------------------------------------------- -# set the real time clock to local time [default=utc] -#localtime='utc' +# set the real time clock to local time [default=0 i.e. set to utc] +#localtime=1 #----------------------------------------------------------------------------- # start in full screen #full-screen=1 - -#----------------------------------------------------------------------------- -# set the mac address of the first interface -#macaddr= - diff -r 5f1ed597f107 -r 8799d14bef77 tools/firmware/acpi/acpi2_0.h --- a/tools/firmware/acpi/acpi2_0.h Wed Aug 24 02:43:18 2005 +++ b/tools/firmware/acpi/acpi2_0.h Thu Aug 25 22:53:20 2005 @@ -18,7 +18,7 @@ #ifndef _ACPI_2_0_H_ #define _ACPI_2_0_H_ -#include "xc.h" // for u8, u16, u32, u64 definition +#include "xenctrl.h" // for u8, u16, u32, u64 definition #pragma pack (1) diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/hw/i8254.c --- a/tools/ioemu/hw/i8254.c Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/hw/i8254.c Thu Aug 25 22:53:20 2005 @@ -22,7 +22,7 @@ * THE SOFTWARE. */ #include "vl.h" -#include "xc.h" +#include "xenctrl.h" #include <io/ioreq.h> //#define DEBUG_PIT diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/hw/i8259.c --- a/tools/ioemu/hw/i8259.c Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/hw/i8259.c Thu Aug 25 22:53:20 2005 @@ -22,7 +22,7 @@ * THE SOFTWARE. */ #include "vl.h" -#include "xc.h" +#include "xenctrl.h" #include <io/ioreq.h> /* debug PIC */ diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/hw/ide.c --- a/tools/ioemu/hw/ide.c Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/hw/ide.c Thu Aug 25 22:53:20 2005 @@ -430,6 +430,7 @@ put_le16(p + 59, 0x100 | s->mult_sectors); put_le16(p + 60, s->nb_sectors); put_le16(p + 61, s->nb_sectors >> 16); + put_le16(p + 63, 0x07); put_le16(p + 80, (1 << 1) | (1 << 2)); put_le16(p + 82, (1 << 14)); put_le16(p + 83, (1 << 14)); @@ -460,7 +461,7 @@ put_le16(p + 48, 1); /* dword I/O (XXX: should not be set on CDROM) */ put_le16(p + 49, 1 << 9); /* LBA supported, no DMA */ put_le16(p + 53, 3); /* words 64-70, 54-58 valid */ - put_le16(p + 63, 0x103); /* DMA modes XXX: may be incorrect */ + put_le16(p + 63, 0x07); /* Multi-word DMA mode 2 */ put_le16(p + 64, 1); /* PIO modes */ put_le16(p + 65, 0xb4); /* minimum DMA multiword tx cycle time */ put_le16(p + 66, 0xb4); /* recommended DMA multiword tx cycle time */ diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/hw/ioapic.h --- a/tools/ioemu/hw/ioapic.h Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/hw/ioapic.h Thu Aug 25 22:53:20 2005 @@ -26,7 +26,7 @@ #ifndef __IOAPIC_H #define __IOAPIC_H -#include "xc.h" +#include "xenctrl.h" #include <io/ioreq.h> #include <io/vmx_vlapic.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/monitor.c --- a/tools/ioemu/monitor.c Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/monitor.c Thu Aug 25 22:53:20 2005 @@ -225,14 +225,10 @@ } } +extern void destroy_vmx_domain(void); static void do_quit(void) { - extern int domid; - extern FILE* logfile; - char destroy_cmd[20]; - sprintf(destroy_cmd, "xm destroy %d", domid); - if (system(destroy_cmd) == -1) - fprintf(logfile, "%s failed.!\n", destroy_cmd); + destroy_vmx_domain(); exit(0); } diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/target-i386-dm/Makefile --- a/tools/ioemu/target-i386-dm/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/target-i386-dm/Makefile Thu Aug 25 22:53:20 2005 @@ -188,7 +188,7 @@ ######################################################### DEFINES+=-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -DAPIC_SUPPORT -LIBS+=-lm -L../../libxc -lxc +LIBS+=-lm -L../../libxc -lxenctrl ifndef CONFIG_USER_ONLY LIBS+=-lz endif @@ -376,10 +376,10 @@ $(CC) $(DEFINES) -c -o $@ $< clean: - rm -rf *.o *.a *~ $(PROGS) gen-op.h opc.h op.h nwfpe slirp qemu-vgaram-bin + rm -rf *.o *.a *~ $(PROGS) gen-op.h opc.h op.h nwfpe slirp distclean: - rm -rf *.o *.a *~ $(PROGS) gen-op.h opc.h op.h nwfpe slirp qemu-vgaram-bin + rm -rf *.o *.a *~ $(PROGS) gen-op.h opc.h op.h nwfpe slirp install: all if [ ! -d $(INSTALL_DIR) ];then mkdir -p $(INSTALL_DIR);fi @@ -387,8 +387,6 @@ install -m 755 -s $(PROGS) "$(INSTALL_DIR)" install -m 755 qemu-dm.debug "$(INSTALL_DIR)" install -m 755 qemu-ifup "$(DESTDIR)$(configdir)" - gunzip -c qemu-vgaram-bin.gz >qemu-vgaram-bin - install -m 755 qemu-vgaram-bin "$(DESTDIR)$(configdir)" ifneq ($(wildcard .depend),) include .depend endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/target-i386-dm/helper2.c --- a/tools/ioemu/target-i386-dm/helper2.c Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/target-i386-dm/helper2.c Thu Aug 25 22:53:20 2005 @@ -47,7 +47,7 @@ #include <fcntl.h> #include <sys/ioctl.h> -#include "xc.h" +#include "xenctrl.h" #include <io/ioreq.h> #include "cpu.h" @@ -55,6 +55,7 @@ #include "vl.h" shared_iopage_t *shared_page = NULL; +extern int reset_requested; CPUX86State *cpu_86_init(void) { @@ -327,7 +328,16 @@ env->send_event = 1; } -//static unsigned long tsc_per_tick = 1; /* XXX: calibrate */ +void +destroy_vmx_domain(void) +{ + extern int domid; + extern FILE* logfile; + char destroy_cmd[20]; + sprintf(destroy_cmd, "xm destroy %d", domid); + if (system(destroy_cmd) == -1) + fprintf(logfile, "%s failed.!\n", destroy_cmd); +} int main_loop(void) { @@ -348,6 +358,10 @@ if (vm_running) { if (shutdown_requested) { break; + } + if (reset_requested){ + qemu_system_reset(); + reset_requested = 0; } } @@ -391,7 +405,21 @@ } } } + destroy_vmx_domain(); return 0; +} + +static void +qemu_vmx_reset(void *unused) +{ + char cmd[255]; + extern int domid; + + /* pause domain first, to avoid repeated reboot request*/ + xc_domain_pause (xc_handle, domid); + + sprintf(cmd,"xm shutdown -R %d", domid); + system (cmd); } CPUState * @@ -400,7 +428,7 @@ CPUX86State *env; cpu_exec_init(); - + qemu_register_reset(qemu_vmx_reset, NULL); env = malloc(sizeof(CPUX86State)); if (!env) return NULL; @@ -427,3 +455,4 @@ return env; } + diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/vl.c --- a/tools/ioemu/vl.c Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/vl.c Thu Aug 25 22:53:20 2005 @@ -72,7 +72,7 @@ #endif #endif /* CONFIG_SDL */ -#include "xc.h" +#include "xenctrl.h" #include "exec-all.h" //#define DO_TB_FLUSH @@ -2030,7 +2030,7 @@ } QEMUResetEntry; static QEMUResetEntry *first_reset_entry; -static int reset_requested; +int reset_requested; int shutdown_requested; void qemu_register_reset(QEMUResetHandler *func, void *opaque) diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/vl.h --- a/tools/ioemu/vl.h Wed Aug 24 02:43:18 2005 +++ b/tools/ioemu/vl.h Thu Aug 25 22:53:20 2005 @@ -107,6 +107,7 @@ void qemu_register_reset(QEMUResetHandler *func, void *opaque); void qemu_system_reset_request(void); +void qemu_system_reset(void); void qemu_system_shutdown_request(void); void main_loop_wait(int timeout); diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/Makefile --- a/tools/libxc/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/Makefile Thu Aug 25 22:53:20 2005 @@ -12,28 +12,32 @@ XEN_ROOT = ../.. include $(XEN_ROOT)/tools/Rules.mk -SRCS := -SRCS += xc_sedf.c -SRCS += xc_bvtsched.c -SRCS += xc_core.c -SRCS += xc_domain.c -SRCS += xc_evtchn.c -SRCS += xc_gnttab.c -SRCS += xc_load_bin.c -SRCS += xc_load_elf.c -SRCS += xc_linux_build.c -SRCS += xc_misc.c -SRCS += xc_physdev.c -SRCS += xc_private.c +SRCS := +BUILD_SRCS := +SRCS += xc_bvtsched.c +SRCS += xc_core.c +SRCS += xc_domain.c +SRCS += xc_evtchn.c +SRCS += xc_gnttab.c +SRCS += xc_misc.c +SRCS += xc_physdev.c +SRCS += xc_private.c +SRCS += xc_sedf.c + ifeq ($(XEN_TARGET_ARCH),ia64) -SRCS += xc_ia64_stubs.c +BUILD_SRCS += xc_ia64_stubs.c else -SRCS += xc_load_aout9.c -SRCS += xc_linux_restore.c -SRCS += xc_linux_save.c -SRCS += xc_vmx_build.c -SRCS += xc_ptrace.c -SRCS += xc_ptrace_core.c +SRCS += xc_ptrace.c +SRCS += xc_ptrace_core.c + +BUILD_SRCS := xc_load_aout9.c +BUILD_SRCS += xc_load_bin.c +BUILD_SRCS += xc_load_elf.c +BUILD_SRCS += xc_linux_build.c +BUILD_SRCS += xc_linux_restore.c +BUILD_SRCS += xc_linux_save.c +BUILD_SRCS += xc_vmx_build.c +BUILD_SRCS += xg_private.c endif CFLAGS += -Wall @@ -43,13 +47,20 @@ CFLAGS += $(INCLUDES) -I. # Get gcc to generate the dependencies for us. CFLAGS += -Wp,-MD,.$(@F).d +LDFLAGS += -L. DEPS = .*.d LIB_OBJS := $(patsubst %.c,%.o,$(SRCS)) PIC_OBJS := $(patsubst %.c,%.opic,$(SRCS)) -LIB := libxc.a libxc-pic.a -LIB += libxc.so libxc.so.$(MAJOR) libxc.so.$(MAJOR).$(MINOR) +LIB_BUILD_OBJS := $(patsubst %.c,%.o,$(BUILD_SRCS)) +PIC_BUILD_OBJS := $(patsubst %.c,%.opic,$(BUILD_SRCS)) + +LIB := libxenctrl.a +LIB += libxenctrl.so libxenctrl.so.$(MAJOR) libxenctrl.so.$(MAJOR).$(MINOR) + +LIB += libxenguest.a +LIB += libxenguest.so libxenguest.so.$(MAJOR) libxenguest.so.$(MAJOR).$(MINOR) all: build build: check-for-zlib mk-symlinks @@ -77,11 +88,16 @@ install: build [ -d $(DESTDIR)/usr/$(LIBDIR) ] || $(INSTALL_DIR) $(DESTDIR)/usr/$(LIBDIR) [ -d $(DESTDIR)/usr/include ] || $(INSTALL_DIR) $(DESTDIR)/usr/include - $(INSTALL_PROG) libxc.so.$(MAJOR).$(MINOR) $(DESTDIR)/usr/$(LIBDIR) - $(INSTALL_DATA) libxc.a $(DESTDIR)/usr/$(LIBDIR) - ln -sf libxc.so.$(MAJOR).$(MINOR) $(DESTDIR)/usr/$(LIBDIR)/libxc.so.$(MAJOR) - ln -sf libxc.so.$(MAJOR) $(DESTDIR)/usr/$(LIBDIR)/libxc.so - $(INSTALL_DATA) xc.h $(DESTDIR)/usr/include + $(INSTALL_PROG) libxenctrl.so.$(MAJOR).$(MINOR) $(DESTDIR)/usr/$(LIBDIR) + $(INSTALL_DATA) libxenctrl.a $(DESTDIR)/usr/$(LIBDIR) + ln -sf libxenctrl.so.$(MAJOR).$(MINOR) $(DESTDIR)/usr/$(LIBDIR)/libxenctrl.so.$(MAJOR) + ln -sf libxenctrl.so.$(MAJOR) $(DESTDIR)/usr/$(LIBDIR)/libxenctrl.so + $(INSTALL_DATA) xenctrl.h $(DESTDIR)/usr/include + + $(INSTALL_PROG) libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)/usr/$(LIBDIR) + $(INSTALL_DATA) libxenguest.a $(DESTDIR)/usr/$(LIBDIR) + ln -sf libxenguest.so.$(MAJOR).$(MINOR) $(DESTDIR)/usr/$(LIBDIR)/libxenguest.so.$(MAJOR) + ln -sf libxenguest.so.$(MAJOR) $(DESTDIR)/usr/$(LIBDIR)/libxenguest.so .PHONY: TAGS clean rpm install all @@ -100,18 +116,30 @@ mv staging/i386/*.rpm . rm -rf staging -libxc.a: $(LIB_OBJS) +# libxenctrl + +libxenctrl.a: $(LIB_OBJS) $(AR) rc $@ $^ -libxc-pic.a: $(PIC_OBJS) +libxenctrl.so: libxenctrl.so.$(MAJOR) + ln -sf $< $@ +libxenctrl.so.$(MAJOR): libxenctrl.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libxenctrl.so.$(MAJOR).$(MINOR): $(PIC_OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-soname -Wl,libxenctrl.so.$(MAJOR) -shared -o $@ $^ + +# libxenguest + +libxenguest.a: $(LIB_BUILD_OBJS) $(AR) rc $@ $^ -libxc.so: libxc.so.$(MAJOR) +libxenguest.so: libxenguest.so.$(MAJOR) ln -sf $< $@ -libxc.so.$(MAJOR): libxc.so.$(MAJOR).$(MINOR) +libxenguest.so.$(MAJOR): libxenguest.so.$(MAJOR).$(MINOR) ln -sf $< $@ -libxc.so.$(MAJOR).$(MINOR): $(PIC_OBJS) - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-soname -Wl,libxc.so.$(MAJOR) -shared -o $@ $^ -lz +libxenguest.so.$(MAJOR).$(MINOR): $(PIC_BUILD_OBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-soname -Wl,libxenguest.so.$(MAJOR) -shared -o $@ $^ -lz -lxenctrl -include $(DEPS) diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/linux_boot_params.h --- a/tools/libxc/linux_boot_params.h Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/linux_boot_params.h Thu Aug 25 22:53:20 2005 @@ -17,6 +17,7 @@ #define E820_NVS 4 #define E820_IO 16 #define E820_SHARED 17 +#define E820_XENSTORE 18 u32 caching_attr; /* used by hypervisor */ #define MEMMAP_UC 0 diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_core.c Thu Aug 25 22:53:20 2005 @@ -1,4 +1,4 @@ -#include "xc_private.h" +#include "xg_private.h" #define ELFSIZE 32 #include "xc_elf.h" #include <stdlib.h> @@ -43,7 +43,7 @@ goto error_out; } - if ((dump_mem_start = malloc(DUMP_INCREMENT*PAGE_SIZE)) == 0) { + if ((dump_mem_start = malloc(DUMP_INCREMENT*PAGE_SIZE)) == NULL) { PERROR("Could not allocate dump_mem"); goto error_out; } @@ -108,9 +108,8 @@ free(dump_mem_start); return 0; error_out: - if (dump_fd) + if (dump_fd != -1) close(dump_fd); - if (dump_mem_start) - free(dump_mem_start); + free(dump_mem_start); return -1; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_domain.c Thu Aug 25 22:53:20 2005 @@ -266,7 +266,7 @@ int err; unsigned int npages = mem_kb / (PAGE_SIZE/1024); - err = do_dom_mem_op(xc_handle, MEMOP_increase_reservation, NULL, + err = xc_dom_mem_op(xc_handle, MEMOP_increase_reservation, NULL, npages, 0, domid); if (err == npages) return 0; diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_gnttab.c --- a/tools/libxc/xc_gnttab.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_gnttab.c Thu Aug 25 22:53:20 2005 @@ -40,17 +40,17 @@ int xc_gnttab_map_grant_ref(int xc_handle, - memory_t host_virt_addr, + u64 host_virt_addr, u32 dom, u16 ref, u16 flags, s16 *handle, - memory_t *dev_bus_addr) + u64 *dev_bus_addr) { struct gnttab_map_grant_ref op; int rc; - op.host_virt_addr = host_virt_addr; + op.host_addr = host_virt_addr; op.dom = (domid_t)dom; op.ref = ref; op.flags = flags; @@ -67,15 +67,15 @@ int xc_gnttab_unmap_grant_ref(int xc_handle, - memory_t host_virt_addr, - memory_t dev_bus_addr, + u64 host_virt_addr, + u64 dev_bus_addr, u16 handle, s16 *status) { struct gnttab_unmap_grant_ref op; int rc; - op.host_virt_addr = host_virt_addr; + op.host_addr = host_virt_addr; op.dev_bus_addr = dev_bus_addr; op.handle = handle; @@ -92,7 +92,7 @@ u32 dom, u16 nr_frames, s16 *status, - memory_t **frame_list) + unsigned long **frame_list) { struct gnttab_setup_table op; int rc, i; diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_linux_build.c --- a/tools/libxc/xc_linux_build.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_linux_build.c Thu Aug 25 22:53:20 2005 @@ -2,7 +2,8 @@ * xc_linux_build.c */ -#include "xc_private.h" +#include "xg_private.h" +#include <xenctrl.h> #if defined(__i386__) #define ELFSIZE 32 @@ -318,8 +319,7 @@ return 0; error_out: - if ( page_array != NULL ) - free(page_array); + free(page_array); return -1; } #else /* x86 */ @@ -341,7 +341,7 @@ unsigned long count, i; start_info_t *start_info; shared_info_t *shared_info; - mmu_t *mmu = NULL; + xc_mmu_t *mmu = NULL; int rc; unsigned long nr_pt_pages; @@ -491,7 +491,7 @@ } } - if ( (mmu = init_mmu_updates(xc_handle, dom)) == NULL ) + if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL ) goto error_out; /* setup page tables */ @@ -521,9 +521,9 @@ page_array[physmap_pfn++]); for ( count = 0; count < nr_pages; count++ ) { - if ( add_mmu_update(xc_handle, mmu, - (page_array[count] << PAGE_SHIFT) | - MMU_MACHPHYS_UPDATE, count) ) + if ( xc_add_mmu_update(xc_handle, mmu, + (page_array[count] << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE, count) ) { munmap(physmap, PAGE_SIZE); goto error_out; @@ -603,7 +603,7 @@ munmap(shared_info, PAGE_SIZE); /* Send the page update requests down to the hypervisor. */ - if ( finish_mmu_updates(xc_handle, mmu) ) + if ( xc_finish_mmu_updates(xc_handle, mmu) ) goto error_out; free(mmu); @@ -616,10 +616,8 @@ return 0; error_out: - if ( mmu != NULL ) - free(mmu); - if ( page_array != NULL ) - free(page_array); + free(mmu); + free(page_array); return -1; } #endif @@ -679,7 +677,7 @@ op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; - if ( (do_dom0_op(xc_handle, &op) < 0) || + if ( (xc_dom0_op(xc_handle, &op) < 0) || ((u16)op.u.getdomaininfo.domain != domid) ) { PERROR("Could not get info on domain"); @@ -719,8 +717,7 @@ close(initrd_fd); if ( initrd_gfd ) gzclose(initrd_gfd); - if ( image != NULL ) - free(image); + free(image); #ifdef __ia64__ /* based on new_thread in xen/arch/ia64/domain.c */ @@ -797,7 +794,7 @@ launch_op.u.setdomaininfo.ctxt = ctxt; launch_op.cmd = DOM0_SETDOMAININFO; - rc = do_dom0_op(xc_handle, &launch_op); + rc = xc_dom0_op(xc_handle, &launch_op); return rc; @@ -806,8 +803,7 @@ gzclose(initrd_gfd); else if ( initrd_fd >= 0 ) close(initrd_fd); - if ( image != NULL ) - free(image); + free(image); return -1; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_linux_restore.c --- a/tools/libxc/xc_linux_restore.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_linux_restore.c Thu Aug 25 22:53:20 2005 @@ -6,7 +6,12 @@ * Copyright (c) 2003, K A Fraser. */ -#include "xc_private.h" +#include <stdlib.h> +#include <unistd.h> + +#include "xg_private.h" +#include <xenctrl.h> + #include <xen/linux/suspend.h> #define MAX_BATCH_SIZE 1024 @@ -32,7 +37,7 @@ #define PPRINTF(_f, _a...) #endif -ssize_t +static ssize_t read_exact(int fd, void *buf, size_t count) { int r = 0, s; @@ -48,7 +53,8 @@ return r; } -int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns) +int xc_linux_restore(int xc_handle, int io_fd, u32 dom, unsigned long nr_pfns, + unsigned int store_evtchn, unsigned long *store_mfn) { dom0_op_t op; int rc = 1, i, n, k; @@ -88,7 +94,7 @@ char *region_base; - mmu_t *mmu = NULL; + xc_mmu_t *mmu = NULL; /* used by debug verify code */ unsigned long buf[PAGE_SIZE/sizeof(unsigned long)]; @@ -131,7 +137,7 @@ /* Get the domain's shared-info frame. */ op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)dom; - if (do_dom0_op(xc_handle, &op) < 0) { + if (xc_dom0_op(xc_handle, &op) < 0) { ERR("Could not get information on new domain"); goto out; } @@ -157,7 +163,7 @@ goto out; } - mmu = init_mmu_updates(xc_handle, dom); + mmu = xc_init_mmu_updates(xc_handle, dom); if (mmu == NULL) { ERR("Could not initialise for MMU updates"); goto out; @@ -354,8 +360,9 @@ } } - if ( add_mmu_update(xc_handle, mmu, - (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) ) + if ( xc_add_mmu_update(xc_handle, mmu, + (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, + pfn) ) { printf("machpys mfn=%ld pfn=%ld\n",mfn,pfn); goto out; @@ -369,7 +376,7 @@ DPRINTF("Received all pages\n"); - if ( finish_mmu_updates(xc_handle, mmu) ) + if ( xc_finish_mmu_updates(xc_handle, mmu) ) goto out; /* @@ -387,14 +394,14 @@ pin[nr_pins].mfn = pfn_to_mfn_table[i]; if ( ++nr_pins == MAX_PIN_BATCH ) { - if ( do_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 ) + if ( xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0 ) goto out; nr_pins = 0; } } if ( (nr_pins != 0) && - (do_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) ) + (xc_mmuext_op(xc_handle, pin, nr_pins, dom) < 0) ) goto out; DPRINTF("\b\b\b\b100%%\n"); @@ -434,7 +441,7 @@ if ( count > 0 ) { - if ( (rc = do_dom_mem_op( xc_handle, + if ( (rc = xc_dom_mem_op( xc_handle, MEMOP_decrease_reservation, pfntab, count, 0, dom )) <0 ) { @@ -464,10 +471,13 @@ } ctxt.user_regs.esi = mfn = pfn_to_mfn_table[pfn]; p_srec = xc_map_foreign_range( - xc_handle, dom, PAGE_SIZE, PROT_WRITE, mfn); + xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn); p_srec->resume_info.nr_pages = nr_pfns; p_srec->resume_info.shared_info = shared_info_frame << PAGE_SHIFT; p_srec->resume_info.flags = 0; + *store_mfn = p_srec->resume_info.store_mfn = + pfn_to_mfn_table[p_srec->resume_info.store_mfn]; + p_srec->resume_info.store_evtchn = store_evtchn; munmap(p_srec, PAGE_SIZE); /* Uncanonicalise each GDT frame number. */ @@ -582,7 +592,7 @@ op.u.setdomaininfo.domain = (domid_t)dom; op.u.setdomaininfo.vcpu = 0; op.u.setdomaininfo.ctxt = &ctxt; - rc = do_dom0_op(xc_handle, &op); + rc = xc_dom0_op(xc_handle, &op); if ( rc != 0 ) { @@ -593,7 +603,7 @@ DPRINTF("Domain ready to be unpaused\n"); op.cmd = DOM0_UNPAUSEDOMAIN; op.u.unpausedomain.domain = (domid_t)dom; - rc = do_dom0_op(xc_handle, &op); + rc = xc_dom0_op(xc_handle, &op); if (rc == 0) { /* Success: print the domain id. */ DPRINTF("DOM=%u\n", dom); @@ -603,12 +613,9 @@ out: if ( (rc != 0) && (dom != 0) ) xc_domain_destroy(xc_handle, dom); - if ( mmu != NULL ) - free(mmu); - if ( pfn_to_mfn_table != NULL ) - free(pfn_to_mfn_table); - if ( pfn_type != NULL ) - free(pfn_type); + free(mmu); + free(pfn_to_mfn_table); + free(pfn_type); DPRINTF("Restore exit with rc=%d\n", rc); return rc; diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_linux_save.c Thu Aug 25 22:53:20 2005 @@ -7,11 +7,15 @@ */ #include <inttypes.h> +#include <time.h> +#include <stdlib.h> +#include <unistd.h> #include <sys/time.h> -#include "xc_private.h" + +#include "xg_private.h" + #include <xen/linux/suspend.h> #include <xen/io/domain_controller.h> -#include <time.h> #define BATCH_SIZE 1024 /* 1024 pages (4MB) at a time */ @@ -20,7 +24,7 @@ #define DEBUG 0 #if 1 -#define ERR(_f, _a...) fprintf ( stderr, _f , ## _a ) +#define ERR(_f, _a...) do { fprintf(stderr, _f , ## _a); fflush(stderr); } while (0) #else #define ERR(_f, _a...) ((void)0) #endif @@ -136,7 +140,7 @@ return (new->tv_sec * 1000000) + new->tv_usec; } -static long long llgettimeofday() +static long long llgettimeofday( void ) { struct timeval now; gettimeofday(&now, NULL); @@ -312,9 +316,9 @@ } -int suspend_and_state(int xc_handle, int io_fd, int dom, - xc_dominfo_t *info, - vcpu_guest_context_t *ctxt) +static int suspend_and_state(int xc_handle, int io_fd, int dom, + xc_dominfo_t *info, + vcpu_guest_context_t *ctxt) { int i=0; char ans[30]; @@ -429,7 +433,7 @@ - that should be sent this iteration (unless later marked as skip); - to skip this iteration because already dirty; - to fixup by sending at the end if not already resent; */ - unsigned long *to_send, *to_skip, *to_fix; + unsigned long *to_send = NULL, *to_skip = NULL, *to_fix = NULL; xc_shadow_control_stats_t stats; @@ -643,6 +647,22 @@ goto out; } + /* Map the suspend-record MFN to pin it. The page must be owned by + dom for this to succeed. */ + p_srec = xc_map_foreign_range(xc_handle, dom, + sizeof(*p_srec), PROT_READ | PROT_WRITE, + ctxt.user_regs.esi); + if (!p_srec){ + ERR("Couldn't map suspend record"); + goto out; + } + + /* Canonicalize store mfn. */ + if ( !translate_mfn_to_pfn(&p_srec->resume_info.store_mfn) ) { + ERR("Store frame is not in range of pseudophys map"); + goto out; + } + print_stats( xc_handle, dom, 0, &stats, 0 ); /* Now write out each data page, canonicalising page tables as we go... */ @@ -756,7 +776,7 @@ goto out; } - if ( get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){ + if ( xc_get_pfn_type_batch(xc_handle, dom, batch, pfn_type) ){ ERR("get_pfn_type_batch failed"); goto out; } @@ -983,16 +1003,6 @@ } } - /* Map the suspend-record MFN to pin it. The page must be owned by - dom for this to succeed. */ - p_srec = xc_map_foreign_range(xc_handle, dom, - sizeof(*p_srec), PROT_READ, - ctxt.user_regs.esi); - if (!p_srec){ - ERR("Couldn't map suspend record"); - goto out; - } - if (nr_pfns != p_srec->nr_pfns ) { ERR("Suspend record nr_pfns unexpected (%ld != %ld)", @@ -1045,8 +1055,11 @@ if(live_mfn_to_pfn_table) munmap(live_mfn_to_pfn_table, PAGE_SIZE*1024); - if (pfn_type != NULL) - free(pfn_type); + free(pfn_type); + free(pfn_batch); + free(to_send); + free(to_fix); + free(to_skip); DPRINTF("Save exit rc=%d\n",rc); return !!rc; diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_load_aout9.c --- a/tools/libxc/xc_load_aout9.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_load_aout9.c Thu Aug 25 22:53:20 2005 @@ -1,5 +1,5 @@ -#include "xc_private.h" +#include "xg_private.h" #include "xc_aout9.h" #if defined(__i386__) diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_load_bin.c --- a/tools/libxc/xc_load_bin.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_load_bin.c Thu Aug 25 22:53:20 2005 @@ -66,7 +66,7 @@ * Free Software Foundation, Inc. */ -#include "xc_private.h" +#include "xg_private.h" #include <stdlib.h> #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_load_elf.c --- a/tools/libxc/xc_load_elf.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_load_elf.c Thu Aug 25 22:53:20 2005 @@ -2,7 +2,7 @@ * xc_elf_load.c */ -#include "xc_private.h" +#include "xg_private.h" #if defined(__i386__) #define ELFSIZE 32 @@ -309,8 +309,7 @@ dsi->v_end = round_pgup(maxva); out: - if ( p != NULL ) - free(p); + free(p); return 0; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_private.c --- a/tools/libxc/xc_private.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_private.c Thu Aug 25 22:53:20 2005 @@ -64,8 +64,8 @@ /*******************/ /* NB: arr must be mlock'ed */ -int get_pfn_type_batch(int xc_handle, - u32 dom, int num, unsigned long *arr) +int xc_get_pfn_type_batch(int xc_handle, + u32 dom, int num, unsigned long *arr) { dom0_op_t op; op.cmd = DOM0_GETPAGEFRAMEINFO2; @@ -92,25 +92,40 @@ return op.u.getpageframeinfo.type; } - - -/*******************/ - -int pin_table( - int xc_handle, unsigned int type, unsigned long mfn, domid_t dom) -{ - struct mmuext_op op; - - op.cmd = type; - op.mfn = mfn; - - if ( do_mmuext_op(xc_handle, &op, 1, dom) < 0 ) - return 1; - - return 0; -} - -static int flush_mmu_updates(int xc_handle, mmu_t *mmu) +int xc_mmuext_op( + int xc_handle, + struct mmuext_op *op, + unsigned int nr_ops, + domid_t dom) +{ + privcmd_hypercall_t hypercall; + long ret = -EINVAL; + + hypercall.op = __HYPERVISOR_mmuext_op; + hypercall.arg[0] = (unsigned long)op; + hypercall.arg[1] = (unsigned long)nr_ops; + hypercall.arg[2] = (unsigned long)0; + hypercall.arg[3] = (unsigned long)dom; + + if ( mlock(op, nr_ops*sizeof(*op)) != 0 ) + { + PERROR("Could not lock memory for Xen hypercall"); + goto out1; + } + + if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) + { + fprintf(stderr, "Dom_mem operation failed (rc=%ld errno=%d)-- need to" + " rebuild the user-space tool set?\n",ret,errno); + } + + safe_munlock(op, nr_ops*sizeof(*op)); + + out1: + return ret; +} + +static int flush_mmu_updates(int xc_handle, xc_mmu_t *mmu) { int err = 0; privcmd_hypercall_t hypercall; @@ -145,9 +160,9 @@ return err; } -mmu_t *init_mmu_updates(int xc_handle, domid_t dom) -{ - mmu_t *mmu = malloc(sizeof(mmu_t)); +xc_mmu_t *xc_init_mmu_updates(int xc_handle, domid_t dom) +{ + xc_mmu_t *mmu = malloc(sizeof(xc_mmu_t)); if ( mmu == NULL ) return mmu; mmu->idx = 0; @@ -155,8 +170,8 @@ return mmu; } -int add_mmu_update(int xc_handle, mmu_t *mmu, - unsigned long ptr, unsigned long val) +int xc_add_mmu_update(int xc_handle, xc_mmu_t *mmu, + unsigned long ptr, unsigned long val) { mmu->updates[mmu->idx].ptr = ptr; mmu->updates[mmu->idx].val = val; @@ -167,10 +182,47 @@ return 0; } -int finish_mmu_updates(int xc_handle, mmu_t *mmu) +int xc_finish_mmu_updates(int xc_handle, xc_mmu_t *mmu) { return flush_mmu_updates(xc_handle, mmu); } + +int xc_dom_mem_op(int xc_handle, + unsigned int memop, + unsigned int *extent_list, + unsigned int nr_extents, + unsigned int extent_order, + domid_t domid) +{ + privcmd_hypercall_t hypercall; + long ret = -EINVAL; + + hypercall.op = __HYPERVISOR_dom_mem_op; + hypercall.arg[0] = (unsigned long)memop; + hypercall.arg[1] = (unsigned long)extent_list; + hypercall.arg[2] = (unsigned long)nr_extents; + hypercall.arg[3] = (unsigned long)extent_order; + hypercall.arg[4] = (unsigned long)domid; + + if ( (extent_list != NULL) && + (mlock(extent_list, nr_extents*sizeof(unsigned long)) != 0) ) + { + PERROR("Could not lock memory for Xen hypercall"); + goto out1; + } + + if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) + { + fprintf(stderr, "Dom_mem operation failed (rc=%ld errno=%d)-- need to" + " rebuild the user-space tool set?\n",ret,errno); + } + + if ( extent_list != NULL ) + safe_munlock(extent_list, nr_extents*sizeof(unsigned long)); + + out1: + return ret; +} long long xc_domain_get_cpu_usage( int xc_handle, domid_t domid, int vcpu ) @@ -189,19 +241,6 @@ return op.u.getvcpucontext.cpu_time; } - -/* This is shared between save and restore, and may generally be useful. */ -unsigned long csum_page (void * page) -{ - int i; - unsigned long *p = page; - unsigned long long sum=0; - - for ( i = 0; i < (PAGE_SIZE/sizeof(unsigned long)); i++ ) - sum += p[i]; - - return sum ^ (sum>>32); -} unsigned long xc_get_m2p_start_mfn ( int xc_handle ) { @@ -332,53 +371,6 @@ return sz; } -char *xc_read_kernel_image(const char *filename, unsigned long *size) -{ - int kernel_fd = -1; - gzFile kernel_gfd = NULL; - char *image = NULL; - unsigned int bytes; - - if ( (kernel_fd = open(filename, O_RDONLY)) < 0 ) - { - PERROR("Could not open kernel image"); - goto out; - } - - if ( (*size = xc_get_filesz(kernel_fd)) == 0 ) - { - PERROR("Could not read kernel image"); - goto out; - } - - if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL ) - { - PERROR("Could not allocate decompression state for state file"); - goto out; - } - - if ( (image = malloc(*size)) == NULL ) - { - PERROR("Could not allocate memory for kernel image"); - goto out; - } - - if ( (bytes = gzread(kernel_gfd, image, *size)) != *size ) - { - PERROR("Error reading kernel image, could not" - " read the whole image (%d != %ld).", bytes, *size); - free(image); - image = NULL; - } - - out: - if ( kernel_gfd != NULL ) - gzclose(kernel_gfd); - else if ( kernel_fd >= 0 ) - close(kernel_fd); - return image; -} - void xc_map_memcpy(unsigned long dst, char *src, unsigned long size, int xch, u32 dom, unsigned long *parray, unsigned long vstart) diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_private.h --- a/tools/libxc/xc_private.h Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_private.h Thu Aug 25 22:53:20 2005 @@ -1,123 +1,25 @@ -#ifndef __XC_PRIVATE_H__ -#define __XC_PRIVATE_H__ +#ifndef XC_PRIVATE_H +#define XC_PRIVATE_H #include <unistd.h> #include <stdio.h> #include <errno.h> #include <fcntl.h> +#include <string.h> #include <sys/mman.h> #include <sys/types.h> #include <sys/stat.h> #include <stdlib.h> #include <sys/ioctl.h> -#include <errno.h> -#include <string.h> -#include "xc.h" +#include "xenctrl.h" #include <xen/linux/privcmd.h> -#define _PAGE_PRESENT 0x001 -#define _PAGE_RW 0x002 -#define _PAGE_USER 0x004 -#define _PAGE_PWT 0x008 -#define _PAGE_PCD 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_PAT 0x080 -#define _PAGE_PSE 0x080 -#define _PAGE_GLOBAL 0x100 - -#if defined(__i386__) -#define L1_PAGETABLE_SHIFT 12 -#define L2_PAGETABLE_SHIFT 22 -#define L1_PAGETABLE_SHIFT_PAE 12 -#define L2_PAGETABLE_SHIFT_PAE 21 -#define L3_PAGETABLE_SHIFT_PAE 30 -#elif defined(__x86_64__) -#define L1_PAGETABLE_SHIFT 12 -#define L2_PAGETABLE_SHIFT 21 -#define L3_PAGETABLE_SHIFT 30 -#define L4_PAGETABLE_SHIFT 39 -#endif - -#if defined(__i386__) -#define ENTRIES_PER_L1_PAGETABLE 1024 -#define ENTRIES_PER_L2_PAGETABLE 1024 -#define L1_PAGETABLE_ENTRIES_PAE 512 -#define L2_PAGETABLE_ENTRIES_PAE 512 -#define L3_PAGETABLE_ENTRIES_PAE 4 -#elif defined(__x86_64__) -#define L1_PAGETABLE_ENTRIES 512 -#define L2_PAGETABLE_ENTRIES 512 -#define L3_PAGETABLE_ENTRIES 512 -#define L4_PAGETABLE_ENTRIES 512 -#endif - #define PAGE_SHIFT XC_PAGE_SHIFT #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) - -typedef u32 l1_pgentry_32_t; -typedef u32 l2_pgentry_32_t; -typedef u64 l1_pgentry_64_t; -typedef u64 l2_pgentry_64_t; -typedef u64 l3_pgentry_64_t; -typedef unsigned long l1_pgentry_t; -typedef unsigned long l2_pgentry_t; -#if defined(__x86_64__) -typedef unsigned long l3_pgentry_t; -typedef unsigned long l4_pgentry_t; -#endif - -#if defined(__i386__) -#define l1_table_offset(_a) \ - (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1)) -#define l2_table_offset(_a) \ - ((_a) >> L2_PAGETABLE_SHIFT) -#define l1_table_offset_pae(_a) \ - (((_a) >> L1_PAGETABLE_SHIFT_PAE) & (L1_PAGETABLE_ENTRIES_PAE - 1)) -#define l2_table_offset_pae(_a) \ - (((_a) >> L2_PAGETABLE_SHIFT_PAE) & (L2_PAGETABLE_ENTRIES_PAE - 1)) -#define l3_table_offset_pae(_a) \ - (((_a) >> L3_PAGETABLE_SHIFT_PAE) & (L3_PAGETABLE_ENTRIES_PAE - 1)) -#elif defined(__x86_64__) -#define l1_table_offset(_a) \ - (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)) -#define l2_table_offset(_a) \ - (((_a) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)) -#define l3_table_offset(_a) \ - (((_a) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1)) -#define l4_table_offset(_a) \ - (((_a) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1)) -#endif - -struct domain_setup_info -{ - unsigned long v_start; - unsigned long v_end; - unsigned long v_kernstart; - unsigned long v_kernend; - unsigned long v_kernentry; - - unsigned int load_symtab; - unsigned int pae_kernel; - unsigned long symtab_addr; - unsigned long symtab_len; -}; - -typedef int (*parseimagefunc)(char *image, unsigned long image_size, - struct domain_setup_info *dsi); -typedef int (*loadimagefunc)(char *image, unsigned long image_size, int xch, - u32 dom, unsigned long *parray, - struct domain_setup_info *dsi); - -struct load_funcs -{ - parseimagefunc parseimage; - loadimagefunc loadimage; -}; #define ERROR(_m, _a...) \ do { \ @@ -186,97 +88,6 @@ return ret; } -static inline int do_dom_mem_op(int xc_handle, - unsigned int memop, - unsigned int *extent_list, - unsigned int nr_extents, - unsigned int extent_order, - domid_t domid) -{ - privcmd_hypercall_t hypercall; - long ret = -EINVAL; - - hypercall.op = __HYPERVISOR_dom_mem_op; - hypercall.arg[0] = (unsigned long)memop; - hypercall.arg[1] = (unsigned long)extent_list; - hypercall.arg[2] = (unsigned long)nr_extents; - hypercall.arg[3] = (unsigned long)extent_order; - hypercall.arg[4] = (unsigned long)domid; - - if ( (extent_list != NULL) && - (mlock(extent_list, nr_extents*sizeof(unsigned long)) != 0) ) - { - PERROR("Could not lock memory for Xen hypercall"); - goto out1; - } - - if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) - { - fprintf(stderr, "Dom_mem operation failed (rc=%ld errno=%d)-- need to" - " rebuild the user-space tool set?\n",ret,errno); - } - - if ( extent_list != NULL ) - safe_munlock(extent_list, nr_extents*sizeof(unsigned long)); - - out1: - return ret; -} - -static inline int do_mmuext_op( - int xc_handle, - struct mmuext_op *op, - unsigned int nr_ops, - domid_t dom) -{ - privcmd_hypercall_t hypercall; - long ret = -EINVAL; - - hypercall.op = __HYPERVISOR_mmuext_op; - hypercall.arg[0] = (unsigned long)op; - hypercall.arg[1] = (unsigned long)nr_ops; - hypercall.arg[2] = (unsigned long)0; - hypercall.arg[3] = (unsigned long)dom; - - if ( mlock(op, nr_ops*sizeof(*op)) != 0 ) - { - PERROR("Could not lock memory for Xen hypercall"); - goto out1; - } - - if ( (ret = do_xen_hypercall(xc_handle, &hypercall)) < 0 ) - { - fprintf(stderr, "Dom_mem operation failed (rc=%ld errno=%d)-- need to" - " rebuild the user-space tool set?\n",ret,errno); - } - - safe_munlock(op, nr_ops*sizeof(*op)); - - out1: - return ret; -} - - -/* - * PFN mapping. - */ -int get_pfn_type_batch(int xc_handle, u32 dom, int num, unsigned long *arr); -unsigned long csum_page (void * page); - -/* - * MMU updates. - */ -#define MAX_MMU_UPDATES 1024 -typedef struct { - mmu_update_t updates[MAX_MMU_UPDATES]; - int idx; - domid_t subject; -} mmu_t; -mmu_t *init_mmu_updates(int xc_handle, domid_t dom); -int add_mmu_update(int xc_handle, mmu_t *mmu, - unsigned long ptr, unsigned long val); -int finish_mmu_updates(int xc_handle, mmu_t *mmu); - /* * ioctl-based mfn mapping interface @@ -296,38 +107,4 @@ } privcmd_mmap_t; */ -#define mfn_mapper_queue_size 128 - -typedef struct mfn_mapper { - int xc_handle; - int size; - int prot; - int error; - int max_queue_size; - void * addr; - privcmd_mmap_t ioctl; - -} mfn_mapper_t; - -unsigned long xc_get_m2p_start_mfn (int xc_handle); - -int xc_copy_to_domain_page(int xc_handle, u32 domid, - unsigned long dst_pfn, void *src_page); - -unsigned long xc_get_filesz(int fd); - -char *xc_read_kernel_image(const char *filename, unsigned long *size); - -void xc_map_memcpy(unsigned long dst, char *src, unsigned long size, - int xch, u32 dom, unsigned long *parray, - unsigned long vstart); - -int pin_table(int xc_handle, unsigned int type, unsigned long mfn, - domid_t dom); - -/* image loading */ -int probe_elf(char *image, unsigned long image_size, struct load_funcs *funcs); -int probe_bin(char *image, unsigned long image_size, struct load_funcs *funcs); -int probe_aout9(char *image, unsigned long image_size, struct load_funcs *funcs); - #endif /* __XC_PRIVATE_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_ptrace.c --- a/tools/libxc/xc_ptrace.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_ptrace.c Thu Aug 25 22:53:20 2005 @@ -221,7 +221,7 @@ return (void *)(((unsigned long)page_virt[cpu]) | (va & BSD_PAGE_MASK)); error_out: - return 0; + return NULL; } int diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xc_vmx_build.c --- a/tools/libxc/xc_vmx_build.c Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xc_vmx_build.c Thu Aug 25 22:53:20 2005 @@ -3,7 +3,7 @@ */ #include <stddef.h> -#include "xc_private.h" +#include "xg_private.h" #define ELFSIZE 32 #include "xc_elf.h" #include <stdlib.h> @@ -37,58 +37,70 @@ int nr_map = 0; /* XXX: Doesn't work for > 4GB yet */ - mem_mapp->map[0].addr = 0x0; - mem_mapp->map[0].size = 0x9F800; - mem_mapp->map[0].type = E820_RAM; - mem_mapp->map[0].caching_attr = MEMMAP_WB; + mem_mapp->map[nr_map].addr = 0x0; + mem_mapp->map[nr_map].size = 0x9F800; + mem_mapp->map[nr_map].type = E820_RAM; + mem_mapp->map[nr_map].caching_attr = MEMMAP_WB; nr_map++; - mem_mapp->map[1].addr = 0x9F800; - mem_mapp->map[1].size = 0x800; - mem_mapp->map[1].type = E820_RESERVED; - mem_mapp->map[1].caching_attr = MEMMAP_UC; + mem_mapp->map[nr_map].addr = 0x9F800; + mem_mapp->map[nr_map].size = 0x800; + mem_mapp->map[nr_map].type = E820_RESERVED; + mem_mapp->map[nr_map].caching_attr = MEMMAP_UC; nr_map++; - mem_mapp->map[2].addr = 0xA0000; - mem_mapp->map[2].size = 0x20000; - mem_mapp->map[2].type = E820_IO; - mem_mapp->map[2].caching_attr = MEMMAP_UC; + mem_mapp->map[nr_map].addr = 0xA0000; + mem_mapp->map[nr_map].size = 0x20000; + mem_mapp->map[nr_map].type = E820_IO; + mem_mapp->map[nr_map].caching_attr = MEMMAP_UC; nr_map++; - mem_mapp->map[3].addr = 0xF0000; - mem_mapp->map[3].size = 0x10000; - mem_mapp->map[3].type = E820_RESERVED; - mem_mapp->map[3].caching_attr = MEMMAP_UC; + mem_mapp->map[nr_map].addr = 0xF0000; + mem_mapp->map[nr_map].size = 0x10000; + mem_mapp->map[nr_map].type = E820_RESERVED; + mem_mapp->map[nr_map].caching_attr = MEMMAP_UC; nr_map++; - mem_mapp->map[4].addr = 0x100000; - mem_mapp->map[4].size = mem_size - 0x100000 - PAGE_SIZE; - mem_mapp->map[4].type = E820_RAM; - mem_mapp->map[4].caching_attr = MEMMAP_WB; +#define STATIC_PAGES 2 /* for ioreq_t and store_mfn */ + /* Most of the ram goes here */ + mem_mapp->map[nr_map].addr = 0x100000; + mem_mapp->map[nr_map].size = mem_size - 0x100000 - STATIC_PAGES*PAGE_SIZE; + mem_mapp->map[nr_map].type = E820_RAM; + mem_mapp->map[nr_map].caching_attr = MEMMAP_WB; nr_map++; - mem_mapp->map[5].addr = mem_size - PAGE_SIZE; - mem_mapp->map[5].size = PAGE_SIZE; - mem_mapp->map[5].type = E820_SHARED; - mem_mapp->map[5].caching_attr = MEMMAP_WB; + /* Statically allocated special pages */ + + /* Shared ioreq_t page */ + mem_mapp->map[nr_map].addr = mem_size - PAGE_SIZE; + mem_mapp->map[nr_map].size = PAGE_SIZE; + mem_mapp->map[nr_map].type = E820_SHARED; + mem_mapp->map[nr_map].caching_attr = MEMMAP_WB; nr_map++; - mem_mapp->map[6].addr = mem_size; - mem_mapp->map[6].size = 0x3 * PAGE_SIZE; - mem_mapp->map[6].type = E820_NVS; - mem_mapp->map[6].caching_attr = MEMMAP_UC; + /* For xenstore */ + mem_mapp->map[nr_map].addr = mem_size - 2*PAGE_SIZE; + mem_mapp->map[nr_map].size = PAGE_SIZE; + mem_mapp->map[nr_map].type = E820_XENSTORE; + mem_mapp->map[nr_map].caching_attr = MEMMAP_WB; nr_map++; - mem_mapp->map[7].addr = mem_size + 0x3 * PAGE_SIZE; - mem_mapp->map[7].size = 0xA * PAGE_SIZE; - mem_mapp->map[7].type = E820_ACPI; - mem_mapp->map[7].caching_attr = MEMMAP_WB; + mem_mapp->map[nr_map].addr = mem_size; + mem_mapp->map[nr_map].size = 0x3 * PAGE_SIZE; + mem_mapp->map[nr_map].type = E820_NVS; + mem_mapp->map[nr_map].caching_attr = MEMMAP_UC; nr_map++; - mem_mapp->map[8].addr = 0xFEC00000; - mem_mapp->map[8].size = 0x1400000; - mem_mapp->map[8].type = E820_IO; - mem_mapp->map[8].caching_attr = MEMMAP_UC; + mem_mapp->map[nr_map].addr = mem_size + 0x3 * PAGE_SIZE; + mem_mapp->map[nr_map].size = 0xA * PAGE_SIZE; + mem_mapp->map[nr_map].type = E820_ACPI; + mem_mapp->map[nr_map].caching_attr = MEMMAP_WB; + nr_map++; + + mem_mapp->map[nr_map].addr = 0xFEC00000; + mem_mapp->map[nr_map].size = 0x1400000; + mem_mapp->map[nr_map].type = E820_IO; + mem_mapp->map[nr_map].caching_attr = MEMMAP_UC; nr_map++; mem_mapp->nr_map = nr_map; @@ -212,7 +224,11 @@ unsigned long shared_info_frame, unsigned int control_evtchn, unsigned long flags, - struct mem_map * mem_mapp) + unsigned int vcpus, + unsigned int store_evtchn, + unsigned long *store_mfn, + struct mem_map *mem_mapp + ) { l1_pgentry_t *vl1tab=NULL, *vl1e=NULL; l2_pgentry_t *vl2tab=NULL, *vl2e=NULL; @@ -227,7 +243,7 @@ shared_info_t *shared_info; struct linux_boot_params * boot_paramsp; __u16 * boot_gdtp; - mmu_t *mmu = NULL; + xc_mmu_t *mmu = NULL; int rc; unsigned long nr_pt_pages; @@ -342,7 +358,7 @@ } } - if ( (mmu = init_mmu_updates(xc_handle, dom)) == NULL ) + if ( (mmu = xc_init_mmu_updates(xc_handle, dom)) == NULL ) goto error_out; #ifdef __i386__ @@ -443,9 +459,9 @@ /* Write the machine->phys table entries. */ for ( count = 0; count < nr_pages; count++ ) { - if ( add_mmu_update(xc_handle, mmu, - (page_array[count] << PAGE_SHIFT) | - MMU_MACHPHYS_UPDATE, count) ) + if ( xc_add_mmu_update(xc_handle, mmu, + (page_array[count] << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE, count) ) goto error_out; } @@ -510,7 +526,10 @@ boot_paramsp->drive_info.dummy[14] = 32; /* memsize is in megabytes */ + /* If you need to create a special e820map, comment this line + and use mem-map.sxp */ build_e820map(mem_mapp, memsize << 20); + *store_mfn = page_array[(v_end-2) >> PAGE_SHIFT]; #if defined (__i386__) if (zap_mmio_ranges(xc_handle, dom, l2tab, mem_mapp) == -1) #else @@ -568,7 +587,7 @@ #endif /* Send the page update requests down to the hypervisor. */ - if ( finish_mmu_updates(xc_handle, mmu) ) + if ( xc_finish_mmu_updates(xc_handle, mmu) ) goto error_out; free(mmu); @@ -597,17 +616,15 @@ return 0; error_out: - if ( mmu != NULL ) - free(mmu); - if ( page_array != NULL ) - free(page_array); + free(mmu); + free(page_array); return -1; } #define VMX_FEATURE_FLAG 0x20 -int vmx_identify(void) +static int vmx_identify(void) { int eax, ecx; @@ -637,7 +654,10 @@ const char *ramdisk_name, const char *cmdline, unsigned int control_evtchn, - unsigned long flags) + unsigned long flags, + unsigned int vcpus, + unsigned int store_evtchn, + unsigned long *store_mfn) { dom0_op_t launch_op, op; int initrd_fd = -1; @@ -688,7 +708,7 @@ op.cmd = DOM0_GETDOMAININFO; op.u.getdomaininfo.domain = (domid_t)domid; - if ( (do_dom0_op(xc_handle, &op) < 0) || + if ( (xc_dom0_op(xc_handle, &op) < 0) || ((u16)op.u.getdomaininfo.domain != domid) ) { PERROR("Could not get info on domain"); @@ -712,7 +732,8 @@ initrd_gfd, initrd_size, nr_pages, ctxt, cmdline, op.u.getdomaininfo.shared_info_frame, - control_evtchn, flags, mem_mapp) < 0 ) + control_evtchn, flags, vcpus, store_evtchn, store_mfn, + mem_mapp) < 0 ) { ERROR("Error constructing guest OS"); goto error_out; @@ -722,8 +743,7 @@ close(initrd_fd); if ( initrd_gfd ) gzclose(initrd_gfd); - if ( image != NULL ) - free(image); + free(image); ctxt->flags = VGCF_VMX_GUEST; /* FPU is set up to default initial state. */ @@ -769,7 +789,7 @@ launch_op.u.setdomaininfo.ctxt = ctxt; launch_op.cmd = DOM0_SETDOMAININFO; - rc = do_dom0_op(xc_handle, &launch_op); + rc = xc_dom0_op(xc_handle, &launch_op); return rc; @@ -778,8 +798,7 @@ gzclose(initrd_gfd); else if ( initrd_fd >= 0 ) close(initrd_fd); - if ( image != NULL ) - free(image); + free(image); return -1; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/misc/Makefile --- a/tools/misc/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/misc/Makefile Thu Aug 25 22:53:20 2005 @@ -50,4 +50,4 @@ $(CC) -c $(CFLAGS) -o $@ $< $(TARGETS): %: %.o Makefile - $(CC) $(CFLAGS) -o $@ $< -L$(XEN_LIBXC) -lxc + $(CC) $(CFLAGS) -o $@ $< -L$(XEN_LIBXC) -lxenctrl diff -r 5f1ed597f107 -r 8799d14bef77 tools/misc/cpuperf/Makefile --- a/tools/misc/cpuperf/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/misc/cpuperf/Makefile Thu Aug 25 22:53:20 2005 @@ -37,7 +37,7 @@ $(CC) $(CFLAGS) -o $@ $< cpuperf-xen: cpuperf.c $(HDRS) Makefile - $(CC) $(CFLAGS) -I $(XEN_LIBXC) -L$(XEN_LIBXC) -lxc -DXENO -o $@ $< + $(CC) $(CFLAGS) -I $(XEN_LIBXC) -L$(XEN_LIBXC) -lxenctrl -DXENO -o $@ $< cpuperf-perfcntr: cpuperf.c $(HDRS) Makefile $(CC) $(CFLAGS) -DPERFCNTR -o $@ $< diff -r 5f1ed597f107 -r 8799d14bef77 tools/misc/cpuperf/cpuperf_xeno.h --- a/tools/misc/cpuperf/cpuperf_xeno.h Wed Aug 24 02:43:18 2005 +++ b/tools/misc/cpuperf/cpuperf_xeno.h Thu Aug 25 22:53:20 2005 @@ -9,7 +9,7 @@ * */ -#include <xc.h> +#include <xenctrl.h> static int xc_handle; diff -r 5f1ed597f107 -r 8799d14bef77 tools/misc/xc_shadow.c --- a/tools/misc/xc_shadow.c Wed Aug 24 02:43:18 2005 +++ b/tools/misc/xc_shadow.c Thu Aug 25 22:53:20 2005 @@ -11,7 +11,7 @@ */ -#include <xc.h> +#include <xenctrl.h> #include <stdio.h> #include <stdlib.h> #include <sys/mman.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/misc/xend --- a/tools/misc/xend Wed Aug 24 02:43:18 2005 +++ b/tools/misc/xend Thu Aug 25 22:53:20 2005 @@ -24,6 +24,7 @@ import socket import signal import time +import commands XCS_PATH = "/var/lib/xen/xcs_socket" XCS_EXEC = "/usr/sbin/xcs" @@ -114,6 +115,17 @@ xcs_pidfile.close() except: return + +def start_xenstored(): + XENSTORED_TRACE = os.getenv("XENSTORED_TRACE") + cmd = "/usr/sbin/xenstored --pid-file=/var/run/xenstore.pid" + if XENSTORED_TRACE: + cmd += " -T /var/log/xenstored-trace.log" + s,o = commands.getstatusoutput(cmd) + +def start_consoled(): + if os.fork() == 0: + os.execvp('/usr/sbin/xenconsoled', ['/usr/sbin/xenconsoled']) def main(): try: @@ -130,9 +142,13 @@ return status >> 8 elif sys.argv[1] == 'start': start_xcs() + start_xenstored() + start_consoled() return daemon.start() elif sys.argv[1] == 'trace_start': start_xcs() + start_xenstored() + start_consoled() return daemon.start(trace=1) elif sys.argv[1] == 'stop': stop_xcs() @@ -140,6 +156,8 @@ elif sys.argv[1] == 'restart': stop_xcs() start_xcs() + start_xenstored() + start_consoled() return daemon.stop() or daemon.start() elif sys.argv[1] == 'status': return daemon.status() diff -r 5f1ed597f107 -r 8799d14bef77 tools/misc/xenperf.c --- a/tools/misc/xenperf.c Wed Aug 24 02:43:18 2005 +++ b/tools/misc/xenperf.c Thu Aug 25 22:53:20 2005 @@ -11,7 +11,7 @@ */ -#include <xc.h> +#include <xenctrl.h> #include <stdio.h> #include <stdlib.h> #include <sys/mman.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/setup.py --- a/tools/python/setup.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/setup.py Thu Aug 25 22:53:20 2005 @@ -17,7 +17,7 @@ XEN_ROOT + "/tools/xenstore", ] -libraries = [ "xc", "xenstore-pic" ] +libraries = [ "xenctrl", "xenguest", "xenstore" ] xc = Extension("xc", extra_compile_args = extra_compile_args, @@ -41,7 +41,7 @@ sources = [ "xen/lowlevel/xs/xs.c" ]) setup(name = 'xen', - version = '2.0', + version = '3.0', description = 'Xen', packages = ['xen', 'xen.lowlevel', diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/lowlevel/xc/xc.c Thu Aug 25 22:53:20 2005 @@ -5,7 +5,8 @@ */ #include <Python.h> -#include <xc.h> +#include <xenctrl.h> +#include <xenguest.h> #include <zlib.h> #include <fcntl.h> #include <netinet/in.h> @@ -297,22 +298,23 @@ u32 dom; char *image, *ramdisk = NULL, *cmdline = ""; PyObject *memmap; - int control_evtchn, flags = 0; + int control_evtchn, store_evtchn; + int flags = 0, vcpus = 1; int numItems, i; int memsize; struct mem_map mem_map; - - static char *kwd_list[] = { "dom", "control_evtchn", - "memsize", - "image", "memmap", + unsigned long store_mfn = 0; + + static char *kwd_list[] = { "dom", "control_evtchn", "store_evtchn", + "memsize", "image", "memmap", "ramdisk", "cmdline", "flags", - NULL }; - - if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiisO!|ssi", kwd_list, - &dom, &control_evtchn, + "vcpus", NULL }; + + if ( !PyArg_ParseTupleAndKeywords(args, kwds, "iiiisO!|ssii", kwd_list, + &dom, &control_evtchn, &store_evtchn, &memsize, &image, &PyList_Type, &memmap, - &ramdisk, &cmdline, &flags) ) + &ramdisk, &cmdline, &flags, &vcpus) ) return NULL; memset(&mem_map, 0, sizeof(mem_map)); @@ -321,7 +323,6 @@ /* get the number of lines passed to us */ numItems = PyList_Size(memmap) - 1; /* removing the line containing "memmap" */ - printf ("numItems: %d\n", numItems); mem_map.nr_map = numItems; /* should raise an error here. */ @@ -365,11 +366,11 @@ } if ( xc_vmx_build(xc->xc_handle, dom, memsize, image, &mem_map, - ramdisk, cmdline, control_evtchn, flags) != 0 ) - return PyErr_SetFromErrno(xc_error); - - Py_INCREF(zero); - return zero; + ramdisk, cmdline, control_evtchn, flags, + vcpus, store_evtchn, &store_mfn) != 0 ) + return PyErr_SetFromErrno(xc_error); + + return Py_BuildValue("{s:i}", "store_mfn", store_mfn); } static PyObject *pyxc_bvtsched_global_set(PyObject *self, diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/lowlevel/xs/xs.c --- a/tools/python/xen/lowlevel/xs/xs.c Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/lowlevel/xs/xs.c Thu Aug 25 22:53:20 2005 @@ -1,6 +1,21 @@ /* * Python interface to the Xen Store Daemon. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General Public + * License as published by the Free Software Foundation. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * * Copyright (C) 2005 Mike Wray Hewlett-Packard + * */ #include <Python.h> @@ -253,12 +268,10 @@ } val = PyList_New(perms_n); for (i = 0; i < perms_n; i++, perms++) { - PyObject *p = Py_BuildValue("{s:i,s:i,s:i,s:i,s:i}", - "dom", perms->id, - "read", (perms->perms & XS_PERM_READ), - "write", (perms->perms & XS_PERM_WRITE), - "create", (perms->perms & XS_PERM_CREATE), - "owner", (perms->perms & XS_PERM_OWNER)); + PyObject *p = Py_BuildValue("{s:i,s:i,s:i}", + "dom", perms->id, + "read", (perms->perms & XS_PERM_READ), + "write", (perms->perms & XS_PERM_WRITE)); PyList_SetItem(val, i, p); } exit: @@ -281,8 +294,7 @@ static char *arg_spec = "sO"; char *path = NULL; PyObject *perms = NULL; - static char *perm_names[] = { "dom", "read", "write", "create", "owner", - NULL }; + static char *perm_names[] = { "dom", "read", "write", NULL }; static char *perm_spec = "i|iiii"; struct xs_handle *xh = xshandle(self); @@ -315,15 +327,9 @@ int dom = 0; /* Read/write perms. Set these. */ int p_read = 0, p_write = 0; - /* Create/owner perms. Ignore them. - * This is so the output from get_permissions() can be used - * as input to set_permissions(). - */ - int p_create = 0, p_owner = 0; PyObject *p = PyList_GetItem(perms, i); if (!PyArg_ParseTupleAndKeywords(tuple0, p, perm_spec, perm_names, - &dom, &p_read, &p_write, &p_create, - &p_owner)) + &dom, &p_read, &p_write)) goto exit; xsperms[i].id = dom; if (p_read) @@ -343,7 +349,6 @@ #define xspy_watch_doc "\n" \ "Watch a path, get notifications when it changes.\n" \ " path [string] : xenstore path.\n" \ - " priority [int] : watch priority (default 0).\n" \ " token [string] : returned in watch notification.\n" \ "\n" \ "Returns: [int] 0 on success.\n" \ @@ -352,10 +357,9 @@ static PyObject *xspy_watch(PyObject *self, PyObject *args, PyObject *kwds) { - static char *kwd_spec[] = { "path", "priority", "token", NULL }; + static char *kwd_spec[] = { "path", "token", NULL }; static char *arg_spec = "s|is"; char *path = NULL; - int priority = 0; char *token = ""; struct xs_handle *xh = xshandle(self); @@ -365,7 +369,7 @@ if (!xh) goto exit; if (!PyArg_ParseTupleAndKeywords(args, kwds, arg_spec, kwd_spec, - &path, &priority, &token)) + &path, &token)) goto exit; xsval = xs_watch(xh, path, token); val = pyvalue_int(xsval); diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/lowlevel/xu/xu.c --- a/tools/python/xen/lowlevel/xu/xu.c Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/lowlevel/xu/xu.c Thu Aug 25 22:53:20 2005 @@ -21,7 +21,7 @@ #include <unistd.h> #include <errno.h> #include <signal.h> -#include <xc.h> +#include <xenctrl.h> #include <xen/xen.h> #include <xen/io/domain_controller.h> @@ -655,7 +655,9 @@ case TYPE(CMSG_NETIF_FE, CMSG_NETIF_FE_INTERFACE_CONNECT): C2P(netif_fe_interface_connect_t, handle, Int, Long); C2P(netif_fe_interface_connect_t, tx_shmem_frame, Int, Long); + C2P(netif_fe_interface_connect_t, tx_shmem_ref, Int, Long); C2P(netif_fe_interface_connect_t, rx_shmem_frame, Int, Long); + C2P(netif_fe_interface_connect_t, rx_shmem_ref, Int, Long); return dict; case TYPE(CMSG_NETIF_FE, CMSG_NETIF_FE_INTERFACE_DISCONNECT): C2P(netif_fe_interface_disconnect_t, handle, Int, Long); @@ -681,7 +683,9 @@ C2P(netif_be_connect_t, domid, Int, Long); C2P(netif_be_connect_t, netif_handle, Int, Long); C2P(netif_be_connect_t, tx_shmem_frame, Int, Long); + C2P(netif_be_connect_t, tx_shmem_ref, Int, Long); C2P(netif_be_connect_t, rx_shmem_frame, Int, Long); + C2P(netif_be_connect_t, rx_shmem_ref, Int, Long); C2P(netif_be_connect_t, evtchn, Int, Long); C2P(netif_be_connect_t, status, Int, Long); return dict; @@ -840,7 +844,7 @@ case TYPE(CMSG_BLKIF_BE, CMSG_BLKIF_BE_CONNECT): P2C(blkif_be_connect_t, domid, u32); P2C(blkif_be_connect_t, blkif_handle, u32); - P2C(blkif_be_connect_t, shmem_frame, memory_t); + P2C(blkif_be_connect_t, shmem_frame, unsigned long); P2C(blkif_be_connect_t, shmem_ref, u32); P2C(blkif_be_connect_t, evtchn, u16); break; @@ -902,9 +906,11 @@ case TYPE(CMSG_NETIF_BE, CMSG_NETIF_BE_CONNECT): P2C(netif_be_connect_t, domid, u32); P2C(netif_be_connect_t, netif_handle, u32); - P2C(netif_be_connect_t, tx_shmem_frame, memory_t); - P2C(netif_be_connect_t, rx_shmem_frame, memory_t); - P2C(netif_be_connect_t, evtchn, u16); + P2C(netif_be_connect_t, tx_shmem_frame, unsigned long); + P2C(netif_be_connect_t, tx_shmem_ref, u32); + P2C(netif_be_connect_t, rx_shmem_frame, unsigned long); + P2C(netif_be_connect_t, rx_shmem_ref, u32); + P2C(netif_be_connect_t, evtchn, u16); break; case TYPE(CMSG_NETIF_BE, CMSG_NETIF_BE_DISCONNECT): P2C(netif_be_disconnect_t, domid, u32); @@ -936,7 +942,7 @@ P2C(usbif_fe_driver_status_changed_t, status, u32); break; case TYPE(CMSG_USBIF_FE, CMSG_USBIF_FE_INTERFACE_CONNECT): - P2C(usbif_fe_interface_connect_t, shmem_frame, memory_t); + P2C(usbif_fe_interface_connect_t, shmem_frame, unsigned long); break; case TYPE(CMSG_USBIF_FE, CMSG_USBIF_FE_INTERFACE_DISCONNECT): break; @@ -950,7 +956,7 @@ break; case TYPE(CMSG_USBIF_BE, CMSG_USBIF_BE_CONNECT): P2C(usbif_be_connect_t, domid, domid_t); - P2C(usbif_be_connect_t, shmem_frame, memory_t); + P2C(usbif_be_connect_t, shmem_frame, unsigned long); P2C(usbif_be_connect_t, evtchn, u32); P2C(usbif_be_connect_t, bandwidth, u32); P2C(usbif_be_connect_t, status, u32); diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/CreateDomain.py --- a/tools/python/xen/sv/CreateDomain.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/CreateDomain.py Thu Aug 25 22:53:20 2005 @@ -17,26 +17,56 @@ CreateFinish ] Wizard.__init__( self, urlWriter, "Create Domain", sheets ) - + + def op_finish( self, request ): + pass + class CreatePage0( Sheet ): + title = "General" + def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "General", 0 ) self.addControl( InputControl( 'name', 'VM Name', 'VM Name:', "[\\w|\\S]+", "You must enter a name in this field" ) ) self.addControl( InputControl( 'memory', '64', 'Memory (Mb):', "[\\d]+", "You must enter a number in this field" ) ) self.addControl( InputControl( 'cpu', '0', 'CPU:', "[\\d]+", "You must enter a number in this feild" ) ) self.addControl( InputControl( 'cpu_weight', '1', 'CPU Weight:', "[\\d]+", "You must enter a number in this feild" ) ) + self.addControl( InputControl( 'vcpus', '1', 'Virtual CPUs:', '[\\d]+', "You must enter a number in this feild") ) class CreatePage1( Sheet ): + title = "Setup Kernel Image" + def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "Setup Kernel Image", 1 ) -# For now we don't need to select a builder... -# self.addControl( ListControl( 'builder', [('linux', 'Linux'), ('netbsd', 'NetBSD')], 'Kernel Type:' ) ) - self.addControl( FileControl( 'kernel', '/boot/vmlinuz-2.6.9-xenU', 'Kernel Image:' ) ) + self.addControl( ListControl( 'builder', [('linux', 'Linux'), ('netbsd', 'NetBSD')], 'Domain Builder:' ) ) + self.addControl( FileControl( 'kernel', '/boot/vmlinuz-2.6.12-xenU', 'Kernel Image:' ) ) self.addControl( InputControl( 'extra', '', 'Kernel Command Line Parameters:' ) ) + self.addControl( ListControl( 'use-initrd', [('yes', 'Yes'), ('no', 'No')], 'Use an Initial Ram Disk?:' ) ) + self.addControl( FileControl( 'initrd', '/boot/initrd-2.6.12-xenU.img', 'Initial Ram Disk:' ) ) + + def validate( self, request ): + if not self.passback: self.parseForm( request ) + check = True + request.write( previous_values.get( '>>>>>use-initrd' ) ) + previous_values = ssxp2hash( string2sxp( self.passback ) ) #get the map for quick reference + if DEBUG: print previous_values + for (feild, control) in self.feilds: + if feild == 'initrd' and previous_values.get( 'use-initrd' ) != 'no': + request.write( previous_values.get( '>>>>>use-initrd' ) ) + if control.validate( previous_values.get( feild ) ): + check = False + elif not control.validate( previous_values.get( feild ) ): + check = False + + if DEBUG: print "> %s = %s" % (feild, previous_values.get( feild )) + + return check + class CreatePage2( Sheet ): + + title = "Choose number of VBDS" def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "Setup Virtual Block Device", 2 ) @@ -44,10 +74,12 @@ class CreatePage3( Sheet ): + title = "Setup VBDS" + def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "Setup Virtual Block Device", 3 ) - def write_BODY( self, request, err ): + def write_BODY( self, request ): if not self.passback: self.parseForm( request ) previous_values = sxp2hash( string2sxp( self.passback ) ) #get the hash for quick reference @@ -61,9 +93,11 @@ self.addControl( InputControl( 'root', '/dev/sda1', 'Root device (in VM):' ) ) - Sheet.write_BODY( self, request, err ) + Sheet.write_BODY( self, request ) class CreatePage4( Sheet ): + + title = "Network Setting" def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "Network settings", 4 ) @@ -76,26 +110,27 @@ class CreateFinish( Sheet ): + title = "Finish" + def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "All Done", 5 ) - def write_BODY( self, request, err ): + def write_BODY( self, request ): if not self.passback: self.parseForm( request ) xend_sxp = self.translate_sxp( string2sxp( self.passback ) ) + + request.write( "<pre>%s</pre>" % sxp2prettystring( xend_sxp ) ) try: - dom_sxp = server.xend_domain_create( xend_sxp ) - success = "Your domain was successfully created.\n" - except: - success = "There was an error creating your domain.\nThe configuration used is as follows:\n" - dom_sxp = xend_sxp - - - - pt = PreTab( success + sxp2prettystring( dom_sxp ) ) - pt.write_BODY( request ) + server.xend_domain_create( xend_sxp ) + request.write( "<p>You domain had been successfully created.</p>" ) + except Exception, e: + request.write( "<p>There was an error creating your domain.<br/>The configuration used is as follows:\n</p>" ) + request.write( "<pre>%s</pre>" % sxp2prettystring( xend_sxp ) ) + request.write( "<p>The error was:</p>" ) + request.write( "<pre>%s</pre>" % str( e ) ) request.write( "<input type='hidden' name='passback' value=\"%s\"></p>" % self.passback ) request.write( "<input type='hidden' name='sheet' value='%s'></p>" % self.location ) @@ -117,6 +152,7 @@ vals.maxmem = get( 'maxmem' ) vals.cpu = get( 'cpu' ) vals.cpu_weight = get( 'cpu_weight' ) + vals.vcpus = get( 'vcpus' ) vals.builder = get( 'builder' ) vals.kernel = get( 'kernel' ) @@ -128,7 +164,7 @@ vbds = [] for i in range( int( get( 'num_vbds' ) ) ): - vbds.append( ( get( 'vbd%s_dom0' % i ), get('vbd%s_domU' % i ), get( 'vbd%s_mode' % i ) ) ) + vbds.append( ( get( 'vbd%s_dom0' % i ), get('vbd%s_domU' % i ), get( 'vbd%s_mode' % i ), None ) ) vals.disk = vbds @@ -141,6 +177,9 @@ vals.restart = None vals.console = None vals.ramdisk = None + vals.ssidref = -1 + vals.bootloader = None + vals.usb = [] #setup vifs @@ -155,9 +194,11 @@ dhcp = get( 'dhcp' ) vals.cmdline_ip = "%s:%s:%s:%s:%s:eth0:%s" % (ip, nfs, gate, mask, host, dhcp) + + opts = None try: - return make_config( vals ) - except: - return [["Error creating domain config."]] - + return make_config( opts, vals ) + except Exception, e: + return [["There was an error creating the domain config SXP. This is typically due to an interface change in xm/create.py:make_config", e]] + diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/DomInfo.py --- a/tools/python/xen/sv/DomInfo.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/DomInfo.py Thu Aug 25 22:53:20 2005 @@ -4,6 +4,7 @@ from xen.sv.HTMLBase import HTMLBase from xen.sv.util import * from xen.sv.GenTabbed import * +from xen.sv.Wizard import * DEBUG=1 @@ -12,33 +13,69 @@ def __init__( self, urlWriter ): self.dom = 0; - - def tabUrlWriter( tab ): - return urlWriter( "&dom=%s%s" % ( self.dom, tab ) ) - - GenTabbed.__init__( self, "Domain Info", tabUrlWriter, [ 'General', 'SXP', 'Devices' ], [ DomGeneralTab, DomSXPTab, NullTab ] ) + + GenTabbed.__init__( self, "Domain Info", urlWriter, [ 'General', 'SXP', 'Devices', 'Migrate', 'Save' ], [ DomGeneralTab, DomSXPTab, DomDeviceTab, DomMigrateTab, DomSaveTab ] ) def write_BODY( self, request ): - dom = request.args.get('dom') - - if dom is None or len(dom) != 1: + try: + dom = int( getVar( 'dom', request ) ) + except: request.write( "<p>Please Select a Domain</p>" ) return None - else: - self.dom = dom[0] - + GenTabbed.write_BODY( self, request ) def write_MENU( self, request ): - pass - + domains = [] + + try: + domains = server.xend_domains() + domains.sort() + except: + pass + + request.write( "\n<table style='border:0px solid white' cellspacing='0' cellpadding='0' border='0' width='100%'>\n" ) + request.write( "<tr class='domainInfoHead'>" ) + request.write( "<td class='domainInfoHead' align='center'>Domain</td>\n" ) + request.write( "<td class='domainInfoHead' align='center'>Name</td>\n" ) + request.write( "<td class='domainInfoHead' align='center'>State</td>\n" ) + request.write( "<td class='domainInfoHead' align='center'></td>\n" ) + request.write( "</tr>" ) + + odd = True + if not domains is None: + for domain in domains: + odd = not odd; + if odd: + request.write( "<tr class='domainInfoOdd'>\n" ) + else: + request.write( "<tr class='domainInfoEven'>\n" ) + domInfo = getDomInfo( domain ) + request.write( "<td class='domainInfo' align='center'>%(id)s</td>\n" % domInfo ) + url = self.urlWriter( "&dom=%(id)s" % domInfo ) + request.write( "<td class='domainInfo' align='center'><a href='%s'>%s</a></td>\n" % ( url, domInfo['name'] ) ) + request.write( "<td class='domainInfo' align='center'>%(state)5s</td>\n" % domInfo ) + if domInfo[ 'id' ] != "0": + request.write( "<td class='domainInfo' align='center'>" ) + if domInfo[ 'state' ][ 2 ] == "-": + request.write( "<img src='images/small-pause.png' onclick='doOp2( \"pause\", \"%(dom)-4s\" )'>" % domInfo ) + else: + request.write( "<img src='images/small-unpause.png' onclick='doOp2( \"unpause\", \"%(dom)-4s\" )'>" % domInfo ) + request.write( "<img src='images/small-destroy.png' onclick='doOp2( \"destroy\", \"%(dom)-4s\" )'></td>" % domInfo ) + else: + request.write( "<td> </td>" ) + request.write( "</tr>\n" ) + else: + request.write( "<tr colspan='10'><p class='small'>Error getting domain list<br/>Perhaps XenD not running?</p></tr>") + request.write( "</table>" ) + class DomGeneralTab( CompositeTab ): - def __init__( self ): - CompositeTab.__init__( self, [ DomGenTab, DomActionTab ] ) - + def __init__( self, urlWriter ): + CompositeTab.__init__( self, [ DomGenTab, DomActionTab ], urlWriter ) + class DomGenTab( GeneralTab ): - def __init__( self ): + def __init__( self, urlWriter ): titles = {} @@ -60,13 +97,13 @@ request.write( "<p>Please Select a Domain</p>" ) return None - self.dict = getDomInfoHash( self.dom ) + self.dict = getDomInfo( self.dom ) GeneralTab.write_BODY( self, request ) class DomSXPTab( PreTab ): - def __init__( self ): + def __init__( self, urlWriter ): self.dom = 0 PreTab.__init__( self, "" ) @@ -86,15 +123,15 @@ self.source = sxp2prettystring( domInfo ) PreTab.write_BODY( self, request ) - + class DomActionTab( ActionTab ): - def __init__( self ): - actions = { "shutdown" : "shutdown", - "reboot" : "reboot", - "pause" : "pause", - "unpause" : "unpause", - "destroy" : "destroy" } + def __init__( self, urlWriter ): + actions = { "shutdown" : "Shutdown", + "reboot" : "Reboot", + "pause" : "Pause", + "unpause" : "Unpause", + "destroy" : "Destroy" } ActionTab.__init__( self, actions ) def op_shutdown( self, request ): @@ -141,8 +178,91 @@ server.xend_domain_destroy( int( dom ), "halt" ) except: pass - - - - - + +class DomDeviceTab( CompositeTab ): + + def __init__( self, urlWriter ): + CompositeTab.__init__( self, [ DomDeviceListTab, DomDeviceOptionsTab, DomDeviceActionTab ], urlWriter ) + +class DomDeviceListTab( NullTab ): + + title = "Device List" + + def __init__( self, urlWriter ): + pass + +class DomDeviceOptionsTab( NullTab ): + + title = "Device Options" + + def __init__( self, urlWriter ): + pass + +class DomDeviceActionTab( ActionTab ): + + def __init__( self, urlWriter ): + ActionTab.__init__( self, { "addvcpu" : "Add VCPU", "addvbd" : "Add VBD", "addvif" : "Add VIF" } ) + +class DomMigrateTab( CompositeTab ): + + def __init__( self, urlWriter ): + CompositeTab.__init__( self, [ DomMigrateExtraTab, DomMigrateActionTab ], urlWriter ) + +class DomMigrateExtraTab( Sheet ): + + def __init__( self, urlWriter ): + Sheet.__init__( self, urlWriter, "Configure Migration", 0) + self.addControl( TickControl('live', 'True', 'Live migrate:') ) + self.addControl( InputControl('rate', '0', 'Rate limit:') ) + self.addControl( InputControl( 'dest', 'host.domain', 'Name or IP address:', ".*") ) + +class DomMigrateActionTab( ActionTab ): + + def __init__( self, urlWriter ): + actions = { "migrate" : "Migrate" } + ActionTab.__init__( self, actions ) + + def op_migrate( self, request ): + try: + domid = int( getVar( 'dom', request ) ) + live = getVar( 'live', request ) + rate = getVar( 'rate', request ) + dest = getVar( 'dest', request ) + dom_sxp = server.xend_domain_migrate( domid, dest, live == 'True', rate ) + success = "Your domain was successfully Migrated.\n" + except Exception, e: + success = "There was an error migrating your domain\n" + dom_sxp = str(e) + +class DomSaveTab( CompositeTab ): + + def __init__( self, urlWriter ): + CompositeTab.__init__( self, [ DomSaveExtraTab, DomSaveActionTab ], urlWriter ) + +class DomSaveExtraTab( Sheet ): + + title = "Save location" + + def __init__( self, urlWriter ): + Sheet.__init__( self, urlWriter, "Save Domain to file", 0 ) + self.addControl( InputControl( 'file', '', 'Suspend file name:', ".*") ) + +class DomSaveActionTab( ActionTab ): + + def __init__( self, urlWriter ): + actions = { "save" : "Save" } + ActionTab.__init__( self, actions ) + + def op_save( self, request ): + + try: + dom_sxp = server.xend_domain_save( config['domid'], config['file'] ) + success = "Your domain was successfully saved.\n" + except Exception, e: + success = "There was an error saving your domain\n" + dom_sxp = str(e) + + try: + dom = int( getVar( 'dom', request ) ) + except: + pass diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/GenTabbed.py --- a/tools/python/xen/sv/GenTabbed.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/GenTabbed.py Thu Aug 25 22:53:20 2005 @@ -1,7 +1,6 @@ import types from xen.sv.HTMLBase import HTMLBase -from xen.sv.TabView import TabView from xen.sv.util import getVar class GenTabbed( HTMLBase ): @@ -12,39 +11,44 @@ self.tabObjects = tabObjects self.urlWriter = urlWriter self.title = title + + def write_BODY( self, request ): + if not self.__dict__.has_key( "tab" ): + try: + self.tab = int( getVar( 'tab', request, 0 ) ) + except: + self.tab = 0 + + request.write( "\n<div class='title'>%s</div>" % self.title ) + + TabView( self.tab, self.tabStrings, self.urlWriter ).write_BODY( request ) + + try: + request.write( "\n<div class='tab'>" ) + render_tab = self.tabObjects[ self.tab ] + render_tab( self.urlWriter ).write_BODY( request ) + request.write( "\n</div>" ) + except Exception, e: + request.write( "\n<p>Error Rendering Tab</p>" ) + request.write( "\n<p>%s</p>" % str( e ) ) - def write_BODY( self, request, urlWriter = None ): - try: - tab = int( getVar( 'tab', request, 0 ) ) - except: - tab = 0 - - request.write( "<table style='' width='100%' border='0' cellspacing='0' cellpadding='0'>" ) - request.write( "<tr><td>" ) - request.write( "<p align='center'><u>%s</u></p>" % self.title ) - - TabView( tab, self.tabStrings, self.urlWriter ).write_BODY( request ) - - request.write( "</td></tr><tr><td>" ) + request.write( "\n<input type=\"hidden\" name=\"tab\" value=\"%d\">" % self.tab ) + + def perform( self, request ): + request.write( "Tab> perform" ) + request.write( "<br/>op: " + str( getVar( 'op', request ) ) ) + request.write( "<br/>args: " + str( getVar( 'args', request ) ) ) + request.write( "<br/>tab: " + str( getVar( 'tab', request ) ) ) try: - render_tab = self.tabObjects[ tab ] - render_tab().write_BODY( request ) + action = getVar( 'op', request, 0 ) + if action == "tab": + self.tab = int( getVar( 'args', request ) ) + else: + this.tab = int( getVar( 'tab', request, 0 ) ) + self.tabObjects[ self.tab ]( self.urlWriter ).perform( request ) except: - request.write( "<p>Error Rendering Tab</p>" ) - - request.write( "</td></tr></table>" ) - - def perform( self, request ): - try: - tab = int( getVar( 'tab', request, 0 ) ) - except: - tab = 0; - - op_tab = self.tabObjects[ tab ] - - if op_tab: - op_tab().perform( request ) + pass class PreTab( HTMLBase ): @@ -53,12 +57,9 @@ self.source = source def write_BODY( self, request ): - - request.write( "<div style='display: block; overflow: auto; border: 0px solid black; width: 540px; padding: 5px; z-index:0; align: center'><pre>" ) - + request.write( "\n<pre>" ) request.write( self.source ) - - request.write( "</pre></div>" ) + request.write( "\n</pre>" ) class GeneralTab( HTMLBase ): @@ -69,7 +70,7 @@ def write_BODY( self, request ): - request.write( "<table width='100%' cellspacing='0' cellpadding='0' border='0'>" ) + request.write( "\n<table width='100%' cellspacing='0' cellpadding='0' border='0'>" ) def writeAttr( niceName, attr, formatter=None ): if type( attr ) is types.TupleType: @@ -80,7 +81,7 @@ temp = formatter( self.dict[ attr ] ) else: temp = str( self.dict[ attr ] ) - request.write( "<tr><td width='50%%'><p>%s:</p></td><td width='50%%'><p>%s</p></td></tr>" % ( niceName, temp ) ) + request.write( "\n<tr><td width='50%%'><p>%s:</p></td><td width='50%%'><p>%s</p></td></tr>" % ( niceName, temp ) ) for niceName, attr in self.titles.items(): writeAttr( niceName, attr ) @@ -89,16 +90,12 @@ class NullTab( HTMLBase ): - def __init__( self ): - HTMLBase.__init__( self ) - self.title = "Null Tab" - - def __init__( self, title ): + def __init__( self, title="Null Tab" ): HTMLBase.__init__( self ) self.title = title - + def write_BODY( self, request ): - request.write( "<p>%s</p>" % self.title ) + request.write( "\n<p>%s</p>" % self.title ) class ActionTab( HTMLBase ): @@ -107,29 +104,44 @@ HTMLBase.__init__( self ) def write_BODY( self, request ): - request.write( "<p align='center'><table cellspacing='3' cellpadding='2' border='0'><tr>" ) - - for ( command, text ) in self.actions.items(): - request.write( "<td style='border: 1px solid black; background-color: grey' onmouseover='buttonMouseOver( this )' onmouseout='buttonMouseOut( this )'>" ) - request.write( "<p><a href='javascript: doOp( \"%s\" );'>%s</a></p></td>" % (command, text) ) - - request.write("</table></p>") - + for item in self.actions.items(): + try: + ((op, attr), title) = item + except: + (op, title) = item + attr = "" + request.write( "\n<div class='button' onclick=\"doOp2( '%s', '%s' )\">%s</a></div>" % (op, attr, title) ) + class CompositeTab( HTMLBase ): - def __init__( self, tabs ): + def __init__( self, tabs, urlWriter ): HTMLBase.__init__( self ) self.tabs = tabs + self.urlWriter = urlWriter def write_BODY( self, request ): for tab in self.tabs: - request.write( "<br/>" ) - tab().write_BODY( request ) + tab( self.urlWriter ).write_BODY( request ) def perform( self, request ): for tab in self.tabs: - tab().perform( request ) - - - - + tab( self.urlWriter ).perform( request ) + +class TabView( HTMLBase ): + + # tab - int, id into tabs of selected tab + # tabs - list of strings, tab names + # urlWriter - + def __init__( self, tab, tabs, urlWriter ): + HTMLBase.__init__(self) + self.tab = tab + self.tabs = tabs + self.urlWriter = urlWriter + + def write_BODY( self, request ): + for i in range( len( self.tabs ) ): + if self.tab == i: + at = " id='activeTab'" + else: + at = "" + request.write( "\n<div%s class='tabButton' onclick=\"doOp2( 'tab', '%d' )\">%s</div>" % ( at, i, self.tabs[ i ] ) ) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/HTMLBase.py --- a/tools/python/xen/sv/HTMLBase.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/HTMLBase.py Thu Aug 25 22:53:20 2005 @@ -12,26 +12,17 @@ return self.render_GET( request ) def render_GET( self, request ): - self.write_TOP( request ) - self.write_BODY( request ) - self.write_BOTTOM( request ) - return '' - + pass + def write_BODY( self, request ): - request.write( "BODY" ) + pass def write_TOP( self, request ): - request.write( '<html><head><title>Xen</title><link rel="stylesheet" type="text/css" href="inc/style.css" />' ) - request.write( '<script src="inc/script.js"></script>' ) - request.write( '</head><body>' ) - request.write('<form method="post" action="%s">' % request.uri) - + pass + def write_BOTTOM( self, request ): - request.write('<input type="hidden" name="op" value="">') - request.write('<input type="hidden" name="args" value="">') - request.write('</form>') - request.write( "</body></html>" ) - + pass + def get_op_method(self, op): """Get the method for an operation. For operation 'foo' looks for 'op_foo'. diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/Main.py --- a/tools/python/xen/sv/Main.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/Main.py Thu Aug 25 22:53:20 2005 @@ -1,113 +1,90 @@ + from xen.sv.HTMLBase import HTMLBase -from xen.sv.DomList import DomList from xen.sv.NodeInfo import NodeInfo from xen.sv.DomInfo import DomInfo from xen.sv.CreateDomain import CreateDomain -from xen.sv.MigrateDomain import MigrateDomain -from xen.sv.SaveDomain import SaveDomain from xen.sv.RestoreDomain import RestoreDomain - -from xen.xend.XendClient import server from xen.sv.util import getVar -class Main( HTMLBase ): - - isLeaf = True +# adapter to make this all work with mod_python +# as opposed to Twisted +# (c) Tom Wilkie 2005 - def __init__( self, urlWriter = None ): +class Args: + def __init__( self, req ): + from mod_python.util import FieldStorage + self.fieldStorage = FieldStorage( req, True ) + + # return a list of values for the given key, + # or None if key not there + def get( self, var ): + retVar = self.fieldStorage.getlist( var ) + if len( retVar ) == 0: + return None + else: + return retVar + + # return a list of tuples, + # (key, value) where value is a list of values + def items( self ): + result = []; + for key in self.fieldStorage.keys(): + result.append( (key, self.fieldStorage.getlist( key ) ) ) + return result + +class TwistedAdapter: + def __init__( self, req ): + self.args = Args( req ) + self.uri = req.unparsed_uri + self.url = req.uri + self.write = req.write + +# This is the Main class +# It peices together all the modules + +class Main: + def __init__( self ): self.modules = { "node": NodeInfo, - "list": DomList, - "info": DomInfo, "create": CreateDomain, - "migrate" : MigrateDomain, - "save" : SaveDomain, - "restore" : RestoreDomain } + "restore" : RestoreDomain, + "info": DomInfo } - # ordered list of module menus to display - self.module_menus = [ "node", "create", "migrate", "save", - "restore", "list" ] - HTMLBase.__init__(self) - - def render_POST( self, request ): - - #decide what module post'd the action - - args = getVar( 'args', request ) + self.init_done = False - mod = getVar( 'mod', request ) - - if not mod is None and args is None: - module = self.modules[ mod ] - #check module exists - if module: - module( self.mainUrlWriter ).perform( request ) - else: - self.perform( request ) - - return self.render_GET( request ) + def init_modules( self, request ): + for moduleName, module in self.modules.iteritems(): + self.modules[ moduleName ] = module( self.urlWriter( moduleName, request.url ) ) - def mainUrlWriter( self, module ): - def fun( f ): - return "Main.rpy?mod=%s%s" % ( module, f ) - return fun - - def write_BODY( self, request ): - - request.write( "\n<table style='border:0px solid black; background: url(images/orb_01.jpg) no-repeat' cellspacing='0' cellpadding='0' border='0' width='780px' height='536px'>\n" ) - request.write( "<tr>\n" ) - request.write( " <td width='15px'> </td>" ) - request.write( " <td width='175px' align='center' valign'center'>" ) - request.write( " <table cellspacing='0' cellpadding='0' border='0' width='100%' height='100%'>" ) - request.write( " <tr><td height='140px' align='center' valign='bottom'><a href='http://www.cl.cam.ac.uk/Research/SRG/netos/xen/'>" ) - request.write( " <img src='images/xen.png' width='150' height='75' border='0'/></a><br/></td></tr>" ) - request.write( " <tr><td height='60px' align='center'><p class='small'>SV Web Interface<br/>(C) <a href='mailto:tw275@xxxxxxxxx'>Tom Wilkie</a> 2004</p></td></tr>") - request.write( " <tr><td align='center' valign='top'>" ) + def render_menu( self, request ): + if not self.init_done: + self.init_modules( request ) + self.init_done = True + + for moduleName, module in self.modules.iteritems(): + module.write_MENU( request ) + request.write( "\n" ) - for modName in self.module_menus: - self.modules[modName]( self.mainUrlWriter( modName ) ).write_MENU( request ) - - request.write( " </td></tr>" ) - request.write( " </table>" ) - request.write( " " ) - request.write( " </td>\n" ) - request.write( " <td width='15px'> </td>" ) - request.write( " <td width='558px' align='left' valign='top'>" ) - request.write( " <table cellspacing='0' cellpadding='0' border='0' width='100%' height='100%'>" ) - request.write( " <tr><td height='20px'></td></tr>" ) - request.write( " <tr><td align='center' valign='top'>" ) - - modName = getVar('mod', request) - - if modName is None: + def render_main( self, request ): + if not self.init_done: + self.init_modules( request ) + self.init_done = True + + moduleName = getVar('mod', request) + if moduleName not in self.modules: request.write( '<p>Please select a module</p>' ) else: - module = self.modules[ modName ] - if module: - module( self.mainUrlWriter( modName ) ).write_BODY( request ) - else: - request.write( '<p>Invalid module. Please select another</p>' ) - - request.write( " </td></tr>" ) - request.write( " </table>" ) - request.write( " </td>\n" ) - request.write( " <td width='17px'> </td>" ) - request.write( "</tr>\n" ) + module = self.modules[ moduleName ] + module.write_BODY( request ) + + def do_POST( self, request ): + if not self.init_done: + self.init_modules( request ) + self.init_done = True - request.write( "</table>\n" ) - - - def op_destroy( self, request ): - dom = getVar( 'dom', request ) - if not dom is None and dom != "0": - server.xend_domain_destroy( int( dom ), "halt" ) - - def op_pause( self, request ): - dom = getVar( 'dom', request ) - if not dom is None and dom != "0": - server.xend_domain_pause( int( dom ) ) - - def op_unpause( self, request ): - dom = getVar( 'dom', request ) - if not dom is None and dom != "0": - server.xend_domain_unpause( int( dom ) ) + moduleName = getVar( 'mod', request ) + if moduleName in self.modules: + self.modules[ moduleName ].perform( request ) + + def urlWriter( self, module, url ): + return lambda x: "%s?mod=%s%s" % ( url, module, x ) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/NodeInfo.py --- a/tools/python/xen/sv/NodeInfo.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/NodeInfo.py Thu Aug 25 22:53:20 2005 @@ -6,18 +6,18 @@ class NodeInfo( GenTabbed ): def __init__( self, urlWriter ): - GenTabbed.__init__( self, "Node Details", urlWriter, [ 'General', 'Dmesg', ], [ NodeGeneralTab, NodeDmesgTab ] ) + GenTabbed.__init__( self, "Node Details", urlWriter, [ 'General', 'Dmesg', 'SXP' ], [ NodeGeneralTab, NodeDmesgTab, NodeSXPTab ] ) def write_MENU( self, request ): request.write( "<p class='small'><a href='%s'>Node details</a></p>" % self.urlWriter( '' ) ) class NodeGeneralTab( CompositeTab ): - def __init__( self ): - CompositeTab.__init__( self, [ NodeInfoTab, NodeActionTab ] ) + def __init__( self, urlWriter ): + CompositeTab.__init__( self, [ NodeInfoTab, NodeActionTab ], urlWriter ) class NodeInfoTab( GeneralTab ): - def __init__( self ): + def __init__( self, urlWriter ): nodeInfo = {} try: @@ -41,7 +41,7 @@ class NodeDmesgTab( PreTab ): - def __init__( self ): + def __init__( self, urlWriter ): try: dmesg = server.xend_node_get_dmesg() except: @@ -50,7 +50,7 @@ class NodeActionTab( ActionTab ): - def __init__( self ): + def __init__( self, urlWriter ): ActionTab.__init__( self, { "shutdown" : "shutdown", "reboot" : "reboot" } ) @@ -61,3 +61,13 @@ def op_reboot( self, request ): if debug: print ">NodeReboot" server.xend_node_reboot() + +class NodeSXPTab( PreTab ): + + def __init__( self, urlWriter ): + try: + nodeSXP = sxp2string( server.xend_node() ) + except: + nodeSXP = 'Error getting node sxp' + + PreTab.__init__( self, nodeSXP ) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/RestoreDomain.py --- a/tools/python/xen/sv/RestoreDomain.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/RestoreDomain.py Thu Aug 25 22:53:20 2005 @@ -16,6 +16,8 @@ class ChooseRestoreDomain( Sheet ): + title = "Configure Restore" + def __init__( self, urlWriter ): Sheet.__init__( self, urlWriter, "Configure Restore", 0) @@ -24,6 +26,8 @@ ".*") ) class DoRestore( Sheet ): + title = "Restore Done" + def __init__(self, urlWriter ): Sheet.__init__(self, urlWriter, "Restore Done", 1) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/Wizard.py --- a/tools/python/xen/sv/Wizard.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/Wizard.py Thu Aug 25 22:53:20 2005 @@ -1,71 +1,44 @@ from xen.sv.util import * from xen.sv.HTMLBase import HTMLBase +from xen.sv.GenTabbed import GenTabbed, ActionTab from xen.xend import sxp import re DEBUG = 0 -class Wizard( HTMLBase ): +class Wizard( GenTabbed ): def __init__( self, urlWriter, title, sheets ): - HTMLBase.__init__( self ) self.title = title self.sheets = sheets self.urlWriter = urlWriter + self.offset = 0 + GenTabbed.__init__( self, title, urlWriter, map( lambda x: x.title, sheets ), sheets ) def write_MENU( self, request ): request.write( "<p class='small'><a href='%s'>%s</a></p>" % (self.urlWriter( '' ), self.title) ) def write_BODY( self, request ): - - request.write( "<table width='100%' border='0' cellspacing='0' cellpadding='0'><tr><td>" ) - request.write( "<p align='center'><u>%s</u></p></td></tr><tr><td>" % self.title ) - - currSheet = getVar( 'sheet', request ) - - if not currSheet is None: - currSheet = int( currSheet ) - else: - currSheet = 0 - - sheet = self.sheets[ currSheet ]( self.urlWriter ) - - err = not sheet.validate( request ) - - if not err: - op = getVar( 'op', request ) - - if op == 'next': - currSheet += 1 - elif op == 'prev': - currSheet -= 1 - - sheet = self.sheets[ currSheet ]( self.urlWriter ) - - if getVar( 'visited-sheet%s' % currSheet, request ): - sheet.write_BODY( request, err ) - else: - sheet.write_BODY( request, False ) - - - request.write( "</td></tr><tr><td><table width='100%' border='0' cellspacing='0' cellpadding='0'><tr>" ) - request.write( "<td width='80%'></td><td width='20%' align='center'><p align='center'>" ) - if currSheet > 0: - request.write( "<img src='images/previous.png' onclick='doOp( \"prev\" )' onmouseover='update( \"wizText\", \"Previous\" )' onmouseout='update( \"wizText\", \" \" )'> " ) - if currSheet < ( len( self.sheets ) - 2 ): - request.write( "<img src='images/next.png' onclick='doOp( \"next\" )' onmouseover='update( \"wizText\", \"Next\" )' onmouseout='update( \"wizText\", \" \" )'>" ) - elif currSheet == ( len( self.sheets ) - 2 ): - request.write( "<img src='images/finish.png' onclick='doOp( \"next\" )' onmouseover='update( \"wizText\", \"Finish\" )' onmouseout='update( \"wizText\", \" \" )'>" ) - request.write( "</p><p align='center'><span id='wizText'></span></p></td></tr></table>" ) - request.write( "</td></tr></table>" ) - - def op_next( self, request ): - pass - - def op_prev( self, request ): - pass - + GenTabbed.write_BODY( self, request ) + actionTab = ActionTab( { ("tab", str(self.tab-1)) : "< Prev", ("tab", str(self.tab+1)) : "Next >", "finish" : "Finish" } ) + actionTab.write_BODY( request ) + + def perform( self, request ): + try: + action = getVar( 'op', request, 0 ) + if action == "tab": + self.tab = int( getVar( 'args', request ) ) + oldtab = int( getVar( 'tab', request ) ) + if not self.tabObjects[ oldtab ]( self.urlWriter ).validate( request ): + self.tab = oldtab + else: + self.tab = int( getVar( 'tab', request, 0 ) ) + self.tabObjects[ self.tab ]( self.urlWriter ).perform( request ) + getattr( self, "op_" + getVar( "op", request ), None )( request ) + except: + pass + def op_finish( self, request ): pass @@ -80,7 +53,7 @@ self.passback = None def parseForm( self, request ): - do_not_parse = [ 'mod', 'op', 'sheet', 'passback' ] + do_not_parse = [ 'mod', 'op', 'passback' ] passed_back = request.args @@ -103,7 +76,7 @@ if DEBUG: print self.passback - def write_BODY( self, request, err ): + def write_BODY( self, request ): if not self.passback: self.parseForm( request ) @@ -115,14 +88,13 @@ for (feild, control) in self.feilds: control.write_Control( request, previous_values.get( feild ) ) - if err and not control.validate( previous_values.get( feild ) ): + if previous_values.get( feild ) is not None and not control.validate( previous_values.get( feild ) ): control.write_Help( request ) request.write( "</table>" ) request.write( "<input type='hidden' name='passback' value=\"%s\"></p>" % self.passback ) - request.write( "<input type='hidden' name='sheet' value='%s'></p>" % self.location ) - request.write( "<input type='hidden' name='visited-sheet%s' value='True'></p>" % self.location ) + #request.write( "<input type='hidden' name='visited-sheet%s' value='True'></p>" % self.location ) def addControl( self, control ): self.feilds.append( [ control.getName(), control ] ) @@ -133,7 +105,7 @@ check = True - previous_values = ssxp2hash( string2sxp( self.passback ) ) #get the hash for quick reference + previous_values = ssxp2hash( string2sxp( self.passback ) ) #get the map for quick reference if DEBUG: print previous_values for (feild, control) in self.feilds: @@ -258,12 +230,16 @@ def write_Control( self, request, persistedValue ): request.write( "<tr><td width='50%%'><p>%s</p></td><td width='50%%'>" % self.humanText ) + + #request.write( str( persistedValue ) ) + + #TODO: Theres a problem with this: it doesn't persist an untick, because the browsers don't pass it back. Need a fix... if persistedValue == 'True': request.write( "<input type='checkbox' name='%s' value='True' checked>" % self.getName() ) else: request.write( "<input type='checkbox' name='%s' value='True'>" % self.getName() ) - request.write( "</select></td></tr>" ) + request.write( "</td></tr>" ) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/sv/util.py --- a/tools/python/xen/sv/util.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/sv/util.py Thu Aug 25 22:53:20 2005 @@ -4,7 +4,7 @@ import types -def getDomInfoHash( domain ): +def getDomInfo( domain ): domInfoHash = {} try: domInfoHash = sxp2hash( server.xend_domain( domain ) ) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/SrvBase.py --- a/tools/python/xen/web/SrvBase.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/SrvBase.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import types diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/SrvDir.py --- a/tools/python/xen/web/SrvDir.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/SrvDir.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import types diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/__init__.py --- a/tools/python/xen/web/__init__.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/__init__.py Thu Aug 25 22:53:20 2005 @@ -1,1 +1,17 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/connection.py --- a/tools/python/xen/web/connection.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/connection.py Thu Aug 25 22:53:20 2005 @@ -1,7 +1,26 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or +# (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import sys import threading import select import socket +import fcntl from errno import EAGAIN, EINTR, EWOULDBLOCK @@ -133,6 +152,9 @@ def createSocket(self): raise NotImplementedError() + + def setCloExec(self): + fcntl.fcntl(self.sock.fileno(), fcntl.F_SETFD, fcntl.FD_CLOEXEC) def acceptConnection(self, sock, protocol, addr): return SocketServerConnection(sock, protocol, addr, self) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/httpserver.py --- a/tools/python/xen/web/httpserver.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/httpserver.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import threading import string diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/protocol.py --- a/tools/python/xen/web/protocol.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/protocol.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + class Factory: """Generic protocol factory. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/reactor.py --- a/tools/python/xen/web/reactor.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/reactor.py Thu Aug 25 22:53:20 2005 @@ -1,2 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + from unix import listenUNIX, connectUNIX -from tcp import listenTCP, connectTCP +from tcp import listenTCP, connectTCP, SetCloExec diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/resource.py --- a/tools/python/xen/web/resource.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/resource.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import http def findResource(resource, request): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/static.py --- a/tools/python/xen/web/static.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/static.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import os from resource import Resource diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/tcp.py --- a/tools/python/xen/web/tcp.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/tcp.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import sys import socket import types @@ -68,6 +85,9 @@ l.startListening() return l +def SetCloExec(SocketListener): + SocketListener.SetCloExec() + def connectTCP(host, port, factory, timeout=None, bindAddress=None): c = TCPConnector(host, port, factory, timeout=timeout, bindAddress=bindAddress) c.connect() diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/web/unix.py --- a/tools/python/xen/web/unix.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/web/unix.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import sys import socket import os diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/Args.py --- a/tools/python/xen/xend/Args.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/Args.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import types import StringIO diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/EventServer.py --- a/tools/python/xen/xend/EventServer.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/EventServer.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Simple publish/subscribe event server. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/PrettyPrint.py --- a/tools/python/xen/xend/PrettyPrint.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/PrettyPrint.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """General pretty-printer, including support for SXP. diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/Vifctl.py --- a/tools/python/xen/xend/Vifctl.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/Vifctl.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Xend interface to networking control scripts. """ import os diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendCheckpoint.py Thu Aug 25 22:53:20 2005 @@ -6,6 +6,7 @@ import errno import os +import re import select import sxp from string import join @@ -64,6 +65,13 @@ if l.rstrip() == "suspend": log.info("suspending %d" % dominfo.id) xd.domain_shutdown(dominfo.id, reason='suspend') + if dominfo.store_channel: + try: + dominfo.db.releaseDomain(dominfo.id) + except Exception, ex: + log.warning("error in domain release on xenstore: %s", + ex) + pass dominfo.state_wait("suspended") log.info("suspend %d done" % dominfo.id) child.tochild.write("done\n") @@ -76,6 +84,11 @@ if child.wait() != 0: raise XendError("xc_save failed: %s" % lasterr) + if dominfo.store_channel: + dominfo.store_channel.close() + dominfo.db['store_channel'].delete() + dominfo.db.saveDB(save=True) + dominfo.store_channel = None xd.domain_destroy(dominfo.id) return None @@ -107,8 +120,13 @@ raise XendError( "not a valid guest state file: pfn count out of range") + if dominfo.store_channel: + evtchn = dominfo.store_channel.port2 + else: + evtchn = 0 + cmd = [PATH_XC_RESTORE, str(xc.handle()), str(fd), - str(dominfo.id), str(nr_pfns)] + str(dominfo.id), str(nr_pfns), str(evtchn)] log.info("[xc_restore] " + join(cmd)) child = xPopen3(cmd, True, -1, [fd, xc.handle()]) child.tochild.close() @@ -128,7 +146,21 @@ lasterr = l.rstrip() if fd == child.fromchild.fileno(): l = child.fromchild.readline() - log.info(l.rstrip()) + while l: + m = re.match(r"^(store-mfn) (\d+)\n$", l) + if m: + if dominfo.store_channel: + dominfo.store_mfn = int(m.group(2)) + if dominfo.store_mfn >= 0: + dominfo.db.introduceDomain(dominfo.id, + dominfo.store_mfn, + dominfo.store_channel) + dominfo.exportToDB(save=True, sync=True) + log.info(l.rstrip()) + try: + l = child.fromchild.readline() + except: + l = None if filter(lambda (fd, event): event & select.POLLHUP, r): break diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendClient.py --- a/tools/python/xen/xend/XendClient.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendClient.py Thu Aug 25 22:53:20 2005 @@ -1,13 +1,27 @@ #!/usr/bin/env python -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Client API for the HTTP interface on xend. Callable as a script - see main(). Supports inet or unix connection to xend. This API is the 'control-plane' for xend. -The 'data-plane' is done separately. For example, consoles -are accessed via sockets on xend, but the list of consoles -is accessible via this API. +The 'data-plane' is done separately. """ import os import sys @@ -145,9 +159,6 @@ def domainurl(self, id=''): return self.url.relative('domain/' + str(id)) - - def consoleurl(self, id=''): - return self.url.relative('console/' + str(id)) def deviceurl(self, id=''): return self.url.relative('device/' + str(id)) @@ -213,11 +224,15 @@ return self.xendPost(self.domainurl(id), {'op' : 'pause' }) - def xend_domain_shutdown(self, id, reason, key=0): + def xend_domain_shutdown(self, id, reason): return self.xendPost(self.domainurl(id), {'op' : 'shutdown', - 'reason' : reason, - 'key' : key }) + 'reason' : reason}) + + def xend_domain_sysrq(self, id, key): + return self.xendPost(self.domainurl(id), + {'op' : 'sysrq', + 'key' : key}) def xend_domain_destroy(self, id, reason): return self.xendPost(self.domainurl(id), @@ -317,16 +332,6 @@ {'op' : 'device_configure', 'idx' : idx, 'config' : fileof(config) }) - - def xend_consoles(self): - return self.xendGet(self.consoleurl()) - - def xend_console(self, id): - return self.xendGet(self.consoleurl(id)) - - def xend_console_disconnect(self, id): - return self.xendPost(self.consoleurl(id), - {'op' : 'disconnect'}) def xend_vnets(self): return self.xendGet(self.vneturl()) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendDB.py --- a/tools/python/xen/xend/XendDB.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendDB.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import os import os.path diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendDmesg.py --- a/tools/python/xen/xend/XendDmesg.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendDmesg.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ - # Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Get dmesg output for this node. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendDomain.py Thu Aug 25 22:53:20 2005 @@ -1,5 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> # Copyright (C) 2005 Christian Limpach <Christian.Limpach@xxxxxxxxxxxx> +#============================================================================ """Handler for domain operations. Nothing here is persistent (across reboots). @@ -305,8 +320,7 @@ @param vmconfig: vm configuration """ config = sxp.child_value(vmconfig, 'config') - uuid = sxp.child_value(vmconfig, 'uuid') - dominfo = XendDomainInfo.restore(self.dbmap, config, uuid=uuid) + dominfo = XendDomainInfo.restore(self.dbmap, config) return dominfo def domain_restore(self, src, progress=False): @@ -386,7 +400,7 @@ except Exception, ex: raise XendError(str(ex)) - def domain_shutdown(self, id, reason='poweroff', key=0): + def domain_shutdown(self, id, reason='poweroff'): """Shutdown domain (nicely). - poweroff: restart according to exit code and restart mode - reboot: restart on exit @@ -402,9 +416,16 @@ eserver.inject('xend.domain.shutdown', [dominfo.name, dominfo.id, reason]) if reason == 'halt': reason = 'poweroff' - val = dominfo.shutdown(reason, key=key) - if not reason in ['suspend', 'sysrq']: + val = dominfo.shutdown(reason) + if not reason in ['suspend']: self.domain_shutdowns() + return val + + def domain_sysrq(self, id, key): + """Send a SysRq to a domain + """ + dominfo = self.domain_lookup(id) + val = dominfo.send_sysrq(key) return val def domain_shutdowns(self): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendDomainInfo.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Representation of a single domain. Includes support for domain construction, using @@ -8,7 +23,7 @@ """ -import string +import string, re import os import time import threading @@ -21,8 +36,10 @@ from xen.xend.server import SrvDaemon; xend = SrvDaemon.instance() from xen.xend.server import messages from xen.xend.server.channel import EventChannel, channelFactory +from xen.util.blkif import blkdev_name_to_number, expand_dev_name from xen.xend import sxp +from xen.xend import Blkctl from xen.xend.PrettyPrint import prettyprintstring from xen.xend.XendBootloader import bootloader from xen.xend.XendLogging import log @@ -52,15 +69,6 @@ DOMAIN_CRASH : "crash", } -"""Map shutdown reasons to the message type to use. -""" -shutdown_messages = { - 'poweroff' : 'shutdown_poweroff_t', - 'reboot' : 'shutdown_reboot_t', - 'suspend' : 'shutdown_suspend_t', - 'sysrq' : 'shutdown_sysrq_t', - } - RESTART_ALWAYS = 'always' RESTART_ONREBOOT = 'onreboot' RESTART_NEVER = 'never' @@ -132,7 +140,7 @@ if domlist and dom == domlist[0]['dom']: return domlist[0] return None - + class XendDomainInfo: """Virtual machine object.""" @@ -152,8 +160,6 @@ vm = cls(db) vm.construct(config) vm.saveToDB(sync=True) - # Flush info to xenstore immediately - vm.exportToDB() return vm @@ -191,19 +197,22 @@ recreate = classmethod(recreate) - def restore(cls, parentdb, config, uuid): + def restore(cls, parentdb, config, uuid=None): """Create a domain and a VM object to do a restore. @param parentdb: parent db @param config: domain configuration @param uuid: uuid to use """ + if not uuid: + uuid = getUuid() db = parentdb.addChild(uuid) vm = cls(db) ssidref = int(sxp.child_value(config, 'ssidref')) log.debug('restoring with ssidref='+str(ssidref)) id = xc.domain_create(ssidref = ssidref) vm.setdom(id) + vm.clear_shutdown() try: vm.restore = True vm.construct(config) @@ -227,6 +236,7 @@ DBVar('restart_time', ty='float'), DBVar('restart_count', ty='int'), DBVar('target', ty='long', path="memory/target"), + DBVar('device_model_pid', ty='int'), ] def __init__(self, db): @@ -255,6 +265,8 @@ self.info = None self.blkif_backend = False self.netif_backend = False + self.netif_idx = 0 + #todo: state: running, suspended self.state = STATE_VM_OK self.state_updated = threading.Condition() @@ -268,9 +280,10 @@ self.restart_time = None self.restart_count = 0 - self.console_port = None self.vcpus = 1 + self.vcpusdb = {} self.bootloader = None + self.device_model_pid = 0 def setDB(self, db): self.db = db @@ -344,9 +357,6 @@ s += " name=" + self.name s += " memory=" + str(self.memory) s += " ssidref=" + str(self.ssidref) - console = self.getConsole() - if console: - s += " console=" + str(console.console_port) s += ">" return s @@ -374,6 +384,71 @@ return ctrl def createDevice(self, type, devconfig, change=False): + if type == 'vbd': + typedev = sxp.child_value(devconfig, 'dev') + if re.match('^ioemu:', typedev): + return; + backdom = domain_exists(sxp.child_value(devconfig, 'backend', '0')) + + devnum = blkdev_name_to_number(sxp.child_value(devconfig, 'dev')) + + # create backend db + backdb = backdom.db.addChild("/backend/%s/%s/%d" % + (type, self.uuid, devnum)) + + # create frontend db + db = self.db.addChild("/device/%s/%d" % (type, devnum)) + + db['virtual-device'] = "%i" % devnum + #db['backend'] = sxp.child_value(devconfig, 'backend', '0') + db['backend'] = backdb.getPath() + db['backend-id'] = "%i" % backdom.id + + backdb['frontend'] = db.getPath() + (type, params) = string.split(sxp.child_value(devconfig, 'uname'), ':', 1) + node = Blkctl.block('bind', type, params) + backdb['frontend-id'] = "%i" % self.id + backdb['physical-device'] = "%li" % blkdev_name_to_number(node) + backdb.saveDB(save=True) + + # Ok, super gross, this really doesn't belong in the frontend db... + db['type'] = type + db['node'] = node + db['params'] = params + db.saveDB(save=True) + + return + + if type == 'vif': + backdom = domain_exists(sxp.child_value(devconfig, 'backend', '0')) + + log.error(devconfig) + + devnum = self.netif_idx + self.netif_idx += 1 + + # create backend db + backdb = backdom.db.addChild("/backend/%s/%s/%d" % + (type, self.uuid, devnum)) + + # create frontend db + db = self.db.addChild("/device/%s/%d" % (type, devnum)) + + backdb['frontend'] = db.getPath() + backdb['frontend-id'] = "%i" % self.id + backdb['handle'] = "%i" % devnum + backdb.saveDB(save=True) + + db['backend'] = backdb.getPath() + db['backend-id'] = "%i" % backdom.id + db['handle'] = "%i" % devnum + log.error(sxp.child_value(devconfig, 'mac')) + db['mac'] = sxp.child_value(devconfig, 'mac') + + db.saveDB(save=True) + + return + ctrl = self.findDeviceController(type) return ctrl.createDevice(devconfig, recreate=self.recreate, change=change) @@ -443,9 +518,6 @@ sxpr.append(self.store_channel.sxpr()) if self.store_mfn: sxpr.append(['store_mfn', self.store_mfn]) - console = self.getConsole() - if console: - sxpr.append(console.sxpr()) if self.restart_count: sxpr.append(['restart_count', self.restart_count]) @@ -459,6 +531,8 @@ sxpr.append(devs) if self.config: sxpr.append(['config', self.config]) + if self.device_model_pid: + sxpr.append(['device_model_pid',self.device_model_pid]) return sxpr def sxpr_devices(self): @@ -519,7 +593,6 @@ # Create domain devices. self.configure_backends() - self.configure_console() self.configure_restart() self.construct_image() self.configure() @@ -558,6 +631,16 @@ except: raise VmError('invalid vcpus value') + def exportVCPUSToDB(self, vcpus): + for v in range(0,vcpus): + path = "/cpu/%d"%(v) + if not self.vcpusdb.has_key(path): + self.vcpusdb[path] = self.db.addChild(path) + db = self.vcpusdb[path] + log.debug("writing key availability=online to path %s in store"%(path)) + db['availability'] = "online" + db.saveDB(save=True) + def init_image(self): """Create boot image handler for the domain. """ @@ -572,15 +655,17 @@ self.create_channel() self.image.createImage() self.exportToDB() - if self.store_channel: + if self.store_channel and self.store_mfn >= 0: self.db.introduceDomain(self.id, self.store_mfn, self.store_channel) + # get the configured value of vcpus and update store + self.exportVCPUSToDB(self.vcpus) def delete(self): """Delete the vm's db. """ - if self.dom_get(self.id): + if dom_get(self.id): return self.id = None self.saveToDB(sync=True) @@ -629,6 +714,7 @@ pass if self.image: try: + self.device_model_pid = 0 self.image.destroy() self.image = None except: @@ -654,6 +740,21 @@ for ctrl in self.getDeviceControllers(): if ctrl.isDestroyed(): continue ctrl.destroyController(reboot=reboot) + ddb = self.db.addChild("/device") + for type in ddb.keys(): + if type == 'vbd': + typedb = ddb.addChild(type) + for dev in typedb.keys(): + devdb = typedb.addChild(str(dev)) + Blkctl.block('unbind', devdb['type'].getData(), + devdb['node'].getData()) + typedb[dev].delete() + typedb.saveDB(save=True) + if type == 'vif': + typedb = ddb.addChild(type) + for dev in typedb.keys(): + typedb[dev].delete() + typedb.saveDB(save=True) def show(self): """Print virtual machine info. @@ -730,7 +831,8 @@ ctrl.initController(reboot=True) else: self.create_configured_devices() - self.image.createDeviceModel() + if not self.device_model_pid: + self.device_model_pid = self.image.createDeviceModel() def device_create(self, dev_config): """Create a new device. @@ -738,7 +840,7 @@ @param dev_config: device configuration """ dev_type = sxp.name(dev_config) - dev = self.createDevice(self, dev_config, change=True) + dev = self.createDevice(dev_type, dev_config, change=True) self.config.append(['device', dev.getConfig()]) return dev.sxpr() @@ -785,17 +887,6 @@ """ self.bootloader = sxp.child_value(self.config, "bootloader") - def configure_console(self): - """Configure the vm console port. - """ - x = sxp.child_value(self.config, 'console') - if x: - try: - port = int(x) - except: - raise VmError('invalid console:' + str(x)) - self.console_port = port - def configure_restart(self): """Configure the vm restart mode. """ @@ -855,7 +946,7 @@ def restart(self): """Restart the domain after it has exited. - Reuses the domain id and console port. + Reuses the domain id """ try: @@ -910,24 +1001,8 @@ """ self.configure_fields() - self.create_console() self.create_devices() self.create_blkif() - - def create_console(self): - console = self.getConsole() - if not console: - config = ['console'] - if self.console_port: - config.append(['console_port', self.console_port]) - console = self.createDevice('console', config) - return console - - def getConsole(self): - console_ctrl = self.getDeviceController("console", error=False) - if console_ctrl: - return console_ctrl.getDevice(0) - return None def create_blkif(self): """Create the block device interface (blkif) for the vm. @@ -935,6 +1010,7 @@ at creation time, for example when it uses NFS root. """ + return blkif = self.getDeviceController("vbd", error=False) if not blkif: blkif = self.createDeviceController("vbd") @@ -967,28 +1043,39 @@ def vcpu_hotplug(self, vcpu, state): """Disable or enable VCPU in domain. """ - log.error("Holly Shit! %d %d\n" % (vcpu, state)) - if self.channel: + db = "" + try: + db = self.vcpusdb['/cpu/%d'%(vcpu)] + except: + log.error("Invalid VCPU") + return + + if self.store_channel: if int(state) == 0: - msg = messages.packMsg('vcpu_hotplug_off_t', { 'vcpu' : vcpu} ) + db['availability'] = "offline" else: - msg = messages.packMsg('vcpu_hotplug_on_t', { 'vcpu' : vcpu} ) - - self.channel.writeRequest(msg) - - def shutdown(self, reason, key=0): - msgtype = shutdown_messages.get(reason) - if not msgtype: + db['availability'] = "online" + + db.saveDB(save=True) + + def shutdown(self, reason): + if not reason in shutdown_reasons.values(): raise XendError('invalid reason:' + reason) - extra = {} - if reason == 'sysrq': - extra['key'] = key - if self.channel: - msg = messages.packMsg(msgtype, extra) - self.channel.writeRequest(msg) - if not reason in ['suspend', 'sysrq']: - self.shutdown_pending = {'start':time.time(), 'reason':reason, - 'key':key} + db = self.db.addChild("/control"); + db['shutdown'] = reason; + db.saveDB(save=True); + if not reason in ['suspend']: + self.shutdown_pending = {'start':time.time(), 'reason':reason} + + def clear_shutdown(self): + db = self.db.addChild("/control") + db['shutdown'] = "" + db.saveDB(save=True) + + def send_sysrq(self, key=0): + db = self.db.addChild("/control"); + db['sysrq'] = '%c' % key; + db.saveDB(save=True); def shutdown_time_left(self, timeout): if not self.shutdown_pending: @@ -1003,6 +1090,8 @@ self.db.introduceDomain(self.id, self.store_mfn, self.store_channel) self.exportToDB(save=True, sync=True) + # get run-time value of vcpus and update store + self.exportVCPUSToDB(dom_get(self.id)['vcpus']) def vm_field_ignore(vm, config, val, index): """Dummy config field handler used for fields with built-in handling. @@ -1048,7 +1137,6 @@ add_config_handler('ssidref', vm_field_ignore) add_config_handler('cpu', vm_field_ignore) add_config_handler('cpu_weight', vm_field_ignore) -add_config_handler('console', vm_field_ignore) add_config_handler('restart', vm_field_ignore) add_config_handler('image', vm_field_ignore) add_config_handler('device', vm_field_ignore) @@ -1062,9 +1150,6 @@ #============================================================================ # Register device controllers and their device config types. -from server import console -controller.addDevControllerClass("console", console.ConsoleController) - from server import blkif controller.addDevControllerClass("vbd", blkif.BlkifController) add_device_handler("vbd", "vbd") diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendError.py --- a/tools/python/xen/xend/XendError.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendError.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ class XendError(ValueError): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendLogging.py --- a/tools/python/xen/xend/XendLogging.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendLogging.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import types import logging diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendNode.py --- a/tools/python/xen/xend/XendNode.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendNode.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Handler for node operations. Has some persistent state: diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendProtocol.py --- a/tools/python/xen/xend/XendProtocol.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendProtocol.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import socket import httplib diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendRoot.py --- a/tools/python/xen/xend/XendRoot.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendRoot.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Xend root class. Creates the event server and handles configuration. @@ -69,12 +84,6 @@ """Default path the unix-domain server listens at.""" xend_unix_path_default = '/var/lib/xend/xend-socket' - - """Default interface address xend listens at for consoles.""" - console_address_default = 'localhost' - - """Default port xend serves consoles at. """ - console_port_base_default = '9600' dom0_min_mem_default = '0' @@ -302,24 +311,11 @@ """ return self.get_config_value("xend-unix-path", self.xend_unix_path_default) - def get_console_address(self): - """Get the address xend listens at for its console ports. - This defaults to 'localhost', allowing only the localhost to connect - to the console ports. Setting this to the empty string, allows all - hosts to connect. - """ - return self.get_config_value('console-address', self.console_address_default) - - def get_console_port_base(self): - """Get the base port number used to generate console ports for domains. - """ - return self.get_config_int('console-port-base', self.console_port_base_default) - def get_block_script(self, type): return self.get_config_value('block-%s' % type, '') def get_network_script(self): - return self.get_config_value('network-script', 'network') + return self.get_config_value('network-script', '') def get_enable_dump(self): return self.get_config_bool('enable-dump', 'no') diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/XendVnet.py --- a/tools/python/xen/xend/XendVnet.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/XendVnet.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Handler for vnet operations. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/encode.py --- a/tools/python/xen/xend/encode.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/encode.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Encoding for arguments to HTTP calls. Uses the url-encoding with MIME type 'application/x-www-form-urlencoded' if the data does not include files. Otherwise it uses the encoding with diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/image.py --- a/tools/python/xen/xend/image.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/image.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,22 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import os, string +import re import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() from xen.xend import sxp @@ -245,7 +263,7 @@ memmap = None memmap_value = [] device_channel = None - + pid = 0 def createImage(self): """Create a VM for the VMX environment. """ @@ -257,14 +275,24 @@ # Create an event channel self.device_channel = channel.eventChannel(0, self.vm.getDomain()) log.info("VMX device model port: %d", self.device_channel.port2) - return xc.vmx_build(dom = self.vm.getDomain(), + if self.vm.store_channel: + store_evtchn = self.vm.store_channel.port2 + else: + store_evtchn = 0 + ret = xc.vmx_build(dom = self.vm.getDomain(), image = self.kernel, control_evtchn = self.device_channel.port2, + store_evtchn = store_evtchn, memsize = self.vm.memory, memmap = self.memmap_value, cmdline = self.cmdline, ramdisk = self.ramdisk, - flags = self.flags) + flags = self.flags, + vcpus = self.vm.vcpus) + if isinstance(ret, dict): + self.vm.store_mfn = ret.get('store_mfn') + return 0 + return ret def parseMemmap(self): self.memmap = sxp.child_value(self.vm.config, "memmap") @@ -278,7 +306,7 @@ # xm config file def parseDeviceModelArgs(self): dmargs = [ 'cdrom', 'boot', 'fda', 'fdb', - 'localtime', 'serial', 'macaddr', 'stdvga', 'isa' ] + 'localtime', 'serial', 'stdvga', 'isa' ] ret = [] for a in dmargs: v = sxp.child_value(self.vm.config, a) @@ -295,20 +323,32 @@ ret.append("-%s" % a) ret.append("%s" % v) - # Handle hd img related options + # Handle disk/network related options devices = sxp.children(self.vm.config, 'device') for device in devices: - vbdinfo = sxp.child(device, 'vbd') - if not vbdinfo: - raise VmError("vmx: missing vbd configuration") - uname = sxp.child_value(vbdinfo, 'uname') - vbddev = sxp.child_value(vbdinfo, 'dev') - (vbdtype, vbdparam) = string.split(uname, ':', 1) - vbddev_list = ['hda', 'hdb', 'hdc', 'hdd'] - if vbdtype != 'file' or vbddev not in vbddev_list: - raise VmError("vmx: for qemu vbd type=file&dev=hda~hdd") - ret.append("-%s" % vbddev) - ret.append("%s" % vbdparam) + name = sxp.name(sxp.child0(device)) + if name == 'vbd': + vbdinfo = sxp.child(device, 'vbd') + uname = sxp.child_value(vbdinfo, 'uname') + typedev = sxp.child_value(vbdinfo, 'dev') + (vbdtype, vbdparam) = string.split(uname, ':', 1) + if re.match('^ioemu:', typedev): + (emtype, vbddev) = string.split(typedev, ':', 1) + else: + emtype = 'vbd' + vbddev = typedev + if emtype != 'ioemu': + continue; + vbddev_list = ['hda', 'hdb', 'hdc', 'hdd'] + if vbddev not in vbddev_list: + raise VmError("vmx: for qemu vbd type=file&dev=hda~hdd") + ret.append("-%s" % vbddev) + ret.append("%s" % vbdparam) + if name == 'vif': + vifinfo = sxp.child(device, 'vif') + mac = sxp.child_value(vifinfo, 'mac') + ret.append("-macaddr") + ret.append("%s" % mac) # Handle graphics library related options vnc = sxp.child_value(self.vm.config, 'vnc') @@ -347,6 +387,7 @@ log.info("spawning device models: %s %s", device_model, args) self.pid = os.spawnve(os.P_NOWAIT, device_model, args, env) log.info("device model pid: %d", self.pid) + return self.pid def vncParams(self): # see if a vncviewer was specified @@ -366,11 +407,16 @@ def destroy(self): channel.eventChannelClose(self.device_channel) import signal + if not self.pid: + self.pid = self.vm.device_model_pid os.kill(self.pid, signal.SIGKILL) (pid, status) = os.waitpid(self.pid, 0) + self.pid = 0 def getDomainMemory(self, mem_mb): - return (mem_mb * 1024) + self.getPageTableSize(mem_mb) + # for ioreq_t and xenstore + static_pages = 2 + return (mem_mb * 1024) + self.getPageTableSize(mem_mb) + 4 * static_pages def getPageTableSize(self, mem_mb): """Return the size of memory needed for 1:1 page tables for physical diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/scheduler.py --- a/tools/python/xen/xend/scheduler.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/scheduler.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import threading def later(delay, fn, args=(), kwargs={}): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvDaemon.py --- a/tools/python/xen/xend/server/SrvDaemon.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvDaemon.py Thu Aug 25 22:53:20 2005 @@ -126,12 +126,8 @@ def cleanup_xend(self, kill=False): return self.cleanup_process(XEND_PID_FILE, "xend", kill) - def cleanup_xenstored(self, kill=False): - return self.cleanup_process(XENSTORED_PID_FILE, "xenstored", kill) - def cleanup(self, kill=False): self.cleanup_xend(kill=kill) - #self.cleanup_xenstored(kill=kill) def status(self): """Returns the status of the xend daemon. @@ -167,31 +163,6 @@ pidfile.write(str(pid)) pidfile.close() return pid - - def start_xenstored(self): - """Fork and exec xenstored, writing its pid to XENSTORED_PID_FILE. - """ - def mkdirs(p): - try: - os.makedirs(p) - except: - pass - mkdirs(XENSTORED_RUN_DIR) - mkdirs(XENSTORED_LIB_DIR) - - pid = self.fork_pid(XENSTORED_PID_FILE) - if pid: - # Parent - log.info("Started xenstored, pid=%d", pid) - else: - # Child - if XEND_DAEMONIZE: - self.daemonize() - if XENSTORED_DEBUG: - os.execl("/usr/sbin/xenstored", "xenstored", "--no-fork", - "-T", "/var/log/xenstored-trace.log") - else: - os.execl("/usr/sbin/xenstored", "xenstored", "--no-fork") def daemonize(self): if not XEND_DAEMONIZE: return @@ -223,14 +194,10 @@ 4 Insufficient privileges """ xend_pid = self.cleanup_xend() - xenstored_pid = self.cleanup_xenstored() if self.set_user(): return 4 os.chdir("/") - - if xenstored_pid == 0: - self.start_xenstored() if xend_pid > 0: # Trying to run an already-running service is a success. diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvDmesg.py --- a/tools/python/xen/xend/server/SrvDmesg.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvDmesg.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import os diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvDomain.py --- a/tools/python/xen/xend/server/SrvDomain.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvDomain.py Thu Aug 25 22:53:20 2005 @@ -1,10 +1,24 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ from xen.web import http from xen.xend import sxp from xen.xend import XendDomain -from xen.xend import XendConsole from xen.xend import PrettyPrint from xen.xend.Args import FormFn @@ -18,7 +32,6 @@ SrvDir.__init__(self) self.dom = dom self.xd = XendDomain.instance() - self.xconsole = XendConsole.instance() def op_configure(self, op, req): """Configure an existing domain. @@ -41,9 +54,17 @@ def op_shutdown(self, op, req): fn = FormFn(self.xd.domain_shutdown, [['dom', 'int'], - ['reason', 'str'], + ['reason', 'str']]) + val = fn(req.args, {'dom': self.dom.id}) + req.setResponseCode(http.ACCEPTED) + req.setHeader("Location", "%s/.." % req.prePathURL()) + return val + + def op_sysrq(self, op, req): + fn = FormFn(self.xd.domain_sysrq, + [['dom', 'int'], ['key', 'int']]) - val = fn(req.args, {'dom': self.dom.id}) + val = fn(req.args, {'dom' : self.dom.id}) req.setResponseCode(http.ACCEPTED) req.setHeader("Location", "%s/.." % req.prePathURL()) return val @@ -208,14 +229,6 @@ self.print_path(req) #self.ls() req.write('<p>%s</p>' % self.dom) - if self.dom.console: - cinfo = self.dom.console - cid = str(cinfo.console_port) - #todo: Local xref: need to know server prefix. - req.write('<p><a href="/xend/console/%s">Console %s</a></p>' - % (cid, cid)) - req.write('<p><a href="%s">Connect to console</a></p>' - % cinfo.uri()) if self.dom.config: req.write("<code><pre>") PrettyPrint.prettyprint(self.dom.config, out=req) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvDomainDir.py --- a/tools/python/xen/xend/server/SrvDomainDir.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvDomainDir.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import traceback from StringIO import StringIO diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvNode.py --- a/tools/python/xen/xend/server/SrvNode.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvNode.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import os diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvRoot.py --- a/tools/python/xen/xend/server/SrvRoot.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvRoot.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ from xen.xend import XendRoot xroot = XendRoot.instance() @@ -15,7 +30,6 @@ subdirs = [ ('node', 'SrvNode' ), ('domain', 'SrvDomainDir' ), - ('console', 'SrvConsoleDir' ), ('vnet', 'SrvVnetDir' ), ] diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvServer.py --- a/tools/python/xen/xend/server/SrvServer.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvServer.py Thu Aug 25 22:53:20 2005 @@ -1,7 +1,22 @@ #!/usr/bin/python -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ -"""Example xend HTTP and console server. +"""Example xend HTTP Can be accessed from a browser or from a program. Do 'python SrvServer.py' to run the server. diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvVnetDir.py --- a/tools/python/xen/xend/server/SrvVnetDir.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvVnetDir.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ from xen.xend import sxp from xen.xend.Args import FormFn diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/SrvXendLog.py --- a/tools/python/xen/xend/server/SrvXendLog.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/SrvXendLog.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ from xen.web import static diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/blkif.py --- a/tools/python/xen/xend/server/blkif.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/blkif.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Support for virtual block devices. """ import string diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/channel.py --- a/tools/python/xen/xend/server/channel.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/channel.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import threading import select diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/controller.py --- a/tools/python/xen/xend/server/controller.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/controller.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """General support for controllers, which handle devices for a domain. """ @@ -126,7 +142,7 @@ def createDevController(self, type, vm, recreate=False): cls = self.getDevControllerClass(type) if not cls: - raise XendError("unknown device type: " + type) + raise XendError("unknown device type: " + str(type)) return cls.createDevController(vm, recreate=recreate) def getDevControllerTable(): @@ -267,6 +283,8 @@ dev.attach(recreate=recreate, change=change) dev.exportToDB() + return dev + def configureDevice(self, id, config, change=False): """Reconfigure an existing device. May be defined in subclass.""" @@ -307,9 +325,9 @@ return self.destroyed def getDevice(self, id, error=False): - dev = self.devices.get(id) + dev = self.devices.get(int(id)) if error and not dev: - raise XendError("invalid device id: " + id) + raise XendError("invalid device id: " + str(id)) return dev def getDeviceIds(self): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/event.py --- a/tools/python/xen/xend/server/event.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/event.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import sys import StringIO @@ -33,7 +50,7 @@ def dataReceived(self, data): try: self.parser.input(data) - if self.parser.ready(): + while(self.parser.ready()): val = self.parser.get_val() res = self.dispatch(val) self.send_result(res) @@ -128,16 +145,8 @@ def op_pretty(self, name, req): self.pretty = 1 - def op_console_disconnect(self, name, req): - id = sxp.child_value(req, 'id') - if not id: - raise XendError('Missing console id') - id = int(id) - self.daemon.console_disconnect(id) - def op_info(self, name, req): val = ['info'] - #val += self.daemon.consoles() #val += self.daemon.blkifs() #val += self.daemon.netifs() #val += self.daemon.usbifs() diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/messages.py --- a/tools/python/xen/xend/server/messages.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/messages.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import sys import struct import types @@ -16,16 +33,6 @@ See below. """ msg_formats = {} - -#============================================================================ -# Console message types. -#============================================================================ - -CMSG_CONSOLE = 0 - -console_formats = { 'console_data': (CMSG_CONSOLE, 0) } - -msg_formats.update(console_formats) #============================================================================ # Block interface message types. diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/netif.py --- a/tools/python/xen/xend/server/netif.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/netif.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Support for virtual network interfaces. """ @@ -405,7 +421,9 @@ 'netif_handle' : self.vif, 'evtchn' : self.getEventChannelBackend(), 'tx_shmem_frame' : val['tx_shmem_frame'], - 'rx_shmem_frame' : val['rx_shmem_frame'] }) + 'tx_shmem_ref' : val['tx_shmem_ref'], + 'rx_shmem_frame' : val['rx_shmem_frame'], + 'rx_shmem_ref' : val['rx_shmem_ref'] }) msg = self.backendChannel.requestResponse(msg) #todo: check return status self.status = NETIF_INTERFACE_STATUS_CONNECTED diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/params.py --- a/tools/python/xen/xend/server/params.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/params.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import os def getenv(var, val, conv=None): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/pciif.py --- a/tools/python/xen/xend/server/pciif.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/pciif.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + import types import xen.lowlevel.xc; xc = xen.lowlevel.xc.new() diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/server/relocate.py --- a/tools/python/xen/xend/server/relocate.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/server/relocate.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import socket import sys @@ -26,7 +42,7 @@ def dataReceived(self, data): try: self.parser.input(data) - if self.parser.ready(): + while(self.parser.ready()): val = self.parser.get_val() res = self.dispatch(val) self.send_result(res) @@ -124,7 +140,8 @@ if xroot.get_xend_relocation_server(): port = xroot.get_xend_relocation_port() interface = xroot.get_xend_relocation_address() - reactor.listenTCP(port, factory, interface=interface) + l = reactor.listenTCP(port, factory, interface=interface) + l.setCloExec() def setupRelocation(dst, port): try: diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/sxp.py --- a/tools/python/xen/xend/sxp.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/sxp.py Thu Aug 25 22:53:20 2005 @@ -1,5 +1,21 @@ #!/usr/bin/python -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """ Input-driven parsing for s-expression (sxp) format. Create a parser: pin = Parser(); diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/uuid.py --- a/tools/python/xen/xend/uuid.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/uuid.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,20 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Universal(ly) Unique Identifiers (UUIDs). """ import commands diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/xenstore/__init__.py --- a/tools/python/xen/xend/xenstore/__init__.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/xenstore/__init__.py Thu Aug 25 22:53:20 2005 @@ -1,2 +1,18 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ from xsnode import * from xsobj import * diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/xenstore/xsnode.py --- a/tools/python/xen/xend/xenstore/xsnode.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/xenstore/xsnode.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import errno import os import os.path diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/xenstore/xsobj.py --- a/tools/python/xen/xend/xenstore/xsobj.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/xenstore/xsobj.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,19 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ import string import types @@ -307,24 +323,24 @@ db = self.__db__ if path is None: path = db.relPath() - print 'DBMap>introduceDomain>', dom, page, evtchn, path + log.info("DBMap>introduceDomain> %d %d %s %s" %(dom, page, evtchn, path)) try: db.introduceDomain(dom, page, evtchn, path) except Exception, ex: import traceback traceback.print_exc() - print 'DBMap>introduceDomain>', ex + log.info("DBMap>introduceDomain> %s" %ex) pass # todo: don't ignore def releaseDomain(self, dom): db = self.__db__ - print 'DBMap>releaseDomain>', dom + log.info("DBMap>releaseDomain> %d" %dom) try: db.releaseDomain(dom) except Exception, ex: import traceback traceback.print_exc() - print 'DBMap>releaseDomain>', ex + log.info("DBMap>releaseDomain> %s" %ex) pass # todo: don't ignore def watch(self, fn, path=""): diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xend/xenstore/xsresource.py --- a/tools/python/xen/xend/xenstore/xsresource.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xend/xenstore/xsresource.py Thu Aug 25 22:53:20 2005 @@ -1,3 +1,16 @@ +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #============================================================================ # Copyright (C) 2005 Mike Wray <mike.wray@xxxxxx> #============================================================================ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/create.py --- a/tools/python/xen/xm/create.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/create.py Thu Aug 25 22:53:20 2005 @@ -1,5 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> # Copyright (C) 2005 Nguyen Anh Quynh <aquynh@xxxxxxxxx> +#============================================================================ """Domain creation. """ @@ -7,6 +22,8 @@ import string import sys import socket +import commands +import time import xen.lowlevel.xc @@ -16,8 +33,6 @@ from xen.xend.XendBootloader import bootloader from xen.xend import XendRoot; xroot = XendRoot.instance() from xen.util import blkif - -from xen.util import console_client from xen.xm.opts import * @@ -144,10 +159,6 @@ fn=set_float, default=None, use="""Set the new domain's cpu weight. WEIGHT is a float that controls the domain's share of the cpu.""") - -gopts.var('console', val='PORT', - fn=set_int, default=None, - use="Console port to use. Default is 9600 + domain id.") gopts.var('restart', val='onreboot|always|never', fn=set_value, default=None, @@ -370,7 +381,6 @@ @return: MAC address string """ - random.seed() mac = [ 0xaa, 0x00, 0x00, random.randint(0x00, 0x7f), random.randint(0x00, 0xff), @@ -471,8 +481,6 @@ config.append(['backend', ['netif']]) if vals.restart: config.append(['restart', vals.restart]) - if vals.console: - config.append(['console', vals.console]) if vals.bootloader: run_bootloader(opts, config, vals) @@ -584,9 +592,14 @@ return d return None +vncpid = None + def spawn_vnc(display): - os.system("vncviewer -log *:stdout:0 -listen %d &" % - (VNC_BASE_PORT + display)) + vncargs = (["vncviewer" + "-log", "*:stdout:0", + "-listen", "%d" % (VNC_BASE_PORT + display) ]) + global vncpid + vncpid = os.spawnvp(os.P_NOWAIT, "vncviewer", vncargs) + return VNC_BASE_PORT + display def preprocess_vnc(opts, vals): @@ -620,8 +633,8 @@ @param opts: options @param config: configuration - @return: domain id, console port - @rtype: (int, int) + @return: domain id + @rtype: int """ try: @@ -631,22 +644,19 @@ else: dominfo = server.xend_domain_create(config) except XendError, ex: + import signal + if vncpid: + os.kill(vncpid, signal.SIGKILL) opts.err(str(ex)) dom = sxp.child_value(dominfo, 'name') - console_info = sxp.child(dominfo, 'console') - if console_info: - console_port = int(sxp.child_value(console_info, 'console_port')) - else: - console_port = None if not opts.vals.paused: if server.xend_domain_unpause(dom) < 0: server.xend_domain_destroy(dom) opts.err("Failed to unpause domain %s" % dom) - opts.info("Started domain %s, console on port %d" - % (dom, console_port)) - return (dom, console_port) + opts.info("Started domain %s" % (dom)) + return int(sxp.child_value(dominfo, 'id')) def get_dom0_alloc(): """Return current allocation memory of dom0 (in MB). Return 0 on error""" @@ -665,20 +675,38 @@ return 0 def balloon_out(dom0_min_mem, opts): - """Balloon out to get memory for domU, if necessarily""" + """Balloon out memory from dom0 if necessary""" SLACK = 4 + timeout = 20 # 2s + ret = 0 xc = xen.lowlevel.xc.new() pinfo = xc.physinfo() - free_mem = pinfo['free_pages']/256 - if free_mem < opts.vals.memory + SLACK: - need_mem = opts.vals.memory + SLACK - free_mem - cur_alloc = get_dom0_alloc() - if cur_alloc - need_mem >= dom0_min_mem: - server.xend_domain_mem_target_set(0, cur_alloc - need_mem) + free_mem = pinfo['free_pages'] / 256 + domU_need_mem = opts.vals.memory + SLACK + + dom0_cur_alloc = get_dom0_alloc() + dom0_new_alloc = dom0_cur_alloc - (domU_need_mem - free_mem) + + if free_mem < domU_need_mem and dom0_new_alloc < dom0_min_mem: + ret = 1 + if free_mem < domU_need_mem and ret == 0: + + server.xend_domain_mem_target_set(0, dom0_new_alloc) + + while dom0_cur_alloc > dom0_new_alloc and timeout > 0: + time.sleep(0.1) # sleep 100ms + dom0_cur_alloc = get_dom0_alloc() + timeout -= 1 + + if dom0_cur_alloc > dom0_new_alloc: + ret = 1 + del xc + return ret def main(argv): + random.seed() opts = gopts args = opts.parse(argv) if opts.vals.help: @@ -707,12 +735,14 @@ else: dom0_min_mem = xroot.get_dom0_min_mem() if dom0_min_mem != 0: - balloon_out(dom0_min_mem, opts) - - (dom, console) = make_domain(opts, config) + if balloon_out(dom0_min_mem, opts): + print >>sys.stderr, "error: cannot allocate enough memory for domain" + sys.exit(1) + + dom = make_domain(opts, config) if opts.vals.console_autoconnect: - path = "/var/lib/xend/console-%s" % console - console_client.connect('localhost', console, path=path) + cmd = "/usr/libexec/xen/xenconsole %d" % dom + os.execvp('/usr/libexec/xen/xenconsole', cmd.split()) if __name__ == '__main__': main(sys.argv) diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/destroy.py --- a/tools/python/xen/xm/destroy.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/destroy.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Destroy a domain. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/help.py --- a/tools/python/xen/xm/help.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/help.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Variable definition and help support for Python defconfig files. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/main.py Thu Aug 25 22:53:20 2005 @@ -1,27 +1,124 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +# (C) Copyright IBM Corp. 2005 +# Copyright (C) 2004 Mike Wray +# +# Authors: +# Sean Dague <sean at dague dot net> +# Mike Wray <mike dot wray at hp dot com> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + """Grand unified management application for Xen. """ import os import os.path import sys +import commands +import re from getopt import getopt import socket import warnings warnings.filterwarnings('ignore', category=FutureWarning) - from xen.xend import PrettyPrint from xen.xend import sxp -# this is a nasty place to stick this in, but required because -# log file access is set up via a 5 deep import chain. This -# ensures the user sees a useful message instead of a stack trace -if os.getuid() != 0: - print "xm requires root access to execute, please try again as root" - sys.exit(1) - -from xen.xend.XendClient import XendError, server -from xen.xend.XendClient import main as xend_client_main -from xen.xm import create, destroy, migrate, shutdown, sysrq from xen.xm.opts import * +shorthelp = """Usage: xm <subcommand> [args] + Control, list, and manipulate Xen guest instances + +xm common subcommands: + console <DomId> attach to console of DomId + create <CfgFile> create a domain based on Config File + destroy <DomId> terminate a domain immediately + help display this message + list [DomId, ...] list information about domains + mem-max <DomId> <Mem> set the maximum memory reservation for a domain + mem-set <DomId> <Mem> adjust the current memory usage for a domain + migrate <DomId> <Host> migrate a domain to another machine + pause <DomId> pause execution of a domain + reboot <DomId> reboot a domain + restore <File> create a domain from a saved state file + save <DomId> <File> save domain state (and config) to file + shutdown <DomId> shutdown a domain + top monitor system and domains in real-time + unpause <DomId> unpause a paused domain + +For a complete list of subcommands run 'xm help --long' +For more help on xm see the xm(1) man page +For more help on xm create, see the xmdomain.cfg(5) man page""" + +longhelp = """Usage: xm <subcommand> [args] + Control, list, and manipulate Xen guest instances + +xm full list of subcommands: + + Domain Commands: + console <DomId> attach to console of DomId + cpus-list <DomId> <VCpu> get the list of cpus for a VCPU + cpus-set <DomId> <VCpu> <CPUS> set which cpus a VCPU can use. + create <ConfigFile> create a domain + destroy <DomId> terminate a domain immediately + domid <DomName> convert a domain name to a domain id + domname <DomId> convert a domain id to a domain name + list list information about domains + mem-max <DomId> <Mem> set domain maximum memory limit + mem-set <DomId> <Mem> set the domain's memory dynamically + migrate <DomId> <Host> migrate a domain to another machine + pause <DomId> pause execution of a domain + reboot [-w|-a] <DomId> reboot a domain + restore <File> create a domain from a saved state file + save <DomId> <File> save domain state (and config) to file + shutdown [-w|-a] <DomId> shutdown a domain + sysrq <DomId> <letter> send a sysrq to a domain + unpause <DomId> unpause a paused domain + vcpu-enable <DomId> <VCPU> disable VCPU in a domain + vcpu-disable <DomId> <VCPU> enable VCPU in a domain + vcpu-list <DomId> get the list of VCPUs for a domain + + Xen Host Commands: + dmesg [--clear] read or clear Xen's message buffer + info get information about the xen host + log print the xend log + top monitor system and domains in real-time + + Scheduler Commands: + bvt <options> set BVT scheduler parameters + bvt_ctxallow <Allow> set the BVT scheduler context switch allowance + sedf <options> set simple EDF parameters + + Virtual Device Commands: + block-create <DomId> <BackDev> <FrontDev> <Mode> [BackDomId] + Create a new virtual block device + block-destroy <DomId> <DevId> Destroy a domain's virtual block device + block-list <DomId> List virtual block devices for a domain + block-refresh <DomId> <DevId> Refresh a virtual block device for a domain + network-limit <DomId> <Vif> <Credit> <Period> + Limit the transmission rate of a virtual network interface + network-list <DomId> List virtual network interfaces for a domain + +For a short list of subcommands run 'xm help' +For more help on xm see the xm(1) man page +For more help on xm create, see the xmdomain.cfg(5) man page""" + +#################################################################### +# +# Utility functions +# +#################################################################### + +def arg_check(args,num,name): + if len(args) < num: + err("'xm %s' requires %s argument(s)!\n" % (name, num)) + usage(name) def unit(c): if not c.isalpha(): @@ -48,724 +145,325 @@ else: return value * (base / dst_base) -class Group: - - name = "" - info = "" - - def __init__(self, xm): - self.xm = xm - self.progs = {} - - def addprog(self, prog): - self.progs[prog.name] = prog - - def getprog(self, name): - return self.progs.get(name) - - def proglist(self): - kl = self.progs.keys() - kl.sort() - return [ self.getprog(k) for k in kl ] - - def help(self, args): - if self.info: - print - print self.info - print - else: - print - - def shortHelp(self, args): - self.help(args) - for p in self.proglist(): - p.shortHelp(args) - -class Prog: - """Base class for sub-programs. - """ - - """Program group it belongs to""" - group = 'all' - """Program name.""" - name = '??' - """Short program info.""" - info = '' - - def __init__(self, xm): - self.xm = xm - - def err(self, msg): - self.xm.err(msg) - - def help(self, args): - self.shortHelp(args) - - def shortHelp(self, args): - print "%-14s %s" % (self.name, self.info) - - def main(self, args): - """Program main entry point. - """ - pass - - -class ProgUnknown(Prog): - - name = 'unknown' - info = '' - - def help(self, args): - self.xm.err("Unknown command: %s\nTry '%s help' for more information." - % (args[0], self.xm.name)) - - main = help - -class Xm: - """Main application. - """ - - def __init__(self): - self.name = 'xm' - self.unknown = ProgUnknown(self) - self.progs = {} - self.groups = {} - - def err(self, msg): - print >>sys.stderr, "Error:", msg +def err(msg): + print >>sys.stderr, "Error:", msg + +def handle_xend_error(cmd, dom, ex): + error = str(ex) + if error == "Not found" and dom != None: + err("Domain '%s' not found when running 'xm %s'" % (dom, cmd)) sys.exit(1) - - def main(self, args): - try: - self.main_call(args) - except socket.error, ex: - print >>sys.stderr, ex - self.err("Error connecting to xend, is xend running?") - except XendError, ex: - self.err(str(ex)) - - def main_call(self, args): - """Main entry point. Dispatches to the progs. - """ - self.name = args[0] - if len(args) < 2: - args.append('help') - help = self.helparg(args) - p = self.getprog(args[1], self.unknown) - if help or len(args) < 2: - p.help(args[1:]) - else: - p.main(args[1:]) - - def helparg(self, args): - for a in args: - if a in ['-h', '--help']: - return 1 - return 0 - - def prog(self, pklass): - """Add a sub-program. - - pklass program class (Prog subclass) - """ - p = pklass(self) - self.progs[p.name] = p - self.getgroup(p.group).addprog(p) - return p - - def getprog(self, name, val=None): - """Get a sub-program. - name Name of the sub-program (or optionally, an unambiguous - prefix of its name) - val Default return value if no (unique) match is found - """ - - match = None - for progname in self.progs.keys(): - if progname == name: - match = progname - break - if progname.startswith(name): - if not match: - match = progname - else: - return val # name is ambiguous - bail out - - return self.progs.get(match, val) - - def group(self, klass): - g = klass(self) - self.groups[g.name] = g - return g - - def getgroup(self, name): - return self.groups[name] - - def grouplist(self): - kl = self.groups.keys() - kl.sort() - return [ self.getgroup(k) for k in kl ] - -# Create the application object, then add the sub-program classes. -xm = Xm() - -class GroupAll(Group): - - name = "all" - info = "" - -xm.group(GroupAll) - -class GroupDomain(Group): - - name = "domain" - info = "Commands on domains:" - -xm.group(GroupDomain) - -class GroupScheduler(Group): - - name = "scheduler" - info = "Comands controlling scheduling:" - -xm.group(GroupScheduler) - -class GroupHost(Group): - - name = "host" - info = "Commands related to the xen host (node):" - -xm.group(GroupHost) - -class GroupConsole(Group): - - name = "console" - info = "Commands related to consoles:" - -xm.group(GroupConsole) - -class GroupVbd(Group): - - name = "vbd" - info = "Commands related to virtual block devices:" - -xm.group(GroupVbd) - -class GroupVif(Group): - - name = "vif" - info = "Commands related to virtual network interfaces:" - -xm.group(GroupVif) - -class ProgHelp(Prog): - - name = "help" - info = "Print help." - - def help(self, args): - if len(args) == 2: - name = args[1] - p = self.xm.getprog(name) - if p: - p.help(args[1:]) - else: - print '%s: Unknown command: %s' % (self.name, name) - else: - for g in self.xm.grouplist(): - g.shortHelp(args) - print "\nTry '%s help CMD' for help on CMD" % self.xm.name - - main = help - -xm.prog(ProgHelp) - -class ProgCreate(Prog): - - group = 'domain' - name = "create" - info = """Create a domain.""" - - def help(self, args): - create.main([args[0], '-h']) - - def main(self, args): - create.main(args) - -xm.prog(ProgCreate) - -class ProgSave(Prog): - group = 'domain' - name = "save" - info = """Save domain state (and config) to file.""" - - def help(self, args): - print args[0], "DOM FILE" - print """\nSave domain with id DOM to FILE.""" - - def main(self, args): - if len(args) < 3: self.err("%s: Missing arguments" % args[0]) - dom = args[1] - savefile = os.path.abspath(args[2]) - server.xend_domain_save(dom, savefile) - -xm.prog(ProgSave) - -class ProgRestore(Prog): - group = 'domain' - name = "restore" - info = """Create a domain from a saved state.""" - - def help(self, args): - print args[0], "FILE" - print "\nRestore a domain from FILE." - - def main(self, args): - if len(args) < 2: self.err("%s: Missing arguments" % args[0]) - savefile = os.path.abspath(args[1]) - info = server.xend_domain_restore(savefile) - PrettyPrint.prettyprint(info) - id = sxp.child_value(info, 'id') - if id is not None: - server.xend_domain_unpause(id) - -xm.prog(ProgRestore) - -class ProgMigrate(Prog): - group = 'domain' - name = "migrate" - info = """Migrate a domain to another machine.""" - - def help(self, args): - migrate.help([self.name] + args) - - def main(self, args): - migrate.main(args) - -xm.prog(ProgMigrate) - -class ProgList(Prog): - group = 'domain' - name = "list" - info = """List information about domains.""" - - short_options = 'lv' - long_options = ['long','vcpus'] - - def help(self, args): - if help: - print args[0], '[options] [DOM...]' - print """\nGet information about domains. - Either all domains or the domains given. - - -l, --long Get more detailed information. - -v, --vcpus Show VCPU to CPU mapping. - """ - return - - def main(self, args): - use_long = 0 - show_vcpus = 0 - (options, params) = getopt(args[1:], - self.short_options, - self.long_options) - n = len(params) - for (k, v) in options: - if k in ['-l', '--long']: - use_long = 1 - if k in ['-v', '--vcpus']: - show_vcpus = 1 - - if n == 0: - doms = server.xend_domains() - doms.sort() - else: - doms = params - - if use_long: - self.long_list(doms) - elif show_vcpus: - self.show_vcpus(doms) - else: - self.brief_list(doms) - - def brief_list(self, doms): - print 'Name Id Mem(MB) CPU VCPU(s) State Time(s) Console' - for dom in doms: - info = server.xend_domain(dom) - d = {} - d['dom'] = int(sxp.child_value(info, 'id', '-1')) - d['name'] = sxp.child_value(info, 'name', '??') - d['mem'] = int(sxp.child_value(info, 'memory', '0')) - d['cpu'] = str(sxp.child_value(info, 'cpu', '0')) - d['vcpus'] = int(sxp.child_value(info, 'vcpus', '0')) - d['state'] = sxp.child_value(info, 'state', '??') - d['cpu_time'] = float(sxp.child_value(info, 'cpu_time', '0')) - console = sxp.child(info, 'console') - if console: - d['port'] = sxp.child_value(console, 'console_port') - else: - d['port'] = '' - if d['vcpus'] > 1: - d['cpu'] = '-' - if ((int(sxp.child_value(info, 'ssidref', '0'))) != 0): - d['ssidref1'] = int(sxp.child_value(info, 'ssidref', '0')) & 0xffff - d['ssidref2'] = (int(sxp.child_value(info, 'ssidref', '0')) >> 16) & 0xffff - print ("%(name)-16s %(dom)3d %(mem)7d %(cpu)3s %(vcpus)5d %(state)5s %(cpu_time)7.1f %(port)4s s:%(ssidref2)02x/p:%(ssidref1)02x" % d) - else: - print ("%(name)-16s %(dom)3d %(mem)7d %(cpu)3s %(vcpus)5d %(state)5s %(cpu_time)7.1f %(port)4s" % d) - - def show_vcpus(self, doms): - print 'Name Id VCPU CPU CPUMAP' - for dom in doms: - info = server.xend_domain(dom) - vcpu_to_cpu = sxp.child_value(info, 'vcpu_to_cpu', '-1').split('|') - cpumap = sxp.child_value(info, 'cpumap', []) - mask = ((int(sxp.child_value(info, 'vcpus', '0')))**2) - 1 - count = 0 - for cpu in vcpu_to_cpu: - d = {} - d['name'] = sxp.child_value(info, 'name', '??') - d['dom'] = int(sxp.child_value(info, 'id', '-1')) - d['vcpu'] = int(count) - d['cpu'] = int(cpu) - d['cpumap'] = int(cpumap[count])&mask - count = count + 1 - print ("%(name)-16s %(dom)3d %(vcpu)4d %(cpu)3d 0x%(cpumap)x" % d) - - def long_list(self, doms): + else: + raise ex + + +######################################################################### +# +# Main xm functions +# +######################################################################### + +def xm_create(args): + from xen.xm import create + # ugly hack because the opt parser apparently wants + # the subcommand name just to throw it away! + args.insert(0,"bogus") + create.main(args) + +def xm_save(args): + arg_check(args,2,"save") + + dom = args[0] # TODO: should check if this exists + savefile = os.path.abspath(args[1]) + + from xen.xend.XendClient import server + server.xend_domain_save(dom, savefile) + +def xm_restore(args): + arg_check(args,1,"restore") + + savefile = os.path.abspath(args[0]) + + from xen.xend.XendClient import server + info = server.xend_domain_restore(savefile) + PrettyPrint.prettyprint(info) + id = sxp.child_value(info, 'id') + if id is not None: + server.xend_domain_unpause(id) + +def xm_migrate(args): + # TODO: arg_check + from xen.xm import migrate + # ugly hack because the opt parser apparently wants + # the subcommand name just to throw it away! + args.insert(0,"bogus") + migrate.main(args) + +def xm_list(args): + use_long = 0 + show_vcpus = 0 + try: + (options, params) = getopt(args, 'lv', ['long','vcpus']) + except GetoptError, opterr: + err(opterr) + sys.exit(1) + + n = len(params) + for (k, v) in options: + if k in ['-l', '--long']: + use_long = 1 + if k in ['-v', '--vcpus']: + show_vcpus = 1 + + domsinfo = [] + from xen.xend.XendClient import server + if n == 0: + doms = server.xend_domains() + doms.sort() + else: + doms = params + for dom in doms: + info = server.xend_domain(dom) + domsinfo.append(parse_doms_info(info)) + + if use_long: for dom in doms: info = server.xend_domain(dom) PrettyPrint.prettyprint(info) - -xm.prog(ProgList) - -class ProgDestroy(Prog): - group = 'domain' - name = "destroy" - info = """Terminate a domain immediately.""" - - def help(self, args): - destroy.main([args[0], '-h']) - - def main(self, args): - destroy.main(args) - -xm.prog(ProgDestroy) - -class ProgShutdown(Prog): - group = 'domain' - name = "shutdown" - info = """Shutdown a domain.""" - - def help(self, args): - shutdown.main([args[0], '-h']) - - def main(self, args): - shutdown.main(args) - -xm.prog(ProgShutdown) - -class ProgSysrq(Prog): - group = 'domain' - name = "sysrq" - info = """Send a sysrq to a domain.""" - - def help(self, args): - sysrq.main([args[0], '-h']) - - def main(self, args): - sysrq.main(args) - -xm.prog(ProgSysrq) - -class ProgPause(Prog): - group = 'domain' - name = "pause" - info = """Pause execution of a domain.""" - - def help(self, args): - print args[0], 'DOM' - print '\nPause execution of domain DOM.' - - def main(self, args): - if len(args) < 2: self.err("%s: Missing domain" % args[0]) - dom = args[1] - server.xend_domain_pause(dom) - -xm.prog(ProgPause) - -class ProgUnpause(Prog): - group = 'domain' - name = "unpause" - info = """Unpause a paused domain.""" - - def help(self, args): - print args[0], 'DOM' - print '\nUnpause execution of domain DOM.' - - def main(self, args): - if len(args) < 2: self.err("%s: Missing domain" % args[0]) - dom = args[1] - server.xend_domain_unpause(dom) - -xm.prog(ProgUnpause) - -class ProgPincpu(Prog): - group = 'domain' - name = "pincpu" - info = """Set which cpus a VCPU can use. """ - - def help(self, args): - print args[0],'DOM VCPU CPUS' - print '\nSet which cpus VCPU in domain DOM can use.' - - # convert list of cpus to bitmap integer value - def make_map(self, cpulist): - cpus = [] - cpumap = 0 - for c in cpulist.split(','): - if c.find('-') != -1: - (x,y) = c.split('-') - for i in range(int(x),int(y)+1): - cpus.append(int(i)) - else: - cpus.append(int(c)) - cpus.sort() - for c in cpus: - cpumap = cpumap | 1<<c - - return cpumap - - def main(self, args): - if len(args) != 4: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - vcpu = int(args[2]) - cpumap = self.make_map(args[3]); - server.xend_domain_pincpu(dom, vcpu, cpumap) - -xm.prog(ProgPincpu) - -class ProgMaxmem(Prog): - group = 'domain' - name = 'maxmem' - info = """Set domain memory limit.""" - - def help(self, args): - print args[0], "DOM MEMORY" - print "\nSet the memory limit for domain DOM to MEMORY megabytes." - - def main(self, args): - if len(args) != 3: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - mem = int_unit(args[2], 'm') - server.xend_domain_maxmem_set(dom, mem) - -xm.prog(ProgMaxmem) - -class ProgSetMem(Prog): - group = 'domain' - name = 'set-mem' - info = """Set the domain's memory footprint using the balloon driver.""" - - def help(self, args): - print args[0], "DOM MEMORY_TARGET" - print """\nRequest domain DOM to adjust its memory footprint to -MEMORY_TARGET megabytes""" - - def main(self, args): - if len(args) != 3: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - mem_target = int_unit(args[2], 'm') - server.xend_domain_mem_target_set(dom, mem_target) - -xm.prog(ProgSetMem) - -class ProgVcpuhotplug(Prog): - group = 'domain' - name = 'vcpu-hotplug' - info = """Enable or disable a VCPU in a domain.""" - - def help(self, args): - print args[0], "DOM VCPU [0|1]" - print """\nRequest virtual processor VCPU to be disabled or enabled in -domain DOM""" - - def main(self, args): - if len(args) != 4: self.err("%s: Invalid arguments(s)" % args[0]) - name = args[1] - vcpu = int(args[2]) - state = int(args[3]) - dom = server.xend_domain(name) - id = sxp.child_value(dom, 'id') - server.xend_domain_vcpu_hotplug(id, vcpu, state) - -xm.prog(ProgVcpuhotplug) - -class ProgDomid(Prog): - group = 'domain' - name = 'domid' - info = 'Convert a domain name to a domain id.' - - def help(self, args): - print args[0], "DOM" - print '\nGet the domain id for the domain with name DOM.' + elif show_vcpus: + xm_show_vcpus(domsinfo) + else: + xm_brief_list(domsinfo) + +def parse_doms_info(info): + dominfo = {} + dominfo['dom'] = int(sxp.child_value(info, 'id', '-1')) + dominfo['name'] = sxp.child_value(info, 'name', '??') + dominfo['mem'] = int(sxp.child_value(info, 'memory', '0')) + dominfo['cpu'] = str(sxp.child_value(info, 'cpu', '0')) + dominfo['vcpus'] = int(sxp.child_value(info, 'vcpus', '0')) + # if there is more than 1 cpu, the value doesn't mean much + if dominfo['vcpus'] > 1: + dominfo['cpu'] = '-' + dominfo['state'] = sxp.child_value(info, 'state', '??') + dominfo['cpu_time'] = float(sxp.child_value(info, 'cpu_time', '0')) + # security identifiers + if ((int(sxp.child_value(info, 'ssidref', '0'))) != 0): + dominfo['ssidref1'] = int(sxp.child_value(info, 'ssidref', '0')) & 0xffff + dominfo['ssidref2'] = (int(sxp.child_value(info, 'ssidref', '0')) >> 16) & 0xffff + # get out the vcpu information + dominfo['vcpulist'] = [] + vcpu_to_cpu = sxp.child_value(info, 'vcpu_to_cpu', '-1').split('|') + cpumap = sxp.child_value(info, 'cpumap', []) + mask = ((int(sxp.child_value(info, 'vcpus', '0')))**2) - 1 + count = 0 + for cpu in vcpu_to_cpu: + vcpuinfo = {} + vcpuinfo['name'] = sxp.child_value(info, 'name', '??') + vcpuinfo['dom'] = int(sxp.child_value(info, 'id', '-1')) + vcpuinfo['vcpu'] = int(count) + vcpuinfo['cpu'] = int(cpu) + vcpuinfo['cpumap'] = int(cpumap[count])&mask + count = count + 1 + dominfo['vcpulist'].append(vcpuinfo) + return dominfo - def main (self, args): - if len(args) != 2: self.err("%s: Invalid argument(s)" % args[0]) - name = args[1] - dom = server.xend_domain(name) - print sxp.child_value(dom, 'id') - -xm.prog(ProgDomid) - -class ProgDomname(Prog): - group = 'domain' - name = 'domname' - info = 'Convert a domain id to a domain name.' - - def help(self, args): - print args[0], "DOM" - print '\nGet the name for the domain with id DOM.' - - def main (self, args): - if len(args) != 2: self.err("%s: Invalid argument(s)" % args[0]) - name = args[1] - dom = server.xend_domain(name) - print sxp.child_value(dom, 'name') - -xm.prog(ProgDomname) - -class ProgBvt(Prog): - group = 'scheduler' - name = "bvt" - info = """Set BVT scheduler parameters.""" - - def help(self, args): - print args[0], "DOM MCUADV WARPBACK WARPVALUE WARPL WARPU" - print '\nSet Borrowed Virtual Time scheduler parameters.' - - def main(self, args): - if len(args) != 7: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - v = map(long, args[2:7]) - server.xend_domain_cpu_bvt_set(dom, *v) - -xm.prog(ProgBvt) - -class ProgBvtslice(Prog): - group = 'scheduler' - name = "bvt_ctxallow" - info = """Set the BVT scheduler context switch allowance.""" - - def help(self, args): - print args[0], 'CTX_ALLOW' - print '\nSet Borrowed Virtual Time scheduler context switch allowance.' - - def main(self, args): - if len(args) < 2: self.err('%s: Missing context switch allowance' - % args[0]) - slice = int(args[1]) - server.xend_node_cpu_bvt_slice_set(slice) - -xm.prog(ProgBvtslice) - -class ProgSedf(Prog): - group = 'scheduler' - name= "sedf" - info = """Set simple EDF parameters.""" - - def help(self, args): - print args[0], "DOM PERIOD SLICE LATENCY EXTRATIME WEIGHT" - print "\nSet simple EDF parameters." - - def main(self, args): - if len(args) != 7: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - v = map(int, args[2:7]) - server.xend_domain_cpu_sedf_set(dom, *v) - -xm.prog(ProgSedf) - -class ProgInfo(Prog): - group = 'host' - name = "info" - info = """Get information about the xen host.""" - - def main(self, args): - info = server.xend_node() - for x in info[1:]: - print "%-23s:" % x[0], x[1] - -xm.prog(ProgInfo) - -class ProgConsoles(Prog): - group = 'console' - name = "consoles" - info = """Get information about domain consoles.""" - - def main(self, args): - l = server.xend_consoles() - print "Dom Port Id Connection" - for x in l: - info = server.xend_console(x) - d = {} - d['dom'] = sxp.child(info, 'domain', '?')[1] - d['port'] = sxp.child_value(info, 'console_port', '?') - d['id'] = sxp.child_value(info, 'id', '?') - connected = sxp.child(info, 'connected') - if connected: - d['conn'] = '%s:%s' % (connected[1], connected[2]) - else: - d['conn'] = '' - print "%(dom)3s %(port)4s %(id)3s %(conn)s" % d - -xm.prog(ProgConsoles) - -class ProgConsole(Prog): - group = 'console' - name = "console" - info = """Open a console to a domain.""" - - def help(self, args): - print args[0], "DOM" - print "\nOpen a console to domain DOM." - - def main(self, args): - if len(args) < 2: self.err("%s: Missing domain" % args[0]) - dom = args[1] - info = server.xend_domain(dom) - console = sxp.child(info, "console") - if not console: - self.err("No console information") - port = sxp.child_value(console, "console_port") - from xen.util import console_client - path = "/var/lib/xend/console-%s" % port - console_client.connect("localhost", int(port), path=path) - -xm.prog(ProgConsole) - -class ProgCall(Prog): - name = "call" - info = "Call xend api functions." - - def help (self, args): - print args[0], "function args..." - print """ - Call a xend HTTP API function. The leading 'xend_' on the function -can be omitted. See xen.xend.XendClient for the API functions. -""" - - def main(self, args): - xend_client_main(args) - -xm.prog(ProgCall) - -class ProgDmesg(Prog): - group = 'host' - name = "dmesg" - info = """Read or clear Xen's message buffer.""" - +def xm_brief_list(domsinfo): + print 'Name Id Mem(MB) CPU VCPU(s) State Time(s)' + for dominfo in domsinfo: + if dominfo.has_key("ssidref1"): + print ("%(name)-16s %(dom)3d %(mem)7d %(cpu)3s %(vcpus)5d %(state)5s %(cpu_time)7.1f s:%(ssidref2)02x/p:%(ssidref1)02x" % dominfo) + else: + print ("%(name)-16s %(dom)3d %(mem)7d %(cpu)3s %(vcpus)5d %(state)5s %(cpu_time)7.1f" % dominfo) + +def xm_show_vcpus(domsinfo): + print 'Name Id VCPU CPU CPUMAP' + for dominfo in domsinfo: + for vcpuinfo in dominfo['vcpulist']: + print ("%(name)-16s %(dom)3d %(vcpu)4d %(cpu)3d 0x%(cpumap)x" % + vcpuinfo) + +def xm_vcpu_list(args): + args.insert(0,"-v") + xm_list(args) + +def xm_destroy(args): + arg_check(args,1,"destroy") + + from xen.xm import destroy + # ugly hack because the opt parser apparently wants + # the subcommand name just to throw it away! + args.insert(0,"bogus") + destroy.main(args) + +def xm_reboot(args): + arg_check(args,1,"reboot") + # ugly hack because the opt parser apparently wants + # the subcommand name just to throw it away! + args.insert(0,"bogus") + args.insert(2,"-R") + from xen.xm import shutdown + shutdown.main(args) + +def xm_shutdown(args): + arg_check(args,1,"shutdown") + + # ugly hack because the opt parser apparently wants + # the subcommand name just to throw it away! + args.insert(0,"bogus") + from xen.xm import shutdown + shutdown.main(args) + +def xm_sysrq(args): + from xen.xm import sysrq + # ugly hack because the opt parser apparently wants + # the subcommand name just to throw it away! + args.insert(0,"bogus") + sysrq.main(args) + +def xm_pause(args): + arg_check(args, 1, "pause") + dom = args[0] + + from xen.xend.XendClient import server + server.xend_domain_pause(dom) + +def xm_unpause(args): + arg_check(args, 1, "unpause") + dom = args[0] + + from xen.xend.XendClient import server + server.xend_domain_unpause(dom) + +############################################################# + +def cpu_make_map(cpulist): + cpus = [] + cpumap = 0 + for c in cpulist.split(','): + if c.find('-') != -1: + (x,y) = c.split('-') + for i in range(int(x),int(y)+1): + cpus.append(int(i)) + else: + cpus.append(int(c)) + cpus.sort() + for c in cpus: + cpumap = cpumap | 1<<c + + return cpumap + +def xm_cpus_set(args): + arg_check(args, 3, "cpus-set") + + dom = args[0] + vcpu = int(args[1]) + cpumap = cpu_make_map(args[2]) + + from xen.xend.XendClient import server + server.xend_domain_pincpu(dom, vcpu, cpumap) + +def xm_mem_max(args): + arg_check(args, 2, "mem-max") + + dom = args[0] + mem = int_unit(args[1], 'm') + + from xen.xend.XendClient import server + server.xend_domain_maxmem_set(dom, mem) + +def xm_mem_set(args): + arg_check(args, 2, "mem-set") + + dom = args[0] + mem_target = int_unit(args[1], 'm') + + from xen.xend.XendClient import server + server.xend_domain_mem_target_set(dom, mem_target) + +# TODO: why does this lookup by name? and what if that fails!? +def xm_vcpu_enable(args): + arg_check(args, 2, "vcpu-enable") + + name = args[0] + vcpu = int(args[1]) + + from xen.xend.XendClient import server + dom = server.xend_domain(name) + id = sxp.child_value(dom, 'id') + server.xend_domain_vcpu_hotplug(id, vcpu, 1) + +def xm_vcpu_disable(args): + arg_check(args, 2, "vcpu-disable") + + name = args[0] + vcpu = int(args[1]) + + from xen.xend.XendClient import server + dom = server.xend_domain(name) + id = sxp.child_value(dom, 'id') + server.xend_domain_vcpu_hotplug(id, vcpu, 0) + +def xm_domid(args): + name = args[0] + + from xen.xend.XendClient import server + dom = server.xend_domain(name) + print sxp.child_value(dom, 'id') + +def xm_domname(args): + name = args[0] + + from xen.xend.XendClient import server + dom = server.xend_domain(name) + print sxp.child_value(dom, 'name') + +def xm_bvt(args): + arg_check(args, 6, "bvt") + dom = args[0] + v = map(long, args[1:6]) + from xen.xend.XendClient import server + server.xend_domain_cpu_bvt_set(dom, *v) + +def xm_bvt_ctxallow(args): + arg_check(args, 1, "bvt_ctxallow") + + slice = int(args[0]) + from xen.xend.XendClient import server + server.xend_node_cpu_bvt_slice_set(slice) + +def xm_sedf(args): + arg_check(args, 6, "sedf") + + dom = args[0] + v = map(int, args[1:6]) + from xen.xend.XendClient import server + server.xend_domain_cpu_sedf_set(dom, *v) + +def xm_info(args): + from xen.xend.XendClient import server + info = server.xend_node() + + for x in info[1:]: + print "%-23s:" % x[0], x[1] + +# TODO: remove as soon as console server shows up +def xm_console(args): + arg_check(args,1,"console") + + dom = args[0] + from xen.xend.XendClient import server + info = server.xend_domain(dom) + domid = int(sxp.child_value(info, 'id', '-1')) + cmd = "/usr/libexec/xen/xenconsole %d" % domid + os.execvp('/usr/libexec/xen/xenconsole', cmd.split()) + console = sxp.child(info, "console") + +def xm_top(args): + os.execv('/usr/sbin/xentop', ['/usr/sbin/xentop']) + +def xm_dmesg(args): + gopts = Opts(use="""[-c|--clear] Read Xen's message buffer (boot output, warning and error messages) or clear @@ -775,161 +473,220 @@ gopts.opt('clear', short='c', fn=set_true, default=0, use="Clear the contents of the Xen message buffer.") - - short_options = ['-c'] - long_options = ['--clear'] - - def help(self, args): - self.gopts.argv = args - self.gopts.usage() - - def main(self, args): - self.gopts.parse(args) - if not (1 <= len(args) <=2): - self.gopts.err('Invalid arguments: ' + str(args)) - - if not self.gopts.vals.clear: - print server.xend_node_get_dmesg() - else: - server.xend_node_clear_dmesg() - -xm.prog(ProgDmesg) - -class ProgLog(Prog): - group = 'host' - name = "log" - info = """Print the xend log.""" - - def main(self, args): - print server.xend_node_log() - -xm.prog(ProgLog) - -class ProgVifCreditLimit(Prog): - group = 'vif' - name= "vif-limit" - info = """Limit the transmission rate of a virtual network interface.""" - - def help(self, args): - print args[0], "DOMAIN VIF CREDIT_IN_BYTES PERIOD_IN_USECS" - print "\nSet the credit limit of a virtual network interface." - - def main(self, args): - if len(args) != 5: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - v = map(int, args[2:5]) - server.xend_domain_vif_limit(dom, *v) - -xm.prog(ProgVifCreditLimit) - -class ProgVifList(Prog): - group = 'vif' - name = 'vif-list' - info = """List virtual network interfaces for a domain.""" - - def help(self, args): - print args[0], "DOM" - print "\nList virtual network interfaces for domain DOM" - - def main(self, args): - if len(args) != 2: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - for x in server.xend_domain_devices(dom, 'vif'): - sxp.show(x) + # Work around for gopts + args.insert(0,"bogus") + gopts.parse(args) + if not (1 <= len(args) <= 2): + err('Invalid arguments: ' + str(args)) + + from xen.xend.XendClient import server + if not gopts.vals.clear: + print server.xend_node_get_dmesg() + else: + server.xend_node_clear_dmesg() + +def xm_log(args): + from xen.xend.XendClient import server + print server.xend_node_log() + +def xm_network_limit(args): + arg_check(args,4,"network-limit") + dom = args[0] + v = map(int, args[1:4]) + from xen.xend.XendClient import server + server.xend_domain_vif_limit(dom, *v) + +def xm_network_list(args): + arg_check(args,1,"network-list") + dom = args[0] + from xen.xend.XendClient import server + for x in server.xend_domain_devices(dom, 'vif'): + sxp.show(x) + print + +def xm_block_list(args): + arg_check(args,1,"block-list") + dom = args[0] + from xen.xend.XendClient import server + for x in server.xend_domain_devices(dom, 'vbd'): + sxp.show(x) + print + +def xm_block_create(args): + n = len(args) + if n < 4 or n > 5: + err("%s: Invalid argument(s)" % args[0]) + usage("block-create") + + dom = args[0] + vbd = ['vbd', + ['uname', args[1]], + ['dev', args[2]], + ['mode', args[3]]] + if n == 5: + vbd.append(['backend', args[4]]) + + from xen.xend.XendClient import server + server.xend_domain_device_create(dom, vbd) + +def xm_block_refresh(args): + arg_check(args,2,"block-refresh") + + dom = args[0] + dev = args[1] + + from xen.xend.XendClient import server + server.xend_domain_device_refresh(dom, 'vbd', dev) + +def xm_block_destroy(args): + arg_check(args,2,"block-destroy") + + dom = args[0] + dev = args[1] + + from xen.xend.XendClient import server + server.xend_domain_device_destroy(dom, 'vbd', dev) + +commands = { + # console commands + "console": xm_console, + # xenstat commands + "top": xm_top, + # domain commands + "domid": xm_domid, + "domname": xm_domname, + "create": xm_create, + "destroy": xm_destroy, + "restore": xm_restore, + "save": xm_save, + "shutdown": xm_shutdown, + "reboot": xm_reboot, + "list": xm_list, + # memory commands + "mem-max": xm_mem_max, + "mem-set": xm_mem_set, + # cpu commands + "cpus-set": xm_cpus_set, +# "cpus-list": xm_cpus_list, + "vcpu-enable": xm_vcpu_enable, + "vcpu-disable": xm_vcpu_disable, + "vcpu-list": xm_vcpu_list, + # migration + "migrate": xm_migrate, + # special + "sysrq": xm_sysrq, + "pause": xm_pause, + "unpause": xm_unpause, + # host commands + "dmesg": xm_dmesg, + "info": xm_info, + "log": xm_log, + # scheduler + "bvt": xm_bvt, + "bvt_ctxallow": xm_bvt_ctxallow, + "sedf": xm_sedf, + # block + "block-create": xm_block_create, + "block-destroy": xm_block_destroy, + "block-list": xm_block_list, + "block-refresh": xm_block_refresh, + # network + "network-limit": xm_network_limit, + "network-list": xm_network_list + } + +aliases = { + "balloon": "mem-set", + "vif-list": "network-list", + "vif-limit": "network-limit", + "vbd-create": "block-create", + "vbd-destroy": "block-destroy", + "vbd-list": "block-list", + "vbd-refresh": "block-refresh", + } + +help = { + "--long": longhelp + } + +def xm_lookup_cmd(cmd): + if commands.has_key(cmd): + return commands[cmd] + elif aliases.has_key(cmd): + deprecated(cmd,aliases[cmd]) + return commands[aliases[cmd]] + else: + if len( cmd ) > 1: + matched_commands = filter( lambda (command, func): command[ 0:len(cmd) ] == cmd, commands.iteritems() ) + if len( matched_commands ) == 1: + return matched_commands[0][1] + err('Sub Command %s not found!' % cmd) + usage() + +def deprecated(old,new): + err('Option %s is deprecated, and will be removed in future!!!' % old) + err('Option %s is the new replacement, see "xm help %s" for more info' % (new, new)) + +def usage(cmd=None): + if cmd == "full": + print fullhelp + elif help.has_key(cmd): + print help[cmd] + else: + print shorthelp + sys.exit(1) + +def main(argv=sys.argv): + if len(argv) < 2: + usage() + + if re.compile('-*help').match(argv[1]): + if len(argv) > 2 and help.has_key(argv[2]): + usage(argv[2]) + else: + usage() + sys.exit(0) + + cmd = xm_lookup_cmd(argv[1]) + + # strip off prog name and subcmd + args = argv[2:] + if cmd: + try: + from xen.xend.XendClient import XendError + rc = cmd(args) + if rc: + usage() + except socket.error, ex: + print >>sys.stderr, ex + err("Error connecting to xend, is xend running?") + sys.exit(1) + except IOError: + err("Most commands need root access. Please try again as root") + sys.exit(1) + except XendError, ex: + if args[0] == "bogus": + args.remove("bogus") + if len(args) > 0: + handle_xend_error(argv[1], args[0], ex) + else: + print "Unexpected error:", sys.exc_info()[0] + print + print "Please report to xen-devel@xxxxxxxxxxxxxxxxxxx" + raise + except SystemExit: + sys.exit(1) + except: + print "Unexpected error:", sys.exc_info()[0] print - -xm.prog(ProgVifList) - -class ProgVbdList(Prog): - group = 'vbd' - name = 'vbd-list' - info = """List virtual block devices for a domain.""" - - def help(self, args): - print args[0], "DOM" - print "\nList virtual block devices for domain DOM" - - def main(self, args): - if len(args) != 2: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - for x in server.xend_domain_devices(dom, 'vbd'): - sxp.show(x) - print - -xm.prog(ProgVbdList) - -class ProgVbdCreate(Prog): - group = 'vbd' - name = 'vbd-create' - info = """Create a new virtual block device for a domain""" - - def help(self, args): - print args[0], "DOM UNAME DEV MODE [BACKEND]" - print """ -Create a virtual block device for a domain. - - UNAME - device to export, e.g. phy:hda2 - DEV - device name in the domain, e.g. sda1 - MODE - access mode: r for read, w for read-write - BACKEND - backend driver domain -""" - - def main(self, args): - n = len(args) - if n < 5 or n > 6: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - vbd = ['vbd', - ['uname', args[2]], - ['dev', args[3]], - ['mode', args[4]]] - if n == 6: - vbd.append(['backend', args[5]]) - server.xend_domain_device_create(dom, vbd) - -xm.prog(ProgVbdCreate) - -class ProgVbdRefresh(Prog): - group = 'vbd' - name = 'vbd-refresh' - info = """Refresh a virtual block device for a domain""" - - def help(self, args): - print args[0], "DOM DEV" - print """ -Refresh a virtual block device for a domain. - - DEV - idx field in the device information -""" - - def main(self, args): - if len(args) != 3: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - dev = args[2] - server.xend_domain_device_refresh(dom, 'vbd', dev) - -xm.prog(ProgVbdRefresh) - - -class ProgVbdDestroy(Prog): - group = 'vbd' - name = 'vbd-destroy' - info = """Destroy a domain's virtual block device""" - - def help(self, args): - print args[0], "DOM DEV" - print """ -Destroy vbd DEV attached to domain DOM. Detaches the device -from the domain, but does not destroy the device contents. -The device indentifier DEV is the idx field in the device -information. This is visible in 'xm vbd-list'.""" - - def main(self, args): - if len(args) != 3: self.err("%s: Invalid argument(s)" % args[0]) - dom = args[1] - dev = args[2] - server.xend_domain_device_destroy(dom, 'vbd', dev) - -xm.prog(ProgVbdDestroy) - -def main(args): - xm.main(args) + print "Please report to xen-devel@xxxxxxxxxxxxxxxxxxx" + raise + + else: + usage() + +if __name__ == "__main__": + main() + + + diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/migrate.py --- a/tools/python/xen/xm/migrate.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/migrate.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Domain migration. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/opts.py --- a/tools/python/xen/xm/opts.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/opts.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,20 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ + """Object-oriented command-line option support. """ from getopt import getopt, GetoptError diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/shutdown.py --- a/tools/python/xen/xm/shutdown.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/shutdown.py Thu Aug 25 22:53:20 2005 @@ -1,4 +1,19 @@ -# Copyright (C) 2004 Mike Wray <mike.wray@xxxxxx> +#============================================================================ +# This library is free software; you can redistribute it and/or +# modify it under the terms of version 2.1 of the GNU Lesser General Public +# License as published by the Free Software Foundation. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +#============================================================================ +# Copyright (C) 2004, 2005 Mike Wray <mike.wray@xxxxxx> +#============================================================================ """Domain shutdown. """ diff -r 5f1ed597f107 -r 8799d14bef77 tools/python/xen/xm/sysrq.py --- a/tools/python/xen/xm/sysrq.py Wed Aug 24 02:43:18 2005 +++ b/tools/python/xen/xm/sysrq.py Thu Aug 25 22:53:20 2005 @@ -21,9 +21,6 @@ fn=set_true, default=0, use="Print this help.") -def sysrq(dom, req): - server.xend_domain_shutdown(dom, 'sysrq', req) - def main(argv): opts = gopts args = opts.parse(argv) @@ -36,4 +33,4 @@ if len(args) < 2: opts.err('Missing sysrq character') dom = args[0] req = ord(args[1][0]) - sysrq(dom, req) + server.xend_domain_sysrq(dom, req) diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/Makefile --- a/tools/security/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/security/Makefile Thu Aug 25 22:53:20 2005 @@ -2,27 +2,71 @@ include $(XEN_ROOT)/tools/Rules.mk SRCS = secpol_tool.c -CFLAGS += -static CFLAGS += -Wall CFLAGS += -Werror CFLAGS += -O3 CFLAGS += -fno-strict-aliasing -CFLAGS += -I. +CFLAGS += -I. -I/usr/include/libxml2 +CFLAGS_XML2BIN += $(shell xml2-config --cflags --libs ) +#if above does not work, try -L/usr/lib -lxml2 -lz -lpthread -lm +XML2VERSION = $(shell xml2-config --version ) +VALIDATE_SCHEMA=$(shell if [[ $(XML2VERSION) < 2.6.20 ]]; then echo ""; else echo "-DVALIDATE_SCHEMA"; fi; ) +ifeq ($(ACM_USE_SECURITY_POLICY),ACM_NULL_POLICY) +POLICY=null +endif +ifeq ($(ACM_USE_SECURITY_POLICY),ACM_CHINESE_WALL_POLICY) +POLICY=chwall +endif +ifeq ($(ACM_USE_SECURITY_POLICY),ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY) +POLICY=ste +endif +ifeq ($(ACM_USE_SECURITY_POLICY),ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY) +POLICY=chwall_ste +endif +POLICYFILE=./policies/$(POLICY)/$(POLICY).bin + +ifneq ($(ACM_USE_SECURITY_POLICY), ACM_NULL_POLICY) all: build + +install:all + +default:all +else +all: + +install: + +default: +endif + build: mk-symlinks $(MAKE) secpol_tool + $(MAKE) secpol_xml2bin + chmod 700 ./setlabel.sh + chmod 700 ./updategrub.sh -default: all - -install: all - -secpol_tool : secpol_tool.c +secpol_tool : secpol_tool.c secpol_compat.h $(CC) $(CPPFLAGS) $(CFLAGS) -o $@ $< +secpol_xml2bin : secpol_xml2bin.c secpol_xml2bin.h secpol_compat.h + $(CC) $(CPPFLAGS) $(CFLAGS) $(CFLAGS_XML2BIN) $(VALIDATE_SCHEMA) -o $@ $< + clean: - rm -rf secpol_tool xen + rm -rf secpol_tool secpol_xml2bin xen +policy_clean: + rm -rf policies/*/*.bin policies/*/*.map + +mrproper: clean policy_clean + + +$(POLICYFILE) : build + @./secpol_xml2bin $(POLICY) > /dev/null + +boot_install: $(POLICYFILE) + @cp $(POLICYFILE) /boot + @./updategrub.sh $(POLICY) $(PWD)/$(XEN_ROOT) LINUX_ROOT := $(XEN_ROOT)/linux-2.6-xen-sparse mk-symlinks: diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/secpol_tool.c --- a/tools/security/secpol_tool.c Wed Aug 24 02:43:18 2005 +++ b/tools/security/secpol_tool.c Thu Aug 25 22:53:20 2005 @@ -31,18 +31,8 @@ #include <stdlib.h> #include <sys/ioctl.h> #include <string.h> -#include <stdint.h> #include <netinet/in.h> - -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; -typedef int8_t s8; -typedef int16_t s16; -typedef int32_t s32; -typedef int64_t s64; - +#include "secpol_compat.h" #include <xen/acm.h> #include <xen/acm_ops.h> #include <xen/linux/privcmd.h> @@ -270,171 +260,6 @@ } } -/*************************** set policy ****************************/ - -int acm_domain_set_chwallpolicy(void *bufstart, int buflen) -{ -#define CWALL_MAX_SSIDREFS 6 -#define CWALL_MAX_TYPES 10 -#define CWALL_MAX_CONFLICTSETS 2 - - struct acm_chwall_policy_buffer *chwall_bin_pol = - (struct acm_chwall_policy_buffer *) bufstart; - domaintype_t *ssidrefs, *conflicts; - int ret = 0; - int j; - - chwall_bin_pol->chwall_max_types = htonl(CWALL_MAX_TYPES); - chwall_bin_pol->chwall_max_ssidrefs = htonl(CWALL_MAX_SSIDREFS); - chwall_bin_pol->policy_code = htonl(ACM_CHINESE_WALL_POLICY); - chwall_bin_pol->policy_version = htonl(ACM_CHWALL_VERSION); - chwall_bin_pol->chwall_ssid_offset = - htonl(sizeof(struct acm_chwall_policy_buffer)); - chwall_bin_pol->chwall_max_conflictsets = - htonl(CWALL_MAX_CONFLICTSETS); - chwall_bin_pol->chwall_conflict_sets_offset = - htonl(ntohl(chwall_bin_pol->chwall_ssid_offset) + - sizeof(domaintype_t) * CWALL_MAX_SSIDREFS * CWALL_MAX_TYPES); - chwall_bin_pol->chwall_running_types_offset = 0; /* not set */ - chwall_bin_pol->chwall_conflict_aggregate_offset = 0; /* not set */ - ret += sizeof(struct acm_chwall_policy_buffer); - /* now push example ssids into the buffer (max_ssidrefs x max_types entries) */ - /* check buffer size */ - if ((buflen - ret) < - (CWALL_MAX_TYPES * CWALL_MAX_SSIDREFS * sizeof(domaintype_t))) - return -1; /* not enough space */ - - ssidrefs = (domaintype_t *) (bufstart + - ntohl(chwall_bin_pol->chwall_ssid_offset)); - memset(ssidrefs, 0, - CWALL_MAX_TYPES * CWALL_MAX_SSIDREFS * sizeof(domaintype_t)); - - /* now set type j-1 for ssidref i+1 */ - for (j = 0; j <= CWALL_MAX_SSIDREFS; j++) - if ((0 < j) && (j <= CWALL_MAX_TYPES)) - ssidrefs[j * CWALL_MAX_TYPES + j - 1] = htons(1); - - ret += CWALL_MAX_TYPES * CWALL_MAX_SSIDREFS * sizeof(domaintype_t); - if ((buflen - ret) < - (CWALL_MAX_CONFLICTSETS * CWALL_MAX_TYPES * sizeof(domaintype_t))) - return -1; /* not enough space */ - - /* now the chinese wall policy conflict sets */ - conflicts = (domaintype_t *) (bufstart + - ntohl(chwall_bin_pol-> - chwall_conflict_sets_offset)); - memset((void *) conflicts, 0, - CWALL_MAX_CONFLICTSETS * CWALL_MAX_TYPES * - sizeof(domaintype_t)); - /* just 1 conflict set [0]={2,3}, [1]={1,5,6} */ - if (CWALL_MAX_TYPES > 3) - { - conflicts[2] = htons(1); - conflicts[3] = htons(1); /* {2,3} */ - conflicts[CWALL_MAX_TYPES + 1] = htons(1); - conflicts[CWALL_MAX_TYPES + 5] = htons(1); - conflicts[CWALL_MAX_TYPES + 6] = htons(1); /* {0,5,6} */ - } - ret += sizeof(domaintype_t) * CWALL_MAX_CONFLICTSETS * CWALL_MAX_TYPES; - return ret; -} - -int acm_domain_set_stepolicy(void *bufstart, int buflen) -{ -#define STE_MAX_SSIDREFS 6 -#define STE_MAX_TYPES 5 - - struct acm_ste_policy_buffer *ste_bin_pol = - (struct acm_ste_policy_buffer *) bufstart; - domaintype_t *ssidrefs; - int j, ret = 0; - - ste_bin_pol->ste_max_types = htonl(STE_MAX_TYPES); - ste_bin_pol->ste_max_ssidrefs = htonl(STE_MAX_SSIDREFS); - ste_bin_pol->policy_code = htonl(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY); - ste_bin_pol->policy_version = htonl(ACM_STE_VERSION); - ste_bin_pol->ste_ssid_offset = - htonl(sizeof(struct acm_ste_policy_buffer)); - ret += sizeof(struct acm_ste_policy_buffer); - /* check buffer size */ - if ((buflen - ret) < - (STE_MAX_TYPES * STE_MAX_SSIDREFS * sizeof(domaintype_t))) - return -1; /* not enough space */ - - ssidrefs = - (domaintype_t *) (bufstart + ntohl(ste_bin_pol->ste_ssid_offset)); - memset(ssidrefs, 0, - STE_MAX_TYPES * STE_MAX_SSIDREFS * sizeof(domaintype_t)); - /* all types 1 for ssidref 1 */ - for (j = 0; j < STE_MAX_TYPES; j++) - ssidrefs[1 * STE_MAX_TYPES + j] = htons(1); - /* now set type j-1 for ssidref j */ - for (j = 0; j < STE_MAX_SSIDREFS; j++) - if ((0 < j) && (j <= STE_MAX_TYPES)) - ssidrefs[j * STE_MAX_TYPES + j - 1] = htons(1); - ret += STE_MAX_TYPES * STE_MAX_SSIDREFS * sizeof(domaintype_t); - return ret; -} - -#define MAX_PUSH_BUFFER 16384 -u8 push_buffer[MAX_PUSH_BUFFER]; - -int acm_domain_setpolicy(int xc_handle) -{ - int ret; - struct acm_policy_buffer *bin_pol; - acm_op_t op; - - /* future: read policy from file and set it */ - bin_pol = (struct acm_policy_buffer *) push_buffer; - bin_pol->policy_version = htonl(ACM_POLICY_VERSION); - bin_pol->magic = htonl(ACM_MAGIC); - bin_pol->primary_policy_code = htonl(ACM_CHINESE_WALL_POLICY); - bin_pol->secondary_policy_code = - htonl(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY); - - bin_pol->len = htonl(sizeof(struct acm_policy_buffer)); - bin_pol->primary_buffer_offset = htonl(ntohl(bin_pol->len)); - ret = - acm_domain_set_chwallpolicy(push_buffer + - ntohl(bin_pol->primary_buffer_offset), - MAX_PUSH_BUFFER - - ntohl(bin_pol->primary_buffer_offset)); - if (ret < 0) - { - printf("ERROR creating chwallpolicy buffer.\n"); - return -1; - } - bin_pol->len = htonl(ntohl(bin_pol->len) + ret); - bin_pol->secondary_buffer_offset = htonl(ntohl(bin_pol->len)); - ret = acm_domain_set_stepolicy(push_buffer + - ntohl(bin_pol->secondary_buffer_offset), - MAX_PUSH_BUFFER - - ntohl(bin_pol->secondary_buffer_offset)); - if (ret < 0) - { - printf("ERROR creating chwallpolicy buffer.\n"); - return -1; - } - bin_pol->len = htonl(ntohl(bin_pol->len) + ret); - - /* dump it and then push it down into xen/acm */ - acm_dump_policy_buffer(push_buffer, ntohl(bin_pol->len)); - - op.cmd = ACM_SETPOLICY; - op.interface_version = ACM_INTERFACE_VERSION; - op.u.setpolicy.pushcache = (void *) push_buffer; - op.u.setpolicy.pushcache_size = ntohl(bin_pol->len); - ret = do_acm_op(xc_handle, &op); - - if (ret) - printf("ERROR setting policy. Use 'xm dmesg' to see details.\n"); - else - printf("Successfully changed policy.\n"); - - return ret; -} - /******************************* get policy ******************************/ #define PULL_CACHE_SIZE 8192 @@ -602,7 +427,6 @@ void usage(char *progname) { printf("Use: %s \n" - "\t setpolicy\n" "\t getpolicy\n" "\t dumpstats\n" "\t loadpolicy <binary policy file>\n", progname); @@ -612,7 +436,7 @@ int main(int argc, char **argv) { - int acm_cmd_fd, ret; + int acm_cmd_fd, ret = 0; if (argc < 2) usage(argv[0]); @@ -623,12 +447,7 @@ exit(-1); } - if (!strcmp(argv[1], "setpolicy")) - { - if (argc != 2) - usage(argv[0]); - ret = acm_domain_setpolicy(acm_cmd_fd); - } else if (!strcmp(argv[1], "getpolicy")) { + if (!strcmp(argv[1], "getpolicy")) { if (argc != 2) usage(argv[0]); ret = acm_domain_getpolicy(acm_cmd_fd); diff -r 5f1ed597f107 -r 8799d14bef77 tools/sv/inc/style.css --- a/tools/sv/inc/style.css Wed Aug 24 02:43:18 2005 +++ b/tools/sv/inc/style.css Thu Aug 25 22:53:20 2005 @@ -1,32 +1,95 @@ +.small { + font-size: 10px +} -P {font-family: verdana, arial; font-size: 12px; color: black} -.small {font-size: 10px} +TD.domainInfo { + font-size: 10px; + color: black +} -TD.domainInfo {font-family: verdana, arial; font-size: 10px; color: black} -TD.domainInfoHead {font-family: verdana, arial; font-size: 10px; color: white; font-face: bold} +TD.domainInfoHead { + font-size: 10px; + color: white; + font-face: bold +} TD.domainInfoHead {background-color: black} TR.domainInfoOdd {background-color: white} TR.domainInfoEven {background-color: lightgrey} body { - width: 670px; - margin: 0px; - padding: 0px; - background-color: #fff; - background-image: url(../images/orb_02.jpg); - background-repeat: repeat-y; - background-position: left top; - font-family: Arial, Helvetica, sans-serif; - font-weight: bold; - color: #333333; - letter-spacing: 0px; - scrollbar-base-color: #333333; - scrollbar-track-color: #666666; - scrollbar-face-color: #fff; - - - } - -.button (cursor:hand) - + margin: 0px; + padding: 0px; + font-family: Arial, Helvetica, sans-serif; + font-size: 12px; + color: #000000; +} + +div#menu { + position: absolute; + left: 10px; + top: 10px; + width: 160px; + padding: 10px; + border: 0px solid black; + text-align: center; +} + +div#main { + position: absolute; + left: 200px; + top: 10px; + right: 10px; + padding: 10px; + border: 0px solid black; +} + +div.button { + float: right; + margin: 10px 0px 0px 10px; + padding: 5px; + text-align: center; + border: 1px solid black; + background: gray; + cursor: hand; +} + +div.tabButton { + position: relative; + top: 0px; + float: left; + margin: 0px 10px -1px 0px; + padding: 5px; + text-align: center; + border: 1px solid black; + background: gray; + cursor: hand; +} + +div.tabButton#activeTab { + top: 0px; + background: white; + border-color: black black white black; +} + +div.button:hover, div.tabButton:hover { + background: white; +} + +div.button a, div.tabButton a { + font-size: 12px; + font-weight: bold; +} + +div.title { + float: right; + font-size: 14px; + font-weight: bold; +} + +div.tab { + overflow: auto; + clear: both; + border: 1px solid black; + padding: 10px; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/sv/index.psp --- a/tools/sv/index.psp Wed Aug 24 02:43:18 2005 +++ b/tools/sv/index.psp Thu Aug 25 22:53:20 2005 @@ -7,158 +7,29 @@ for path in sys.path: if debug: req.write( path + "<br/>" ) -from xen.sv.HTMLBase import HTMLBase -from xen.sv.DomList import DomList -from xen.sv.NodeInfo import NodeInfo -from xen.sv.DomInfo import DomInfo -from xen.sv.CreateDomain import CreateDomain -from xen.sv.MigrateDomain import MigrateDomain -from xen.sv.SaveDomain import SaveDomain -from xen.sv.RestoreDomain import RestoreDomain - -from xen.xend.XendClient import server - -from xen.sv.util import getVar - -# adapter to make this all work with mod_python -# (c) Tom Wilkie 2005 - -class TwistedAdapter: - - def write( self, text ): - req.write( text ) - - class Args: - - from mod_python.util import FieldStorage - - fieldStorage = FieldStorage( req, True ) - - # return a list of values for the given key, - # or None if key not there - def get( self, var ): - retVar = self.fieldStorage.getlist( var ) - if len( retVar ) == 0: - return None - else: - return retVar - - # return a list of tuples, - # (key, value) where value is a list of values - def items( self ): - result = []; - for key in self.fieldStorage.keys(): - result.append( (key, self.fieldStorage.getlist( key ) ) ) - return result - - args = Args() - - uri = req.unparsed_uri - - -class Main( HTMLBase ): - - isLeaf = True - - def __init__( self, urlWriter = None ): - self.modules = { "node": NodeInfo, - "list": DomList, - "info": DomInfo, - "create": CreateDomain, - "migrate" : MigrateDomain, - "save" : SaveDomain, - "restore" : RestoreDomain } - - # ordered list of module menus to display - self.module_menus = [ "node", "create", "migrate", "save", - "restore", "list" ] - HTMLBase.__init__(self) - - def render_POST( self, request ): - - #decide what module post'd the action - - args = getVar( 'args', request ) - - mod = getVar( 'mod', request ) - - if mod in self.modules and args is None: - module = self.modules[ mod ] - #check module exists - if module: - module( self.mainUrlWriter ).perform( request ) - else: - self.perform( request ) - - return self.render_GET( request ) - - #TODO: need to make this get the request uri automatically - def mainUrlWriter( self, module ): - def fun( f ): - return "index.psp?mod=%s%s" % ( module, f ) - return fun - - def write_BODY( self, request ): - - request.write( "\n<table style='border:0px solid black; background: url(images/orb_01.jpg) no-repeat' cellspacing='0' cellpadding='0' border='0' width='780px' height='536px'>\n" ) - request.write( "<tr>\n" ) - request.write( " <td width='15px'> </td>" ) - request.write( " <td width='175px' align='center' valign'center'>" ) - request.write( " <table cellspacing='0' cellpadding='0' border='0' width='100%' height='100%'>" ) - request.write( " <tr><td height='140px' align='center' valign='bottom'><a href='http://www.cl.cam.ac.uk/Research/SRG/netos/xen/'>" ) - request.write( " <img src='images/xen.png' width='150' height='75' border='0'/></a><br/></td></tr>" ) - request.write( " <tr><td height='60px' align='center'><p class='small'>SV Web Interface<br/>(C) <a href='mailto:tw275@xxxxxxxxx'>Tom Wilkie</a> 2004</p></td></tr>") - request.write( " <tr><td align='center' valign='top'>" ) - - for modName in self.module_menus: - self.modules[modName]( self.mainUrlWriter( modName ) ).write_MENU( request ) - - request.write( " </td></tr>" ) - request.write( " </table>" ) - request.write( " " ) - request.write( " </td>\n" ) - request.write( " <td width='15px'> </td>" ) - request.write( " <td width='558px' align='left' valign='top'>" ) - request.write( " <table cellspacing='0' cellpadding='0' border='0' width='100%' height='100%'>" ) - request.write( " <tr><td height='20px'></td></tr>" ) - request.write( " <tr><td align='center' valign='top'>" ) - - modName = getVar('mod', request) - - if modName not in self.modules: - request.write( '<p>Please select a module</p>' ) - else: - module = self.modules[ modName ] - if module: - module( self.mainUrlWriter( modName ) ).write_BODY( request ) - else: - request.write( '<p>Invalid module. Please select another</p>' ) - - request.write( " </td></tr>" ) - request.write( " </table>" ) - request.write( " </td>\n" ) - request.write( " <td width='17px'> </td>" ) - request.write( "</tr>\n" ) - - request.write( "</table>\n" ) - - - def op_destroy( self, request ): - dom = getVar( 'dom', request ) - if not dom is None and dom != "0": - server.xend_domain_destroy( int( dom ), "halt" ) - - def op_pause( self, request ): - dom = getVar( 'dom', request ) - if not dom is None and dom != "0": - server.xend_domain_pause( int( dom ) ) - - def op_unpause( self, request ): - dom = getVar( 'dom', request ) - if not dom is None and dom != "0": - server.xend_domain_unpause( int( dom ) ) +from xen.sv.Main import Main, TwistedAdapter main = Main() - -main.render_POST( TwistedAdapter() ) +request = TwistedAdapter( req ) +main.do_POST( request ) %> +<html> +<head> + <title>XenSV</title> + <script src="inc/script.js"></script> + <link rel="StyleSheet" type="text/css" href="inc/style.css"> +</head> +<body> + <form method="post" action="<%=request.uri%>"> + <div id="menu"> + <img src="images/xen.png"> + <% main.render_menu( request ) %> + </div> + <div id="main"> + <% main.render_main( request ) %> + </div> + <input type="hidden" name="op" value=""> + <input type="hidden" name="args" value=""> + </form> +</body> +</html> diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcs/Makefile --- a/tools/xcs/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/xcs/Makefile Thu Aug 25 22:53:20 2005 @@ -34,10 +34,10 @@ xcsdump: xcsdump.c dump.c $(CC) $(CFLAGS) -o xcsdump xcsdump.c -L$(XEN_LIBXC) \ - ctrl_interface.c evtchn.c dump.c -lxc + ctrl_interface.c evtchn.c dump.c -lxenctrl $(BIN): $(OBJS) - $(CC) $(CFLAGS) $^ -o $@ -L$(XEN_LIBXC) -lxc + $(CC) $(CFLAGS) $^ -o $@ -L$(XEN_LIBXC) -lxenctrl $(OBJS): $(HDRS) diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcs/dump.h --- a/tools/xcs/dump.h Wed Aug 24 02:43:18 2005 +++ b/tools/xcs/dump.h Thu Aug 25 22:53:20 2005 @@ -20,7 +20,7 @@ #define XENCTLD_ERROR_H #include <stdint.h> -#include <xc.h> +#include <xenctrl.h> #include <xen/io/domain_controller.h> void dump_msg(const control_msg_t *msg, uint64_t flags); diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcs/xcs.h --- a/tools/xcs/xcs.h Wed Aug 24 02:43:18 2005 +++ b/tools/xcs/xcs.h Thu Aug 25 22:53:20 2005 @@ -11,7 +11,7 @@ #define __XCS_H__ #include <pthread.h> -#include <xc.h> +#include <xenctrl.h> #include <xen/xen.h> #include <xen/io/domain_controller.h> #include <xen/linux/privcmd.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcs/xcsdump.c --- a/tools/xcs/xcsdump.c Wed Aug 24 02:43:18 2005 +++ b/tools/xcs/xcsdump.c Thu Aug 25 22:53:20 2005 @@ -16,7 +16,7 @@ #include <sys/socket.h> #include <sys/un.h> #include <ctype.h> -#include <xc.h> +#include <xenctrl.h> #include <xen/xen.h> #include <xen/io/domain_controller.h> #include <getopt.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcutils/Makefile --- a/tools/xcutils/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/xcutils/Makefile Thu Aug 25 22:53:20 2005 @@ -30,7 +30,7 @@ PROGRAMS = xc_restore xc_save -LDLIBS = -L$(XEN_LIBXC) -lxc +LDLIBS = -L$(XEN_LIBXC) -lxenguest -lxenctrl .PHONY: all all: build diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcutils/xc_restore.c --- a/tools/xcutils/xc_restore.c Wed Aug 24 02:43:18 2005 +++ b/tools/xcutils/xc_restore.c Thu Aug 25 22:53:20 2005 @@ -7,24 +7,33 @@ * */ +#include <err.h> #include <stdlib.h> +#include <stdint.h> #include <stdio.h> -#include <err.h> -#include <xc.h> +#include <xenguest.h> int main(int argc, char **argv) { - unsigned int xc_fd, io_fd, domid, nr_pfns; + unsigned int xc_fd, io_fd, domid, nr_pfns, evtchn; + int ret; + unsigned long mfn; - if (argc != 5) - errx(1, "usage: %s xcfd iofd domid nr_pfns", argv[0]); + if (argc != 6) + errx(1, "usage: %s xcfd iofd domid nr_pfns evtchn", argv[0]); xc_fd = atoi(argv[1]); io_fd = atoi(argv[2]); domid = atoi(argv[3]); nr_pfns = atoi(argv[4]); + evtchn = atoi(argv[5]); - return xc_linux_restore(xc_fd, io_fd, domid, nr_pfns); + ret = xc_linux_restore(xc_fd, io_fd, domid, nr_pfns, evtchn, &mfn); + if (ret == 0) { + printf("store-mfn %li\n", mfn); + fflush(stdout); + } + return ret; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/xcutils/xc_save.c --- a/tools/xcutils/xc_save.c Wed Aug 24 02:43:18 2005 +++ b/tools/xcutils/xc_save.c Thu Aug 25 22:53:20 2005 @@ -7,11 +7,12 @@ * */ +#include <err.h> #include <stdlib.h> +#include <stdint.h> #include <stdio.h> -#include <err.h> -#include <xc.h> +#include <xenguest.h> int main(int argc, char **argv) diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/Makefile --- a/tools/xenstore/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/Makefile Thu Aug 25 22:53:20 2005 @@ -1,6 +1,5 @@ XEN_ROOT=../.. include $(XEN_ROOT)/tools/Rules.mk -LIBDIR = lib XEN_LIBXC = $(XEN_ROOT)/tools/libxc INSTALL = install @@ -25,7 +24,7 @@ TESTFLAGS= -DTESTING TESTENV = XENSTORED_ROOTDIR=$(TESTDIR) XENSTORED_RUNDIR=$(TESTDIR) -all: xen xenstored libxenstore.a libxenstore-pic.a +all: xen xenstored libxenstore.so testcode: xen xs_test xenstored_test xs_random xs_dom0_test @@ -33,7 +32,7 @@ ln -sf $(XEN_ROOT)/xen/include/public $@ xenstored: xenstored_core.o xenstored_watch.o xenstored_domain.o xenstored_transaction.o xs_lib.o talloc.o utils.o - $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxc -o $@ + $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@ xenstored_test: xenstored_core_test.o xenstored_watch_test.o xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o fake_libxc.o utils.o $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@ @@ -41,9 +40,9 @@ xs_test: xs_test.o xs_lib.o utils.o xs_random: xs_random.o xs_test_lib.o xs_lib.o talloc.o utils.o xs_stress: xs_stress.o xs_test_lib.o xs_lib.o talloc.o utils.o -xs_watch_stress: xs_watch_stress.o xs_test_lib.o xs_lib.o talloc.o utils.o +xs_crashme: xs_crashme.o xs_lib.o talloc.o utils.o -xs_test.o xs_stress.o xs_watch_stress.o xenstored_core_test.o xenstored_watch_test.o xenstored_transaction_test.o xenstored_domain_test.o xs_random.o xs_test_lib.o talloc_test.o fake_libxc.o: CFLAGS=$(BASECFLAGS) $(TESTFLAGS) +xs_test.o xs_stress.o xenstored_core_test.o xenstored_watch_test.o xenstored_transaction_test.o xenstored_domain_test.o xs_random.o xs_test_lib.o talloc_test.o fake_libxc.o xs_crashme.o: CFLAGS=$(BASECFLAGS) $(TESTFLAGS) xenstored_%_test.o: xenstored_%.c $(COMPILE.c) -o $@ $< @@ -54,25 +53,30 @@ talloc_test.o: talloc.c $(COMPILE.c) -o $@ $< -LIB_OBJS := xs.o xs_lib.o - -LIB_OBJS_A := $(patsubst %.o,libxenstore.a(%.o),$(LIB_OBJS)) -LIB_OBJS_PIC := $(patsubst %.o,libxenstore-pic.a(%.opic),$(LIB_OBJS)) - -libxenstore.a: $(LIB_OBJS_A) - -libxenstore-pic.a: $(LIB_OBJS_PIC) +libxenstore.so: xs.opic xs_lib.opic + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-soname -Wl,libxenstore.so -shared -o $@ $^ clean: testsuite-clean - rm -f *.o *.opic *.a - rm -f xen xenstored xs_random xs_stress xs_watch_stress + rm -f *.o *.opic *.so + rm -f xen xenstored xs_random xs_stress xs_crashme rm -f xs_test xenstored_test xs_dom0_test - -$(RM) $(PROG_DEP) + $(RM) $(PROG_DEP) -check: testsuite-run randomcheck stresstest +print-dir: + @echo -n tools/xenstore: + +print-end: + @echo + +check: print-dir testsuite-fast randomcheck-fast print-end + +fullcheck: testsuite-run randomcheck stresstest testsuite-run: xen xenstored_test xs_test - $(TESTENV) testsuite/test.sh + $(TESTENV) testsuite/test.sh && echo + +testsuite-fast: xen xenstored_test xs_test + @$(TESTENV) testsuite/test.sh --fast testsuite-clean: rm -rf $(TESTDIR) @@ -81,18 +85,25 @@ # fail. RANDSEED=$(shell date +%s) randomcheck: xs_random xenstored_test - $(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED) - $(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED) + $(TESTENV) ./xs_random --simple --fast /tmp/xs_random 200000 $(RANDSEED) && echo + $(TESTENV) ./xs_random --fast /tmp/xs_random 100000 $(RANDSEED) && echo $(TESTENV) ./xs_random --fail /tmp/xs_random 10000 $(RANDSEED) -stresstest: xs_stress xs_watch_stress xenstored_test +crashme: xs_crashme xenstored_test + rm -rf $(TESTDIR)/store $(TESTDIR)/transactions /tmp/xs_crashme.vglog* /tmp/trace + export $(TESTENV); ./xs_crashme 5000 $(RANDSEED) 2>/dev/null + if [ -n "`cat /tmp/xs_crashme.vglog*`" ]; then echo Valgrind complained; cat /tmp/xs_crashme.vglog*; exit 1; fi + rm -rf $(TESTDIR)/store $(TESTDIR)/transactions /tmp/xs_crashme.vglog* /tmp/trace + +randomcheck-fast: xs_random xenstored_test + @$(TESTENV) ./xs_random --fast /tmp/xs_random 2000 $(RANDSEED) + +stresstest: xs_stress xenstored_test rm -rf $(TESTDIR)/store $(TESTDIR)/transactions export $(TESTENV); PID=`./xenstored_test --output-pid --trace-file=/tmp/trace`; ./xs_stress 5000; ret=$$?; kill $$PID; exit $$ret - rm -rf $(TESTDIR)/store $(TESTDIR)/transactions - export $(TESTENV); PID=`./xenstored_test --output-pid`; ./xs_watch_stress; ret=$$?; kill $$PID; exit $$ret xs_dom0_test: xs_dom0_test.o utils.o - $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxc -o $@ + $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@ TAGS: etags `find . -name '*.[ch]'` @@ -100,15 +111,14 @@ tarball: clean cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/ -install: xenstored libxenstore.a libxenstore-pic.a +install: xenstored libxenstore.so $(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored $(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored $(INSTALL_DIR) -p $(DESTDIR)/usr/sbin $(INSTALL_DIR) -p $(DESTDIR)/usr/include $(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR) - $(INSTALL_DATA) libxenstore.a $(DESTDIR)/usr/$(LIBDIR) - $(INSTALL_DATA) libxenstore-pic.a $(DESTDIR)/usr/$(LIBDIR) + $(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_DATA) xs.h $(DESTDIR)/usr/include $(INSTALL_DATA) xs_lib.h $(DESTDIR)/usr/include diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/test.sh --- a/tools/xenstore/testsuite/test.sh Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/test.sh Thu Aug 25 22:53:20 2005 @@ -7,20 +7,20 @@ { rm -rf $XENSTORED_ROOTDIR mkdir $XENSTORED_ROOTDIR -# Weird failures with this. - if type valgrind >/dev/null 2>&1; then - valgrind -q --logfile-fd=3 ./xenstored_test --output-pid --trace-file=testsuite/tmp/trace --no-fork 3>testsuite/tmp/vgout > /tmp/pid 2> testsuite/tmp/xenstored_errors & + if [ $VALGRIND -eq 1 ]; then + valgrind --suppressions=testsuite/vg-suppressions -q ./xenstored_test --output-pid --trace-file=testsuite/tmp/trace --no-fork > /tmp/pid 2> testsuite/tmp/xenstored_errors & while [ ! -s /tmp/pid ]; do sleep 0; done PID=`cat /tmp/pid` rm /tmp/pid else - PID=`./xenstored_test --output-pid` + # We don't get error messages from this, though. + PID=`./xenstored_test --output-pid --trace-file=testsuite/tmp/trace` fi - if sh -e $2 $1; then - if [ -s testsuite/tmp/vgout ]; then + if ./xs_test $2 $1; then + if [ -s testsuite/tmp/xenstored_errors ]; then kill $PID - echo VALGRIND errors: - cat testsuite/tmp/vgout + echo Errors: + cat testsuite/tmp/xenstored_errors return 1 fi echo shutdown | ./xs_test @@ -33,15 +33,29 @@ fi } +if [ x$1 = x--fast ]; then + VALGRIND=0 + SLOWTESTS="" + shift +else + if type valgrind >/dev/null 2>&1; then + VALGRIND=1 + else + echo "WARNING: valgrind not available" >&2 + VALGRIND=0 + fi + SLOWTESTS=testsuite/[0-9]*.slowtest +fi + MATCH=${1:-"*"} -for f in testsuite/[0-9]*.sh; do +for f in testsuite/[0-9]*.test $SLOWTESTS; do case `basename $f` in $MATCH) RUN=1;; esac [ -n "$RUN" ] || continue - if run_test $f; then - echo Test $f passed... + + if run_test $f -x >/tmp/out; then + echo -n . else - echo Test $f failed, running verbosely... - run_test $f -x || true + cat /tmp/out # That will have filled the screen, repeat message. echo Test $f failed exit 1 diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/utils.c --- a/tools/xenstore/utils.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/utils.c Thu Aug 25 22:53:20 2005 @@ -80,30 +80,6 @@ barf("malloc of %zu failed", size); } -/* Stevens. */ -void daemonize(void) -{ - pid_t pid; - - /* Separate from our parent via fork, so init inherits us. */ - if ((pid = fork()) < 0) - barf_perror("Failed to fork daemon"); - if (pid != 0) - exit(0); - - close(STDIN_FILENO); - close(STDOUT_FILENO); - close(STDERR_FILENO); - - /* Session leader so ^C doesn't whack us. */ - setsid(); - /* Move off any mount points we might be in. */ - chdir("/"); - /* Discard our parent's old-fashioned umask prejudices. */ - umask(0); -} - - /* This version adds one byte (for nul term) */ void *grab_file(const char *filename, unsigned long *size) { diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/utils.h --- a/tools/xenstore/utils.h Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/utils.h Thu Aug 25 22:53:20 2005 @@ -40,9 +40,6 @@ void *grab_file(const char *filename, unsigned long *size); void release_file(void *data, unsigned long size); -/* For writing daemons, based on Stevens. */ -void daemonize(void); - /* Signal handling: returns fd to listen on. */ int signal_to_fd(int signal); void close_signal(int fd); diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xenstored.h --- a/tools/xenstore/xenstored.h Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xenstored.h Thu Aug 25 22:53:20 2005 @@ -1,21 +1,29 @@ -/* - Simple prototyle Xen Store Daemon providing simple tree-like database. - Copyright (C) 2005 Rusty Russell IBM Corporation +/* + * Simple prototyle Xen Store Daemon providing simple tree-like database. + * Copyright (C) 2005 Rusty Russell IBM Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -*/ #ifndef _XENSTORED_H #define _XENSTORED_H diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xenstored_core.c --- a/tools/xenstore/xenstored_core.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xenstored_core.c Thu Aug 25 22:53:20 2005 @@ -252,6 +252,7 @@ int ret; struct buffered_data *out = conn->out; + assert(conn->state != BLOCKED); if (out->inhdr) { if (verbose) xprintf("Writing msg %s (%s) out to %p\n", @@ -289,6 +290,10 @@ talloc_free(out); queue_next_event(conn); + + /* No longer busy? */ + if (!conn->out) + conn->state = OK; return true; } @@ -418,14 +423,24 @@ return node_dir_inside_transaction(trans, node); } +static char *datafile(const char *dir) +{ + return talloc_asprintf(dir, "%s/.data", dir); +} + static char *node_datafile(struct transaction *trans, const char *node) { - return talloc_asprintf(node, "%s/.data", node_dir(trans, node)); + return datafile(node_dir(trans, node)); +} + +static char *permfile(const char *dir) +{ + return talloc_asprintf(dir, "%s/.perms", dir); } static char *node_permfile(struct transaction *trans, const char *node) { - return talloc_asprintf(node, "%s/.perms", node_dir(trans, node)); + return permfile(node_dir(trans, node)); } struct buffered_data *new_buffer(void *ctx) @@ -492,6 +507,8 @@ conn->waiting_reply = bdata; } else conn->out = bdata; + assert(conn->state != BLOCKED); + conn->state = BUSY; } /* Some routines (write, mkdir, etc) just need a non-error return */ @@ -504,11 +521,13 @@ { unsigned int i; - for (i = 0; error != xsd_errors[i].errnum; i++) - if (i == ARRAY_SIZE(xsd_errors) - 1) - corrupt(conn, "Unknown error %i (%s)", error, - strerror(error)); - + for (i = 0; error != xsd_errors[i].errnum; i++) { + if (i == ARRAY_SIZE(xsd_errors) - 1) { + eprintf("xenstored: error %i untranslatable", error); + i = 0; /* EINVAL */ + break; + } + } send_reply(conn, XS_ERROR, xsd_errors[i].errstring, strlen(xsd_errors[i].errstring) + 1); } @@ -542,21 +561,20 @@ /* We expect one arg in the input: return NULL otherwise. */ static const char *onearg(struct buffered_data *in) { - if (get_string(in, 0) != in->used) + if (!in->used || get_string(in, 0) != in->used) return NULL; return in->buffer; } /* If it fails, returns NULL and sets errno. */ -static struct xs_permissions *get_perms(struct transaction *transaction, - const char *node, unsigned int *num) +static struct xs_permissions *get_perms(const char *dir, unsigned int *num) { unsigned int size; char *strings; struct xs_permissions *ret; int *fd; - fd = talloc_open(node_permfile(transaction, node), O_RDONLY, 0); + fd = talloc_open(permfile(dir), O_RDONLY, 0); if (!fd) return NULL; strings = read_all(fd, &size); @@ -564,14 +582,14 @@ return NULL; *num = xs_count_strings(strings, size); - ret = talloc_array(node, struct xs_permissions, *num); + ret = talloc_array(dir, struct xs_permissions, *num); if (!xs_strings_to_perms(ret, *num, strings)) - corrupt(NULL, "Permissions corrupt for %s", node); + corrupt(NULL, "Permissions corrupt for %s", dir); return ret; } -static char *perms_to_strings(const char *node, +static char *perms_to_strings(const void *ctx, struct xs_permissions *perms, unsigned int num, unsigned int *len) { @@ -583,7 +601,7 @@ if (!xs_perm_to_string(&perms[i], buffer)) return NULL; - strings = talloc_realloc(node, strings, char, + strings = talloc_realloc(ctx, strings, char, *len + strlen(buffer) + 1); strcpy(strings + *len, buffer); *len += strlen(buffer) + 1; @@ -616,16 +634,23 @@ return 0; } +/* Create a self-destructing temporary path */ +static char *temppath(const char *path) +{ + char *tmppath = talloc_asprintf(path, "%s.tmp", path); + talloc_set_destructor(tmppath, destroy_path); + return tmppath; +} + /* Create a self-destructing temporary file */ static char *tempfile(const char *path, void *contents, unsigned int len) { int *fd; - char *tmppath = talloc_asprintf(path, "%s.tmp", path); + char *tmppath = temppath(path); fd = talloc_open(tmppath, O_WRONLY|O_CREAT|O_EXCL, 0640); if (!fd) return NULL; - talloc_set_destructor(tmppath, destroy_path); if (!xs_write_all(*fd, contents, len)) return NULL; @@ -705,44 +730,50 @@ /* Owners and tools get it all... */ if (!id || perms[0].id == id) - return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_CREATE|XS_PERM_OWNER; + return XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER; for (i = 1; i < num; i++) if (perms[i].id == id) return perms[i].perms; return perms[0].perms; +} + +/* What do parents say? */ +static enum xs_perm_type ask_parents(struct connection *conn, + const char *node) +{ + struct xs_permissions *perms; + unsigned int num; + + do { + node = get_parent(node); + perms = get_perms(node_dir(conn->transaction, node), &num); + if (perms) + break; + } while (!streq(node, "/")); + + /* No permission at root? We're in trouble. */ + if (!perms) + corrupt(conn, "No permissions file at root"); + + return perm_for_id(conn->id, perms, num); } /* We have a weird permissions system. You can allow someone into a * specific node without allowing it in the parents. If it's going to * fail, however, we don't want the errno to indicate any information * about the node. */ -static int check_with_parents(struct connection *conn, const char *node, +static int errno_from_parents(struct connection *conn, const char *node, int errnum) { - struct xs_permissions *perms; - unsigned int num; - /* We always tell them about memory failures. */ if (errnum == ENOMEM) return errnum; - do { - node = get_parent(node); - perms = get_perms(conn->transaction, node, &num); - if (perms) - break; - } while (!streq(node, "/")); - - /* No permission at root? We're in trouble. */ - if (!perms) - corrupt(conn, "No permissions file at root"); - - if (!(perm_for_id(conn->id, perms, num) & XS_PERM_READ)) - return EACCES; - - return errnum; + if (ask_parents(conn, node) & XS_PERM_READ) + return errnum; + return EACCES; } char *canonicalize(struct connection *conn, const char *node) @@ -773,31 +804,33 @@ return false; } - perms = get_perms(conn->transaction, node, &num); - /* No permissions. If we want to create it and - * it doesn't exist, check parent directory. */ - if (!perms && errno == ENOENT && (perm & XS_PERM_CREATE)) { - char *parent = get_parent(node); - if (!parent) - return false; - - perms = get_perms(conn->transaction, parent, &num); - } - if (!perms) { - errno = check_with_parents(conn, node, errno); + perms = get_perms(node_dir(conn->transaction, node), &num); + + if (perms) { + if (perm_for_id(conn->id, perms, num) & perm) + return true; + errno = EACCES; return false; } - if (perm_for_id(conn->id, perms, num) & perm) - return true; - - errno = check_with_parents(conn, node, EACCES); + /* If it's OK not to exist, we consult parents. */ + if (errno == ENOENT && (perm & XS_PERM_ENOENT_OK)) { + if (ask_parents(conn, node) & perm) + return true; + /* Parents say they should not know. */ + errno = EACCES; + return false; + } + + /* They might not have permission to even *see* this node, in + * which case we return EACCES even if it's ENOENT or EIO. */ + errno = errno_from_parents(conn, node, errno); return false; } static void send_directory(struct connection *conn, const char *node) { - char *path, *reply = talloc_strdup(node, ""); + char *path, *reply; unsigned int reply_len = 0; DIR **dir; struct dirent *dirent; @@ -815,6 +848,7 @@ return; } + reply = talloc_strdup(node, ""); while ((dirent = readdir(*dir)) != NULL) { int len = strlen(dirent->d_name) + 1; @@ -857,44 +891,64 @@ send_reply(conn, XS_READ, value, size); } -/* Create a new directory. Optionally put data in it (if data != NULL) */ -static bool new_directory(struct connection *conn, - const char *node, void *data, unsigned int datalen) +/* Commit this directory, eg. comitting a/b.tmp/c causes a/b.tmp -> a.b */ +static bool commit_dir(char *dir) +{ + char *dot, *slash, *dest; + + dot = strrchr(dir, '.'); + slash = strchr(dot, '/'); + if (slash) + *slash = '\0'; + + dest = talloc_asprintf(dir, "%.*s", dot - dir, dir); + return rename(dir, dest) == 0; +} + +/* Create a temporary directory. Put data in it (if data != NULL) */ +static char *tempdir(struct connection *conn, + const char *node, void *data, unsigned int datalen) { struct xs_permissions *perms; char *permstr; unsigned int num, len; int *fd; - char *dir = node_dir(conn->transaction, node); - - if (mkdir(dir, 0750) != 0) - return false; - - /* Set destructor so we clean up if neccesary. */ - talloc_set_destructor(dir, destroy_path); - - perms = get_perms(conn->transaction, get_parent(node), &num); + char *dir; + + dir = temppath(node_dir(conn->transaction, node)); + if (mkdir(dir, 0750) != 0) { + if (errno != ENOENT) + return NULL; + + dir = tempdir(conn, get_parent(node), NULL, 0); + if (!dir) + return NULL; + + dir = talloc_asprintf(dir, "%s%s", dir, strrchr(node, '/')); + if (mkdir(dir, 0750) != 0) + return NULL; + talloc_set_destructor(dir, destroy_path); + } + + perms = get_perms(get_parent(dir), &num); + assert(perms); /* Domains own what they create. */ if (conn->id) perms->id = conn->id; permstr = perms_to_strings(dir, perms, num, &len); - fd = talloc_open(node_permfile(conn->transaction, node), - O_WRONLY|O_CREAT|O_EXCL, 0640); + fd = talloc_open(permfile(dir), O_WRONLY|O_CREAT|O_EXCL, 0640); if (!fd || !xs_write_all(*fd, permstr, len)) - return false; + return NULL; if (data) { - char *datapath = node_datafile(conn->transaction, node); + char *datapath = datafile(dir); fd = talloc_open(datapath, O_WRONLY|O_CREAT|O_EXCL, 0640); if (!fd || !xs_write_all(*fd, data, datalen)) - return false; - } - - /* Finished! */ - talloc_set_destructor(dir, NULL); - return true; + return NULL; + } + return dir; } /* path, flags, data... */ @@ -913,8 +967,7 @@ } node = canonicalize(conn, vec[0]); - if (/*suppress error on write outside transaction*/ 0 && - !within_transaction(conn->transaction, node)) { + if (!within_transaction(conn->transaction, node)) { send_error(conn, EROFS); return; } @@ -928,9 +981,9 @@ if (streq(vec[1], XS_WRITE_NONE)) mode = XS_PERM_WRITE; else if (streq(vec[1], XS_WRITE_CREATE)) - mode = XS_PERM_WRITE|XS_PERM_CREATE; + mode = XS_PERM_WRITE|XS_PERM_ENOENT_OK; else if (streq(vec[1], XS_WRITE_CREATE_EXCL)) - mode = XS_PERM_WRITE|XS_PERM_CREATE; + mode = XS_PERM_WRITE|XS_PERM_ENOENT_OK; else { send_error(conn, EINVAL); return; @@ -942,6 +995,8 @@ } if (lstat(node_dir(conn->transaction, node), &st) != 0) { + char *dir; + /* Does not exist... */ if (errno != ENOENT) { send_error(conn, errno); @@ -949,15 +1004,17 @@ } /* Not going to create it? */ - if (!(mode & XS_PERM_CREATE)) { + if (streq(vec[1], XS_WRITE_NONE)) { send_error(conn, ENOENT); return; } - if (!new_directory(conn, node, in->buffer + offset, datalen)) { + dir = tempdir(conn, node, in->buffer + offset, datalen); + if (!dir || !commit_dir(dir)) { send_error(conn, errno); return; } + } else { /* Exists... */ if (streq(vec[1], XS_WRITE_CREATE_EXCL)) { @@ -982,8 +1039,11 @@ static void do_mkdir(struct connection *conn, const char *node) { + char *dir; + struct stat st; + node = canonicalize(conn, node); - if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_CREATE)) { + if (!check_node_perms(conn, node, XS_PERM_WRITE|XS_PERM_ENOENT_OK)) { send_error(conn, errno); return; } @@ -996,7 +1056,14 @@ if (transaction_block(conn, node)) return; - if (!new_directory(conn, node, NULL, 0)) { + /* Must not already exist. */ + if (lstat(node_dir(conn->transaction, node), &st) == 0) { + send_error(conn, EEXIST); + return; + } + + dir = tempdir(conn, node, NULL, 0); + if (!dir || !commit_dir(dir)) { send_error(conn, errno); return; } @@ -1056,7 +1123,7 @@ return; } - perms = get_perms(conn->transaction, node, &num); + perms = get_perms(node_dir(conn->transaction, node), &num); if (!perms) { send_error(conn, errno); return; @@ -1072,7 +1139,7 @@ static void do_set_perms(struct connection *conn, struct buffered_data *in) { unsigned int num; - char *node; + char *node, *permstr; struct xs_permissions *perms; num = xs_count_strings(in->buffer, in->used); @@ -1083,7 +1150,7 @@ /* First arg is node name. */ node = canonicalize(conn, in->buffer); - in->buffer += strlen(in->buffer) + 1; + permstr = in->buffer + strlen(in->buffer) + 1; num--; if (!within_transaction(conn->transaction, node)) { @@ -1101,7 +1168,7 @@ } perms = talloc_array(node, struct xs_permissions, num); - if (!xs_strings_to_perms(perms, num, in->buffer)) { + if (!xs_strings_to_perms(perms, num, permstr)) { send_error(conn, errno); return; } @@ -1270,8 +1337,10 @@ talloc_free(in); talloc_set_fail_handler(NULL, NULL); if (talloc_total_blocks(NULL) - != talloc_total_blocks(talloc_autofree_context()) + 1) + != talloc_total_blocks(talloc_autofree_context()) + 1) { talloc_report_full(NULL, stderr); + abort(); + } } /* Errors in reading or allocating here mean we get out of sync, so we @@ -1295,8 +1364,10 @@ return; if (in->hdr.msg.len > PATH_MAX) { +#ifndef TESTING syslog(LOG_DAEMON, "Client tried to feed us %i", in->hdr.msg.len); +#endif goto bad_client; } @@ -1347,6 +1418,7 @@ consider_message(i); } break; + case BUSY: case OK: break; } @@ -1372,6 +1444,7 @@ new->state = OK; new->blocked_by = NULL; new->out = new->waiting_reply = NULL; + new->waiting_for_ack = NULL; new->fd = -1; new->id = 0; new->domain = NULL; @@ -1451,6 +1524,7 @@ printf(" state = %s\n", i->state == OK ? "OK" : i->state == BLOCKED ? "BLOCKED" + : i->state == BUSY ? "BUSY" : "INVALID"); if (i->id) printf(" id = %i\n", i->id); @@ -1516,19 +1590,59 @@ xs_daemon_transactions()); } +static void write_pidfile(const char *pidfile) +{ + char buf[100]; + int len; + int fd; + + fd = open(pidfile, O_RDWR | O_CREAT, 0600); + if (fd == -1) + barf_perror("Opening pid file %s", pidfile); + + /* We exit silently if daemon already running. */ + if (lockf(fd, F_TLOCK, 0) == -1) + exit(0); + + len = sprintf(buf, "%d\n", getpid()); + write(fd, buf, len); +} + +/* Stevens. */ +static void daemonize(void) +{ + pid_t pid; + + /* Separate from our parent via fork, so init inherits us. */ + if ((pid = fork()) < 0) + barf_perror("Failed to fork daemon"); + if (pid != 0) + exit(0); + + /* Session leader so ^C doesn't whack us. */ + setsid(); + /* Move off any mount points we might be in. */ + chdir("/"); + /* Discard our parent's old-fashioned umask prejudices. */ + umask(0); +} + + static struct option options[] = { { "no-fork", 0, NULL, 'N' }, { "verbose", 0, NULL, 'V' }, { "output-pid", 0, NULL, 'P' }, { "trace-file", 1, NULL, 'T' }, + { "pid-file", 1, NULL, 'F' }, { NULL, 0, NULL, 0 } }; int main(int argc, char *argv[]) { - int opt, *sock, *ro_sock, event_fd, max, tmpout; + int opt, *sock, *ro_sock, event_fd, max; struct sockaddr_un addr; fd_set inset, outset; bool dofork = true; bool outputpid = false; + const char *pidfile = NULL; while ((opt = getopt_long(argc, argv, "DVT:", options, NULL)) != -1) { switch (opt) { @@ -1548,10 +1662,19 @@ optarg); write(tracefd, "\n***\n", strlen("\n***\n")); break; + case 'F': + pidfile = optarg; } } if (optind != argc) barf("%s: No arguments desired", argv[0]); + + if (dofork) { + openlog("xenstored", 0, LOG_DAEMON); + daemonize(); + } + if (pidfile) + write_pidfile(pidfile); talloc_enable_leak_report_full(); @@ -1599,19 +1722,17 @@ /* Restore existing connections. */ restore_existing_connections(); - /* Debugging: daemonize() closes standard fds, so dup here. */ - tmpout = dup(STDOUT_FILENO); + if (outputpid) { + printf("%i\n", getpid()); + fflush(stdout); + } + + /* close stdin/stdout now we're ready to accept connections */ if (dofork) { - openlog("xenstored", 0, LOG_DAEMON); - daemonize(); - } - - if (outputpid) { - char buffer[20]; - sprintf(buffer, "%i\n", getpid()); - write(tmpout, buffer, strlen(buffer)); - } - close(tmpout); + close(STDIN_FILENO); + close(STDOUT_FILENO); + close(STDERR_FILENO); + } #ifdef TESTING signal(SIGUSR1, stop_failtest); @@ -1621,6 +1742,7 @@ max = initialize_set(&inset, &outset, *sock, *ro_sock, event_fd); /* Main loop. */ + /* FIXME: Rewrite so noone can starve. */ for (;;) { struct connection *i; struct timeval *tvp = NULL, tv; @@ -1665,10 +1787,22 @@ } } - /* Flush output for domain connections, */ - list_for_each_entry(i, &connections, list) - if (i->domain && i->out) + /* Handle all possible I/O for domain connections. */ + more: + list_for_each_entry(i, &connections, list) { + if (!i->domain) + continue; + + if (domain_can_read(i)) { + handle_input(i); + goto more; + } + + if (domain_can_write(i)) { handle_output(i); + goto more; + } + } if (tvp) { check_transaction_timeout(); diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xenstored_core.h --- a/tools/xenstore/xenstored_core.h Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xenstored_core.h Thu Aug 25 22:53:20 2005 @@ -51,6 +51,8 @@ { /* Blocked by transaction. */ BLOCKED, + /* Doing action, not listening */ + BUSY, /* Completed */ OK, }; @@ -65,7 +67,7 @@ /* Who am I? 0 for socket connections. */ domid_t id; - /* Blocked on transaction? */ + /* Blocked on transaction? Busy? */ enum state state; /* Node we are waiting for (if state == BLOCKED) */ diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xenstored_domain.c --- a/tools/xenstore/xenstored_domain.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xenstored_domain.c Thu Aug 25 22:53:20 2005 @@ -227,32 +227,27 @@ return NULL; } +/* We scan all domains rather than use the information given here. */ void handle_event(int event_fd) { u16 port; - struct domain *domain; if (read(event_fd, &port, sizeof(port)) != sizeof(port)) barf_perror("Failed to read from event fd"); - - /* We have to handle *all* the data available before we ack: - * careful that handle_input/handle_output can destroy conn. - */ - while ((domain = find_domain(port)) != NULL) { - if (domain->conn->state == OK - && buffer_has_input(domain->input)) - handle_input(domain->conn); - else if (domain->conn->out - && buffer_has_output_room(domain->output)) - handle_output(domain->conn); - else - break; - } - #ifndef TESTING if (write(event_fd, &port, sizeof(port)) != sizeof(port)) barf_perror("Failed to write to event fd"); #endif +} + +bool domain_can_read(struct connection *conn) +{ + return conn->state == OK && buffer_has_input(conn->domain->input); +} + +bool domain_can_write(struct connection *conn) +{ + return conn->out && buffer_has_output_room(conn->domain->output); } static struct domain *new_domain(void *context, domid_t domid, diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xenstored_domain.h --- a/tools/xenstore/xenstored_domain.h Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xenstored_domain.h Thu Aug 25 22:53:20 2005 @@ -40,4 +40,8 @@ /* Read existing connection information from store. */ void restore_existing_connections(void); +/* Can connection attached to domain read/write. */ +bool domain_can_read(struct connection *conn); +bool domain_can_write(struct connection *conn); + #endif /* _XENSTORED_DOMAIN_H */ diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xenstored_watch.c --- a/tools/xenstore/xenstored_watch.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xenstored_watch.c Thu Aug 25 22:53:20 2005 @@ -95,9 +95,18 @@ return 0; } -static void add_event(struct watch *watch, const char *node) +static void add_event(struct connection *conn, + struct watch *watch, const char *node) { struct watch_event *event; + + /* Check read permission: no permission, no watch event. + * If it doesn't exist, we need permission to read parent. + */ + if (!check_node_perms(conn, node, XS_PERM_READ|XS_PERM_ENOENT_OK)) { + fprintf(stderr, "No permission for %s\n", node); + return; + } if (watch->relative_path) { node += strlen(watch->relative_path); @@ -132,9 +141,9 @@ list_for_each_entry(watch, &i->watches, list) { if (is_child(node, watch->node)) - add_event(watch, node); + add_event(i, watch, node); else if (recurse && is_child(watch->node, node)) - add_event(watch, watch->node); + add_event(i, watch, watch->node); else continue; /* If connection not doing anything, queue this. */ @@ -206,7 +215,7 @@ relative = !strstarts(vec[0], "/"); vec[0] = canonicalize(conn, vec[0]); - if (!check_node_perms(conn, vec[0], XS_PERM_READ)) { + if (!is_valid_nodename(vec[0])) { send_error(conn, errno); return; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs.c --- a/tools/xenstore/xs.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs.c Thu Aug 25 22:53:20 2005 @@ -2,19 +2,19 @@ Xen Store Daemon interface providing simple tree-like database. Copyright (C) 2005 Rusty Russell IBM Corporation - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <sys/types.h> @@ -204,13 +204,19 @@ return NULL; } - assert(msg.type == type); + if (msg.type != type) { + free(ret); + saved_errno = EBADF; + goto close_fd; + + } return ret; fail: /* We're in a bad state, so close fd. */ saved_errno = errno; sigaction(SIGPIPE, &oldact, NULL); +close_fd: close(h->fd); h->fd = -1; errno = saved_errno; diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs.h --- a/tools/xenstore/xs.h Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs.h Thu Aug 25 22:53:20 2005 @@ -2,19 +2,19 @@ Xen Store Daemon providing simple tree-like database. Copyright (C) 2005 Rusty Russell IBM Corporation - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef _XS_H diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs_dom0_test.c --- a/tools/xenstore/xs_dom0_test.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs_dom0_test.c Thu Aug 25 22:53:20 2005 @@ -3,7 +3,7 @@ #include <sys/ioctl.h> #include "xs.h" #include "utils.h" -#include <xc.h> +#include <xenctrl.h> #include <xen/linux/privcmd.h> #include <stdio.h> #include <unistd.h> diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs_lib.c --- a/tools/xenstore/xs_lib.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs_lib.c Thu Aug 25 22:53:20 2005 @@ -1,3 +1,22 @@ +/* + Common routines between Xen store user library and daemon. + Copyright (C) 2005 Rusty Russell IBM Corporation + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + #include "xs_lib.h" #include <unistd.h> #include <stdio.h> @@ -133,8 +152,9 @@ unsigned int num; const char *p; - for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) - num++; + for (p = strings, num = 0; p < strings + len; p++) + if (*p == '\0') + num++; return num; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs_lib.h --- a/tools/xenstore/xs_lib.h Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs_lib.h Thu Aug 25 22:53:20 2005 @@ -2,19 +2,19 @@ Common routines between Xen store user library and daemon. Copyright (C) 2005 Rusty Russell IBM Corporation - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. - This program is distributed in the hope that it will be useful, + This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef _XS_LIB_H @@ -22,7 +22,7 @@ #include <stdbool.h> #include <limits.h> -#include <xc.h> +#include <xenctrl.h> /* Bitmask of permissions. */ enum xs_perm_type { @@ -30,7 +30,7 @@ XS_PERM_READ = 1, XS_PERM_WRITE = 2, /* Internal use. */ - XS_PERM_CREATE = 4, + XS_PERM_ENOENT_OK = 4, XS_PERM_OWNER = 8, }; diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs_random.c --- a/tools/xenstore/xs_random.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs_random.c Thu Aug 25 22:53:20 2005 @@ -303,6 +303,34 @@ return true; } +static char *parent_filename(const char *name) +{ + char *slash = strrchr(name + 1, '/'); + if (!slash) + return talloc_strdup(name, "/"); + return talloc_asprintf(name, "%.*s", slash-name, name); +} + +static void make_dirs(const char *filename) +{ + struct stat st; + + if (lstat(filename, &st) == 0 && S_ISREG(st.st_mode)) + convert_to_dir(filename); + + if (mkdir(filename, 0700) == 0) { + init_perms(filename); + return; + } + if (errno == EEXIST) + return; + + make_dirs(parent_filename(filename)); + if (mkdir(filename, 0700) != 0) + barf_perror("Failed to mkdir %s", filename); + init_perms(filename); +} + static bool file_write(struct file_ops_info *info, const char *path, const void *data, unsigned int len, int createflags) @@ -329,6 +357,9 @@ } } + if (createflags & O_CREAT) + make_dirs(parent_filename(filename)); + fd = open(filename, createflags|O_TRUNC|O_WRONLY, 0600); if (fd < 0) { /* FIXME: Another hack. */ @@ -349,19 +380,13 @@ { char *dirname = path_to_name(info, path); - /* Same effective order as daemon, so error returns are right. */ - if (mkdir(dirname, 0700) != 0) { - if (errno != ENOENT && errno != ENOTDIR) - write_ok(info, path); - return false; - } - - if (!write_ok(info, path)) { - int saved_errno = errno; - rmdir(dirname); - errno = saved_errno; - return false; - } + if (!write_ok(info, path)) + return false; + + make_dirs(parent_filename(dirname)); + if (mkdir(dirname, 0700) != 0) + return false; + init_perms(dirname); return true; } @@ -427,7 +452,7 @@ } if (abort) { - cmd = talloc_asprintf(NULL, "rm -r %s", info->transact_base); + cmd = talloc_asprintf(NULL, "rm -rf %s", info->transact_base); do_command(cmd); goto success; } @@ -984,13 +1009,15 @@ static void setup_file_ops(const char *dir) { - char *cmd = talloc_asprintf(NULL, "echo -n r0 > %s/.perms", dir); + struct xs_permissions perm = { .id = 0, .perms = XS_PERM_READ }; + struct file_ops_info *h = file_handle(dir); if (mkdir(dir, 0700) != 0) barf_perror("Creating directory %s", dir); - if (mkdir(talloc_asprintf(cmd, "%s/tool", dir), 0700) != 0) + if (mkdir(talloc_asprintf(h, "%s/tool", dir), 0700) != 0) barf_perror("Creating directory %s/tool", dir); - do_command(cmd); - talloc_free(cmd); + if (!file_set_perms(h, talloc_strdup(h, "/"), &perm, 1)) + barf_perror("Setting root perms in %s", dir); + file_close(h); } static void setup_xs_ops(void) @@ -1009,8 +1036,8 @@ } else { dup2(fds[1], STDOUT_FILENO); close(fds[0]); -#if 0 - execlp("valgrind", "valgrind", "xenstored_test", "--output-pid", +#if 1 + execlp("valgrind", "valgrind", "-q", "--suppressions=testsuite/vg-suppressions", "xenstored_test", "--output-pid", "--no-fork", NULL); #else execlp("./xenstored_test", "xenstored_test", "--output-pid", @@ -1112,9 +1139,6 @@ data->ops->close(pre); } } - if (data->print_progress) - printf("\n"); - out: data->ops->close(h); return i; @@ -1192,10 +1216,9 @@ try = try_simple(NULL, iters, verbose, &data); if (try == iters) { cleanup_xs_ops(); - printf("Succeeded\n"); exit(0); } - printf("Failed on iteration %u\n", try + 1); + printf("Failed on iteration %u of seed %u\n", try + 1, seed); data.print_progress = false; reduce_problem(try + 1, try_simple, &data); } @@ -1406,8 +1429,6 @@ talloc_free(fileh_pre); } } - if (data->print_progress) - printf("\n"); fail = NULL; if (data->fast) @@ -1435,10 +1456,9 @@ try = try_diff(NULL, iters, verbose, &data); if (try == iters) { cleanup_xs_ops(); - printf("Succeeded\n"); exit(0); } - printf("Failed on iteration %u\n", try + 1); + printf("Failed on iteration %u of seed %u\n", try + 1, seed); data.print_progress = false; reduce_problem(try + 1, try_diff, &data); } @@ -1593,8 +1613,6 @@ xs_close(tmpxsh); file_close(tmpfileh); } - - printf("Total %u of %u not aborted\n", tried - aborted, tried); out: if (xsh) xs_close(xsh); @@ -1615,10 +1633,9 @@ try = try_fail(NULL, iters, verbose, &data); if (try == iters) { cleanup_xs_ops(); - printf("Succeeded\n"); exit(0); } - printf("Failed on iteration %u\n", try + 1); + printf("Failed on iteration %u of seed %u\n", try + 1, seed); fflush(stdout); data.print_progress = false; reduce_problem(try + 1, try_fail, &data); diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs_test.c --- a/tools/xenstore/xs_test.c Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs_test.c Thu Aug 25 22:53:20 2005 @@ -17,6 +17,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <sys/types.h> @@ -28,16 +29,25 @@ #include <stdbool.h> #include <stdlib.h> #include <sys/mman.h> +#include <fnmatch.h> +#include <stdarg.h> +#include <string.h> +#include <getopt.h> +#include <ctype.h> +#include <sys/time.h> #include "utils.h" #include "xs_lib.h" +#include "list.h" #define XSTEST static struct xs_handle *handles[10] = { NULL }; -static unsigned int children; - -static bool timeout = true; + +static unsigned int timeout_ms = 200; +static bool timeout_suppressed = true; static bool readonly = false; +static bool print_input = false; +static unsigned int linenum = 0; struct ringbuf_head { @@ -178,7 +188,7 @@ static void __attribute__((noreturn)) usage(void) { barf("Usage:\n" - " xs_test [--readonly] [--notimeout]\n" + " xs_test [--readonly] [--no-timeout] [-x]\n" "Reads commands from stdin, one per line:" " dir <path>\n" " read <path>\n" @@ -190,8 +200,6 @@ " setperm <path> <id> <flags> ...\n" " shutdown\n" " watch <path> <token>\n" - " async <command>...\n" - " asyncwait\n" " waitwatch\n" " ackwatch <token>\n" " unwatch <path> <token>\n" @@ -200,7 +208,13 @@ " abort\n" " introduce <domid> <mfn> <eventchn> <path>\n" " commit\n" - " sleep <seconds>\n" + " sleep <milliseconds>\n" + " expect <pattern>\n" + " notimeout\n" + " readonly\n" + " readwrite\n" + " noackwrite <path> <flags> <value>...\n" + " readack\n" " dump\n"); } @@ -218,7 +232,7 @@ return off; } -static char *arg(char *line, unsigned int num) +static char *arg(const char *line, unsigned int num) { static char *args[10]; unsigned int off, len; @@ -236,12 +250,64 @@ return args[num]; } +struct expect +{ + struct list_head list; + char *pattern; +}; +static LIST_HEAD(expects); + static char *command; -static void __attribute__((noreturn)) failed(int handle) + +/* Trim leading and trailing whitespace */ +static void trim(char *str) +{ + while (isspace(str[0])) + memmove(str, str+1, strlen(str)); + + while (strlen(str) && isspace(str[strlen(str)-1])) + str[strlen(str)-1] = '\0'; +} + +static void output(const char *fmt, ...) +{ + char *str; + struct expect *i; + va_list arglist; + + va_start(arglist, fmt); + vasprintf(&str, fmt, arglist); + va_end(arglist); + + printf("%s", str); + fflush(stdout); + trim(str); + list_for_each_entry(i, &expects, list) { + if (fnmatch(i->pattern, str, 0) == 0) { + list_del(&i->list); + free(i); + return; + } + } + barf("Unexpected output %s\n", str); +} + +static void failed(int handle) { if (handle) - barf_perror("%i: %s", handle, command); - barf_perror("%s", command); + output("%i: %s failed: %s\n", + handle, command, strerror(errno)); + else + output("%s failed: %s\n", command, strerror(errno)); +} + +static void expect(const char *line) +{ + struct expect *e = malloc(sizeof(*e)); + + e->pattern = strdup(line + argpos(line, 1)); + trim(e->pattern); + list_add(&e->list, &expects); } static void do_dir(unsigned int handle, char *path) @@ -250,14 +316,16 @@ unsigned int i, num; entries = xs_directory(handles[handle], path, &num); - if (!entries) - failed(handle); + if (!entries) { + failed(handle); + return; + } for (i = 0; i < num; i++) if (handle) - printf("%i:%s\n", handle, entries[i]); + output("%i:%s\n", handle, entries[i]); else - printf("%s\n", entries[i]); + output("%s\n", entries[i]); free(entries); } @@ -267,15 +335,17 @@ unsigned int len; value = xs_read(handles[handle], path, &len); - if (!value) - failed(handle); + if (!value) { + failed(handle); + return; + } /* It's supposed to nul terminate for us. */ assert(value[len] == '\0'); if (handle) - printf("%i:%.*s\n", handle, len, value); + output("%i:%.*s\n", handle, len, value); else - printf("%.*s\n", len, value); + output("%.*s\n", len, value); } static void do_write(unsigned int handle, char *path, char *flags, char *data) @@ -297,6 +367,45 @@ failed(handle); } +static void do_noackwrite(unsigned int handle, + char *path, const char *flags, char *data) +{ + struct xsd_sockmsg msg; + + /* Format: Flags (as string), path, data. */ + if (streq(flags, "none")) + flags = XS_WRITE_NONE; + else if (streq(flags, "create")) + flags = XS_WRITE_CREATE; + else if (streq(flags, "excl")) + flags = XS_WRITE_CREATE_EXCL; + else + barf("noackwrite flags 'none', 'create' or 'excl' only"); + + msg.len = strlen(path) + 1 + strlen(flags) + 1 + strlen(data); + msg.type = XS_WRITE; + if (!write_all_choice(handles[handle]->fd, &msg, sizeof(msg))) + failed(handle); + if (!write_all_choice(handles[handle]->fd, path, strlen(path) + 1)) + failed(handle); + if (!write_all_choice(handles[handle]->fd, flags, strlen(flags) + 1)) + failed(handle); + if (!write_all_choice(handles[handle]->fd, data, strlen(data))) + failed(handle); + /* Do not wait for ack. */ +} + +static void do_readack(unsigned int handle) +{ + enum xsd_sockmsg_type type; + char *ret; + + ret = read_reply(handles[handle]->fd, &type, NULL); + if (!ret) + failed(handle); + free(ret); +} + static void do_setid(unsigned int handle, char *id) { if (!xs_bool(xs_debug_command(handles[handle], "setid", id, @@ -322,8 +431,10 @@ struct xs_permissions *perms; perms = xs_get_permissions(handles[handle], path, &num); - if (!perms) - failed(handle); + if (!perms) { + failed(handle); + return; + } for (i = 0; i < num; i++) { char *permstring; @@ -346,9 +457,9 @@ } if (handle) - printf("%i:%i %s\n", handle, perms[i].id, permstring); + output("%i:%i %s\n", handle, perms[i].id, permstring); else - printf("%i %s\n", perms[i].id, permstring); + output("%i %s\n", perms[i].id, permstring); } free(perms); } @@ -396,18 +507,56 @@ failed(handle); } +static void set_timeout(void) +{ + struct itimerval timeout; + + timeout.it_value.tv_sec = timeout_ms / 1000; + timeout.it_value.tv_usec = (timeout_ms * 1000) % 1000000; + timeout.it_interval.tv_sec = timeout.it_interval.tv_usec = 0; + setitimer(ITIMER_REAL, &timeout, NULL); +} + +static void disarm_timeout(void) +{ + struct itimerval timeout; + + timeout.it_value.tv_sec = 0; + timeout.it_value.tv_usec = 0; + setitimer(ITIMER_REAL, &timeout, NULL); +} + static void do_waitwatch(unsigned int handle) { char **vec; + struct timeval tv = {.tv_sec = timeout_ms/1000, + .tv_usec = (timeout_ms*1000)%1000000 }; + fd_set set; + + if (xs_fileno(handles[handle]) != -2) { + /* Manually select here so we can time out gracefully. */ + FD_ZERO(&set); + FD_SET(xs_fileno(handles[handle]), &set); + disarm_timeout(); + if (select(xs_fileno(handles[handle])+1, &set, + NULL, NULL, &tv) == 0) { + errno = ETIMEDOUT; + failed(handle); + return; + } + set_timeout(); + } vec = xs_read_watch(handles[handle]); - if (!vec) - failed(handle); + if (!vec) { + failed(handle); + return; + } if (handle) - printf("%i:%s:%s\n", handle, vec[0], vec[1]); + output("%i:%s:%s\n", handle, vec[0], vec[1]); else - printf("%s:%s\n", vec[0], vec[1]); + output("%s:%s\n", vec[0], vec[1]); free(vec); } @@ -415,82 +564,6 @@ { if (!xs_acknowledge_watch(handles[handle], token)) failed(handle); -} - -static bool wait_for_input(unsigned int handle) -{ - unsigned int i; - for (i = 0; i < ARRAY_SIZE(handles); i++) { - int fd; - - if (!handles[i] || i == handle) - continue; - - fd = xs_fileno(handles[i]); - if (fd == -2) { - unsigned int avail; - get_input_chunk(in, in->buf, &avail); - if (avail != 0) - return true; - } else { - struct timeval tv = {.tv_sec = 0, .tv_usec = 0 }; - fd_set set; - - FD_ZERO(&set); - FD_SET(fd, &set); - if (select(fd+1, &set, NULL, NULL,&tv)) - return true; - } - } - return false; -} - - -/* Async wait for watch on handle */ -static void do_command(unsigned int default_handle, char *line); -static void do_async(unsigned int handle, char *line) -{ - int child; - unsigned int i; - children++; - if ((child = fork()) != 0) { - /* Wait until *something* happens, which indicates - * child has created an event. V. sloppy, but we can't - * select on fake domain connections. - */ - while (!wait_for_input(handle)); - return; - } - - /* Don't keep other handles open in parent. */ - for (i = 0; i < ARRAY_SIZE(handles); i++) { - if (handles[i] && i != handle) { - xs_daemon_close(handles[i]); - handles[i] = NULL; - } - } - - do_command(handle, line + argpos(line, 1)); - exit(0); -} - -static void do_asyncwait(unsigned int handle) -{ - int status; - - if (handle) - barf("handle has no meaning with asyncwait"); - - if (children == 0) - barf("No children to wait for!"); - - if (waitpid(0, &status, 0) > 0) { - if (!WIFEXITED(status)) - barf("async died"); - if (WEXITSTATUS(status)) - exit(WEXITSTATUS(status)); - } - children--; } static void do_unwatch(unsigned int handle, const char *node, const char *token) @@ -519,6 +592,9 @@ { unsigned int i; int fd; + + /* This mechanism is v. slow w. valgrind running. */ + timeout_ms = 5000; /* We poll, so ignore signal */ signal(SIGUSR2, SIG_IGN); @@ -538,14 +614,17 @@ *(int *)((void *)out + 32) = getpid(); *(u16 *)((void *)out + 36) = atoi(eventchn); + if (!xs_introduce_domain(handles[handle], atoi(domid), + atol(mfn), atoi(eventchn), path)) { + failed(handle); + munmap(out, getpagesize()); + return; + } + output("handle is %i\n", i); + /* Create new handle. */ handles[i] = new(struct xs_handle); handles[i]->fd = -2; - - if (!xs_introduce_domain(handles[handle], atoi(domid), - atol(mfn), atoi(eventchn), path)) - failed(handle); - printf("handle is %i\n", i); /* Read in daemon pid. */ daemon_pid = *(int *)((void *)out + 32); @@ -593,18 +672,20 @@ sprintf(subnode, "%s/%s", node, dir[i]); perms = xs_get_permissions(handles[handle], subnode,&numperms); - if (!perms) + if (!perms) { failed(handle); - - printf("%s%s: ", spacing, dir[i]); + return; + } + + output("%s%s: ", spacing, dir[i]); for (j = 0; j < numperms; j++) { char buffer[100]; if (!xs_perm_to_string(&perms[j], buffer)) barf("perm to string"); - printf("%s ", buffer); + output("%s ", buffer); } free(perms); - printf("\n"); + output("\n"); /* Even directories can have contents. */ contents = xs_read(handles[handle], subnode, &len); @@ -612,14 +693,16 @@ if (errno != EISDIR) failed(handle); } else { - printf(" %s(%.*s)\n", spacing, len, contents); + output(" %s(%.*s)\n", spacing, len, contents); free(contents); } /* Every node is a directory. */ subdirs = xs_directory(handles[handle], subnode, &subnum); - if (!subdirs) + if (!subdirs) { failed(handle); + return; + } dump_dir(handle, subnode, subdirs, subnum, depth+1); free(subdirs); } @@ -631,8 +714,10 @@ unsigned int subnum; subdirs = xs_directory(handles[handle], "/", &subnum); - if (!subdirs) - failed(handle); + if (!subdirs) { + failed(handle); + return; + } dump_dir(handle, "", subdirs, subnum, 0); free(subdirs); @@ -655,6 +740,9 @@ static void do_command(unsigned int default_handle, char *line) { char *endp; + + if (print_input) + printf("%i> %s", ++linenum, line); if (strspn(line, " \n") == strlen(line)) return; @@ -667,6 +755,7 @@ else handle = default_handle; + command = arg(line, 0); if (!handles[handle]) { if (readonly) handles[handle] = xs_daemon_open_readonly(); @@ -675,10 +764,10 @@ if (!handles[handle]) barf_perror("Opening connection to daemon"); } - command = arg(line, 0); - - if (timeout) - alarm(1); + + if (!timeout_suppressed) + set_timeout(); + timeout_suppressed = false; if (streq(command, "dir")) do_dir(handle, arg(line, 1)); @@ -703,10 +792,6 @@ do_watch(handle, arg(line, 1), arg(line, 2)); else if (streq(command, "waitwatch")) do_waitwatch(handle); - else if (streq(command, "async")) - do_async(handle, line); - else if (streq(command, "asyncwait")) - do_asyncwait(handle); else if (streq(command, "ackwatch")) do_ackwatch(handle, arg(line, 1)); else if (streq(command, "unwatch")) @@ -727,32 +812,70 @@ do_release(handle, arg(line, 1)); else if (streq(command, "dump")) dump(handle); - else if (streq(command, "sleep")) - sleep(atoi(arg(line, 1))); + else if (streq(command, "sleep")) { + disarm_timeout(); + usleep(atoi(arg(line, 1)) * 1000); + } else if (streq(command, "expect")) + expect(line); + else if (streq(command, "notimeout")) + timeout_suppressed = true; + else if (streq(command, "readonly")) { + readonly = true; + xs_daemon_close(handles[handle]); + handles[handle] = NULL; + } else if (streq(command, "readwrite")) { + readonly = false; + xs_daemon_close(handles[handle]); + handles[handle] = NULL; + } else if (streq(command, "noackwrite")) + do_noackwrite(handle, arg(line,1), arg(line,2), arg(line,3)); + else if (streq(command, "readack")) + do_readack(handle); else barf("Unknown command %s", command); fflush(stdout); - alarm(0); -} + disarm_timeout(); + + /* Check expectations. */ + if (!streq(command, "expect")) { + struct expect *i = list_top(&expects, struct expect, list); + + if (i) + barf("Expected '%s', didn't happen\n", i->pattern); + } +} + +static struct option options[] = { { "readonly", 0, NULL, 'r' }, + { "no-timeout", 0, NULL, 't' }, + { NULL, 0, NULL, 0 } }; int main(int argc, char *argv[]) { + int opt; char line[1024]; - if (argc > 1 && streq(argv[1], "--readonly")) { - readonly = true; - argc--; - argv++; - } - - if (argc > 1 && streq(argv[1], "--no-timeout")) { - timeout = false; - argc--; - argv++; - } - - if (argc != 1) + while ((opt = getopt_long(argc, argv, "xrt", options, NULL)) != -1) { + switch (opt) { + case 'r': + readonly = true; + break; + case 't': + timeout_ms = 0; + break; + case 'x': + print_input = true; + break; + } + } + + if (optind + 1 == argc) { + int fd = open(argv[optind], O_RDONLY); + if (!fd) + barf_perror("Opening %s", argv[optind]); + dup2(fd, STDIN_FILENO); + } else if (optind != argc) usage(); + /* The size of the ringbuffer: half a page minus head structure. */ ringbuf_datasize = getpagesize() / 2 - sizeof(struct ringbuf_head); @@ -761,7 +884,5 @@ while (fgets(line, sizeof(line), stdin)) do_command(0, line); - while (children) - do_asyncwait(0); return 0; } diff -r 5f1ed597f107 -r 8799d14bef77 tools/xentrace/Makefile --- a/tools/xentrace/Makefile Wed Aug 24 02:43:18 2005 +++ b/tools/xentrace/Makefile Thu Aug 25 22:53:20 2005 @@ -36,4 +36,4 @@ $(RM) *.a *.so *.o *.rpm $(BIN) %: %.c $(HDRS) Makefile - $(CC) $(CFLAGS) -o $@ $< -L$(XEN_LIBXC) -lxc + $(CC) $(CFLAGS) -o $@ $< -L$(XEN_LIBXC) -lxenctrl diff -r 5f1ed597f107 -r 8799d14bef77 tools/xentrace/xenctx.c --- a/tools/xentrace/xenctx.c Wed Aug 24 02:43:18 2005 +++ b/tools/xentrace/xenctx.c Thu Aug 25 22:53:20 2005 @@ -21,7 +21,7 @@ #include <argp.h> #include <signal.h> -#include "xc.h" +#include "xenctrl.h" #ifdef __i386__ void print_ctx(vcpu_guest_context_t *ctx1) diff -r 5f1ed597f107 -r 8799d14bef77 tools/xentrace/xentrace.c --- a/tools/xentrace/xentrace.c Wed Aug 24 02:43:18 2005 +++ b/tools/xentrace/xentrace.c Thu Aug 25 22:53:20 2005 @@ -45,6 +45,8 @@ char *outfile; struct timespec poll_sleep; unsigned long new_data_thresh; + u32 evt_mask; + u32 cpu_mask; } settings_t; settings_t opts; @@ -93,13 +95,13 @@ /** * get_tbufs - get pointer to and size of the trace buffers - * @mach_addr: location to store machine address if the trace buffers to - * @size: location to store the size of a trace buffer to + * @mfn: location to store mfn of the trace buffers to + * @size: location to store the size of a trace buffer to * * Gets the machine address of the trace pointer area and the size of the * per CPU buffers. */ -void get_tbufs(unsigned long *mach_addr, unsigned long *size) +void get_tbufs(unsigned long *mfn, unsigned long *size) { int ret; dom0_op_t op; /* dom0 op we'll build */ @@ -119,19 +121,19 @@ exit(EXIT_FAILURE); } - *mach_addr = op.u.tbufcontrol.mach_addr; - *size = op.u.tbufcontrol.size; + *mfn = op.u.tbufcontrol.buffer_mfn; + *size = op.u.tbufcontrol.size; } /** * map_tbufs - memory map Xen trace buffers into user space - * @tbufs: machine address of the trace buffers + * @tbufs_mfn: mfn of the trace buffers * @num: number of trace buffers to map * @size: size of each trace buffer * * Maps the Xen trace buffers them into process address space. */ -struct t_buf *map_tbufs(unsigned long tbufs_mach, unsigned int num, +struct t_buf *map_tbufs(unsigned long tbufs_mfn, unsigned int num, unsigned long size) { int xc_handle; /* file descriptor for /proc/xen/privcmd */ @@ -147,7 +149,7 @@ tbufs_mapped = xc_map_foreign_range(xc_handle, 0 /* Dom 0 ID */, size * num, PROT_READ, - tbufs_mach >> PAGE_SHIFT); + tbufs_mfn); xc_interface_close(xc_handle); @@ -160,6 +162,41 @@ return tbufs_mapped; } +/** + * set_mask - set the cpu/event mask in HV + * @mask: the new mask + * @type: the new mask type,0-event mask, 1-cpu mask + * + */ +void set_mask(u32 mask, int type) +{ + int ret; + dom0_op_t op; /* dom0 op we'll build */ + int xc_handle = xc_interface_open(); /* for accessing control interface */ + + op.cmd = DOM0_TBUFCONTROL; + op.interface_version = DOM0_INTERFACE_VERSION; + if (type == 1) { /* cpu mask */ + op.u.tbufcontrol.op = DOM0_TBUF_SET_CPU_MASK; + op.u.tbufcontrol.cpu_mask = mask; + fprintf(stderr, "change cpumask to 0x%x\n", mask); + }else if (type == 0) { /* event mask */ + op.u.tbufcontrol.op = DOM0_TBUF_SET_EVT_MASK; + op.u.tbufcontrol.evt_mask = mask; + fprintf(stderr, "change evtmask to 0x%x\n", mask); + } + + ret = do_dom0_op(xc_handle, &op); + + xc_interface_close(xc_handle); + + if ( ret != 0 ) + { + PERROR("Failure to get trace buffer pointer from Xen and set the new mask"); + exit(EXIT_FAILURE); + } + +} /** * init_bufs_ptrs - initialises an array of pointers to the trace buffers @@ -194,7 +231,7 @@ /** * init_rec_ptrs - initialises data area pointers to locations in user space - * @tbufs_mach: machine base address of the trace buffer area + * @tbufs_mfn: base mfn of the trace buffer area * @tbufs_mapped: user virtual address of base of trace buffer area * @meta: array of user-space pointers to struct t_buf's of metadata * @num: number of trace buffers @@ -203,7 +240,7 @@ * mapped in user space. Note that the trace buffer metadata contains machine * pointers - the array returned allows more convenient access to them. */ -struct t_rec **init_rec_ptrs(unsigned long tbufs_mach, +struct t_rec **init_rec_ptrs(unsigned long tbufs_mfn, struct t_buf *tbufs_mapped, struct t_buf **meta, unsigned int num) @@ -219,7 +256,7 @@ } for ( i = 0; i < num; i++ ) - data[i] = (struct t_rec *)(meta[i]->rec_addr - tbufs_mach + data[i] = (struct t_rec *)(meta[i]->rec_addr - (tbufs_mfn<<XC_PAGE_SHIFT) /* XXX */ + (unsigned long)tbufs_mapped); return data; @@ -293,7 +330,7 @@ struct t_rec **data; /* pointers to the trace buffer data areas * where they are mapped into user space. */ unsigned long *cons; /* store tail indexes for the trace buffers */ - unsigned long tbufs_mach; /* machine address of the tbufs */ + unsigned long tbufs_mfn; /* mfn of the tbufs */ unsigned int num; /* number of trace buffers / logical CPUS */ unsigned long size; /* size of a single trace buffer */ @@ -303,14 +340,14 @@ num = get_num_cpus(); /* setup access to trace buffers */ - get_tbufs(&tbufs_mach, &size); - tbufs_mapped = map_tbufs(tbufs_mach, num, size); + get_tbufs(&tbufs_mfn, &size); + tbufs_mapped = map_tbufs(tbufs_mfn, num, size); size_in_recs = (size - sizeof(struct t_buf)) / sizeof(struct t_rec); /* build arrays of convenience ptrs */ meta = init_bufs_ptrs (tbufs_mapped, num, size); - data = init_rec_ptrs (tbufs_mach, tbufs_mapped, meta, num); + data = init_rec_ptrs (tbufs_mfn, tbufs_mapped, meta, num); cons = init_tail_idxs (meta, num); /* now, scan buffers for events */ @@ -341,6 +378,31 @@ * Various declarations / definitions GNU argp needs to do its work *****************************************************************************/ +int parse_evtmask(char *arg, struct argp_state *state) +{ + settings_t *setup = (settings_t *)state->input; + char *inval; + + /* search filtering class */ + if (strcmp(arg, "gen") == 0){ + setup->evt_mask |= TRC_GEN; + } else if(strcmp(arg, "sched") == 0){ + setup->evt_mask |= TRC_SCHED; + } else if(strcmp(arg, "dom0op") == 0){ + setup->evt_mask |= TRC_DOM0OP; + } else if(strcmp(arg, "vmx") == 0){ + setup->evt_mask |= TRC_VMX; + } else if(strcmp(arg, "all") == 0){ + setup->evt_mask |= TRC_ALL; + } else { + setup->evt_mask = strtol(arg, &inval, 0); + if ( inval == arg ) + argp_usage(state); + } + + return 0; + +} /* command parser for GNU argp - see GNU docs for more info */ error_t cmd_parser(int key, char *arg, struct argp_state *state) @@ -366,6 +428,21 @@ argp_usage(state); } break; + + case 'c': /* set new cpu mask for filtering*/ + { + char *inval; + setup->cpu_mask = strtol(arg, &inval, 0); + if ( inval == arg ) + argp_usage(state); + } + break; + + case 'e': /* set new event mask for filtering*/ + { + parse_evtmask(arg, state); + } + break; case ARGP_KEY_ARG: { @@ -397,6 +474,14 @@ .doc = "Set sleep time, p, in milliseconds between polling the trace buffer " "for new data (default " xstr(POLL_SLEEP_MILLIS) ")." }, + + { .name = "cpu-mask", .key='c', .arg="c", + .doc = + "set cpu-mask " }, + + { .name = "evt-mask", .key='e', .arg="e", + .doc = + "set evt-mask " }, {0} }; @@ -430,8 +515,18 @@ opts.outfile = 0; opts.poll_sleep = millis_to_timespec(POLL_SLEEP_MILLIS); opts.new_data_thresh = NEW_DATA_THRESH; + opts.evt_mask = 0; + opts.cpu_mask = 0; argp_parse(&parser_def, argc, argv, 0, 0, &opts); + + if (opts.evt_mask != 0) { + set_mask(opts.evt_mask, 0); + } + + if (opts.cpu_mask != 0) { + set_mask(opts.evt_mask, 1); + } if ( opts.outfile ) outfd = open(opts.outfile, O_WRONLY | O_CREAT); diff -r 5f1ed597f107 -r 8799d14bef77 xen/Rules.mk --- a/xen/Rules.mk Wed Aug 24 02:43:18 2005 +++ b/xen/Rules.mk Thu Aug 25 22:53:20 2005 @@ -2,7 +2,7 @@ # If you change any of these configuration options then you must # 'make clean' before rebuilding. # -verbose ?= n +verbose ?= y debug ?= n perfc ?= n perfc_arrays?= n @@ -10,14 +10,6 @@ optimize ?= y domu_debug ?= n crash_debug ?= n - -# ACM_USE_SECURITY_POLICY is set to security policy of Xen -# Supported models are: -# ACM_NULL_POLICY (ACM will not be built with this policy) -# ACM_CHINESE_WALL_POLICY -# ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY -# ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY -ACM_USE_SECURITY_POLICY ?= ACM_NULL_POLICY include $(BASEDIR)/../Config.mk diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/ia64/domain.c --- a/xen/arch/ia64/domain.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/ia64/domain.c Thu Aug 25 22:53:20 2005 @@ -1092,3 +1092,12 @@ { vcpu_pend_interrupt(dom0->vcpu[0],irq); } + +void vcpu_migrate_cpu(struct vcpu *v, int newcpu) +{ + if ( v->processor == newcpu ) + return; + + set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); + v->processor = newcpu; +} diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/ia64/grant_table.c --- a/xen/arch/ia64/grant_table.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/ia64/grant_table.c Thu Aug 25 22:53:20 2005 @@ -355,7 +355,7 @@ /* Bitwise-OR avoids short-circuiting which screws control flow. */ if ( unlikely(__get_user(dom, &uop->dom) | __get_user(ref, &uop->ref) | - __get_user(host_virt_addr, &uop->host_virt_addr) | + __get_user(host_virt_addr, &uop->host_addr) | __get_user(dev_hst_ro_flags, &uop->flags)) ) { DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n"); @@ -500,7 +500,7 @@ ld = current->domain; /* Bitwise-OR avoids short-circuiting which screws control flow. */ - if ( unlikely(__get_user(virt, &uop->host_virt_addr) | + if ( unlikely(__get_user(virt, &uop->host_addr) | __get_user(frame, &uop->dev_bus_addr) | __get_user(handle, &uop->handle)) ) { @@ -545,15 +545,6 @@ if ( frame == 0 ) { frame = act->frame; - } - else if ( frame == GNTUNMAP_DEV_FROM_VIRT ) - { - if ( !( flags & GNTMAP_device_map ) ) - PIN_FAIL(unmap_out, GNTST_bad_dev_addr, - "Bad frame number: frame not mapped for dev access.\n"); - frame = act->frame; - - /* Frame will be unmapped for device access below if virt addr okay. */ } else { @@ -615,15 +606,6 @@ act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_hstr_inc : GNTPIN_hstw_inc; - - if ( frame == GNTUNMAP_DEV_FROM_VIRT ) - { - act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_devr_inc - : GNTPIN_devw_inc; - - map->ref_and_flags &= ~GNTMAP_device_map; - (void)__put_user(0, &uop->dev_bus_addr); - } rc = 0; *va = virt; diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/ia64/vcpu.c --- a/xen/arch/ia64/vcpu.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/ia64/vcpu.c Thu Aug 25 22:53:20 2005 @@ -585,6 +585,14 @@ set_bit(vector,PSCBX(vcpu,irr)); PSCB(vcpu,pending_interruption) = 1; } + + /* Keir: I think you should unblock when an interrupt is pending. */ + { + int running = test_bit(_VCPUF_running, &vcpu->vcpu_flags); + vcpu_unblock(vcpu); + if ( running ) + smp_send_event_check_cpu(vcpu->processor); + } } void early_tick(VCPU *vcpu) diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/ia64/xenmisc.c --- a/xen/arch/ia64/xenmisc.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/ia64/xenmisc.c Thu Aug 25 22:53:20 2005 @@ -280,7 +280,6 @@ unsigned long context_switch_count = 0; -// context_switch void context_switch(struct vcpu *prev, struct vcpu *next) { //printk("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"); @@ -290,22 +289,14 @@ //if (prev->domain->domain_id == 0 && next->domain->domain_id == 1) cs01foo(); //printk("@@sw %d->%d\n",prev->domain->domain_id,next->domain->domain_id); #ifdef CONFIG_VTI - unsigned long psr; - /* Interrupt is enabled after next task is chosen. - * So we have to disable it for stack switch. - */ - local_irq_save(psr); vtm_domain_out(prev); - /* Housekeeping for prev domain */ -#endif // CONFIG_VTI - +#endif context_switch_count++; switch_to(prev,next,prev); #ifdef CONFIG_VTI - /* Post-setup for new domain */ vtm_domain_in(current); - local_irq_restore(psr); -#endif // CONFIG_VTI +#endif + // leave this debug for now: it acts as a heartbeat when more than // one domain is active { @@ -315,25 +306,27 @@ if (!cnt[id]--) { printk("%x",id); cnt[id] = 500000; } if (!i--) { printk("+",id); i = 1000000; } } - clear_bit(_VCPUF_running, &prev->vcpu_flags); - //if (!is_idle_task(next->domain) ) - //send_guest_virq(next, VIRQ_TIMER); + #ifdef CONFIG_VTI if (VMX_DOMAIN(current)) vmx_load_all_rr(current); - return; -#else // CONFIG_VTI +#else if (!is_idle_task(current->domain)) { load_region_regs(current); if (vcpu_timer_expired(current)) vcpu_pend_timer(current); } if (vcpu_timer_expired(current)) vcpu_pend_timer(current); -#endif // CONFIG_VTI +#endif +} + +void context_switch_finalise(struct vcpu *next) +{ + /* nothing to do */ } void continue_running(struct vcpu *same) { - /* nothing to do */ + /* nothing to do */ } void panic_domain(struct pt_regs *regs, const char *fmt, ...) diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/ia64/xensetup.c --- a/xen/arch/ia64/xensetup.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/ia64/xensetup.c Thu Aug 25 22:53:20 2005 @@ -131,12 +131,14 @@ } struct ns16550_defaults ns16550_com1 = { + .baud = BAUD_AUTO, .data_bits = 8, .parity = 'n', .stop_bits = 1 }; struct ns16550_defaults ns16550_com2 = { + .baud = BAUD_AUTO, .data_bits = 8, .parity = 'n', .stop_bits = 1 diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/ia64/xentime.c --- a/xen/arch/ia64/xentime.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/ia64/xentime.c Thu Aug 25 22:53:20 2005 @@ -48,7 +48,7 @@ static s_time_t stime_irq = 0x0; /* System time at last 'time update' */ unsigned long itc_scale; unsigned long itc_at_irq; -static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */ +static unsigned long wc_sec, wc_nsec; /* UTC time at last 'time update'. */ //static rwlock_t time_lock = RW_LOCK_UNLOCKED; static irqreturn_t vmx_timer_interrupt (int irq, void *dev_id, struct pt_regs *regs); @@ -103,25 +103,22 @@ } /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */ -void do_settime(unsigned long secs, unsigned long usecs, u64 system_time_base) +void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base) { #ifdef CONFIG_VTI - s64 delta; - long _usecs = (long)usecs; + u64 _nsecs; write_lock_irq(&xtime_lock); - delta = (s64)(stime_irq - system_time_base); - - _usecs += (long)(delta/1000); - while ( _usecs >= 1000000 ) + _nsecs = (u64)nsecs + (s64)(stime_irq - system_time_base); + while ( _nsecs >= 1000000000 ) { - _usecs -= 1000000; + _nsecs -= 1000000000; secs++; } wc_sec = secs; - wc_usec = _usecs; + wc_nsec = (unsigned long)_nsecs; write_unlock_irq(&xtime_lock); @@ -290,13 +287,13 @@ /* Wallclock time starts as the initial RTC time. */ efi_gettimeofday(&tm); wc_sec = tm.tv_sec; - wc_usec = tm.tv_nsec/1000; + wc_nsec = tm.tv_nsec; printk("Time init:\n"); printk(".... System Time: %ldns\n", NOW()); printk(".... scale: %16lX\n", itc_scale); - printk(".... Wall Clock: %lds %ldus\n", wc_sec, wc_usec); + printk(".... Wall Clock: %lds %ldus\n", wc_sec, wc_nsec/1000); return 0; } @@ -338,10 +335,10 @@ (*(unsigned long *)&jiffies_64)++; /* Update wall time. */ - wc_usec += 1000000/HZ; - if ( wc_usec >= 1000000 ) + wc_nsec += 1000000000/HZ; + if ( wc_nsec >= 1000000000 ) { - wc_usec -= 1000000; + wc_nsec -= 1000000000; wc_sec++; } diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/audit.c --- a/xen/arch/x86/audit.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/audit.c Thu Aug 25 22:53:20 2005 @@ -73,7 +73,7 @@ if ( tcount < 0 ) { APRINTK("Audit %d: type count went below zero " - "mfn=%lx t=%x ot=%x", + "mfn=%lx t=%" PRtype_info " ot=%x", d->domain_id, page_to_pfn(page), page->u.inuse.type_info, page->tlbflush_timestamp); @@ -82,7 +82,7 @@ else if ( (tcount & ~PGT_count_mask) != 0 ) { APRINTK("Audit %d: type count overflowed " - "mfn=%lx t=%x ot=%x", + "mfn=%lx t=%" PRtype_info " ot=%x", d->domain_id, page_to_pfn(page), page->u.inuse.type_info, page->tlbflush_timestamp); @@ -101,7 +101,7 @@ if ( count < 0 ) { APRINTK("Audit %d: general count went below zero " - "mfn=%lx t=%x ot=%x", + "mfn=%lx t=%" PRtype_info " ot=%x", d->domain_id, page_to_pfn(page), page->u.inuse.type_info, page->tlbflush_timestamp); @@ -110,7 +110,7 @@ else if ( (count & ~PGT_count_mask) != 0 ) { APRINTK("Audit %d: general count overflowed " - "mfn=%lx t=%x ot=%x", + "mfn=%lx t=%" PRtype_info " ot=%x", d->domain_id, page_to_pfn(page), page->u.inuse.type_info, page->tlbflush_timestamp); @@ -152,7 +152,8 @@ if ( page_type != PGT_l1_shadow ) { printk("Audit %d: [Shadow L2 mfn=%lx i=%x] " - "Expected Shadow L1 t=%x mfn=%lx\n", + "Expected Shadow L1 t=%" PRtype_info + " mfn=%lx\n", d->domain_id, mfn, i, l1page->u.inuse.type_info, l1mfn); errors++; @@ -178,14 +179,14 @@ if ( page_type == PGT_l2_page_table ) { printk("Audit %d: [%x] Found %s Linear PT " - "t=%x mfn=%lx\n", + "t=%" PRtype_info " mfn=%lx\n", d->domain_id, i, (l1mfn==mfn) ? "Self" : "Other", l1page->u.inuse.type_info, l1mfn); } else if ( page_type != PGT_l1_page_table ) { printk("Audit %d: [L2 mfn=%lx i=%x] " - "Expected L1 t=%x mfn=%lx\n", + "Expected L1 t=%" PRtype_info " mfn=%lx\n", d->domain_id, mfn, i, l1page->u.inuse.type_info, l1mfn); errors++; @@ -237,7 +238,8 @@ if ( page_get_owner(gpage) != d ) { printk("Audit %d: [hl2mfn=%lx,i=%x] Skip foreign page " - "dom=%p (id=%d) mfn=%lx c=%08x t=%08x\n", + "dom=%p (id=%d) mfn=%lx c=%08x t=%" + PRtype_info "\n", d->domain_id, hl2mfn, i, page_get_owner(gpage), page_get_owner(gpage)->domain_id, @@ -288,7 +290,7 @@ PGT_writable_page) ) { printk("Audit %d: [l1mfn=%lx, i=%x] Illegal RW " - "t=%x mfn=%lx\n", + "t=%" PRtype_info " mfn=%lx\n", d->domain_id, l1mfn, i, gpage->u.inuse.type_info, gmfn); errors++; @@ -308,7 +310,8 @@ if ( page_get_owner(gpage) != d ) { printk("Audit %d: [l1mfn=%lx,i=%x] Skip foreign page " - "dom=%p (id=%d) mfn=%lx c=%08x t=%08x\n", + "dom=%p (id=%d) mfn=%lx c=%08x t=%" + PRtype_info "\n", d->domain_id, l1mfn, i, page_get_owner(gpage), page_get_owner(gpage)->domain_id, @@ -454,7 +457,7 @@ if ( shadow_refcounts ) { printk("Audit %d: found an L2 guest page " - "mfn=%lx t=%08x c=%08x while in shadow mode\n", + "mfn=%lx t=%" PRtype_info " c=%08x while in shadow mode\n", d->domain_id, mfn, page->u.inuse.type_info, page->count_info); errors++; @@ -465,14 +468,16 @@ if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) { - printk("Audit %d: L2 mfn=%lx not validated %08x\n", + printk("Audit %d: L2 mfn=%lx not validated %" + PRtype_info "\n", d->domain_id, mfn, page->u.inuse.type_info); errors++; } if ( (page->u.inuse.type_info & PGT_pinned) != PGT_pinned ) { - printk("Audit %d: L2 mfn=%lx not pinned t=%08x\n", + printk("Audit %d: L2 mfn=%lx not pinned t=%" + PRtype_info "\n", d->domain_id, mfn, page->u.inuse.type_info); errors++; } @@ -494,7 +499,8 @@ { if ( shadow_refcounts ) { - printk("found an L1 guest page mfn=%lx t=%08x c=%08x " + printk("found an L1 guest page mfn=%lx t=%" + PRtype_info " c=%08x " "while in shadow mode\n", mfn, page->u.inuse.type_info, page->count_info); errors++; @@ -505,7 +511,8 @@ if ( (page->u.inuse.type_info & PGT_validated) != PGT_validated ) { - printk("Audit %d: L1 not validated mfn=%lx t=%08x\n", + printk("Audit %d: L1 not validated mfn=%lx t=%" + PRtype_info "\n", d->domain_id, mfn, page->u.inuse.type_info); errors++; } @@ -514,7 +521,8 @@ { if ( !VM_ASSIST(d, VMASST_TYPE_writable_pagetables) ) { - printk("Audit %d: L1 mfn=%lx not pinned t=%08x\n", + printk("Audit %d: L1 mfn=%lx not pinned t=%" + PRtype_info "\n", d->domain_id, mfn, page->u.inuse.type_info); } } @@ -621,7 +629,7 @@ for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { if ( (pt[i] & _PAGE_PRESENT) && ((pt[i] >> PAGE_SHIFT) == xmfn) ) - printk(" found dom=%d mfn=%lx t=%08x c=%08x " + printk(" found dom=%d mfn=%lx t=%" PRtype_info " c=%08x " "pt[i=%x]=%lx\n", d->domain_id, mfn, page->u.inuse.type_info, page->count_info, i, pt[i]); @@ -754,7 +762,7 @@ if ( (page->u.inuse.type_info & PGT_count_mask) > (page->count_info & PGC_count_mask) ) { - printk("taf(%08x) > caf(%08x) mfn=%lx\n", + printk("taf(%" PRtype_info ") > caf(%08x) mfn=%lx\n", page->u.inuse.type_info, page->count_info, mfn); errors++; } @@ -763,8 +771,8 @@ (page_type == PGT_writable_page) && !(page->u.inuse.type_info & PGT_validated) ) { - printk("shadow mode writable page not validated mfn=%lx " - "t=%08x c=%08x\n", + printk("shadow mode writable page not validated mfn=%lx " + "t=%" PRtype_info " c=%08x\n", mfn, page->u.inuse.type_info, page->count_info); errors++; } @@ -774,7 +782,7 @@ (page->u.inuse.type_info & PGT_count_mask) > 1 ) { printk("writeable page with type count >1: " - "mfn=%lx t=%08x c=%08x\n", + "mfn=%lx t=%" PRtype_info " c=%08x\n", mfn, page->u.inuse.type_info, page->count_info ); @@ -786,7 +794,7 @@ if ( page_type == PGT_none && (page->u.inuse.type_info & PGT_count_mask) > 0 ) { - printk("normal page with type count >0: mfn=%lx t=%08x c=%08x\n", + printk("normal page with type count >0: mfn=%lx t=%" PRtype_info " c=%08x\n", mfn, page->u.inuse.type_info, page->count_info ); @@ -812,7 +820,7 @@ : !(page_type && (page_type <= PGT_l4_page_table)) ) { printk("out of sync page mfn=%lx has strange type " - "t=%08x c=%08x\n", + "t=%" PRtype_info " c=%08x\n", mfn, page->u.inuse.type_info, page->count_info); errors++; } @@ -850,7 +858,7 @@ case PGT_l4_page_table: if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) { - printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n", + printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n", d->domain_id, page->u.inuse.type_info, page->tlbflush_timestamp, page->count_info, mfn); @@ -864,7 +872,7 @@ case PGT_ldt_page: if ( (page->u.inuse.type_info & PGT_count_mask) != 0 ) { - printk("Audit %d: type count!=0 t=%x ot=%x c=%x mfn=%lx\n", + printk("Audit %d: type count!=0 t=%" PRtype_info " ot=%x c=%x mfn=%lx\n", d->domain_id, page->u.inuse.type_info, page->tlbflush_timestamp, page->count_info, mfn); @@ -877,7 +885,7 @@ if ( (page->count_info & PGC_count_mask) != 1 ) { - printk("Audit %d: gen count!=1 (c=%x) t=%x ot=%x mfn=%lx\n", + printk("Audit %d: gen count!=1 (c=%x) t=%" PRtype_info " ot=%x mfn=%lx\n", d->domain_id, page->count_info, page->u.inuse.type_info, @@ -913,7 +921,7 @@ (page->count_info != 0) ) { printk("Audit %d: shadow page counts wrong " - "mfn=%lx t=%08x c=%08x\n", + "mfn=%lx t=%" PRtype_info " c=%08x\n", d->domain_id, page_to_pfn(page), page->u.inuse.type_info, page->count_info); diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/cpu/amd.c --- a/xen/arch/x86/cpu/amd.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/cpu/amd.c Thu Aug 25 22:53:20 2005 @@ -8,6 +8,20 @@ #include <asm/processor.h> #include "cpu.h" + +/* + * amd_flush_filter={on,off}. Forcibly Enable or disable the TLB flush + * filter on AMD 64-bit processors. + */ +static int flush_filter_force; +static void flush_filter(char *s) +{ + if (!strcmp(s, "off")) + flush_filter_force = -1; + if (!strcmp(s, "on")) + flush_filter_force = 1; +} +custom_param("amd_flush_filter", flush_filter); #define num_physpages 0 @@ -25,7 +39,7 @@ */ extern void vide(void); -__asm__(".align 4\nvide: ret"); +__asm__(".text\n.align 4\nvide: ret"); static void __init init_amd(struct cpuinfo_x86 *c) { @@ -190,6 +204,21 @@ case 6: set_bit(X86_FEATURE_K7, c->x86_capability); break; + } + + if (c->x86 == 15) { + rdmsr(MSR_K7_HWCR, l, h); + printk(KERN_INFO "CPU%d: AMD Flush Filter %sabled", + smp_processor_id(), (l & (1<<6)) ? "dis" : "en"); + if ((flush_filter_force > 0) && (l & (1<<6))) { + l &= ~(1<<6); + printk(" -> Forcibly enabled"); + } else if ((flush_filter_force < 0) && !(l & (1<<6))) { + l |= 1<<6; + printk(" -> Forcibly disabled"); + } + wrmsr(MSR_K7_HWCR, l, h); + printk("\n"); } display_cacheinfo(c); diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/dom0_ops.c --- a/xen/arch/x86/dom0_ops.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/dom0_ops.c Thu Aug 25 22:53:20 2005 @@ -404,15 +404,17 @@ memcpy(c, &v->arch.guest_context, sizeof(*c)); - /* IOPL privileges are virtualised -- merge back into returned eflags. */ - BUG_ON((c->user_regs.eflags & EF_IOPL) != 0); - c->user_regs.eflags |= v->arch.iopl << 12; - if ( VMX_DOMAIN(v) ) { save_vmx_cpu_user_regs(&c->user_regs); __vmread(CR0_READ_SHADOW, &c->ctrlreg[0]); __vmread(CR4_READ_SHADOW, &c->ctrlreg[4]); + } + else + { + /* IOPL privileges are virtualised: merge back into returned eflags. */ + BUG_ON((c->user_regs.eflags & EF_IOPL) != 0); + c->user_regs.eflags |= v->arch.iopl << 12; } c->flags = 0; diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/domain.c Thu Aug 25 22:53:20 2005 @@ -48,6 +48,8 @@ struct percpu_ctxt { struct vcpu *curr_vcpu; + unsigned int context_not_finalised; + unsigned int dirty_segment_mask; } __cacheline_aligned; static struct percpu_ctxt percpu_ctxt[NR_CPUS]; @@ -190,7 +192,7 @@ { list_for_each_entry ( page, &d->page_list, list ) { - printk("Page %p: caf=%08x, taf=%08x\n", + printk("Page %p: caf=%08x, taf=%" PRtype_info "\n", _p(page_to_phys(page)), page->count_info, page->u.inuse.type_info); } @@ -198,14 +200,14 @@ list_for_each_entry ( page, &d->xenpage_list, list ) { - printk("XenPage %p: caf=%08x, taf=%08x\n", + printk("XenPage %p: caf=%08x, taf=%" PRtype_info "\n", _p(page_to_phys(page)), page->count_info, page->u.inuse.type_info); } page = virt_to_page(d->shared_info); - printk("Shared_info@%p: caf=%08x, taf=%08x\n", + printk("Shared_info@%p: caf=%08x, taf=%" PRtype_info "\n", _p(page_to_phys(page)), page->count_info, page->u.inuse.type_info); } @@ -215,8 +217,16 @@ return xmalloc(struct vcpu); } +/* We assume that vcpu 0 is always the last one to be freed in a + domain i.e. if v->vcpu_id == 0, the domain should be + single-processor. */ void arch_free_vcpu_struct(struct vcpu *v) { + struct vcpu *p; + for_each_vcpu(v->domain, p) { + if (p->next_in_list == v) + p->next_in_list = v->next_in_list; + } xfree(v); } @@ -295,26 +305,23 @@ l1e_from_page(virt_to_page(gdt_table), PAGE_HYPERVISOR); } +void vcpu_migrate_cpu(struct vcpu *v, int newcpu) +{ + if ( v->processor == newcpu ) + return; + + set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); + v->processor = newcpu; + + if ( VMX_DOMAIN(v) ) + { + __vmpclear(virt_to_phys(v->arch.arch_vmx.vmcs)); + v->arch.schedule_tail = arch_vmx_do_relaunch; + } +} + #ifdef CONFIG_VMX static int vmx_switch_on; - -void arch_vmx_do_resume(struct vcpu *v) -{ - u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); - - load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); - vmx_do_resume(v); - reset_stack_and_jump(vmx_asm_do_resume); -} - -void arch_vmx_do_launch(struct vcpu *v) -{ - u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); - - load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); - vmx_do_launch(v); - reset_stack_and_jump(vmx_asm_do_launch); -} static int vmx_final_setup_guest( struct vcpu *v, struct vcpu_guest_context *ctxt) @@ -346,7 +353,7 @@ v->arch.schedule_tail = arch_vmx_do_launch; -#if defined (__i386) +#if defined (__i386__) v->domain->arch.vmx_platform.real_mode_data = (unsigned long *) regs->esi; #endif @@ -404,7 +411,7 @@ { if ( ((c->user_regs.cs & 3) == 0) || ((c->user_regs.ss & 3) == 0) ) - return -EINVAL; + return -EINVAL; } clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags); @@ -458,7 +465,7 @@ if ( !(c->flags & VGCF_VMX_GUEST) ) #endif if ( !get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], d, - PGT_base_page_table) ) + PGT_base_page_table) ) return -EINVAL; } @@ -479,7 +486,10 @@ } update_pagetables(v); - + + if ( v->vcpu_id == 0 ) + init_domain_time(d); + /* Don't redo final setup */ set_bit(_VCPUF_initialised, &v->vcpu_flags); @@ -541,51 +551,59 @@ __r; }) #if CONFIG_VMX -#define load_msrs(_p, _n) if (vmx_switch_on) vmx_load_msrs((_p), (_n)) +#define load_msrs(n) if (vmx_switch_on) vmx_load_msrs(n) #else -#define load_msrs(_p, _n) ((void)0) +#define load_msrs(n) ((void)0) #endif -static void load_segments(struct vcpu *p, struct vcpu *n) -{ - struct vcpu_guest_context *pctxt = &p->arch.guest_context; +/* + * save_segments() writes a mask of segments which are dirty (non-zero), + * allowing load_segments() to avoid some expensive segment loads and + * MSR writes. + */ +#define DIRTY_DS 0x01 +#define DIRTY_ES 0x02 +#define DIRTY_FS 0x04 +#define DIRTY_GS 0x08 +#define DIRTY_FS_BASE 0x10 +#define DIRTY_GS_BASE_USER 0x20 + +static void load_segments(struct vcpu *n) +{ struct vcpu_guest_context *nctxt = &n->arch.guest_context; int all_segs_okay = 1; + unsigned int dirty_segment_mask, cpu = smp_processor_id(); + + /* Load and clear the dirty segment mask. */ + dirty_segment_mask = percpu_ctxt[cpu].dirty_segment_mask; + percpu_ctxt[cpu].dirty_segment_mask = 0; /* Either selector != 0 ==> reload. */ - if ( unlikely(pctxt->user_regs.ds | nctxt->user_regs.ds) ) + if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) ) all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds); /* Either selector != 0 ==> reload. */ - if ( unlikely(pctxt->user_regs.es | nctxt->user_regs.es) ) + if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) ) all_segs_okay &= loadsegment(es, nctxt->user_regs.es); /* * Either selector != 0 ==> reload. * Also reload to reset FS_BASE if it was non-zero. */ - if ( unlikely(pctxt->user_regs.fs | - pctxt->fs_base | + if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) | nctxt->user_regs.fs) ) - { all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs); - if ( pctxt->user_regs.fs ) /* != 0 selector kills fs_base */ - pctxt->fs_base = 0; - } /* * Either selector != 0 ==> reload. * Also reload to reset GS_BASE if it was non-zero. */ - if ( unlikely(pctxt->user_regs.gs | - pctxt->gs_base_user | + if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) | nctxt->user_regs.gs) ) { /* Reset GS_BASE with user %gs? */ - if ( pctxt->user_regs.gs || !nctxt->gs_base_user ) + if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user ) all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs); - if ( pctxt->user_regs.gs ) /* != 0 selector kills gs_base_user */ - pctxt->gs_base_user = 0; } /* This can only be non-zero if selector is NULL. */ @@ -650,7 +668,9 @@ static void save_segments(struct vcpu *v) { - struct cpu_user_regs *regs = &v->arch.guest_context.user_regs; + struct vcpu_guest_context *ctxt = &v->arch.guest_context; + struct cpu_user_regs *regs = &ctxt->user_regs; + unsigned int dirty_segment_mask = 0; if ( VMX_DOMAIN(v) ) rdmsrl(MSR_SHADOW_GS_BASE, v->arch.arch_vmx.msr_content.shadow_gs); @@ -659,18 +679,34 @@ __asm__ __volatile__ ( "movl %%es,%0" : "=m" (regs->es) ); __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (regs->fs) ); __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (regs->gs) ); -} - -static void clear_segments(void) -{ - __asm__ __volatile__ ( - " movl %0,%%ds; " - " movl %0,%%es; " - " movl %0,%%fs; " - " movl %0,%%gs; " - ""safe_swapgs" " - " movl %0,%%gs" - : : "r" (0) ); + + if ( regs->ds ) + dirty_segment_mask |= DIRTY_DS; + + if ( regs->es ) + dirty_segment_mask |= DIRTY_ES; + + if ( regs->fs ) + { + dirty_segment_mask |= DIRTY_FS; + ctxt->fs_base = 0; /* != 0 selector kills fs_base */ + } + else if ( ctxt->fs_base ) + { + dirty_segment_mask |= DIRTY_FS_BASE; + } + + if ( regs->gs ) + { + dirty_segment_mask |= DIRTY_GS; + ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */ + } + else if ( ctxt->gs_base_user ) + { + dirty_segment_mask |= DIRTY_GS_BASE_USER; + } + + percpu_ctxt[smp_processor_id()].dirty_segment_mask = dirty_segment_mask; } long do_switch_to_user(void) @@ -706,10 +742,9 @@ #elif defined(__i386__) -#define load_segments(_p, _n) ((void)0) -#define load_msrs(_p, _n) ((void)0) -#define save_segments(_p) ((void)0) -#define clear_segments() ((void)0) +#define load_segments(n) ((void)0) +#define load_msrs(n) ((void)0) +#define save_segments(p) ((void)0) static inline void switch_kernel_stack(struct vcpu *n, unsigned int cpu) { @@ -726,9 +761,9 @@ static void __context_switch(void) { struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); - unsigned int cpu = smp_processor_id(); - struct vcpu *p = percpu_ctxt[cpu].curr_vcpu; - struct vcpu *n = current; + unsigned int cpu = smp_processor_id(); + struct vcpu *p = percpu_ctxt[cpu].curr_vcpu; + struct vcpu *n = current; if ( !is_idle_task(p->domain) ) { @@ -786,23 +821,31 @@ void context_switch(struct vcpu *prev, struct vcpu *next) { - struct vcpu *realprev; - - local_irq_disable(); + unsigned int cpu = smp_processor_id(); + + ASSERT(!local_irq_is_enabled()); set_current(next); - if ( ((realprev = percpu_ctxt[smp_processor_id()].curr_vcpu) == next) || - is_idle_task(next->domain) ) - { - local_irq_enable(); - } - else + if ( (percpu_ctxt[cpu].curr_vcpu != next) && !is_idle_task(next->domain) ) { __context_switch(); - - local_irq_enable(); - + percpu_ctxt[cpu].context_not_finalised = 1; + } +} + +void context_switch_finalise(struct vcpu *next) +{ + unsigned int cpu = smp_processor_id(); + + ASSERT(local_irq_is_enabled()); + + if ( percpu_ctxt[cpu].context_not_finalised ) + { + percpu_ctxt[cpu].context_not_finalised = 0; + + BUG_ON(percpu_ctxt[cpu].curr_vcpu != next); + if ( VMX_DOMAIN(next) ) { vmx_restore_msrs(next); @@ -810,18 +853,10 @@ else { load_LDT(next); - load_segments(realprev, next); - load_msrs(realprev, next); - } - } - - /* - * We do this late on because it doesn't need to be protected by the - * schedule_lock, and because we want this to be the very last use of - * 'prev' (after this point, a dying domain's info structure may be freed - * without warning). - */ - clear_bit(_VCPUF_running, &prev->vcpu_flags); + load_segments(next); + load_msrs(next); + } + } schedule_tail(next); BUG(); @@ -835,12 +870,19 @@ int __sync_lazy_execstate(void) { - if ( percpu_ctxt[smp_processor_id()].curr_vcpu == current ) - return 0; - __context_switch(); - load_LDT(current); - clear_segments(); - return 1; + unsigned long flags; + int switch_required; + + local_irq_save(flags); + + switch_required = (percpu_ctxt[smp_processor_id()].curr_vcpu != current); + + if ( switch_required ) + __context_switch(); + + local_irq_restore(flags); + + return switch_required; } void sync_lazy_execstate_cpu(unsigned int cpu) diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/domain_build.c Thu Aug 25 22:53:20 2005 @@ -22,16 +22,28 @@ #include <asm/i387.h> #include <asm/shadow.h> -/* opt_dom0_mem: memory allocated to domain 0. */ -static unsigned int opt_dom0_mem; +static long dom0_nrpages; + +/* + * dom0_mem: + * If +ve: + * * The specified amount of memory is allocated to domain 0. + * If -ve: + * * All of memory is allocated to domain 0, minus the specified amount. + * If not specified: + * * All of memory is allocated to domain 0, minus 1/16th which is reserved + * for uses such as DMA buffers (the reservation is clamped to 128MB). + */ static void parse_dom0_mem(char *s) { - unsigned long long bytes = parse_size_and_unit(s); - /* If no unit is specified we default to kB units, not bytes. */ - if ( isdigit(s[strlen(s)-1]) ) - opt_dom0_mem = (unsigned int)bytes; - else - opt_dom0_mem = (unsigned int)(bytes >> 10); + unsigned long long bytes; + char *t = s; + if ( *s == '-' ) + t++; + bytes = parse_size_and_unit(t); + dom0_nrpages = bytes >> PAGE_SHIFT; + if ( *s == '-' ) + dom0_nrpages = -dom0_nrpages; } custom_param("dom0_mem", parse_dom0_mem); @@ -57,11 +69,21 @@ #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) #define round_pgdown(_p) ((_p)&PAGE_MASK) -static struct pfn_info *alloc_largest(struct domain *d, unsigned long max) +static struct pfn_info *alloc_chunk(struct domain *d, unsigned long max_pages) { struct pfn_info *page; - unsigned int order = get_order(max * PAGE_SIZE); - if ( (max & (max-1)) != 0 ) + unsigned int order; + /* + * Allocate up to 2MB at a time: + * 1. This prevents overflow of get_order() when allocating more than + * 4GB to domain 0 on a PAE machine. + * 2. It prevents allocating very large chunks from DMA pools before + * the >4GB pool is fully depleted. + */ + if ( max_pages > (2UL << (20 - PAGE_SHIFT)) ) + max_pages = 2UL << (20 - PAGE_SHIFT); + order = get_order(max_pages << PAGE_SHIFT); + if ( (max_pages & (max_pages-1)) != 0 ) order--; while ( (page = alloc_domheap_pages(d, order, 0)) == NULL ) if ( order-- == 0 ) @@ -74,12 +96,12 @@ unsigned long _initrd_start, unsigned long initrd_len, char *cmdline) { - int i, rc, dom0_pae, xen_pae; + int i, rc, dom0_pae, xen_pae, order; unsigned long pfn, mfn; unsigned long nr_pages; unsigned long nr_pt_pages; - unsigned long alloc_start; - unsigned long alloc_end; + unsigned long alloc_spfn; + unsigned long alloc_epfn; unsigned long count; struct pfn_info *page = NULL; start_info_t *si; @@ -137,16 +159,30 @@ printk("*** LOADING DOMAIN 0 ***\n"); - /* By default DOM0 is allocated all available memory. */ d->max_pages = ~0U; - if ( (nr_pages = opt_dom0_mem >> (PAGE_SHIFT - 10)) == 0 ) + + /* + * If domain 0 allocation isn't specified, reserve 1/16th of available + * memory for things like DMA buffers. This reservation is clamped to + * a maximum of 128MB. + */ + if ( dom0_nrpages == 0 ) + { + dom0_nrpages = avail_domheap_pages() + + ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) + + ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT); + dom0_nrpages = min(dom0_nrpages / 16, 128L << (20 - PAGE_SHIFT)); + dom0_nrpages = -dom0_nrpages; + } + + /* Negative memory specification means "all memory - specified amount". */ + if ( dom0_nrpages < 0 ) nr_pages = avail_domheap_pages() + ((initrd_len + PAGE_SIZE - 1) >> PAGE_SHIFT) + - ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT); - if ( (page = alloc_largest(d, nr_pages)) == NULL ) - panic("Not enough RAM for DOM0 reservation.\n"); - alloc_start = page_to_phys(page); - alloc_end = alloc_start + (d->tot_pages << PAGE_SHIFT); + ((image_len + PAGE_SIZE - 1) >> PAGE_SHIFT) + + dom0_nrpages; + else + nr_pages = dom0_nrpages; if ( (rc = parseelfimage(&dsi)) != 0 ) return rc; @@ -166,7 +202,7 @@ return -EINVAL; } if (strstr(dsi.xen_section_string, "SHADOW=translate")) - opt_dom0_translate = 1; + opt_dom0_translate = 1; /* Align load address to 4MB boundary. */ dsi.v_start &= ~((1UL<<22)-1); @@ -215,12 +251,19 @@ #endif } - if ( (v_end - dsi.v_start) > (alloc_end - alloc_start) ) - panic("Insufficient contiguous RAM to build kernel image.\n"); + order = get_order(v_end - dsi.v_start); + if ( (1UL << order) > nr_pages ) + panic("Domain 0 allocation is too small for kernel image.\n"); + + /* Allocate from DMA pool: PAE L3 table must be below 4GB boundary. */ + if ( (page = alloc_domheap_pages(d, order, ALLOC_DOM_DMA)) == NULL ) + panic("Not enough RAM for domain 0 allocation.\n"); + alloc_spfn = page_to_pfn(page); + alloc_epfn = alloc_spfn + d->tot_pages; printk("PHYSICAL MEMORY ARRANGEMENT:\n" - " Dom0 alloc.: %p->%p", - _p(alloc_start), _p(alloc_end)); + " Dom0 alloc.: %"PRIphysaddr"->%"PRIphysaddr, + pfn_to_phys(alloc_spfn), pfn_to_phys(alloc_epfn)); if ( d->tot_pages < nr_pages ) printk(" (%lu pages to be allocated)", nr_pages - d->tot_pages); @@ -249,7 +292,8 @@ return -ENOMEM; } - mpt_alloc = (vpt_start - dsi.v_start) + alloc_start; + mpt_alloc = (vpt_start - dsi.v_start) + + (unsigned long)pfn_to_phys(alloc_spfn); /* * We're basically forcing default RPLs to 1, so that our "what privilege @@ -306,7 +350,7 @@ #endif l2tab += l2_linear_offset(dsi.v_start); - mfn = alloc_start >> PAGE_SHIFT; + mfn = alloc_spfn; for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) { if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) @@ -428,7 +472,7 @@ v->arch.guest_table = mk_pagetable(__pa(l4start)); l4tab += l4_table_offset(dsi.v_start); - mfn = alloc_start >> PAGE_SHIFT; + mfn = alloc_spfn; for ( count = 0; count < ((v_end-dsi.v_start)>>PAGE_SHIFT); count++ ) { if ( !((unsigned long)l1tab & (PAGE_SIZE-1)) ) @@ -563,24 +607,24 @@ /* Write the phys->machine and machine->phys table entries. */ for ( pfn = 0; pfn < d->tot_pages; pfn++ ) { - mfn = pfn + (alloc_start>>PAGE_SHIFT); + mfn = pfn + alloc_spfn; #ifndef NDEBUG #define REVERSE_START ((v_end - dsi.v_start) >> PAGE_SHIFT) if ( !opt_dom0_translate && (pfn > REVERSE_START) ) - mfn = (alloc_end>>PAGE_SHIFT) - (pfn - REVERSE_START); + mfn = alloc_epfn - (pfn - REVERSE_START); #endif ((u32 *)vphysmap_start)[pfn] = mfn; machine_to_phys_mapping[mfn] = pfn; } while ( pfn < nr_pages ) { - if ( (page = alloc_largest(d, nr_pages - d->tot_pages)) == NULL ) + if ( (page = alloc_chunk(d, nr_pages - d->tot_pages)) == NULL ) panic("Not enough RAM for DOM0 reservation.\n"); while ( pfn < d->tot_pages ) { mfn = page_to_pfn(page); #ifndef NDEBUG -#define pfn (nr_pages - 1 - (pfn - ((alloc_end - alloc_start) >> PAGE_SHIFT))) +#define pfn (nr_pages - 1 - (pfn - (alloc_epfn - alloc_spfn))) #endif ((u32 *)vphysmap_start)[pfn] = mfn; machine_to_phys_mapping[mfn] = pfn; @@ -614,19 +658,21 @@ /* DOM0 gets access to everything. */ physdev_init_dom0(d); + init_domain_time(d); + set_bit(_DOMF_constructed, &d->domain_flags); new_thread(v, dsi.v_kernentry, vstack_end, vstartinfo_start); if ( opt_dom0_shadow || opt_dom0_translate ) { - printk("dom0: shadow enable\n"); + printk("dom0: shadow enable\n"); shadow_mode_enable(d, (opt_dom0_translate ? SHM_enable | SHM_refcounts | SHM_translate : SHM_enable)); if ( opt_dom0_translate ) { - printk("dom0: shadow translate\n"); + printk("dom0: shadow translate\n"); #if defined(__i386__) && defined(CONFIG_X86_PAE) printk("FIXME: PAE code needed here: %s:%d (%s)\n", __FILE__, __LINE__, __FUNCTION__); @@ -659,7 +705,7 @@ } update_pagetables(v); /* XXX SMP */ - printk("dom0: shadow setup done\n"); + printk("dom0: shadow setup done\n"); } return 0; diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/io_apic.c --- a/xen/arch/x86/io_apic.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/io_apic.c Thu Aug 25 22:53:20 2005 @@ -1751,8 +1751,30 @@ pin = (address - 0x10) >> 1; + *(u32 *)&rte = val; rte.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - *(int *)&rte = val; + + /* + * What about weird destination types? + * SMI: Ignore? Ought to be set up by the BIOS. + * NMI: Ignore? Watchdog functionality is Xen's concern. + * INIT: Definitely ignore: probably a guest OS bug. + * ExtINT: Ignore? Linux only asserts this at start of day. + * For now, print a message and return an error. We can fix up on demand. + */ + if ( rte.delivery_mode > dest_LowestPrio ) + { + printk("ERROR: Attempt to write weird IOAPIC destination mode!\n"); + printk(" APIC=%d/%d, lo-reg=%x\n", apicid, pin, val); + return -EINVAL; + } + + /* + * The guest does not know physical APIC arrangement (flat vs. cluster). + * Apply genapic conventions for this platform. + */ + rte.delivery_mode = INT_DELIVERY_MODE; + rte.dest_mode = INT_DEST_MODE; if ( rte.vector >= FIRST_DEVICE_VECTOR ) { diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/mm.c Thu Aug 25 22:53:20 2005 @@ -95,6 +95,7 @@ #include <xen/irq.h> #include <xen/softirq.h> #include <xen/domain_page.h> +#include <xen/event.h> #include <asm/shadow.h> #include <asm/page.h> #include <asm/flushtlb.h> @@ -122,7 +123,7 @@ static void free_l1_table(struct pfn_info *page); static int mod_l2_entry(l2_pgentry_t *, l2_pgentry_t, unsigned long, - unsigned int type); + unsigned long type); static int mod_l1_entry(l1_pgentry_t *, l1_pgentry_t); /* Used to defer flushing of memory structures. */ @@ -138,7 +139,7 @@ * Returns the current foreign domain; defaults to the currently-executing * domain if a foreign override hasn't been specified. */ -#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ? : current->domain) +#define FOREIGNDOM (percpu_info[smp_processor_id()].foreign ?: current->domain) /* Private domain structs for DOMID_XEN and DOMID_IO. */ static struct domain *dom_xen, *dom_io; @@ -354,7 +355,7 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr, - u32 type, + unsigned long type, struct domain *d) { struct pfn_info *page = &frame_table[page_nr]; @@ -365,7 +366,7 @@ if ( unlikely(!get_page_type(page, type)) ) { if ( (type & PGT_type_mask) != PGT_l1_page_table ) - MEM_LOG("Bad page type for pfn %lx (%08x)", + MEM_LOG("Bad page type for pfn %lx (%" PRtype_info ")", page_nr, page->u.inuse.type_info); put_page(page); return 0; @@ -390,7 +391,7 @@ get_linear_pagetable( root_pgentry_t re, unsigned long re_pfn, struct domain *d) { - u32 x, y; + unsigned long x, y; struct pfn_info *page; unsigned long pfn; @@ -443,7 +444,7 @@ if ( unlikely(l1e_get_flags(l1e) & L1_DISALLOW_MASK) ) { - MEM_LOG("Bad L1 flags %x\n", l1e_get_flags(l1e) & L1_DISALLOW_MASK); + MEM_LOG("Bad L1 flags %x", l1e_get_flags(l1e) & L1_DISALLOW_MASK); return 0; } @@ -489,7 +490,7 @@ if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) ) { - MEM_LOG("Bad L2 flags %x\n", l2e_get_flags(l2e) & L2_DISALLOW_MASK); + MEM_LOG("Bad L2 flags %x", l2e_get_flags(l2e) & L2_DISALLOW_MASK); return 0; } @@ -522,7 +523,7 @@ if ( unlikely((l3e_get_flags(l3e) & L3_DISALLOW_MASK)) ) { - MEM_LOG("Bad L3 flags %x\n", l3e_get_flags(l3e) & L3_DISALLOW_MASK); + MEM_LOG("Bad L3 flags %x", l3e_get_flags(l3e) & L3_DISALLOW_MASK); return 0; } @@ -544,7 +545,8 @@ static int get_page_from_l4e( - l4_pgentry_t l4e, unsigned long pfn, struct domain *d) + l4_pgentry_t l4e, unsigned long pfn, + struct domain *d, unsigned long vaddr) { int rc; @@ -555,12 +557,15 @@ if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) ) { - MEM_LOG("Bad L4 flags %x\n", l4e_get_flags(l4e) & L4_DISALLOW_MASK); + MEM_LOG("Bad L4 flags %x", l4e_get_flags(l4e) & L4_DISALLOW_MASK); return 0; } + vaddr >>= L4_PAGETABLE_SHIFT; + vaddr <<= PGT_va_shift; rc = get_page_and_type_from_pagenr( - l4e_get_pfn(l4e), PGT_l3_page_table, d); + l4e_get_pfn(l4e), + PGT_l3_page_table | vaddr, d); if ( unlikely(!rc) ) return get_linear_pagetable(l4e, pfn, d); @@ -731,7 +736,7 @@ pl2e[l2_table_offset(LINEAR_PT_VIRT_START) + i] = (l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ? l2e_from_pfn(l3e_get_pfn(pl3e[i]), __PAGE_HYPERVISOR) : - l2e_empty(); + l2e_empty(); unmap_domain_page(pl2e); return 1; @@ -750,13 +755,47 @@ return 1; } +#elif CONFIG_X86_64 +# define create_pae_xen_mappings(pl3e) (1) + +static inline int l1_backptr( + unsigned long *backptr, unsigned long offset_in_l2, unsigned long l2_type) +{ + unsigned long l2_backptr = l2_type & PGT_va_mask; + BUG_ON(l2_backptr == PGT_va_unknown); + + *backptr = ((l2_backptr >> PGT_va_shift) << L3_PAGETABLE_SHIFT) | + (offset_in_l2 << L2_PAGETABLE_SHIFT); + return 1; +} + +static inline int l2_backptr( + unsigned long *backptr, unsigned long offset_in_l3, unsigned long l3_type) +{ + unsigned long l3_backptr = l3_type & PGT_va_mask; + BUG_ON(l3_backptr == PGT_va_unknown); + + *backptr = ((l3_backptr >> PGT_va_shift) << L4_PAGETABLE_SHIFT) | + (offset_in_l3 << L3_PAGETABLE_SHIFT); + return 1; +} + +static inline int l3_backptr( + unsigned long *backptr, unsigned long offset_in_l4, unsigned long l4_type) +{ + unsigned long l4_backptr = l4_type & PGT_va_mask; + BUG_ON(l4_backptr == PGT_va_unknown); + + *backptr = (offset_in_l4 << L4_PAGETABLE_SHIFT); + return 1; +} #else # define create_pae_xen_mappings(pl3e) (1) # define l1_backptr(bp,l2o,l2t) \ ({ *(bp) = (unsigned long)(l2o) << L2_PAGETABLE_SHIFT; 1; }) #endif -static int alloc_l2_table(struct pfn_info *page, unsigned int type) +static int alloc_l2_table(struct pfn_info *page, unsigned long type) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_pfn(page); @@ -808,7 +847,7 @@ #if CONFIG_PAGING_LEVELS >= 3 -static int alloc_l3_table(struct pfn_info *page) +static int alloc_l3_table(struct pfn_info *page, unsigned long type) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_pfn(page); @@ -818,10 +857,23 @@ ASSERT(!shadow_mode_refcounts(d)); +#ifdef CONFIG_X86_PAE + if ( pfn >= 0x100000 ) + { + MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn); + return 0; + } +#endif + pl3e = map_domain_page(pfn); for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) { +#if CONFIG_PAGING_LEVELS >= 4 + if ( !l2_backptr(&vaddr, i, type) ) + goto fail; +#else vaddr = (unsigned long)i << L3_PAGETABLE_SHIFT; +#endif if ( is_guest_l3_slot(i) && unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) ) goto fail; @@ -842,15 +894,16 @@ return 0; } #else -#define alloc_l3_table(page) (0) +#define alloc_l3_table(page, type) (0) #endif #if CONFIG_PAGING_LEVELS >= 4 -static int alloc_l4_table(struct pfn_info *page) +static int alloc_l4_table(struct pfn_info *page, unsigned long type) { struct domain *d = page_get_owner(page); unsigned long pfn = page_to_pfn(page); l4_pgentry_t *pl4e = page_to_virt(page); + unsigned long vaddr; int i; /* See the code in shadow_promote() to understand why this is here. */ @@ -860,9 +913,14 @@ ASSERT(!shadow_mode_refcounts(d)); for ( i = 0; i < L4_PAGETABLE_ENTRIES; i++ ) + { + if ( !l3_backptr(&vaddr, i, type) ) + goto fail; + if ( is_guest_l4_slot(i) && - unlikely(!get_page_from_l4e(pl4e[i], pfn, d)) ) + unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) ) goto fail; + } /* Xen private mappings. */ memcpy(&pl4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], @@ -885,7 +943,7 @@ return 0; } #else -#define alloc_l4_table(page) (0) +#define alloc_l4_table(page, type) (0) #endif @@ -967,7 +1025,7 @@ unlikely(o != l1e_get_intpte(ol1e)) ) { MEM_LOG("Failed to update %" PRIpte " -> %" PRIpte - ": saw %" PRIpte "\n", + ": saw %" PRIpte, l1e_get_intpte(ol1e), l1e_get_intpte(nl1e), o); @@ -993,7 +1051,7 @@ { if ( unlikely(l1e_get_flags(nl1e) & L1_DISALLOW_MASK) ) { - MEM_LOG("Bad L1 flags %x\n", + MEM_LOG("Bad L1 flags %x", l1e_get_flags(nl1e) & L1_DISALLOW_MASK); return 0; } @@ -1037,10 +1095,10 @@ static int mod_l2_entry(l2_pgentry_t *pl2e, l2_pgentry_t nl2e, unsigned long pfn, - unsigned int type) + unsigned long type) { l2_pgentry_t ol2e; - unsigned long vaddr; + unsigned long vaddr = 0; if ( unlikely(!is_guest_l2_slot(type,pgentry_ptr_to_slot(pl2e))) ) { @@ -1055,7 +1113,7 @@ { if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) ) { - MEM_LOG("Bad L2 flags %x\n", + MEM_LOG("Bad L2 flags %x", l2e_get_flags(nl2e) & L2_DISALLOW_MASK); return 0; } @@ -1074,10 +1132,9 @@ return 0; } } - else - { - if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) - return 0; + else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e)) ) + { + return 0; } put_page_from_l2e(ol2e, pfn); @@ -1090,7 +1147,8 @@ /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */ static int mod_l3_entry(l3_pgentry_t *pl3e, l3_pgentry_t nl3e, - unsigned long pfn) + unsigned long pfn, + unsigned long type) { l3_pgentry_t ol3e; unsigned long vaddr; @@ -1117,7 +1175,7 @@ { if ( unlikely(l3e_get_flags(nl3e) & L3_DISALLOW_MASK) ) { - MEM_LOG("Bad L3 flags %x\n", + MEM_LOG("Bad L3 flags %x", l3e_get_flags(nl3e) & L3_DISALLOW_MASK); return 0; } @@ -1126,28 +1184,29 @@ if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT)) return UPDATE_ENTRY(l3, pl3e, ol3e, nl3e); +#if CONFIG_PAGING_LEVELS >= 4 + if ( unlikely(!l2_backptr(&vaddr, pgentry_ptr_to_slot(pl3e), type)) || + unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) ) + return 0; +#else vaddr = (((unsigned long)pl3e & ~PAGE_MASK) / sizeof(l3_pgentry_t)) << L3_PAGETABLE_SHIFT; if ( unlikely(!get_page_from_l3e(nl3e, pfn, current->domain, vaddr)) ) return 0; +#endif if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) { - BUG_ON(!create_pae_xen_mappings(pl3e)); put_page_from_l3e(nl3e, pfn); return 0; } - - put_page_from_l3e(ol3e, pfn); - return 1; - } - - if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) - { - BUG_ON(!create_pae_xen_mappings(pl3e)); + } + else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e)) ) + { return 0; } + BUG_ON(!create_pae_xen_mappings(pl3e)); put_page_from_l3e(ol3e, pfn); return 1; } @@ -1159,9 +1218,11 @@ /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */ static int mod_l4_entry(l4_pgentry_t *pl4e, l4_pgentry_t nl4e, - unsigned long pfn) + unsigned long pfn, + unsigned long type) { l4_pgentry_t ol4e; + unsigned long vaddr; if ( unlikely(!is_guest_l4_slot(pgentry_ptr_to_slot(pl4e))) ) { @@ -1176,7 +1237,7 @@ { if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) ) { - MEM_LOG("Bad L4 flags %x\n", + MEM_LOG("Bad L4 flags %x", l4e_get_flags(nl4e) & L4_DISALLOW_MASK); return 0; } @@ -1185,7 +1246,8 @@ if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT)) return UPDATE_ENTRY(l4, pl4e, ol4e, nl4e); - if ( unlikely(!get_page_from_l4e(nl4e, pfn, current->domain)) ) + if ( unlikely(!l3_backptr(&vaddr, pgentry_ptr_to_slot(pl4e), type)) || + unlikely(!get_page_from_l4e(nl4e, pfn, current->domain, vaddr)) ) return 0; if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) @@ -1193,13 +1255,11 @@ put_page_from_l4e(nl4e, pfn); return 0; } - - put_page_from_l4e(ol4e, pfn); - return 1; - } - - if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) + } + else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e)) ) + { return 0; + } put_page_from_l4e(ol4e, pfn); return 1; @@ -1207,7 +1267,7 @@ #endif -int alloc_page_type(struct pfn_info *page, unsigned int type) +int alloc_page_type(struct pfn_info *page, unsigned long type) { switch ( type & PGT_type_mask ) { @@ -1216,14 +1276,14 @@ case PGT_l2_page_table: return alloc_l2_table(page, type); case PGT_l3_page_table: - return alloc_l3_table(page); + return alloc_l3_table(page, type); case PGT_l4_page_table: - return alloc_l4_table(page); + return alloc_l4_table(page, type); case PGT_gdt_page: case PGT_ldt_page: return alloc_segdesc_page(page); default: - printk("Bad type in alloc_page_type %x t=%x c=%x\n", + printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%x\n", type, page->u.inuse.type_info, page->count_info); BUG(); @@ -1233,7 +1293,7 @@ } -void free_page_type(struct pfn_info *page, unsigned int type) +void free_page_type(struct pfn_info *page, unsigned long type) { struct domain *owner = page_get_owner(page); unsigned long gpfn; @@ -1273,7 +1333,7 @@ #endif default: - printk("%s: type %x pfn %lx\n",__FUNCTION__, + printk("%s: type %lx pfn %lx\n",__FUNCTION__, type, page_to_pfn(page)); BUG(); } @@ -1282,7 +1342,7 @@ void put_page_type(struct pfn_info *page) { - u32 nx, x, y = page->u.inuse.type_info; + unsigned long nx, x, y = page->u.inuse.type_info; again: do { @@ -1335,9 +1395,9 @@ } -int get_page_type(struct pfn_info *page, u32 type) -{ - u32 nx, x, y = page->u.inuse.type_info; +int get_page_type(struct pfn_info *page, unsigned long type) +{ + unsigned long nx, x, y = page->u.inuse.type_info; again: do { @@ -1388,8 +1448,11 @@ { if ( ((x & PGT_type_mask) != PGT_l2_page_table) || ((type & PGT_type_mask) != PGT_l1_page_table) ) - MEM_LOG("Bad type (saw %08x != exp %08x) for pfn %lx", - x, type, page_to_pfn(page)); + MEM_LOG("Bad type (saw %" PRtype_info + "!= exp %" PRtype_info ") " + "for mfn %lx (pfn %x)", + x, type, page_to_pfn(page), + machine_to_phys_mapping[page_to_pfn(page)]); return 0; } else if ( (x & PGT_va_mask) == PGT_va_mutable ) @@ -1427,8 +1490,8 @@ /* Try to validate page type; drop the new reference on failure. */ if ( unlikely(!alloc_page_type(page, type)) ) { - MEM_LOG("Error while validating pfn %lx for type %08x." - " caf=%08x taf=%08x", + MEM_LOG("Error while validating pfn %lx for type %" PRtype_info "." + " caf=%08x taf=%" PRtype_info, page_to_pfn(page), type, page->count_info, page->u.inuse.type_info); @@ -1537,7 +1600,7 @@ percpu_info[cpu].foreign = dom_io; break; default: - MEM_LOG("Dom %u cannot set foreign dom\n", d->domain_id); + MEM_LOG("Dom %u cannot set foreign dom", d->domain_id); okay = 0; break; } @@ -1596,7 +1659,7 @@ { struct mmuext_op op; int rc = 0, i = 0, okay, cpu = smp_processor_id(); - unsigned int type, done = 0; + unsigned long type, done = 0; struct pfn_info *page; struct vcpu *v = current; struct domain *d = v->domain, *e; @@ -1674,16 +1737,16 @@ #ifndef CONFIG_X86_PAE /* Unsafe on PAE because of Xen-private mappings. */ case MMUEXT_PIN_L2_TABLE: - type = PGT_l2_page_table; + type = PGT_l2_page_table | PGT_va_mutable; goto pin_page; #endif case MMUEXT_PIN_L3_TABLE: - type = PGT_l3_page_table; + type = PGT_l3_page_table | PGT_va_mutable; goto pin_page; case MMUEXT_PIN_L4_TABLE: - type = PGT_l4_page_table; + type = PGT_l4_page_table | PGT_va_mutable; goto pin_page; case MMUEXT_UNPIN_TABLE: @@ -1770,7 +1833,7 @@ case MMUEXT_FLUSH_CACHE: if ( unlikely(!IS_CAPABLE_PHYSDEV(d)) ) { - MEM_LOG("Non-physdev domain tried to FLUSH_CACHE.\n"); + MEM_LOG("Non-physdev domain tried to FLUSH_CACHE."); okay = 0; } else @@ -1784,7 +1847,7 @@ if ( shadow_mode_external(d) ) { MEM_LOG("ignoring SET_LDT hypercall from external " - "domain %u\n", d->domain_id); + "domain %u", d->domain_id); okay = 0; break; } @@ -1855,7 +1918,7 @@ unlikely(IS_XEN_HEAP_FRAME(page)) ) { MEM_LOG("Transferee has no reservation headroom (%d,%d), or " - "page is in Xen heap (%lx), or dom is dying (%ld).\n", + "page is in Xen heap (%lx), or dom is dying (%ld).", e->tot_pages, e->max_pages, op.mfn, e->domain_flags); okay = 0; goto reassign_fail; @@ -1876,9 +1939,9 @@ unlikely(_nd != _d) ) { MEM_LOG("Bad page values %lx: ed=%p(%u), sd=%p," - " caf=%08x, taf=%08x\n", page_to_pfn(page), - d, d->domain_id, unpickle_domptr(_nd), x, - page->u.inuse.type_info); + " caf=%08x, taf=%" PRtype_info, + page_to_pfn(page), d, d->domain_id, + unpickle_domptr(_nd), x, page->u.inuse.type_info); okay = 0; goto reassign_fail; } @@ -1951,7 +2014,7 @@ unsigned int cmd, done = 0; struct vcpu *v = current; struct domain *d = v->domain; - u32 type_info; + unsigned long type_info; struct domain_mmap_cache mapcache, sh_mapcache; LOCK_BIGLOCK(d); @@ -2041,7 +2104,8 @@ l1e = l1e_from_intpte(req.val); okay = mod_l1_entry(va, l1e); if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l1_normal_pt_update(d, req.ptr, l1e, &sh_mapcache); + shadow_l1_normal_pt_update( + d, req.ptr, l1e, &sh_mapcache); put_page_type(page); } break; @@ -2054,24 +2118,28 @@ /* FIXME: doesn't work with PAE */ l2e = l2e_from_intpte(req.val); - okay = mod_l2_entry((l2_pgentry_t *)va, l2e, mfn, type_info); + okay = mod_l2_entry( + (l2_pgentry_t *)va, l2e, mfn, type_info); if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l2_normal_pt_update(d, req.ptr, l2e, &sh_mapcache); + shadow_l2_normal_pt_update( + d, req.ptr, l2e, &sh_mapcache); put_page_type(page); } break; #if CONFIG_PAGING_LEVELS >= 3 case PGT_l3_page_table: ASSERT( !shadow_mode_refcounts(d) ); - if ( likely(get_page_type(page, PGT_l3_page_table)) ) + if ( likely(get_page_type( + page, type_info & (PGT_type_mask|PGT_va_mask))) ) { l3_pgentry_t l3e; /* FIXME: doesn't work with PAE */ l3e = l3e_from_intpte(req.val); - okay = mod_l3_entry(va, l3e, mfn); + okay = mod_l3_entry(va, l3e, mfn, type_info); if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l3_normal_pt_update(d, req.ptr, l3e, &sh_mapcache); + shadow_l3_normal_pt_update( + d, req.ptr, l3e, &sh_mapcache); put_page_type(page); } break; @@ -2079,14 +2147,16 @@ #if CONFIG_PAGING_LEVELS >= 4 case PGT_l4_page_table: ASSERT( !shadow_mode_refcounts(d) ); - if ( likely(get_page_type(page, PGT_l4_page_table)) ) + if ( likely(get_page_type( + page, type_info & (PGT_type_mask|PGT_va_mask))) ) { l4_pgentry_t l4e; l4e = l4e_from_intpte(req.val); - okay = mod_l4_entry(va, l4e, mfn); + okay = mod_l4_entry(va, l4e, mfn, type_info); if ( okay && unlikely(shadow_mode_enabled(d)) ) - shadow_l4_normal_pt_update(d, req.ptr, l4e, &sh_mapcache); + shadow_l4_normal_pt_update( + d, req.ptr, l4e, &sh_mapcache); put_page_type(page); } break; @@ -2108,7 +2178,7 @@ } } - *(unsigned long *)va = req.val; + *(intpte_t *)va = req.val; okay = 1; if ( shadow_mode_enabled(d) ) @@ -2133,7 +2203,8 @@ if ( unlikely(shadow_mode_translate(FOREIGNDOM) && IS_PRIV(d)) ) { shadow_lock(FOREIGNDOM); - printk("privileged guest dom%d requests pfn=%lx to map mfn=%lx for dom%d\n", + printk("privileged guest dom%d requests pfn=%lx to " + "map mfn=%lx for dom%d\n", d->domain_id, gpfn, mfn, FOREIGNDOM->domain_id); set_machinetophys(mfn, gpfn); set_p2m_entry(FOREIGNDOM, gpfn, mfn, &sh_mapcache, &mapcache); @@ -2199,60 +2270,213 @@ return rc; } -/* This function assumes the caller is holding the domain's BIGLOCK - * and is running in a shadow mode - */ -int update_grant_va_mapping(unsigned long va, - l1_pgentry_t _nl1e, - struct domain *d, - struct vcpu *v) -{ - /* Caller must: - * . own d's BIGLOCK - * . already have 'get_page' correctly on the to-be-installed nl1e - * . be responsible for flushing the TLB - * . check PTE being installed isn't DISALLOWED + +int update_grant_pte_mapping( + unsigned long pte_addr, l1_pgentry_t _nl1e, + struct domain *d, struct vcpu *v) +{ + int rc = GNTST_okay; + void *va; + unsigned long gpfn, mfn; + struct pfn_info *page; + u32 type_info; + l1_pgentry_t ol1e; + + ASSERT(spin_is_locked(&d->big_lock)); + ASSERT(!shadow_mode_refcounts(d)); + ASSERT((l1e_get_flags(_nl1e) & L1_DISALLOW_MASK) == 0); + + gpfn = pte_addr >> PAGE_SHIFT; + mfn = __gpfn_to_mfn(d, gpfn); + + if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) ) + { + MEM_LOG("Could not get page for normal update"); + return GNTST_general_error; + } + + va = map_domain_page(mfn); + va = (void *)((unsigned long)va + (pte_addr & ~PAGE_MASK)); + page = pfn_to_page(mfn); + + type_info = page->u.inuse.type_info; + if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) || + !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) ) + { + MEM_LOG("Grant map attempted to update a non-L1 page"); + rc = GNTST_general_error; + goto failed; + } + + if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) || + !update_l1e(va, ol1e, _nl1e) ) + { + put_page_type(page); + rc = GNTST_general_error; + goto failed; + } + + put_page_from_l1e(ol1e, d); + + rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_all : GNTST_okay; + + if ( unlikely(shadow_mode_enabled(d)) ) + { + struct domain_mmap_cache sh_mapcache; + domain_mmap_cache_init(&sh_mapcache); + shadow_l1_normal_pt_update(d, pte_addr, _nl1e, &sh_mapcache); + domain_mmap_cache_destroy(&sh_mapcache); + } + + put_page_type(page); + + failed: + unmap_domain_page(va); + put_page(page); + return rc; +} + +int clear_grant_pte_mapping( + unsigned long addr, unsigned long frame, struct domain *d) +{ + int rc = GNTST_okay; + void *va; + unsigned long gpfn, mfn; + struct pfn_info *page; + u32 type_info; + l1_pgentry_t ol1e; + + ASSERT(!shadow_mode_refcounts(d)); + + gpfn = addr >> PAGE_SHIFT; + mfn = __gpfn_to_mfn(d, gpfn); + + if ( unlikely(!get_page_from_pagenr(mfn, current->domain)) ) + { + MEM_LOG("Could not get page for normal update"); + return GNTST_general_error; + } + + va = map_domain_page(mfn); + va = (void *)((unsigned long)va + (addr & ~PAGE_MASK)); + page = pfn_to_page(mfn); + + type_info = page->u.inuse.type_info; + if ( ((type_info & PGT_type_mask) != PGT_l1_page_table) || + !get_page_type(page, type_info & (PGT_type_mask|PGT_va_mask)) ) + { + MEM_LOG("Grant map attempted to update a non-L1 page"); + rc = GNTST_general_error; + goto failed; + } + + if ( __copy_from_user(&ol1e, (l1_pgentry_t *)va, sizeof(ol1e)) ) + { + put_page_type(page); + rc = GNTST_general_error; + goto failed; + } + + /* Check that the virtual address supplied is actually mapped to frame. */ + if ( unlikely((l1e_get_intpte(ol1e) >> PAGE_SHIFT) != frame) ) + { + MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx", + (unsigned long)l1e_get_intpte(ol1e), addr, frame); + put_page_type(page); + rc = GNTST_general_error; + goto failed; + } + + /* Delete pagetable entry. */ + if ( unlikely(__put_user(0, (intpte_t *)va))) + { + MEM_LOG("Cannot delete PTE entry at %p", va); + put_page_type(page); + rc = GNTST_general_error; + goto failed; + } + + if ( unlikely(shadow_mode_enabled(d)) ) + { + struct domain_mmap_cache sh_mapcache; + domain_mmap_cache_init(&sh_mapcache); + shadow_l1_normal_pt_update(d, addr, l1e_empty(), &sh_mapcache); + domain_mmap_cache_destroy(&sh_mapcache); + } + + put_page_type(page); + + failed: + unmap_domain_page(va); + put_page(page); + return rc; +} + + +int update_grant_va_mapping( + unsigned long va, l1_pgentry_t _nl1e, struct domain *d, struct vcpu *v) +{ + int rc = GNTST_okay; + l1_pgentry_t *pl1e, ol1e; + + ASSERT(spin_is_locked(&d->big_lock)); + ASSERT(!shadow_mode_refcounts(d)); + ASSERT((l1e_get_flags(_nl1e) & L1_DISALLOW_MASK) == 0); + + /* + * This is actually overkill - we don't need to sync the L1 itself, + * just everything involved in getting to this L1 (i.e. we need + * linear_pg_table[l1_linear_offset(va)] to be in sync)... */ - - int rc = 0; - l1_pgentry_t *pl1e; - l1_pgentry_t ol1e; - - cleanup_writable_pagetable(d); - - // This is actually overkill - we don't need to sync the L1 itself, - // just everything involved in getting to this L1 (i.e. we need - // linear_pg_table[l1_linear_offset(va)] to be in sync)... - // __shadow_sync_va(v, va); pl1e = &linear_pg_table[l1_linear_offset(va)]; - if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) - rc = -EINVAL; - else if ( !shadow_mode_refcounts(d) ) - { - if ( update_l1e(pl1e, ol1e, _nl1e) ) - { - put_page_from_l1e(ol1e, d); - if ( l1e_get_flags(ol1e) & _PAGE_PRESENT ) - rc = 0; /* Caller needs to invalidate TLB entry */ - else - rc = 1; /* Caller need not invalidate TLB entry */ - } - else - rc = -EINVAL; - } - else - { - printk("grant tables and shadow mode currently don't work together\n"); - BUG(); - } + if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) || + !update_l1e(pl1e, ol1e, _nl1e) ) + return GNTST_general_error; + + put_page_from_l1e(ol1e, d); + + rc = (l1e_get_flags(ol1e) & _PAGE_PRESENT) ? GNTST_flush_one : GNTST_okay; if ( unlikely(shadow_mode_enabled(d)) ) shadow_do_update_va_mapping(va, _nl1e, v); return rc; +} + +int clear_grant_va_mapping(unsigned long addr, unsigned long frame) +{ + l1_pgentry_t *pl1e, ol1e; + + pl1e = &linear_pg_table[l1_linear_offset(addr)]; + + if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) ) + { + MEM_LOG("Could not find PTE entry for address %lx", addr); + return GNTST_general_error; + } + + /* + * Check that the virtual address supplied is actually mapped to + * frame. + */ + if ( unlikely(l1e_get_pfn(ol1e) != frame) ) + { + MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx", + l1e_get_pfn(ol1e), addr, frame); + return GNTST_general_error; + } + + /* Delete pagetable entry. */ + if ( unlikely(__put_user(0, &pl1e->l1)) ) + { + MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); + return GNTST_general_error; + } + + return 0; } @@ -2289,10 +2513,11 @@ (shadow_mode_translate(d) || shadow_mode_translate(percpu_info[cpu].foreign))) ) { - // The foreign domain's pfn's are in a different namespace. - // There's not enough information in just a gpte to figure out - // how to (re-)shadow this entry. - // + /* + * The foreign domain's pfn's are in a different namespace. There's + * not enough information in just a gpte to figure out how to + * (re-)shadow this entry. + */ domain_crash(); } @@ -2409,14 +2634,16 @@ if ( entries > FIRST_RESERVED_GDT_ENTRY ) return -EINVAL; - + shadow_sync_all(d); /* Check the pages in the new GDT. */ - for ( i = 0; i < nr_pages; i++ ) - if ( ((pfn = frames[i]) >= max_page) || - !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) + for ( i = 0; i < nr_pages; i++ ) { + pfn = frames[i]; + if ((pfn >= max_page) || + !get_page_and_type(&frame_table[pfn], d, PGT_gdt_page) ) goto fail; + } /* Tear down the old GDT. */ destroy_gdt(v); @@ -2463,22 +2690,24 @@ } -long do_update_descriptor(unsigned long pa, u64 desc) +long do_update_descriptor(u64 pa, u64 desc) { struct domain *dom = current->domain; unsigned long gpfn = pa >> PAGE_SHIFT; unsigned long mfn; - unsigned int offset = (pa & ~PAGE_MASK) / sizeof(struct desc_struct); + unsigned int offset; struct desc_struct *gdt_pent, d; struct pfn_info *page; long ret = -EINVAL; + offset = ((unsigned int)pa & ~PAGE_MASK) / sizeof(struct desc_struct); + *(u64 *)&d = desc; LOCK_BIGLOCK(dom); if ( !VALID_MFN(mfn = __gpfn_to_mfn(dom, gpfn)) || - ((pa % sizeof(struct desc_struct)) != 0) || + (((unsigned int)pa % sizeof(struct desc_struct)) != 0) || (mfn >= max_page) || !check_descriptor(&d) ) { @@ -2547,7 +2776,7 @@ * Writable Pagetables */ -#ifdef VERBOSE +#ifdef VVERBOSE int ptwr_debug = 0x0; #define PTWR_PRINTK(_f, _a...) \ do { if ( unlikely(ptwr_debug) ) printk( _f , ## _a ); } while ( 0 ) @@ -2556,18 +2785,128 @@ #define PTWR_PRINTK(_f, _a...) ((void)0) #endif + +#ifdef PERF_ARRAYS + +/**************** writeable pagetables profiling functions *****************/ + +#define ptwr_eip_buckets 256 + +int ptwr_eip_stat_threshold[] = {1, 10, 50, 100, L1_PAGETABLE_ENTRIES}; + +#define ptwr_eip_stat_thresholdN (sizeof(ptwr_eip_stat_threshold)/sizeof(int)) + +struct { + unsigned long eip; + domid_t id; + u32 val[ptwr_eip_stat_thresholdN]; +} typedef ptwr_eip_stat_t; + +ptwr_eip_stat_t ptwr_eip_stats[ptwr_eip_buckets]; + +static inline unsigned int ptwr_eip_stat_hash( unsigned long eip, domid_t id ) +{ + return (((unsigned long) id) ^ eip ^ (eip>>8) ^ (eip>>16) ^ (eip>24)) % + ptwr_eip_buckets; +} + +static void ptwr_eip_stat_inc(u32 *n) +{ + int i, j; + + if ( ++(*n) != 0 ) + return; + + *n = ~0; + + /* Re-scale all buckets. */ + for ( i = 0; i <ptwr_eip_buckets; i++ ) + for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ ) + ptwr_eip_stats[i].val[j] >>= 1; +} + +static void ptwr_eip_stat_update(unsigned long eip, domid_t id, int modified) +{ + int i, j, b; + + i = b = ptwr_eip_stat_hash(eip, id); + + do + { + if ( !ptwr_eip_stats[i].eip ) + { + /* doesn't exist */ + ptwr_eip_stats[i].eip = eip; + ptwr_eip_stats[i].id = id; + memset(ptwr_eip_stats[i].val,0, sizeof(ptwr_eip_stats[i].val)); + } + + if ( ptwr_eip_stats[i].eip == eip ) + { + for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ ) + if ( modified <= ptwr_eip_stat_threshold[j] ) + break; + BUG_ON(j >= ptwr_eip_stat_thresholdN); + ptwr_eip_stat_inc(&ptwr_eip_stats[i].val[j]); + return; + } + + i = (i+1) % ptwr_eip_buckets; + } + while ( i != b ); + + printk("ptwr_eip_stat: too many EIPs in use!\n"); + + ptwr_eip_stat_print(); + ptwr_eip_stat_reset(); +} + +void ptwr_eip_stat_reset(void) +{ + memset(ptwr_eip_stats, 0, sizeof(ptwr_eip_stats)); +} + +void ptwr_eip_stat_print(void) +{ + struct domain *e; + domid_t d; + int i, j; + + for_each_domain( e ) + { + d = e->domain_id; + + for ( i = 0; i < ptwr_eip_buckets; i++ ) + { + if ( ptwr_eip_stats[i].eip && ptwr_eip_stats[i].id != d ) + continue; + + printk("D %d eip %08lx ", + ptwr_eip_stats[i].id, ptwr_eip_stats[i].eip); + + for ( j = 0; j < ptwr_eip_stat_thresholdN; j++ ) + printk("<=%u %4u \t", + ptwr_eip_stat_threshold[j], + ptwr_eip_stats[i].val[j]); + printk("\n"); + } + } +} + +#else /* PERF_ARRAYS */ + +#define ptwr_eip_stat_update(eip, id, modified) ((void)0) + +#endif + +/*******************************************************************/ + /* Re-validate a given p.t. page, given its prior snapshot */ -int revalidate_l1(struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot) +int revalidate_l1( + struct domain *d, l1_pgentry_t *l1page, l1_pgentry_t *snapshot) { l1_pgentry_t ol1e, nl1e; int modified = 0, i; - -#if 0 - if ( d->domain_id ) - printk("%s: l1page mfn=%lx snapshot mfn=%lx\n", __func__, - l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)l1page)]), - l1e_get_pfn(linear_pg_table[l1_linear_offset((unsigned long)snapshot)])); -#endif for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { @@ -2593,7 +2932,7 @@ if ( unlikely(!get_page_from_l1e(nl1e, d)) ) { - MEM_LOG("ptwr: Could not re-validate l1 page\n"); + MEM_LOG("ptwr: Could not re-validate l1 page"); /* * Make the remaining p.t's consistent before crashing, so the * reference counts are correct. @@ -2614,24 +2953,34 @@ /* Flush the given writable p.t. page and write-protect it again. */ void ptwr_flush(struct domain *d, const int which) { - unsigned long pte, *ptep, l1va; - l1_pgentry_t *pl1e; + unsigned long l1va; + l1_pgentry_t *pl1e, pte, *ptep; l2_pgentry_t *pl2e; unsigned int modified; +#ifdef CONFIG_X86_64 + struct vcpu *v = current; + extern void toggle_guest_mode(struct vcpu *); + int user_mode = !(v->arch.flags & TF_kernel_mode); +#endif + ASSERT(!shadow_mode_enabled(d)); if ( unlikely(d->arch.ptwr[which].vcpu != current) ) - write_ptbase(d->arch.ptwr[which].vcpu); + /* Don't use write_ptbase: it may switch to guest_user on x86/64! */ + write_cr3(pagetable_get_paddr( + d->arch.ptwr[which].vcpu->arch.guest_table)); + else + TOGGLE_MODE(); l1va = d->arch.ptwr[which].l1va; - ptep = (unsigned long *)&linear_pg_table[l1_linear_offset(l1va)]; + ptep = (l1_pgentry_t *)&linear_pg_table[l1_linear_offset(l1va)]; /* * STEP 1. Write-protect the p.t. page so no more updates can occur. */ - if ( unlikely(__get_user(pte, ptep)) ) + if ( unlikely(__get_user(pte.l1, &ptep->l1)) ) { MEM_LOG("ptwr: Could not read pte at %p", ptep); /* @@ -2640,9 +2989,9 @@ */ BUG(); } - PTWR_PRINTK("[%c] disconnected_l1va at %p is %lx\n", - PTWR_PRINT_WHICH, ptep, pte); - pte &= ~_PAGE_RW; + PTWR_PRINTK("[%c] disconnected_l1va at %p is %"PRIpte"\n", + PTWR_PRINT_WHICH, ptep, pte.l1); + l1e_remove_flags(pte, _PAGE_RW); /* Write-protect the p.t. page in the guest page table. */ if ( unlikely(__put_user(pte, ptep)) ) @@ -2658,8 +3007,8 @@ /* Ensure that there are no stale writable mappings in any TLB. */ /* NB. INVLPG is a serialising instruction: flushes pending updates. */ flush_tlb_one_mask(d->cpumask, l1va); - PTWR_PRINTK("[%c] disconnected_l1va at %p now %lx\n", - PTWR_PRINT_WHICH, ptep, pte); + PTWR_PRINTK("[%c] disconnected_l1va at %p now %"PRIpte"\n", + PTWR_PRINT_WHICH, ptep, pte.l1); /* * STEP 2. Validate any modified PTEs. @@ -2669,6 +3018,7 @@ modified = revalidate_l1(d, pl1e, d->arch.ptwr[which].page); unmap_domain_page(pl1e); perfc_incr_histo(wpt_updates, modified, PT_UPDATES); + ptwr_eip_stat_update( d->arch.ptwr[which].eip, d->domain_id, modified); d->arch.ptwr[which].prev_nr_updates = modified; /* @@ -2689,6 +3039,8 @@ if ( unlikely(d->arch.ptwr[which].vcpu != current) ) write_ptbase(current); + else + TOGGLE_MODE(); } static int ptwr_emulated_update( @@ -2706,13 +3058,13 @@ /* Aligned access only, thank you. */ if ( !access_ok(addr, bytes) || ((addr & (bytes-1)) != 0) ) { - MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)\n", + MEM_LOG("ptwr_emulate: Unaligned or bad size ptwr access (%d, %lx)", bytes, addr); return X86EMUL_UNHANDLEABLE; } /* Turn a sub-word access into a full-word access. */ - if (bytes != sizeof(physaddr_t)) + if ( bytes != sizeof(physaddr_t) ) { int rc; physaddr_t full; @@ -2721,7 +3073,7 @@ /* Align address; read full word. */ addr &= ~(sizeof(physaddr_t)-1); if ( (rc = x86_emulate_read_std(addr, (unsigned long *)&full, - sizeof(physaddr_t))) ) + sizeof(physaddr_t))) ) return rc; /* Mask out bits provided by caller. */ full &= ~((((physaddr_t)1 << (bytes*8)) - 1) << (offset*8)); @@ -2729,13 +3081,17 @@ val &= (((physaddr_t)1 << (bytes*8)) - 1); val <<= (offset)*8; val |= full; + /* Also fill in missing parts of the cmpxchg old value. */ + old &= (((physaddr_t)1 << (bytes*8)) - 1); + old <<= (offset)*8; + old |= full; } /* Read the PTE that maps the page being updated. */ if (__copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)], sizeof(pte))) { - MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table\n"); + MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table"); return X86EMUL_UNHANDLEABLE; } @@ -2747,7 +3103,8 @@ ((page->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table) || (page_get_owner(page) != d) ) { - MEM_LOG("ptwr_emulate: Page is mistyped or bad pte (%lx, %08x)\n", + MEM_LOG("ptwr_emulate: Page is mistyped or bad pte " + "(%lx, %" PRtype_info ")", l1e_get_pfn(pte), page->u.inuse.type_info); return X86EMUL_UNHANDLEABLE; } @@ -2763,7 +3120,7 @@ if ( do_cmpxchg ) { ol1e = l1e_from_intpte(old); - if ( cmpxchg((unsigned long *)pl1e, old, val) != old ) + if ( cmpxchg((intpte_t *)pl1e, old, val) != old ) { unmap_domain_page(pl1e); put_page_from_l1e(nl1e, d); @@ -2821,14 +3178,15 @@ }; /* Write page fault handler: check if guest is trying to modify a PTE. */ -int ptwr_do_page_fault(struct domain *d, unsigned long addr) +int ptwr_do_page_fault(struct domain *d, unsigned long addr, + struct cpu_user_regs *regs) { unsigned long pfn; struct pfn_info *page; l1_pgentry_t pte; - l2_pgentry_t *pl2e; + l2_pgentry_t *pl2e, l2e; int which; - u32 l2_idx; + unsigned long l2_idx; if ( unlikely(shadow_mode_enabled(d)) ) return 0; @@ -2837,7 +3195,7 @@ * Attempt to read the PTE that maps the VA being accessed. By checking for * PDE validity in the L2 we avoid many expensive fixups in __get_user(). */ - if ( !(l2e_get_flags(__linear_l2_table[addr>>L2_PAGETABLE_SHIFT]) & + if ( !(l2e_get_flags(__linear_l2_table[l2_linear_offset(addr)]) & _PAGE_PRESENT) || __copy_from_user(&pte,&linear_pg_table[l1_linear_offset(addr)], sizeof(pte)) ) @@ -2857,9 +3215,8 @@ return 0; } - /* x86/64: Writable pagetable code needs auditing. Use emulator for now. */ -#if defined(__x86_64__) - goto emulate; +#if 0 /* Leave this in as useful for debugging */ + goto emulate; #endif /* Get the L2 index at which this L1 p.t. is always mapped. */ @@ -2868,7 +3225,7 @@ goto emulate; /* Urk! This L1 is mapped in multiple L2 slots! */ l2_idx >>= PGT_va_shift; - if ( unlikely(l2_idx == (addr >> L2_PAGETABLE_SHIFT)) ) + if ( unlikely(l2_idx == l2_linear_offset(addr)) ) goto emulate; /* Urk! Pagetable maps itself! */ /* @@ -2877,7 +3234,8 @@ */ pl2e = &__linear_l2_table[l2_idx]; which = PTWR_PT_INACTIVE; - if ( (l2e_get_pfn(*pl2e)) == pfn ) + + if ( (__get_user(l2e.l2, &pl2e->l2) == 0) && (l2e_get_pfn(l2e) == pfn) ) { /* * Check the PRESENT bit to set ACTIVE mode. @@ -2885,7 +3243,7 @@ * ACTIVE p.t. (it may be the same p.t. mapped at another virt addr). * The ptwr_flush call below will restore the PRESENT bit. */ - if ( likely(l2e_get_flags(*pl2e) & _PAGE_PRESENT) || + if ( likely(l2e_get_flags(l2e) & _PAGE_PRESENT) || (d->arch.ptwr[PTWR_PT_ACTIVE].l1va && (l2_idx == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx)) ) which = PTWR_PT_ACTIVE; @@ -2905,7 +3263,7 @@ goto emulate; } - PTWR_PRINTK("[%c] page_fault on l1 pt at va %lx, pt for %08x, " + PTWR_PRINTK("[%c] page_fault on l1 pt at va %lx, pt for %08lx, " "pfn %lx\n", PTWR_PRINT_WHICH, addr, l2_idx << L2_PAGETABLE_SHIFT, pfn); @@ -2930,7 +3288,11 @@ d->arch.ptwr[which].l1va = addr | 1; d->arch.ptwr[which].l2_idx = l2_idx; d->arch.ptwr[which].vcpu = current; - + +#ifdef PERF_ARRAYS + d->arch.ptwr[which].eip = regs->eip; +#endif + /* For safety, disconnect the L1 p.t. page from current space. */ if ( which == PTWR_PT_ACTIVE ) { @@ -2946,11 +3308,11 @@ /* Finally, make the p.t. page writable by the guest OS. */ l1e_add_flags(pte, _PAGE_RW); - if ( unlikely(__copy_to_user(&linear_pg_table[addr>>PAGE_SHIFT], - &pte, sizeof(pte))) ) + if ( unlikely(__put_user(pte.l1, + &linear_pg_table[l1_linear_offset(addr)].l1)) ) { MEM_LOG("ptwr: Could not update pte at %p", (unsigned long *) - &linear_pg_table[addr>>PAGE_SHIFT]); + &linear_pg_table[l1_linear_offset(addr)]); /* Toss the writable pagetable state and crash. */ unmap_domain_page(d->arch.ptwr[which].pl1e); d->arch.ptwr[which].l1va = 0; diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/physdev.c --- a/xen/arch/x86/physdev.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/physdev.c Thu Aug 25 22:53:20 2005 @@ -106,7 +106,7 @@ (op.u.set_iobitmap.nr_ports > 65536) ) break; ret = 0; - current->arch.iobmp = (u8 *)op.u.set_iobitmap.bitmap; + current->arch.iobmp = op.u.set_iobitmap.bitmap; current->arch.iobmp_limit = op.u.set_iobitmap.nr_ports; break; default: diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/setup.c --- a/xen/arch/x86/setup.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/setup.c Thu Aug 25 22:53:20 2005 @@ -244,15 +244,17 @@ #define EARLY_FAIL() for ( ; ; ) __asm__ __volatile__ ( "hlt" ) +static struct e820entry e820_raw[E820MAX]; + void __init __start_xen(multiboot_info_t *mbi) { char *cmdline; module_t *mod = (module_t *)__va(mbi->mods_addr); - unsigned long firsthole_start, nr_pages; + unsigned long nr_pages, modules_length; unsigned long initial_images_start, initial_images_end; unsigned long _initrd_start = 0, _initrd_len = 0; unsigned int initrdidx = 1; - struct e820entry e820_raw[E820MAX]; + physaddr_t s, e; int i, e820_raw_nr = 0, bytes = 0; struct ns16550_defaults ns16550 = { .data_bits = 8, @@ -330,22 +332,30 @@ max_page = init_e820(e820_raw, &e820_raw_nr); - /* Find the first high-memory RAM hole. */ - for ( i = 0; i < e820.nr_map; i++ ) + modules_length = mod[mbi->mods_count-1].mod_end - mod[0].mod_start; + + /* Find a large enough RAM extent to stash the DOM0 modules. */ + for ( i = 0; ; i++ ) + { + if ( i == e820.nr_map ) + { + printk("Not enough memory to stash the DOM0 kernel image.\n"); + for ( ; ; ) ; + } + if ( (e820.map[i].type == E820_RAM) && - (e820.map[i].addr >= 0x100000) ) + (e820.map[i].size >= modules_length) && + ((e820.map[i].addr + e820.map[i].size) >= + (xenheap_phys_end + modules_length)) ) break; - firsthole_start = e820.map[i].addr + e820.map[i].size; - - /* Relocate the Multiboot modules. */ - initial_images_start = xenheap_phys_end; - initial_images_end = initial_images_start + - (mod[mbi->mods_count-1].mod_end - mod[0].mod_start); - if ( initial_images_end > firsthole_start ) - { - printk("Not enough memory to stash the DOM0 kernel image.\n"); - for ( ; ; ) ; - } + } + + /* Stash as near as possible to the beginning of the RAM extent. */ + initial_images_start = e820.map[i].addr; + if ( initial_images_start < xenheap_phys_end ) + initial_images_start = xenheap_phys_end; + initial_images_end = initial_images_start + modules_length; + #if defined(CONFIG_X86_32) memmove((void *)initial_images_start, /* use low mapping */ (void *)mod[0].mod_start, /* use low mapping */ @@ -358,16 +368,23 @@ /* Initialise boot-time allocator with all RAM situated after modules. */ xenheap_phys_start = init_boot_allocator(__pa(&_end)); - nr_pages = 0; + nr_pages = 0; for ( i = 0; i < e820.nr_map; i++ ) { if ( e820.map[i].type != E820_RAM ) continue; + nr_pages += e820.map[i].size >> PAGE_SHIFT; - if ( (e820.map[i].addr + e820.map[i].size) >= initial_images_end ) - init_boot_pages((e820.map[i].addr < initial_images_end) ? - initial_images_end : e820.map[i].addr, - e820.map[i].addr + e820.map[i].size); + + /* Initialise boot heap, skipping Xen heap and dom0 modules. */ + s = e820.map[i].addr; + e = s + e820.map[i].size; + if ( s < xenheap_phys_end ) + s = xenheap_phys_end; + if ( (s < initial_images_end) && (e > initial_images_start) ) + s = initial_images_end; + init_boot_pages(s, e); + #if defined (CONFIG_X86_64) /* * x86/64 maps all registered RAM. Points to note: @@ -404,10 +421,30 @@ end_boot_allocator(); - init_xenheap_pages(xenheap_phys_start, xenheap_phys_end); - printk("Xen heap: %luMB (%lukB)\n", - (xenheap_phys_end-xenheap_phys_start) >> 20, - (xenheap_phys_end-xenheap_phys_start) >> 10); + /* Initialise the Xen heap, skipping RAM holes. */ + nr_pages = 0; + for ( i = 0; i < e820.nr_map; i++ ) + { + if ( e820.map[i].type != E820_RAM ) + continue; + + s = e820.map[i].addr; + e = s + e820.map[i].size; + if ( s < xenheap_phys_start ) + s = xenheap_phys_start; + if ( e > xenheap_phys_end ) + e = xenheap_phys_end; + + if ( s < e ) + { + nr_pages += (e - s) >> PAGE_SHIFT; + init_xenheap_pages(s, e); + } + } + + printk("Xen heap: %luMB (%lukB)\n", + nr_pages >> (20 - PAGE_SHIFT), + nr_pages << (PAGE_SHIFT - 10)); early_boot = 0; diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/shadow.c --- a/xen/arch/x86/shadow.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/shadow.c Thu Aug 25 22:53:20 2005 @@ -1578,7 +1578,7 @@ if ( unlikely(!VALID_MFN(gmfn)) ) { - SH_LOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); + SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); *spte_p = l1e_empty(); return 0; } @@ -1612,7 +1612,7 @@ if ( unlikely(!VALID_MFN(mfn)) ) { - SH_LOG("l1pte_read_fault: invalid gpfn=%lx", pfn); + SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn); *spte_p = l1e_empty(); return 0; } diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/shadow32.c --- a/xen/arch/x86/shadow32.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/shadow32.c Thu Aug 25 22:53:20 2005 @@ -418,7 +418,7 @@ break; default: - printk("Free shadow weird page type mfn=%lx type=%08x\n", + printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n", page_to_pfn(page), page->u.inuse.type_info); break; } @@ -665,7 +665,7 @@ shadow_audit(d, 0); - SH_LOG("Free shadow table."); + SH_VLOG("Free shadow table."); } void shadow_mode_init(void) @@ -1137,7 +1137,7 @@ d->arch.shadow_ht_free = NULL; ASSERT(d->arch.shadow_extras_count == 0); - SH_LOG("freed extras, now %d", d->arch.shadow_extras_count); + SH_VLOG("freed extras, now %d", d->arch.shadow_extras_count); if ( d->arch.shadow_dirty_bitmap != NULL ) { diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/shadow_public.c --- a/xen/arch/x86/shadow_public.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/shadow_public.c Thu Aug 25 22:53:20 2005 @@ -571,7 +571,7 @@ break; default: - printk("Free shadow weird page type mfn=%lx type=%08x\n", + printk("Free shadow weird page type mfn=%lx type=%" PRtype_info "\n", page_to_pfn(page), page->u.inuse.type_info); break; } @@ -1638,14 +1638,14 @@ /* XXX This needs more thought... */ printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n", __func__, page_to_pfn(page)); - printk("Before: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page), + printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_pfn(page), page->count_info, page->u.inuse.type_info); shadow_lock(d); __shadow_remove_all_access(d, page_to_pfn(page)); shadow_unlock(d); - printk("After: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page), + printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_pfn(page), page->count_info, page->u.inuse.type_info); } diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/smpboot.c Thu Aug 25 22:53:20 2005 @@ -434,7 +434,6 @@ unsigned int cpu = cpucount; extern void percpu_traps_init(void); - extern void cpu_init(void); set_current(idle_task[cpu]); set_processor_id(cpu); diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/time.c --- a/xen/arch/x86/time.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/time.c Thu Aug 25 22:53:20 2005 @@ -43,7 +43,8 @@ spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; int timer_ack = 0; unsigned long volatile jiffies; -static unsigned long wc_sec, wc_usec; /* UTC time at last 'time update'. */ +static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */ +static spinlock_t wc_lock = SPIN_LOCK_UNLOCKED; struct time_scale { int shift; @@ -67,13 +68,6 @@ static spinlock_t platform_timer_lock = SPIN_LOCK_UNLOCKED; static u64 (*read_platform_count)(void); -static inline u32 down_shift(u64 time, int shift) -{ - if ( shift < 0 ) - return (u32)(time >> -shift); - return (u32)((u32)time << shift); -} - /* * 32-bit division of integer dividend and integer divisor yielding * 32-bit fractional quotient. @@ -83,7 +77,7 @@ u32 quotient, remainder; ASSERT(dividend < divisor); __asm__ ( - "div %4" + "divl %4" : "=a" (quotient), "=d" (remainder) : "0" (0), "1" (dividend), "r" (divisor) ); return quotient; @@ -101,6 +95,42 @@ : "=a" (product_frac), "=d" (product_int) : "0" (multiplicand), "r" (multiplier) ); return product_int; +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, struct time_scale *scale) +{ + u64 product; +#ifdef CONFIG_X86_32 + u32 tmp1, tmp2; +#endif + + if ( scale->shift < 0 ) + delta >>= -scale->shift; + else + delta <<= scale->shift; + +#ifdef CONFIG_X86_32 + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "add %4,%%eax ; " + "xor %5,%5 ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (scale->mul_frac) ); +#else + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)scale->mul_frac) ); +#endif + + return product; } void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) @@ -486,11 +516,9 @@ static s_time_t __read_platform_stime(u64 platform_time) { - u64 diff64 = platform_time - platform_timer_stamp; - u32 diff = down_shift(diff64, platform_timer_scale.shift); + u64 diff = platform_time - platform_timer_stamp; ASSERT(spin_is_locked(&platform_timer_lock)); - return (stime_platform_stamp + - (u64)mul_frac(diff, platform_timer_scale.mul_frac)); + return (stime_platform_stamp + scale_delta(diff, &platform_timer_scale)); } static s_time_t read_platform_stime(void) @@ -619,15 +647,27 @@ s_time_t get_s_time(void) { struct cpu_time *t = &cpu_time[smp_processor_id()]; - u64 tsc; - u32 delta; + u64 tsc, delta; s_time_t now; rdtscll(tsc); - delta = down_shift(tsc - t->local_tsc_stamp, t->tsc_scale.shift); - now = t->stime_local_stamp + (u64)mul_frac(delta, t->tsc_scale.mul_frac); + delta = tsc - t->local_tsc_stamp; + now = t->stime_local_stamp + scale_delta(delta, &t->tsc_scale); return now; +} + +static inline void version_update_begin(u32 *version) +{ + /* Explicitly OR with 1 just in case version number gets out of sync. */ + *version = (*version + 1) | 1; + wmb(); +} + +static inline void version_update_end(u32 *version) +{ + wmb(); + (*version)++; } static inline void __update_dom_time(struct vcpu *v) @@ -635,20 +675,14 @@ struct cpu_time *t = &cpu_time[smp_processor_id()]; struct vcpu_time_info *u = &v->domain->shared_info->vcpu_time[v->vcpu_id]; - u->time_version1++; - wmb(); + version_update_begin(&u->version); u->tsc_timestamp = t->local_tsc_stamp; u->system_time = t->stime_local_stamp; u->tsc_to_system_mul = t->tsc_scale.mul_frac; u->tsc_shift = (s8)t->tsc_scale.shift; - wmb(); - u->time_version2++; - - /* Should only do this during do_settime(). */ - v->domain->shared_info->wc_sec = wc_sec; - v->domain->shared_info->wc_usec = wc_usec; + version_update_end(&u->version); } void update_dom_time(struct vcpu *v) @@ -659,21 +693,43 @@ } /* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */ -void do_settime(unsigned long secs, unsigned long usecs, u64 system_time_base) -{ - u64 x, base_usecs; - u32 y; - - base_usecs = system_time_base; - do_div(base_usecs, 1000); - - x = (secs * 1000000ULL) + (u64)usecs + base_usecs; - y = do_div(x, 1000000); - - wc_sec = (unsigned long)x; - wc_usec = (unsigned long)y; - - __update_dom_time(current); +void do_settime(unsigned long secs, unsigned long nsecs, u64 system_time_base) +{ + u64 x; + u32 y, _wc_sec, _wc_nsec; + struct domain *d; + shared_info_t *s; + + x = (secs * 1000000000ULL) + (u64)nsecs - system_time_base; + y = do_div(x, 1000000000); + + wc_sec = _wc_sec = (u32)x; + wc_nsec = _wc_nsec = (u32)y; + + read_lock(&domlist_lock); + spin_lock(&wc_lock); + + for_each_domain ( d ) + { + s = d->shared_info; + version_update_begin(&s->wc_version); + s->wc_sec = _wc_sec; + s->wc_nsec = _wc_nsec; + version_update_end(&s->wc_version); + } + + spin_unlock(&wc_lock); + read_unlock(&domlist_lock); +} + +void init_domain_time(struct domain *d) +{ + spin_lock(&wc_lock); + version_update_begin(&d->shared_info->wc_version); + d->shared_info->wc_sec = wc_sec; + d->shared_info->wc_nsec = wc_nsec; + version_update_end(&d->shared_info->wc_version); + spin_unlock(&wc_lock); } static void local_time_calibration(void *unused) diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/traps.c Thu Aug 25 22:53:20 2005 @@ -159,10 +159,8 @@ addr = *stack++; if ( is_kernel_text(addr) ) { - if ( (i != 0) && ((i % 6) == 0) ) - printk("\n "); printk("[<%p>]", _p(addr)); - print_symbol(" %s\n", addr); + print_symbol(" %s\n ", addr); i++; } } @@ -422,7 +420,7 @@ { LOCK_BIGLOCK(d); if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && - unlikely((addr >> L2_PAGETABLE_SHIFT) == + unlikely(l2_linear_offset(addr) == d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) { ptwr_flush(d, PTWR_PT_ACTIVE); @@ -430,10 +428,15 @@ return EXCRET_fault_fixed; } - if ( (addr < HYPERVISOR_VIRT_START) && + if ( ((addr < HYPERVISOR_VIRT_START) +#if defined(__x86_64__) + || (addr >= HYPERVISOR_VIRT_END) +#endif + ) + && KERNEL_MODE(v, regs) && ((regs->error_code & 3) == 3) && /* write-protection fault */ - ptwr_do_page_fault(d, addr) ) + ptwr_do_page_fault(d, addr, regs) ) { UNLOCK_BIGLOCK(d); return EXCRET_fault_fixed; @@ -459,15 +462,13 @@ goto xen_fault; propagate_page_fault(addr, regs->error_code); - return 0; + return 0; xen_fault: if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { perfc_incrc(copy_user_faults); - if ( !shadow_mode_enabled(d) ) - DPRINTK("Page fault: %p -> %p\n", _p(regs->eip), _p(fixup)); regs->eip = fixup; return 0; } @@ -1155,7 +1156,6 @@ void __init trap_init(void) { extern void percpu_traps_init(void); - extern void cpu_init(void); /* * Note that interrupt gates are always used, rather than trap gates. We diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/vmx.c --- a/xen/arch/x86/vmx.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/vmx.c Thu Aug 25 22:53:20 2005 @@ -65,7 +65,7 @@ * are not modified once set for generic domains, we don't save them, * but simply reset them to the values set at percpu_traps_init(). */ -void vmx_load_msrs(struct vcpu *p, struct vcpu *n) +void vmx_load_msrs(struct vcpu *n) { struct msr_state *host_state; host_state = &percpu_msr[smp_processor_id()]; @@ -1712,9 +1712,6 @@ default: __vmx_bug(®s); /* should not happen */ } - - vmx_intr_assist(v); - return; } asmlinkage void load_cr2(void) diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/vmx_intercept.c --- a/xen/arch/x86/vmx_intercept.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/vmx_intercept.c Thu Aug 25 22:53:20 2005 @@ -74,10 +74,10 @@ static void pit_cal_count(struct vmx_virpit_t *vpit) { - unsigned int usec_delta = (unsigned int)((NOW() - vpit->inject_point) / 1000); - if (usec_delta > vpit->period * 1000) + u64 nsec_delta = (unsigned int)((NOW() - vpit->inject_point)); + if (nsec_delta > vpit->period) VMX_DBG_LOG(DBG_LEVEL_1, "VMX_PIT:long time has passed from last injection!"); - vpit->count = vpit->init_val - ((usec_delta * PIT_FREQ / 1000000) % vpit->init_val ); + vpit->count = vpit->init_val - ((nsec_delta * PIT_FREQ / 1000000000ULL) % vpit->init_val ); } static void pit_latch_io(struct vmx_virpit_t *vpit) @@ -197,9 +197,10 @@ static void pit_timer_fn(void *data) { struct vmx_virpit_t *vpit = data; - int missed_ticks; - - missed_ticks = (NOW() - vpit->scheduled) / MILLISECS(vpit->period); + s_time_t next; + int missed_ticks; + + missed_ticks = (NOW() - vpit->scheduled)/(s_time_t) vpit->period; /* Set the pending intr bit, and send evtchn notification to myself. */ if (test_and_set_bit(vpit->vector, vpit->intr_bitmap)) @@ -208,12 +209,12 @@ /* pick up missed timer tick */ if ( missed_ticks > 0 ) { vpit->pending_intr_nr += missed_ticks; - vpit->scheduled += missed_ticks * MILLISECS(vpit->period); - } - vpit->scheduled += MILLISECS(vpit->period); - set_ac_timer(&vpit->pit_timer, vpit->scheduled); -} - + vpit->scheduled += missed_ticks * vpit->period; + } + next = vpit->scheduled + vpit->period; + set_ac_timer(&vpit->pit_timer, next); + vpit->scheduled = next; +} /* Only some PIT operations such as load init counter need a hypervisor hook. * leave all other operations in user space DM @@ -236,16 +237,17 @@ reinit = 1; } else - init_ac_timer(&vpit->pit_timer, pit_timer_fn, vpit, 0); + init_ac_timer(&vpit->pit_timer, pit_timer_fn, vpit, d->processor); /* init count for this channel */ vpit->init_val = (p->u.data & 0xFFFF) ; - /* frequency(ms) of pit */ - vpit->period = DIV_ROUND(((vpit->init_val) * 1000), PIT_FREQ); - if (vpit->period < 1) { + /* frequency(ns) of pit */ + vpit->period = DIV_ROUND(((vpit->init_val) * 1000000000ULL), PIT_FREQ); + VMX_DBG_LOG(DBG_LEVEL_1,"VMX_PIT: guest set init pit freq:%u ns, initval:0x%x\n", vpit->period, vpit->init_val); + if (vpit->period < 900000) { /* < 0.9 ms */ printk("VMX_PIT: guest programmed too small an init_val: %x\n", vpit->init_val); - vpit->period = 1; + vpit->period = 1000000; } vpit->vector = ((p->u.data >> 16) & 0xFF); vpit->channel = ((p->u.data >> 24) & 0x3); @@ -272,7 +274,7 @@ vpit->intr_bitmap = intr; - vpit->scheduled = NOW() + MILLISECS(vpit->period); + vpit->scheduled = NOW() + vpit->period; set_ac_timer(&vpit->pit_timer, vpit->scheduled); /*restore the state*/ diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/vmx_io.c --- a/xen/arch/x86/vmx_io.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/vmx_io.c Thu Aug 25 22:53:20 2005 @@ -631,12 +631,14 @@ return ((eflags & X86_EFLAGS_IF) == 0); } -void vmx_intr_assist(struct vcpu *v) +asmlinkage void vmx_intr_assist(void) { int intr_type = 0; - int highest_vector = find_highest_pending_irq(v, &intr_type); + int highest_vector; unsigned long intr_fields, eflags, interruptibility, cpu_exec_control; - + struct vcpu *v = current; + + highest_vector = find_highest_pending_irq(v, &intr_type); __vmread(CPU_BASED_VM_EXEC_CONTROL, &cpu_exec_control); if (highest_vector == -1) { @@ -712,9 +714,6 @@ /* We can't resume the guest if we're waiting on I/O */ ASSERT(!test_bit(ARCH_VMX_IO_WAIT, &d->arch.arch_vmx.flags)); - - /* We always check for interrupts before resuming guest */ - vmx_intr_assist(d); } #endif /* CONFIG_VMX */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/vmx_vmcs.c --- a/xen/arch/x86/vmx_vmcs.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/vmx_vmcs.c Thu Aug 25 22:53:20 2005 @@ -187,46 +187,52 @@ return 0; } -void vmx_do_launch(struct vcpu *v) -{ -/* Update CR3, GDT, LDT, TR */ +void vmx_set_host_env(struct vcpu *v) +{ unsigned int tr, cpu, error = 0; struct host_execution_env host_env; struct Xgt_desc_struct desc; - unsigned long pfn = 0; - struct pfn_info *page; - struct cpu_user_regs *regs = guest_cpu_user_regs(); - - vmx_stts(); cpu = smp_processor_id(); - - page = (struct pfn_info *) alloc_domheap_page(NULL); - pfn = (unsigned long) (page - frame_table); - - vmx_setup_platform(v, regs); - __asm__ __volatile__ ("sidt (%0) \n" :: "a"(&desc) : "memory"); host_env.idtr_limit = desc.size; host_env.idtr_base = desc.address; error |= __vmwrite(HOST_IDTR_BASE, host_env.idtr_base); - + __asm__ __volatile__ ("sgdt (%0) \n" :: "a"(&desc) : "memory"); host_env.gdtr_limit = desc.size; host_env.gdtr_base = desc.address; error |= __vmwrite(HOST_GDTR_BASE, host_env.gdtr_base); + __asm__ __volatile__ ("str (%0) \n" :: "a"(&tr) : "memory"); + host_env.tr_selector = tr; + host_env.tr_limit = sizeof(struct tss_struct); + host_env.tr_base = (unsigned long) &init_tss[cpu]; + error |= __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector); + error |= __vmwrite(HOST_TR_BASE, host_env.tr_base); +} + +void vmx_do_launch(struct vcpu *v) +{ +/* Update CR3, GDT, LDT, TR */ + unsigned int error = 0; + unsigned long pfn = 0; + struct pfn_info *page; + struct cpu_user_regs *regs = guest_cpu_user_regs(); + + vmx_stts(); + + page = (struct pfn_info *) alloc_domheap_page(NULL); + pfn = (unsigned long) (page - frame_table); + + vmx_setup_platform(v, regs); + + vmx_set_host_env(v); + error |= __vmwrite(GUEST_LDTR_SELECTOR, 0); error |= __vmwrite(GUEST_LDTR_BASE, 0); error |= __vmwrite(GUEST_LDTR_LIMIT, 0); - __asm__ __volatile__ ("str (%0) \n" :: "a"(&tr) : "memory"); - host_env.tr_selector = tr; - host_env.tr_limit = sizeof(struct tss_struct); - host_env.tr_base = (unsigned long) &init_tss[cpu]; - - error |= __vmwrite(HOST_TR_SELECTOR, host_env.tr_selector); - error |= __vmwrite(HOST_TR_BASE, host_env.tr_base); error |= __vmwrite(GUEST_TR_BASE, 0); error |= __vmwrite(GUEST_TR_LIMIT, 0xff); @@ -523,12 +529,48 @@ void vm_launch_fail(unsigned long eflags) { + unsigned long error; + __vmread(VM_INSTRUCTION_ERROR, &error); + printk("<vm_launch_fail> error code %lx\n", error); __vmx_bug(guest_cpu_user_regs()); } void vm_resume_fail(unsigned long eflags) { + unsigned long error; + __vmread(VM_INSTRUCTION_ERROR, &error); + printk("<vm_resume_fail> error code %lx\n", error); __vmx_bug(guest_cpu_user_regs()); +} + +void arch_vmx_do_resume(struct vcpu *v) +{ + u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); + + load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); + vmx_do_resume(v); + reset_stack_and_jump(vmx_asm_do_resume); +} + +void arch_vmx_do_launch(struct vcpu *v) +{ + u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); + + load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); + vmx_do_launch(v); + reset_stack_and_jump(vmx_asm_do_launch); +} + +void arch_vmx_do_relaunch(struct vcpu *v) +{ + u64 vmcs_phys_ptr = (u64) virt_to_phys(v->arch.arch_vmx.vmcs); + + load_vmcs(&v->arch.arch_vmx, vmcs_phys_ptr); + vmx_do_resume(v); + vmx_set_host_env(v); + v->arch.schedule_tail = arch_vmx_do_resume; + + reset_stack_and_jump(vmx_asm_do_relaunch); } #endif /* CONFIG_VMX */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/x86_32/entry.S --- a/xen/arch/x86/x86_32/entry.S Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/x86_32/entry.S Thu Aug 25 22:53:20 2005 @@ -108,31 +108,26 @@ pushl %ecx; \ pushl %ebx; +#define VMX_RESTORE_ALL_NOSEGREGS \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax; \ + addl $(NR_SKIPPED_REGS*4), %esp + ENTRY(vmx_asm_vmexit_handler) /* selectors are restored/saved by VMX */ VMX_SAVE_ALL_NOSEGREGS call vmx_vmexit_handler jmp vmx_asm_do_resume -ENTRY(vmx_asm_do_launch) - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - addl $(NR_SKIPPED_REGS*4), %esp - /* VMLUANCH */ - .byte 0x0f,0x01,0xc2 - pushf - call vm_launch_fail - hlt - - ALIGN - -ENTRY(vmx_asm_do_resume) -vmx_test_all_events: +.macro vmx_asm_common launch initialized +1: +/* vmx_test_all_events */ + .if \initialized GET_CURRENT(%ebx) /*test_all_events:*/ xorl %ecx,%ecx @@ -142,34 +137,51 @@ movl VCPU_processor(%ebx),%eax shl $IRQSTAT_shift,%eax test %ecx,irq_stat(%eax,1) - jnz vmx_process_softirqs - -vmx_restore_all_guest: + jnz 2f + +/* vmx_restore_all_guest */ + call vmx_intr_assist call load_cr2 + .endif + VMX_RESTORE_ALL_NOSEGREGS /* * Check if we are going back to VMX-based VM * By this time, all the setups in the VMCS must be complete. */ - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - addl $(NR_SKIPPED_REGS*4), %esp + .if \launch + /* VMLUANCH */ + .byte 0x0f,0x01,0xc2 + pushf + call vm_launch_fail + .else /* VMRESUME */ .byte 0x0f,0x01,0xc3 pushf call vm_resume_fail + .endif /* Should never reach here */ hlt ALIGN -vmx_process_softirqs: + .if \initialized +2: +/* vmx_process_softirqs */ sti call do_softirq - jmp vmx_test_all_events + jmp 1b + ALIGN + .endif +.endm + +ENTRY(vmx_asm_do_launch) + vmx_asm_common 1 0 + +ENTRY(vmx_asm_do_resume) + vmx_asm_common 0 1 + +ENTRY(vmx_asm_do_relaunch) + vmx_asm_common 1 1 + #endif ALIGN @@ -335,7 +347,8 @@ movl VCPU_vcpu_info(%ebx),%eax pushl VCPUINFO_upcall_mask(%eax) testb $TBF_INTERRUPT,%cl - setnz VCPUINFO_upcall_mask(%eax) # TBF_INTERRUPT -> clear upcall mask + setnz %ch # TBF_INTERRUPT -> set upcall mask + orb %ch,VCPUINFO_upcall_mask(%eax) popl %eax shll $16,%eax # Bits 16-23: saved_upcall_mask movw UREGS_cs+4(%esp),%ax # Bits 0-15: CS diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/x86_32/mm.c --- a/xen/arch/x86/x86_32/mm.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/x86_32/mm.c Thu Aug 25 22:53:20 2005 @@ -93,13 +93,10 @@ /* * Allocate and map the machine-to-phys table and create read-only mapping - * of MPT for guest-OS use. Without PAE we'll end up with one 4MB page, - * with PAE we'll allocate 2MB pages depending on the amount of memory - * installed, but at least 4MB to cover 4GB address space. This is needed - * to make PCI I/O memory address lookups work in guests. + * of MPT for guest-OS use. */ - if ( (mpt_size = max_page * 4) < (4*1024*1024) ) - mpt_size = 4*1024*1024; + mpt_size = (max_page * 4) + (1UL << L2_PAGETABLE_SHIFT) - 1UL; + mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) { if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL ) @@ -148,7 +145,7 @@ void subarch_init_memory(struct domain *dom_xen) { unsigned long m2p_start_mfn; - int i; + unsigned int i, j; /* * We are rather picky about the layout of 'struct pfn_info'. The @@ -172,12 +169,12 @@ { m2p_start_mfn = l2e_get_pfn( idle_pg_table_l2[l2_linear_offset(RDWR_MPT_VIRT_START) + i]); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + for ( j = 0; j < L2_PAGETABLE_ENTRIES; j++ ) { - frame_table[m2p_start_mfn+i].count_info = PGC_allocated | 1; + frame_table[m2p_start_mfn+j].count_info = PGC_allocated | 1; /* Ensure it's only mapped read-only by domains. */ - frame_table[m2p_start_mfn+i].u.inuse.type_info = PGT_gdt_page | 1; - page_set_owner(&frame_table[m2p_start_mfn+i], dom_xen); + frame_table[m2p_start_mfn+j].u.inuse.type_info = PGT_gdt_page | 1; + page_set_owner(&frame_table[m2p_start_mfn+j], dom_xen); } } } diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/x86_32/traps.c --- a/xen/arch/x86/x86_32/traps.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/x86_32/traps.c Thu Aug 25 22:53:20 2005 @@ -1,5 +1,6 @@ #include <xen/config.h> +#include <xen/domain_page.h> #include <xen/init.h> #include <xen/sched.h> #include <xen/lib.h> @@ -66,8 +67,9 @@ printk("CPU: %d\nEIP: %04lx:[<%08lx>]", smp_processor_id(), (unsigned long)0xffff & regs->cs, eip); - print_symbol(" %s\n", eip); - printk("EFLAGS: %08lx CONTEXT: %s\n", eflags, context); + if ( !GUEST_MODE(regs) ) + print_symbol(" %s", eip); + printk("\nEFLAGS: %08lx CONTEXT: %s\n", eflags, context); printk("eax: %08x ebx: %08x ecx: %08x edx: %08x\n", regs->eax, regs->ebx, regs->ecx, regs->edx); printk("esi: %08x edi: %08x ebp: %08x esp: %08lx\n", @@ -85,24 +87,33 @@ void show_page_walk(unsigned long addr) { - l2_pgentry_t pmd; - l1_pgentry_t *pte; - - if ( addr < PAGE_OFFSET ) - return; + unsigned long pfn = read_cr3() >> PAGE_SHIFT; + intpte_t *ptab, ent; printk("Pagetable walk from %08lx:\n", addr); - - pmd = idle_pg_table_l2[l2_linear_offset(addr)]; - printk(" L2 = %"PRIpte" %s\n", l2e_get_intpte(pmd), - (l2e_get_flags(pmd) & _PAGE_PSE) ? "(2/4MB)" : ""); - if ( !(l2e_get_flags(pmd) & _PAGE_PRESENT) || - (l2e_get_flags(pmd) & _PAGE_PSE) ) - return; - - pte = __va(l2e_get_paddr(pmd)); - pte += l1_table_offset(addr); - printk(" L1 = %"PRIpte"\n", l1e_get_intpte(*pte)); + +#ifdef CONFIG_X86_PAE + ptab = map_domain_page(pfn); + ent = ptab[l3_table_offset(addr)]; + printk(" L3 = %"PRIpte"\n", ent); + unmap_domain_page(ptab); + if ( !(ent & _PAGE_PRESENT) ) + return; + pfn = ent >> PAGE_SHIFT; +#endif + + ptab = map_domain_page(pfn); + ent = ptab[l2_table_offset(addr)]; + printk(" L2 = %"PRIpte" %s\n", ent, (ent & _PAGE_PSE) ? "(PSE)" : ""); + unmap_domain_page(ptab); + if ( !(ent & _PAGE_PRESENT) || (ent & _PAGE_PSE) ) + return; + pfn = ent >> PAGE_SHIFT; + + ptab = map_domain_page(ent >> PAGE_SHIFT); + ent = ptab[l2_table_offset(addr)]; + printk(" L1 = %"PRIpte"\n", ent); + unmap_domain_page(ptab); } #define DOUBLEFAULT_STACK_SIZE 1024 diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/x86_64/entry.S --- a/xen/arch/x86/x86_64/entry.S Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/x86_64/entry.S Thu Aug 25 22:53:20 2005 @@ -194,39 +194,34 @@ pushq %r14; \ pushq %r15; \ +#define VMX_RESTORE_ALL_NOSEGREGS \ + popq %r15; \ + popq %r14; \ + popq %r13; \ + popq %r12; \ + popq %rbp; \ + popq %rbx; \ + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rax; \ + popq %rcx; \ + popq %rdx; \ + popq %rsi; \ + popq %rdi; \ + addq $(NR_SKIPPED_REGS*8), %rsp; \ + ENTRY(vmx_asm_vmexit_handler) /* selectors are restored/saved by VMX */ VMX_SAVE_ALL_NOSEGREGS call vmx_vmexit_handler jmp vmx_asm_do_resume -ENTRY(vmx_asm_do_launch) - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - addq $(NR_SKIPPED_REGS*8), %rsp - /* VMLUANCH */ - .byte 0x0f,0x01,0xc2 - pushfq - call vm_launch_fail - hlt - - ALIGN - -ENTRY(vmx_asm_do_resume) -vmx_test_all_events: +.macro vmx_asm_common launch initialized +1: + .if \initialized +/* vmx_test_all_events */ GET_CURRENT(%rbx) /* test_all_events: */ cli # tests must not race interrupts @@ -235,42 +230,52 @@ shl $IRQSTAT_shift,%rax leaq irq_stat(%rip), %rdx testl $~0,(%rdx,%rax,1) - jnz vmx_process_softirqs - -vmx_restore_all_guest: + jnz 2f + +/* vmx_restore_all_guest */ + call vmx_intr_assist call load_cr2 + .endif /* * Check if we are going back to VMX-based VM * By this time, all the setups in the VMCS must be complete. */ - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - addq $(NR_SKIPPED_REGS*8), %rsp + VMX_RESTORE_ALL_NOSEGREGS + .if \launch + /* VMLUANCH */ + .byte 0x0f,0x01,0xc2 + pushfq + call vm_launch_fail + .else /* VMRESUME */ .byte 0x0f,0x01,0xc3 pushfq call vm_resume_fail + .endif /* Should never reach here */ hlt ALIGN -vmx_process_softirqs: + + .if \initialized +2: +/* vmx_process_softirqs */ sti call do_softirq - jmp vmx_test_all_events + jmp 1b + ALIGN + .endif +.endm + +ENTRY(vmx_asm_do_launch) + vmx_asm_common 1 0 + +ENTRY(vmx_asm_do_resume) + vmx_asm_common 0 1 + +ENTRY(vmx_asm_do_relaunch) + vmx_asm_common 1 1 + #endif ALIGN @@ -314,7 +319,8 @@ movq VCPU_vcpu_info(%rbx),%rax pushq VCPUINFO_upcall_mask(%rax) testb $TBF_INTERRUPT,%cl - setnz VCPUINFO_upcall_mask(%rax)# TBF_INTERRUPT -> clear upcall mask + setnz %ch # TBF_INTERRUPT -> set upcall mask + orb %ch,VCPUINFO_upcall_mask(%rax) popq %rax shlq $32,%rax # Bits 32-39: saved_upcall_mask movw UREGS_cs+8(%rsp),%ax # Bits 0-15: CS diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/x86_64/mm.c --- a/xen/arch/x86/x86_64/mm.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/x86_64/mm.c Thu Aug 25 22:53:20 2005 @@ -74,7 +74,7 @@ void __init paging_init(void) { - unsigned long i; + unsigned long i, mpt_size; l3_pgentry_t *l3_ro_mpt; l2_pgentry_t *l2_ro_mpt; struct pfn_info *pg; @@ -98,16 +98,17 @@ * Allocate and map the machine-to-phys table. * This also ensures L3 is present for fixmaps. */ - for ( i = 0; i < max_page; i += ((1UL << L2_PAGETABLE_SHIFT) / 8) ) - { - pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0); - if ( pg == NULL ) + mpt_size = (max_page * 4) + (1UL << L2_PAGETABLE_SHIFT) - 1UL; + mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL); + for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ ) + { + if ( (pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER, 0)) == NULL ) panic("Not enough memory for m2p table\n"); map_pages_to_xen( - RDWR_MPT_VIRT_START + i*8, page_to_pfn(pg), + RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT), page_to_pfn(pg), 1UL << PAGETABLE_ORDER, PAGE_HYPERVISOR); - memset((void *)(RDWR_MPT_VIRT_START + i*8), 0x55, + memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)), 0x55, 1UL << L2_PAGETABLE_SHIFT); *l2_ro_mpt++ = l2e_from_page( pg, _PAGE_GLOBAL|_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT); diff -r 5f1ed597f107 -r 8799d14bef77 xen/arch/x86/x86_64/traps.c --- a/xen/arch/x86/x86_64/traps.c Wed Aug 24 02:43:18 2005 +++ b/xen/arch/x86/x86_64/traps.c Thu Aug 25 22:53:20 2005 @@ -17,8 +17,9 @@ { printk("CPU: %d\nEIP: %04x:[<%016lx>]", smp_processor_id(), 0xffff & regs->cs, regs->rip); - print_symbol(" %s\n", regs->rip); - printk("EFLAGS: %016lx\n", regs->eflags); + if ( !GUEST_MODE(regs) ) + print_symbol(" %s", regs->rip); + printk("\nEFLAGS: %016lx\n", regs->eflags); printk("rax: %016lx rbx: %016lx rcx: %016lx rdx: %016lx\n", regs->rax, regs->rbx, regs->rcx, regs->rdx); printk("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/dom0_ops.c --- a/xen/common/dom0_ops.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/dom0_ops.c Thu Aug 25 22:53:20 2005 @@ -70,8 +70,7 @@ flags &= ~DOMFLAGS_BLOCKED; if ( v->vcpu_flags & VCPUF_running ) flags |= DOMFLAGS_RUNNING; - if ( v->cpu_time > cpu_time ) - cpu_time += v->cpu_time; + cpu_time += v->cpu_time; vcpu_count++; } @@ -294,17 +293,17 @@ v->cpumap = cpumap; if ( cpumap == CPUMAP_RUNANYWHERE ) + { clear_bit(_VCPUF_cpu_pinned, &v->vcpu_flags); + } else { /* pick a new cpu from the usable map */ int new_cpu = (int)find_first_set_bit(cpumap) % num_online_cpus(); vcpu_pause(v); - if ( v->processor != new_cpu ) - set_bit(_VCPUF_cpu_migrated, &v->vcpu_flags); + vcpu_migrate_cpu(v, new_cpu); set_bit(_VCPUF_cpu_pinned, &v->vcpu_flags); - v->processor = new_cpu; vcpu_unpause(v); } @@ -475,7 +474,7 @@ case DOM0_SETTIME: { do_settime(op->u.settime.secs, - op->u.settime.usecs, + op->u.settime.nsecs, op->u.settime.system_time); ret = 0; } diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/event_channel.c --- a/xen/common/event_channel.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/event_channel.c Thu Aug 25 22:53:20 2005 @@ -588,7 +588,6 @@ long rc = 0; if ( (vcpu >= MAX_VIRT_CPUS) || (d->vcpu[vcpu] == NULL) ) { - printf("vcpu %d bad.\n", vcpu); return -EINVAL; } @@ -596,7 +595,6 @@ if ( !port_is_valid(d, port) ) { - printf("port %d bad.\n", port); rc = -EINVAL; goto out; } @@ -610,7 +608,6 @@ chn->notify_vcpu_id = vcpu; break; default: - printf("evtchn type %d can't be rebound.\n", chn->state); rc = -EINVAL; break; } diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/grant_table.c --- a/xen/common/grant_table.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/grant_table.c Thu Aug 25 22:53:20 2005 @@ -6,6 +6,8 @@ * * Copyright (c) 2005 Christopher Clark * Copyright (c) 2004 K A Fraser + * Copyright (c) 2005 Andrew Warfield + * Modifications by Geoffrey Lefebvre are (c) Intel Research Cambridge * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -50,7 +52,7 @@ grant_table_t *t) { unsigned int h; - if ( unlikely((h = t->maptrack_head) == t->maptrack_limit) ) + if ( unlikely((h = t->maptrack_head) == (t->maptrack_limit - 1)) ) return -1; t->maptrack_head = t->maptrack[h].ref_and_flags >> MAPTRACK_REF_SHIFT; t->map_count++; @@ -68,13 +70,13 @@ static int __gnttab_activate_grant_ref( - struct domain *mapping_d, /* IN */ + struct domain *mapping_d, /* IN */ struct vcpu *mapping_ed, - struct domain *granting_d, - grant_ref_t ref, - u16 dev_hst_ro_flags, - unsigned long host_virt_addr, - unsigned long *pframe ) /* OUT */ + struct domain *granting_d, + grant_ref_t ref, + u16 dev_hst_ro_flags, + u64 addr, + unsigned long *pframe ) /* OUT */ { domid_t sdom; u16 sflags; @@ -95,7 +97,7 @@ * Returns: * . -ve: error * . 1: ok - * . 0: ok and TLB invalidate of host_virt_addr needed. + * . 0: ok and TLB invalidate of host_addr needed. * * On success, *pframe contains mfn. */ @@ -121,6 +123,10 @@ sflags = sha->flags; sdom = sha->domid; + /* This loop attempts to set the access (reading/writing) flags + * in the grant table entry. It tries a cmpxchg on the field + * up to five times, and then fails under the assumption that + * the guest is misbehaving. */ for ( ; ; ) { u32 scombo, prev_scombo, new_scombo; @@ -253,28 +259,32 @@ /* * At this point: - * act->pin updated to reflect mapping. + * act->pin updated to reference count mappings. * sha->flags updated to indicate to granting domain mapping done. * frame contains the mfn. */ spin_unlock(&granting_d->grant_table->lock); - if ( (host_virt_addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) ) + if ( (addr != 0) && (dev_hst_ro_flags & GNTMAP_host_map) ) { /* Write update into the pagetable. */ l1_pgentry_t pte; pte = l1e_from_pfn(frame, GRANT_PTE_FLAGS); + + if ( (dev_hst_ro_flags & GNTMAP_application_map) ) + l1e_add_flags(pte,_PAGE_USER); if ( !(dev_hst_ro_flags & GNTMAP_readonly) ) l1e_add_flags(pte,_PAGE_RW); - rc = update_grant_va_mapping( host_virt_addr, pte, - mapping_d, mapping_ed ); - - /* - * IMPORTANT: (rc == 0) => must flush / invalidate entry in TLB. - * This is done in the outer gnttab_map_grant_ref. - */ - + + if ( dev_hst_ro_flags & GNTMAP_contains_pte ) + rc = update_grant_pte_mapping(addr, pte, mapping_d, mapping_ed); + else + rc = update_grant_va_mapping(addr, pte, mapping_d, mapping_ed); + + /* IMPORTANT: rc indicates the degree of TLB flush that is required. + * GNTST_flush_one (1) or GNTST_flush_all (2). This is done in the + * outer gnttab_map_grant_ref. */ if ( rc < 0 ) { /* Failure: undo and abort. */ @@ -317,20 +327,24 @@ /* * Returns 0 if TLB flush / invalidate required by caller. * va will indicate the address to be invalidated. + * + * addr is _either_ a host virtual address, or the address of the pte to + * update, as indicated by the GNTMAP_contains_pte flag. */ static int __gnttab_map_grant_ref( gnttab_map_grant_ref_t *uop, unsigned long *va) { - domid_t dom; - grant_ref_t ref; - struct domain *ld, *rd; + domid_t dom; + grant_ref_t ref; + struct domain *ld, *rd; struct vcpu *led; - u16 dev_hst_ro_flags; - int handle; - unsigned long frame = 0, host_virt_addr; - int rc; + u16 dev_hst_ro_flags; + int handle; + u64 addr; + unsigned long frame = 0; + int rc; led = current; ld = led->domain; @@ -338,19 +352,20 @@ /* Bitwise-OR avoids short-circuiting which screws control flow. */ if ( unlikely(__get_user(dom, &uop->dom) | __get_user(ref, &uop->ref) | - __get_user(host_virt_addr, &uop->host_virt_addr) | + __get_user(addr, &uop->host_addr) | __get_user(dev_hst_ro_flags, &uop->flags)) ) { DPRINTK("Fault while reading gnttab_map_grant_ref_t.\n"); return -EFAULT; /* don't set status */ } - - if ( ((host_virt_addr != 0) || (dev_hst_ro_flags & GNTMAP_host_map)) && - unlikely(!__addr_ok(host_virt_addr))) - { - DPRINTK("Bad virtual address (%lx) or flags (%x).\n", - host_virt_addr, dev_hst_ro_flags); + if ( (dev_hst_ro_flags & GNTMAP_host_map) && + ( (addr == 0) || + (!(dev_hst_ro_flags & GNTMAP_contains_pte) && + unlikely(!__addr_ok(addr))) ) ) + { + DPRINTK("Bad virtual address (%"PRIx64") or flags (%"PRIx16").\n", + addr, dev_hst_ro_flags); (void)__put_user(GNTST_bad_virt_addr, &uop->handle); return GNTST_bad_gntref; } @@ -386,12 +401,20 @@ grant_mapping_t *new_mt; grant_table_t *lgt = ld->grant_table; + if ( (lgt->maptrack_limit << 1) > MAPTRACK_MAX_ENTRIES ) + { + put_domain(rd); + DPRINTK("Maptrack table is at maximum size.\n"); + (void)__put_user(GNTST_no_device_space, &uop->handle); + return GNTST_no_device_space; + } + /* Grow the maptrack table. */ new_mt = alloc_xenheap_pages(lgt->maptrack_order + 1); if ( new_mt == NULL ) { put_domain(rd); - DPRINTK("No more map handles available\n"); + DPRINTK("No more map handles available.\n"); (void)__put_user(GNTST_no_device_space, &uop->handle); return GNTST_no_device_space; } @@ -405,7 +428,7 @@ lgt->maptrack_order += 1; lgt->maptrack_limit <<= 1; - printk("Doubled maptrack size\n"); + DPRINTK("Doubled maptrack size\n"); handle = get_maptrack_handle(ld->grant_table); } @@ -416,7 +439,7 @@ if ( 0 <= ( rc = __gnttab_activate_grant_ref( ld, led, rd, ref, dev_hst_ro_flags, - host_virt_addr, &frame))) + addr, &frame))) { /* * Only make the maptrack live _after_ writing the pte, in case we @@ -428,10 +451,11 @@ = (ref << MAPTRACK_REF_SHIFT) | (dev_hst_ro_flags & MAPTRACK_GNTMAP_MASK); - (void)__put_user(frame, &uop->dev_bus_addr); - - if ( dev_hst_ro_flags & GNTMAP_host_map ) - *va = host_virt_addr; + (void)__put_user((u64)frame << PAGE_SHIFT, &uop->dev_bus_addr); + + if ( ( dev_hst_ro_flags & GNTMAP_host_map ) && + !( dev_hst_ro_flags & GNTMAP_contains_pte) ) + *va = addr; (void)__put_user(handle, &uop->handle); } @@ -449,12 +473,12 @@ gnttab_map_grant_ref( gnttab_map_grant_ref_t *uop, unsigned int count) { - int i, flush = 0; + int i, rc, flush = 0; unsigned long va = 0; for ( i = 0; i < count; i++ ) - if ( __gnttab_map_grant_ref(&uop[i], &va) == 0 ) - flush++; + if ( (rc =__gnttab_map_grant_ref(&uop[i], &va)) >= 0 ) + flush += rc; if ( flush == 1 ) flush_tlb_one_mask(current->domain->cpumask, va); @@ -469,28 +493,30 @@ gnttab_unmap_grant_ref_t *uop, unsigned long *va) { - domid_t dom; - grant_ref_t ref; - u16 handle; - struct domain *ld, *rd; - + domid_t dom; + grant_ref_t ref; + u16 handle; + struct domain *ld, *rd; active_grant_entry_t *act; - grant_entry_t *sha; + grant_entry_t *sha; grant_mapping_t *map; - u16 flags; - s16 rc = 1; - unsigned long frame, virt; + u16 flags; + s16 rc = 1; + u64 addr, dev_bus_addr; + unsigned long frame; ld = current->domain; /* Bitwise-OR avoids short-circuiting which screws control flow. */ - if ( unlikely(__get_user(virt, &uop->host_virt_addr) | - __get_user(frame, &uop->dev_bus_addr) | + if ( unlikely(__get_user(addr, &uop->host_addr) | + __get_user(dev_bus_addr, &uop->dev_bus_addr) | __get_user(handle, &uop->handle)) ) { DPRINTK("Fault while reading gnttab_unmap_grant_ref_t.\n"); return -EFAULT; /* don't set status */ } + + frame = (unsigned long)(dev_bus_addr >> PAGE_SHIFT); map = &ld->grant_table->maptrack[handle]; @@ -529,15 +555,6 @@ if ( frame == 0 ) { frame = act->frame; - } - else if ( frame == GNTUNMAP_DEV_FROM_VIRT ) - { - if ( !( flags & GNTMAP_device_map ) ) - PIN_FAIL(unmap_out, GNTST_bad_dev_addr, - "Bad frame number: frame not mapped for dev access.\n"); - frame = act->frame; - - /* Frame will be unmapped for device access below if virt addr okay. */ } else { @@ -554,41 +571,19 @@ /* Frame is now unmapped for device access. */ } - if ( (virt != 0) && + if ( (addr != 0) && (flags & GNTMAP_host_map) && ((act->pin & (GNTPIN_hstw_mask | GNTPIN_hstr_mask)) > 0)) { - l1_pgentry_t *pl1e; - unsigned long _ol1e; - - pl1e = &linear_pg_table[l1_linear_offset(virt)]; - - if ( unlikely(__get_user(_ol1e, (unsigned long *)pl1e) != 0) ) - { - DPRINTK("Could not find PTE entry for address %lx\n", virt); - rc = -EINVAL; - goto unmap_out; - } - - /* - * Check that the virtual address supplied is actually mapped to - * act->frame. - */ - if ( unlikely((_ol1e >> PAGE_SHIFT) != frame )) - { - DPRINTK("PTE entry %lx for address %lx doesn't match frame %lx\n", - _ol1e, virt, frame); - rc = -EINVAL; - goto unmap_out; - } - - /* Delete pagetable entry. */ - if ( unlikely(__put_user(0, (unsigned long *)pl1e))) - { - DPRINTK("Cannot delete PTE entry at %p for virtual address %lx\n", - pl1e, virt); - rc = -EINVAL; - goto unmap_out; + if ( flags & GNTMAP_contains_pte ) + { + if ( (rc = clear_grant_pte_mapping(addr, frame, ld)) < 0 ) + goto unmap_out; + } + else + { + if ( (rc = clear_grant_va_mapping(addr, frame)) < 0 ) + goto unmap_out; } map->ref_and_flags &= ~GNTMAP_host_map; @@ -596,17 +591,9 @@ act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_hstr_inc : GNTPIN_hstw_inc; - if ( frame == GNTUNMAP_DEV_FROM_VIRT ) - { - act->pin -= (flags & GNTMAP_readonly) ? GNTPIN_devr_inc - : GNTPIN_devw_inc; - - map->ref_and_flags &= ~GNTMAP_device_map; - (void)__put_user(0, &uop->dev_bus_addr); - } - rc = 0; - *va = virt; + if ( !( flags & GNTMAP_contains_pte) ) + *va = addr; } if ( (map->ref_and_flags & (GNTMAP_device_map|GNTMAP_host_map)) == 0) @@ -630,6 +617,7 @@ if ( act->pin == 0 ) { + act->frame = 0xdeadbeef; clear_bit(_GTF_reading, &sha->flags); put_page(&frame_table[frame]); } @@ -768,7 +756,7 @@ if ( sha_copy.flags ) { DPRINTK("Grant: dom (%hu) SHARED (%d) flags:(%hx) " - "dom:(%hu) frame:(%lx)\n", + "dom:(%hu) frame:(%x)\n", op.dom, i, sha_copy.flags, sha_copy.domid, sha_copy.frame); } } @@ -822,18 +810,20 @@ for (i = 0; i < count; i++) { gnttab_donate_t *gop = &uop[i]; #if GRANT_DEBUG - printk("gnttab_donate: i=%d mfn=%08x domid=%d gref=%08x\n", + printk("gnttab_donate: i=%d mfn=%lx domid=%d gref=%08x\n", i, gop->mfn, gop->domid, gop->handle); #endif page = &frame_table[gop->mfn]; - + if (unlikely(IS_XEN_HEAP_FRAME(page))) { - printk("gnttab_donate: xen heap frame mfn=%lx\n", (unsigned long) gop->mfn); + printk("gnttab_donate: xen heap frame mfn=%lx\n", + (unsigned long) gop->mfn); gop->status = GNTST_bad_virt_addr; continue; } if (unlikely(!pfn_valid(page_to_pfn(page)))) { - printk("gnttab_donate: invalid pfn for mfn=%lx\n", (unsigned long) gop->mfn); + printk("gnttab_donate: invalid pfn for mfn=%lx\n", + (unsigned long) gop->mfn); gop->status = GNTST_bad_virt_addr; continue; } @@ -859,7 +849,8 @@ if (unlikely((x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated)) || unlikely(_nd != _d)) { printk("gnttab_donate: Bad page values %p: ed=%p(%u), sd=%p," - " caf=%08x, taf=%08x\n", (void *) page_to_pfn(page), + " caf=%08x, taf=%" PRtype_info "\n", + (void *) page_to_pfn(page), d, d->domain_id, unpickle_domptr(_nd), x, page->u.inuse.type_info); spin_unlock(&d->page_alloc_lock); @@ -918,9 +909,9 @@ if (unlikely(test_bit(DOMFLAGS_DYING, &e->domain_flags)) || unlikely(e->tot_pages == e->max_pages) || unlikely(!gnttab_prepare_for_transfer(e, d, gop->handle))) { - printk("gnttab_donate: Transferee has no reservation headroom (%d,%d), or " - "provided a bad grant ref (%08x), or is dying (%p).\n", - e->tot_pages, e->max_pages, gop->handle, e->d_flags); + printk("gnttab_donate: Transferee has no reservation headroom (%d," + "%d) or provided a bad grant ref (%08x) or is dying (%p)\n", + e->tot_pages, e->max_pages, gop->handle, e->d_flags); spin_unlock(&e->page_alloc_lock); put_domain(e); result = GNTST_general_error; @@ -933,9 +924,9 @@ } list_add_tail(&page->list, &e->page_list); page_set_owner(page, e); - + spin_unlock(&e->page_alloc_lock); - + /* * Transfer is all done: tell the guest about its new page * frame. @@ -943,7 +934,7 @@ gnttab_notify_transfer(e, d, gop->handle, gop->mfn); put_domain(e); - + gop->status = GNTST_okay; } return result; @@ -954,48 +945,53 @@ unsigned int cmd, void *uop, unsigned int count) { long rc; - + struct domain *d = current->domain; + if ( count > 512 ) return -EINVAL; - - LOCK_BIGLOCK(current->domain); - + + LOCK_BIGLOCK(d); + + sync_pagetable_state(d); + rc = -EFAULT; switch ( cmd ) - { - case GNTTABOP_map_grant_ref: - if ( unlikely(!array_access_ok( - uop, count, sizeof(gnttab_map_grant_ref_t))) ) - goto out; - rc = gnttab_map_grant_ref((gnttab_map_grant_ref_t *)uop, count); - break; - case GNTTABOP_unmap_grant_ref: - if ( unlikely(!array_access_ok( - uop, count, sizeof(gnttab_unmap_grant_ref_t))) ) - goto out; - rc = gnttab_unmap_grant_ref((gnttab_unmap_grant_ref_t *)uop, count); - break; - case GNTTABOP_setup_table: - rc = gnttab_setup_table((gnttab_setup_table_t *)uop, count); - break; + { + case GNTTABOP_map_grant_ref: + if ( unlikely(!array_access_ok( + uop, count, sizeof(gnttab_map_grant_ref_t))) ) + goto out; + rc = gnttab_map_grant_ref((gnttab_map_grant_ref_t *)uop, count); + break; + case GNTTABOP_unmap_grant_ref: + if ( unlikely(!array_access_ok( + uop, count, sizeof(gnttab_unmap_grant_ref_t))) ) + goto out; + rc = gnttab_unmap_grant_ref((gnttab_unmap_grant_ref_t *)uop, + count); + break; + case GNTTABOP_setup_table: + rc = gnttab_setup_table((gnttab_setup_table_t *)uop, count); + break; #if GRANT_DEBUG - case GNTTABOP_dump_table: - rc = gnttab_dump_table((gnttab_dump_table_t *)uop); - break; + case GNTTABOP_dump_table: + rc = gnttab_dump_table((gnttab_dump_table_t *)uop); + break; #endif - case GNTTABOP_donate: - if (unlikely(!array_access_ok(uop, count, sizeof(gnttab_donate_t)))) - goto out; - rc = gnttab_donate(uop, count); - break; - default: - rc = -ENOSYS; - break; - } - -out: - UNLOCK_BIGLOCK(current->domain); - + case GNTTABOP_donate: + if (unlikely(!array_access_ok(uop, count, + sizeof(gnttab_donate_t)))) + goto out; + rc = gnttab_donate(uop, count); + break; + default: + rc = -ENOSYS; + break; + } + + out: + UNLOCK_BIGLOCK(d); + return rc; } @@ -1009,106 +1005,101 @@ * Called a _lot_ at domain creation because pages mapped by priv domains * also traverse this. */ - + /* Note: If the same frame is mapped multiple times, and then one of * the ptes is overwritten, which maptrack handle gets invalidated? * Advice: Don't do it. Explicitly unmap. */ - + unsigned int handle, ref, refcount; grant_table_t *lgt, *rgt; active_grant_entry_t *act; grant_mapping_t *map; int found = 0; - + lgt = ld->grant_table; - + #if GRANT_DEBUG_VERBOSE - if ( ld->domain_id != 0 ) - { - DPRINTK("Foreign unref rd(%d) ld(%d) frm(%x) flgs(%x).\n", - rd->domain_id, ld->domain_id, frame, readonly); - } + if ( ld->domain_ id != 0 ) { + DPRINTK("Foreign unref rd(%d) ld(%d) frm(%lx) flgs(%x).\n", + rd->domain_id, ld->domain_id, frame, readonly); + } #endif - + /* Fast exit if we're not mapping anything using grant tables */ if ( lgt->map_count == 0 ) return 0; - - if ( get_domain(rd) == 0 ) - { + + if ( get_domain(rd) == 0 ) { DPRINTK("gnttab_check_unmap: couldn't get_domain rd(%d)\n", rd->domain_id); return 0; } - + rgt = rd->grant_table; - - for ( handle = 0; handle < lgt->maptrack_limit; handle++ ) - { + + for ( handle = 0; handle < lgt->maptrack_limit; handle++ ) { + map = &lgt->maptrack[handle]; - + if ( map->domid != rd->domain_id ) continue; - + if ( ( map->ref_and_flags & MAPTRACK_GNTMAP_MASK ) && - ( readonly ? 1 : (!(map->ref_and_flags & GNTMAP_readonly)))) - { + ( readonly ? 1 : (!(map->ref_and_flags & GNTMAP_readonly)))) { + ref = (map->ref_and_flags >> MAPTRACK_REF_SHIFT); act = &rgt->active[ref]; - + spin_lock(&rgt->lock); - - if ( act->frame != frame ) - { + + if ( act->frame != frame ) { spin_unlock(&rgt->lock); continue; } - + refcount = act->pin & ( readonly ? GNTPIN_hstr_mask - : GNTPIN_hstw_mask ); - if ( refcount == 0 ) - { + : GNTPIN_hstw_mask ); + + if ( refcount == 0 ) { spin_unlock(&rgt->lock); continue; } - + /* gotcha */ DPRINTK("Grant unref rd(%d) ld(%d) frm(%lx) flgs(%x).\n", rd->domain_id, ld->domain_id, frame, readonly); - + if ( readonly ) act->pin -= GNTPIN_hstr_inc; - else - { + else { act->pin -= GNTPIN_hstw_inc; - + /* any more granted writable mappings? */ - if ( (act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) == 0 ) - { + if ( (act->pin & (GNTPIN_hstw_mask|GNTPIN_devw_mask)) == 0 ) { clear_bit(_GTF_writing, &rgt->shared[ref].flags); put_page_type(&frame_table[frame]); } } - - if ( act->pin == 0 ) - { + + if ( act->pin == 0 ) { clear_bit(_GTF_reading, &rgt->shared[ref].flags); put_page(&frame_table[frame]); } + spin_unlock(&rgt->lock); - + clear_bit(GNTMAP_host_map, &map->ref_and_flags); - + if ( !(map->ref_and_flags & GNTMAP_device_map) ) put_maptrack_handle(lgt, handle); - + found = 1; break; } } put_domain(rd); - + return found; } @@ -1124,8 +1115,10 @@ int retries = 0; unsigned long target_pfn; +#if GRANT_DEBUG_VERBOSE DPRINTK("gnttab_prepare_for_transfer rd(%hu) ld(%hu) ref(%hu).\n", rd->domain_id, ld->domain_id, ref); +#endif if ( unlikely((rgt = rd->grant_table) == NULL) || unlikely(ref >= NR_GRANT_ENTRIES) ) @@ -1203,8 +1196,10 @@ grant_entry_t *sha; unsigned long pfn; +#if GRANT_DEBUG_VERBOSE DPRINTK("gnttab_notify_transfer rd(%hu) ld(%hu) ref(%hu).\n", rd->domain_id, ld->domain_id, ref); +#endif sha = &rd->grant_table->shared[ref]; diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/lib.c --- a/xen/common/lib.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/lib.c Thu Aug 25 22:53:20 2005 @@ -450,8 +450,10 @@ ret <<= 10; case 'M': case 'm': ret <<= 10; - case 'K': case 'k': + case 'K': case 'k': default: ret <<= 10; + case 'B': case 'b': + break; } return ret; diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/page_alloc.c --- a/xen/common/page_alloc.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/page_alloc.c Thu Aug 25 22:53:20 2005 @@ -52,7 +52,6 @@ * One bit per page of memory. Bit set => page is allocated. */ -static unsigned long bitmap_size; /* in bytes */ static unsigned long *alloc_bitmap; #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8) @@ -135,10 +134,16 @@ /* Initialise allocator to handle up to @max_page pages. */ physaddr_t init_boot_allocator(physaddr_t bitmap_start) { + unsigned long bitmap_size; + bitmap_start = round_pgup(bitmap_start); - /* Allocate space for the allocation bitmap. */ + /* + * Allocate space for the allocation bitmap. Include an extra longword + * of padding for possible overrun in map_alloc and map_free. + */ bitmap_size = max_page / 8; + bitmap_size += sizeof(unsigned long); bitmap_size = round_pgup(bitmap_size); alloc_bitmap = (unsigned long *)phys_to_virt(bitmap_start); @@ -171,7 +176,7 @@ else if ( *p != '\0' ) break; - if ( (bad_pfn < (bitmap_size*8)) && !allocated_in_map(bad_pfn) ) + if ( (bad_pfn < max_page) && !allocated_in_map(bad_pfn) ) { printk("Marking page %lx as bad\n", bad_pfn); map_alloc(bad_pfn, 1); @@ -183,7 +188,7 @@ { unsigned long pg, i; - for ( pg = 0; (pg + nr_pfns) < (bitmap_size*8); pg += pfn_align ) + for ( pg = 0; (pg + nr_pfns) < max_page; pg += pfn_align ) { for ( i = 0; i < nr_pfns; i++ ) if ( allocated_in_map(pg + i) ) @@ -362,7 +367,7 @@ printk("Scrubbing Free RAM: "); - for ( pfn = 0; pfn < (bitmap_size * 8); pfn++ ) + for ( pfn = 0; pfn < max_page; pfn++ ) { /* Every 100MB, print a progress dot. */ if ( (pfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) @@ -413,6 +418,8 @@ ps = round_pgup(ps); pe = round_pgdown(pe); + if ( pe <= ps ) + return; memguard_guard_range(phys_to_virt(ps), pe - ps); @@ -482,19 +489,25 @@ ps = round_pgup(ps) >> PAGE_SHIFT; pe = round_pgdown(pe) >> PAGE_SHIFT; - - if (ps < MAX_DMADOM_PFN && pe > MAX_DMADOM_PFN) { - init_heap_pages(MEMZONE_DMADOM, pfn_to_page(ps), MAX_DMADOM_PFN - ps); - init_heap_pages(MEMZONE_DOM, pfn_to_page(MAX_DMADOM_PFN), - pe - MAX_DMADOM_PFN); + if ( pe <= ps ) + return; + + if ( (ps < MAX_DMADOM_PFN) && (pe > MAX_DMADOM_PFN) ) + { + init_heap_pages( + MEMZONE_DMADOM, pfn_to_page(ps), MAX_DMADOM_PFN - ps); + init_heap_pages( + MEMZONE_DOM, pfn_to_page(MAX_DMADOM_PFN), pe - MAX_DMADOM_PFN); } else + { init_heap_pages(pfn_dom_zone_type(ps), pfn_to_page(ps), pe - ps); -} - - -struct pfn_info *alloc_domheap_pages(struct domain *d, unsigned int order, - unsigned int flags) + } +} + + +struct pfn_info *alloc_domheap_pages( + struct domain *d, unsigned int order, unsigned int flags) { struct pfn_info *pg; cpumask_t mask; diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/perfc.c --- a/xen/common/perfc.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/perfc.c Thu Aug 25 22:53:20 2005 @@ -7,6 +7,7 @@ #include <xen/spinlock.h> #include <public/dom0_ops.h> #include <asm/uaccess.h> +#include <xen/mm.h> #undef PERFCOUNTER #undef PERFCOUNTER_CPU @@ -81,6 +82,10 @@ } printk("\n"); } + +#ifdef PERF_ARRAYS + ptwr_eip_stat_print(); +#endif } void perfc_reset(unsigned char key) @@ -118,6 +123,10 @@ break; } } + +#ifdef PERF_ARRAYS + ptwr_eip_stat_reset(); +#endif } static dom0_perfc_desc_t perfc_d[NR_PERFCTRS]; diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/schedule.c --- a/xen/common/schedule.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/schedule.c Thu Aug 25 22:53:20 2005 @@ -38,6 +38,8 @@ #include <xen/mm.h> #include <public/sched_ctl.h> +extern void arch_getdomaininfo_ctxt(struct vcpu *, + struct vcpu_guest_context *); /* opt_sched: scheduler - default to SEDF */ static char opt_sched[10] = "sedf"; string_param("sched", opt_sched); @@ -82,7 +84,8 @@ int i; SCHED_OP(free_task, d); - for (i = 0; i < MAX_VIRT_CPUS; i++) + /* vcpu 0 has to be the last one destructed. */ + for (i = MAX_VIRT_CPUS-1; i >= 0; i--) if ( d->vcpu[i] ) arch_free_vcpu_struct(d->vcpu[i]); @@ -295,10 +298,36 @@ return 0; } +static long do_vcpu_pickle(int vcpu, unsigned long arg) +{ + struct vcpu *v; + vcpu_guest_context_t *c; + int ret = 0; + + if (vcpu >= MAX_VIRT_CPUS) + return -EINVAL; + v = current->domain->vcpu[vcpu]; + if (!v) + return -ESRCH; + /* Don't pickle vcpus which are currently running */ + if (!test_bit(_VCPUF_down, &v->vcpu_flags)) { + return -EBUSY; + } + c = xmalloc(vcpu_guest_context_t); + if (!c) + return -ENOMEM; + arch_getdomaininfo_ctxt(v, c); + if (copy_to_user((vcpu_guest_context_t *)arg, + (const vcpu_guest_context_t *)c, sizeof(*c))) + ret = -EFAULT; + xfree(c); + return ret; +} + /* * Demultiplex scheduler-related hypercalls. */ -long do_sched_op(unsigned long op) +long do_sched_op(unsigned long op, unsigned long arg) { long ret = 0; @@ -332,6 +361,11 @@ case SCHEDOP_vcpu_up: { ret = do_vcpu_up((int)(op >> SCHEDOP_vcpushift)); + break; + } + case SCHEDOP_vcpu_pickle: + { + ret = do_vcpu_pickle((int)(op >> SCHEDOP_vcpushift), arg); break; } @@ -474,13 +508,14 @@ set_ac_timer(&schedule_data[cpu].s_timer, now + r_time); - /* Must be protected by the schedule_lock! */ + if ( unlikely(prev == next) ) + { + spin_unlock_irq(&schedule_data[cpu].schedule_lock); + return continue_running(prev); + } + + clear_bit(_VCPUF_running, &prev->vcpu_flags); set_bit(_VCPUF_running, &next->vcpu_flags); - - spin_unlock_irq(&schedule_data[cpu].schedule_lock); - - if ( unlikely(prev == next) ) - return continue_running(prev); perfc_incrc(sched_ctx); @@ -517,6 +552,10 @@ next->domain->domain_id, next->vcpu_id); context_switch(prev, next); + + spin_unlock_irq(&schedule_data[cpu].schedule_lock); + + context_switch_finalise(next); } /* No locking needed -- pointer comparison is safe :-) */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/common/trace.c --- a/xen/common/trace.c Wed Aug 24 02:43:18 2005 +++ b/xen/common/trace.c Thu Aug 25 22:53:20 2005 @@ -113,10 +113,10 @@ switch ( tbc->op) { case DOM0_TBUF_GET_INFO: - tbc->cpu_mask = tb_cpu_mask; - tbc->evt_mask = tb_event_mask; - tbc->mach_addr = __pa(t_bufs[0]); - tbc->size = opt_tbuf_size * PAGE_SIZE; + tbc->cpu_mask = tb_cpu_mask; + tbc->evt_mask = tb_event_mask; + tbc->buffer_mfn = __pa(t_bufs[0]) >> PAGE_SHIFT; + tbc->size = opt_tbuf_size * PAGE_SIZE; break; case DOM0_TBUF_SET_CPU_MASK: tb_cpu_mask = tbc->cpu_mask; diff -r 5f1ed597f107 -r 8799d14bef77 xen/drivers/char/console.c --- a/xen/drivers/char/console.c Wed Aug 24 02:43:18 2005 +++ b/xen/drivers/char/console.c Thu Aug 25 22:53:20 2005 @@ -652,8 +652,9 @@ void panic(const char *fmt, ...) { va_list args; - char buf[128], cpustr[10]; + char buf[128]; unsigned long flags; + static spinlock_t lock = SPIN_LOCK_UNLOCKED; extern void machine_restart(char *); debugtrace_dump(); @@ -665,16 +666,13 @@ debugger_trap_immediate(); /* Spit out multiline message in one go. */ - spin_lock_irqsave(&console_lock, flags); - __putstr("\n****************************************\n"); - __putstr("Panic on CPU"); - sprintf(cpustr, "%d", smp_processor_id()); - __putstr(cpustr); - __putstr(":\n"); - __putstr(buf); - __putstr("****************************************\n\n"); - __putstr("Reboot in five seconds...\n"); - spin_unlock_irqrestore(&console_lock, flags); + spin_lock_irqsave(&lock, flags); + printk("\n****************************************\n"); + printk("Panic on CPU %d:\n", smp_processor_id()); + printk(buf); + printk("****************************************\n\n"); + printk("Reboot in five seconds...\n"); + spin_unlock_irqrestore(&lock, flags); watchdog_disable(); mdelay(5000); diff -r 5f1ed597f107 -r 8799d14bef77 xen/drivers/char/ns16550.c --- a/xen/drivers/char/ns16550.c Wed Aug 24 02:43:18 2005 +++ b/xen/drivers/char/ns16550.c Thu Aug 25 22:53:20 2005 @@ -15,7 +15,12 @@ #include <xen/serial.h> #include <asm/io.h> -/* Config serial port with a string <baud>,DPS,<io-base>,<irq>. */ +/* + * Configure serial port with a string <baud>,DPS,<io-base>,<irq>. + * The tail of the string can be omitted if platform defaults are sufficient. + * If the baud rate is pre-configured, perhaps by a bootloader, then 'auto' + * can be specified in place of a numeric baud rate. + */ static char opt_com1[30] = "", opt_com2[30] = ""; string_param("com1", opt_com1); string_param("com2", opt_com2); @@ -154,7 +159,7 @@ ns_write_reg(uart, IER, 0); /* Line control and baud-rate generator. */ - if ( uart->baud != 0 ) + if ( uart->baud != BAUD_AUTO ) { ns_write_reg(uart, LCR, lcr | LCR_DLAB); ns_write_reg(uart, DLL, 115200/uart->baud); /* baud lo */ @@ -244,38 +249,50 @@ { int baud; + /* No user-specified configuration? */ if ( (conf == NULL) || (*conf == '\0') ) - goto config_parsed; - - if ( (baud = simple_strtol(conf, &conf, 10)) != 0 ) + { + /* Some platforms may automatically probe the UART configuartion. */ + if ( uart->baud != 0 ) + goto config_parsed; + return; + } + + if ( strncmp(conf, "auto", 4) == 0 ) + { + uart->baud = BAUD_AUTO; + conf += 4; + } + else if ( (baud = simple_strtoul(conf, &conf, 10)) != 0 ) uart->baud = baud; if ( *conf != ',' ) goto config_parsed; conf++; - uart->data_bits = simple_strtol(conf, &conf, 10); + uart->data_bits = simple_strtoul(conf, &conf, 10); uart->parity = parse_parity_char(*conf); conf++; - uart->stop_bits = simple_strtol(conf, &conf, 10); + uart->stop_bits = simple_strtoul(conf, &conf, 10); if ( *conf == ',' ) { conf++; - uart->io_base = simple_strtol(conf, &conf, 0); + uart->io_base = simple_strtoul(conf, &conf, 0); if ( *conf == ',' ) { conf++; - uart->irq = simple_strtol(conf, &conf, 10); + uart->irq = simple_strtoul(conf, &conf, 10); } } config_parsed: /* Sanity checks. */ - if ( (uart->baud != 0) && ((uart->baud < 1200) || (uart->baud > 115200)) ) + if ( (uart->baud != BAUD_AUTO) && + ((uart->baud < 1200) || (uart->baud > 115200)) ) PARSE_ERR("Baud rate %d outside supported range.", uart->baud); if ( (uart->data_bits < 5) || (uart->data_bits > 8) ) PARSE_ERR("%d data bits are unsupported.", uart->data_bits); diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/e820.h --- a/xen/include/asm-x86/e820.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/e820.h Thu Aug 25 22:53:20 2005 @@ -3,7 +3,7 @@ #include <asm/page.h> -#define E820MAX 32 +#define E820MAX 128 #define E820_RAM 1 #define E820_RESERVED 2 diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/event.h --- a/xen/include/asm-x86/event.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/event.h Thu Aug 25 22:53:20 2005 @@ -11,6 +11,19 @@ static inline void evtchn_notify(struct vcpu *v) { + /* + * NB1. 'vcpu_flags' and 'processor' must be checked /after/ update of + * pending flag. These values may fluctuate (after all, we hold no + * locks) but the key insight is that each change will cause + * evtchn_upcall_pending to be polled. + * + * NB2. We save VCPUF_running across the unblock to avoid a needless + * IPI for domains that we IPI'd to unblock. + */ + int running = test_bit(_VCPUF_running, &v->vcpu_flags); + vcpu_unblock(v); + if ( running ) + smp_send_event_check_cpu(v->processor); } #endif diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/io.h --- a/xen/include/asm-x86/io.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/io.h Thu Aug 25 22:53:20 2005 @@ -2,6 +2,7 @@ #define _ASM_IO_H #include <xen/config.h> +#include <xen/types.h> #include <asm/page.h> #define IO_SPACE_LIMIT 0xffff @@ -45,11 +46,7 @@ /* * Change "struct pfn_info" to physical address. */ -#ifdef CONFIG_HIGHMEM64G -#define page_to_phys(page) ((u64)(page - frame_table) << PAGE_SHIFT) -#else -#define page_to_phys(page) ((page - frame_table) << PAGE_SHIFT) -#endif +#define page_to_phys(page) ((physaddr_t)(page - frame_table) << PAGE_SHIFT) #define page_to_pfn(_page) ((unsigned long)((_page) - frame_table)) #define page_to_virt(_page) phys_to_virt(page_to_phys(_page)) diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/mm.h Thu Aug 25 22:53:20 2005 @@ -36,7 +36,7 @@ /* Owner of this page (NULL if page is anonymous). */ u32 _domain; /* pickled format */ /* Type reference count and various PGT_xxx flags and fields. */ - u32 type_info; + unsigned long type_info; } inuse; /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */ @@ -77,6 +77,7 @@ /* Owning guest has pinned this page to its current type? */ #define _PGT_pinned 27 #define PGT_pinned (1U<<_PGT_pinned) +#if defined(__i386__) /* The 11 most significant bits of virt address if this is a page table. */ #define PGT_va_shift 16 #define PGT_va_mask (((1U<<11)-1)<<PGT_va_shift) @@ -84,6 +85,16 @@ #define PGT_va_mutable (((1U<<11)-1)<<PGT_va_shift) /* Is the back pointer unknown (e.g., p.t. is mapped at multiple VAs)? */ #define PGT_va_unknown (((1U<<11)-2)<<PGT_va_shift) +#elif defined(__x86_64__) + /* The 27 most significant bits of virt address if this is a page table. */ +#define PGT_va_shift 32 +#define PGT_va_mask ((unsigned long)((1U<<28)-1)<<PGT_va_shift) + /* Is the back pointer still mutable (i.e. not fixed yet)? */ +#define PGT_va_mutable ((unsigned long)((1U<<28)-1)<<PGT_va_shift) + /* Is the back pointer unknown (e.g., p.t. is mapped at multiple VAs)? */ +#define PGT_va_unknown ((unsigned long)((1U<<28)-2)<<PGT_va_shift) +#endif + /* 16-bit count of uses of this frame as its current type. */ #define PGT_count_mask ((1U<<16)-1) @@ -114,11 +125,13 @@ #if defined(__i386__) #define pickle_domptr(_d) ((u32)(unsigned long)(_d)) #define unpickle_domptr(_d) ((struct domain *)(unsigned long)(_d)) +#define PRtype_info "08lx" /* should only be used for printk's */ #elif defined(__x86_64__) static inline struct domain *unpickle_domptr(u32 _domain) { return (_domain == 0) ? NULL : __va(_domain); } static inline u32 pickle_domptr(struct domain *domain) { return (domain == NULL) ? 0 : (u32)__pa(domain); } +#define PRtype_info "016lx"/* should only be used for printk's */ #endif #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) @@ -144,8 +157,8 @@ extern unsigned long max_page; void init_frametable(void); -int alloc_page_type(struct pfn_info *page, unsigned int type); -void free_page_type(struct pfn_info *page, unsigned int type); +int alloc_page_type(struct pfn_info *page, unsigned long type); +void free_page_type(struct pfn_info *page, unsigned long type); extern void invalidate_shadow_ldt(struct vcpu *d); extern int shadow_remove_all_write_access( struct domain *d, unsigned long gpfn, unsigned long gmfn); @@ -183,7 +196,7 @@ unlikely(d != _domain) ) /* Wrong owner? */ { if ( !_shadow_mode_refcounts(domain) ) - DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%08x\n", + DPRINTK("Error pfn %lx: rd=%p, od=%p, caf=%08x, taf=%" PRtype_info "\n", page_to_pfn(page), domain, unpickle_domptr(d), x, page->u.inuse.type_info); return 0; @@ -200,7 +213,7 @@ } void put_page_type(struct pfn_info *page); -int get_page_type(struct pfn_info *page, u32 type); +int get_page_type(struct pfn_info *page, unsigned long type); int get_page_from_l1e(l1_pgentry_t l1e, struct domain *d); void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d); @@ -213,7 +226,7 @@ static inline int get_page_and_type(struct pfn_info *page, struct domain *domain, - u32 type) + unsigned long type) { int rc = get_page(page, domain); @@ -300,6 +313,9 @@ unsigned int prev_nr_updates; /* Exec domain which created writable mapping. */ struct vcpu *vcpu; + /* EIP of the address which took the original write fault + used for stats collection only */ + unsigned long eip; }; #define PTWR_PT_ACTIVE 0 @@ -311,7 +327,8 @@ int ptwr_init(struct domain *); void ptwr_destroy(struct domain *); void ptwr_flush(struct domain *, const int); -int ptwr_do_page_fault(struct domain *, unsigned long); +int ptwr_do_page_fault(struct domain *, unsigned long, + struct cpu_user_regs *); int revalidate_l1(struct domain *, l1_pgentry_t *, l1_pgentry_t *); void cleanup_writable_pagetable(struct domain *d); @@ -334,6 +351,18 @@ #define _audit_domain(_d, _f) ((void)0) #define audit_domain(_d) ((void)0) #define audit_domains() ((void)0) + +#endif + +#ifdef PERF_ARRAYS + +void ptwr_eip_stat_reset(); +void ptwr_eip_stat_print(); + +#else + +#define ptwr_eip_stat_reset() ((void)0) +#define ptwr_eip_stat_print() ((void)0) #endif @@ -345,8 +374,14 @@ * Caller must own d's BIGLOCK, is responsible for flushing the TLB, and must * hold a reference to the page. */ -int update_grant_va_mapping(unsigned long va, - l1_pgentry_t _nl1e, - struct domain *d, - struct vcpu *v); +int update_grant_va_mapping( + unsigned long va, l1_pgentry_t _nl1e, + struct domain *d, struct vcpu *v); +int update_grant_pte_mapping( + unsigned long pte_addr, l1_pgentry_t _nl1e, + struct domain *d, struct vcpu *v); +int clear_grant_va_mapping(unsigned long addr, unsigned long frame); +int clear_grant_pte_mapping( + unsigned long addr, unsigned long frame, struct domain *d); + #endif /* __ASM_X86_MM_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/page.h --- a/xen/include/asm-x86/page.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/page.h Thu Aug 25 22:53:20 2005 @@ -189,6 +189,9 @@ #define virt_to_page(kaddr) (frame_table + (__pa(kaddr) >> PAGE_SHIFT)) #define pfn_valid(_pfn) ((_pfn) < max_page) +#define pfn_to_phys(pfn) ((physaddr_t)(pfn) << PAGE_SHIFT) +#define phys_to_pfn(pa) ((unsigned long)((pa) >> PAGE_SHIFT)) + /* High table entries are reserved by the hypervisor. */ #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) #define DOMAIN_ENTRIES_PER_L2_PAGETABLE \ @@ -208,20 +211,21 @@ + DOMAIN_ENTRIES_PER_L4_PAGETABLE) #endif -#define linear_l1_table \ +#define LINEAR_PT_OFFSET (LINEAR_PT_VIRT_START & VADDR_MASK) +#define linear_l1_table \ ((l1_pgentry_t *)(LINEAR_PT_VIRT_START)) -#define __linear_l2_table \ - ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)))) -#define __linear_l3_table \ - ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)) + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<1)))) -#define __linear_l4_table \ - ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<0)) + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<1)) + \ - (LINEAR_PT_VIRT_START >> (PAGETABLE_ORDER<<2)))) +#define __linear_l2_table \ + ((l2_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)))) +#define __linear_l3_table \ + ((l3_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \ + (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)))) +#define __linear_l4_table \ + ((l4_pgentry_t *)(LINEAR_PT_VIRT_START + \ + (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<0)) + \ + (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<1)) + \ + (LINEAR_PT_OFFSET >> (PAGETABLE_ORDER<<2)))) #define linear_pg_table linear_l1_table #define linear_l2_table(_ed) ((_ed)->arch.guest_vtable) @@ -279,13 +283,9 @@ static __inline__ int get_order(unsigned long size) { int order; - - size = (size-1) >> (PAGE_SHIFT-1); - order = -1; - do { + size = (size-1) >> PAGE_SHIFT; + for ( order = 0; size; order++ ) size >>= 1; - order++; - } while (size); return order; } diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/shadow.h Thu Aug 25 22:53:20 2005 @@ -483,9 +483,9 @@ #ifndef NDEBUG else if ( mfn < max_page ) { - SH_LOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (dom %p)", + SH_VLOG("mark_dirty OOR! mfn=%x pfn=%lx max=%x (dom %p)", mfn, pfn, d->arch.shadow_dirty_bitmap_size, d); - SH_LOG("dom=%p caf=%08x taf=%08x", + SH_VLOG("dom=%p caf=%08x taf=%" PRtype_info, page_get_owner(&frame_table[mfn]), frame_table[mfn].count_info, frame_table[mfn].u.inuse.type_info ); @@ -602,14 +602,14 @@ /* XXX This needs more thought... */ printk("%s: needing to call shadow_remove_all_access for mfn=%lx\n", __func__, page_to_pfn(page)); - printk("Before: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page), + printk("Before: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_pfn(page), page->count_info, page->u.inuse.type_info); shadow_lock(d); shadow_remove_all_access(d, page_to_pfn(page)); shadow_unlock(d); - printk("After: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page), + printk("After: mfn=%lx c=%08x t=%" PRtype_info "\n", page_to_pfn(page), page->count_info, page->u.inuse.type_info); } @@ -648,7 +648,7 @@ if ( unlikely(nx == 0) ) { - printk("get_shadow_ref overflow, gmfn=%x smfn=%lx\n", + printk("get_shadow_ref overflow, gmfn=%" PRtype_info " smfn=%lx\n", frame_table[smfn].u.inuse.type_info & PGT_mfn_mask, smfn); BUG(); @@ -678,7 +678,8 @@ if ( unlikely(x == 0) ) { - printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%08x\n", + printk("put_shadow_ref underflow, smfn=%lx oc=%08x t=%" + PRtype_info "\n", smfn, frame_table[smfn].count_info, frame_table[smfn].u.inuse.type_info); @@ -735,7 +736,7 @@ if ( unlikely(!VALID_MFN(gmfn)) ) { - SH_LOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); + SH_VLOG("l1pte_write_fault: invalid gpfn=%lx", gpfn); *spte_p = l1e_empty(); return 0; } @@ -769,7 +770,7 @@ if ( unlikely(!VALID_MFN(mfn)) ) { - SH_LOG("l1pte_read_fault: invalid gpfn=%lx", pfn); + SH_VLOG("l1pte_read_fault: invalid gpfn=%lx", pfn); *spte_p = l1e_empty(); return 0; } @@ -1200,7 +1201,7 @@ #ifndef NDEBUG if ( ___shadow_status(d, gpfn, stype) != 0 ) { - printk("d->id=%d gpfn=%lx gmfn=%lx stype=%lx c=%x t=%x " + printk("d->id=%d gpfn=%lx gmfn=%lx stype=%lx c=%x t=%" PRtype_info " " "mfn_out_of_sync(gmfn)=%d mfn_is_page_table(gmfn)=%d\n", d->domain_id, gpfn, gmfn, stype, frame_table[gmfn].count_info, @@ -1471,7 +1472,7 @@ /* We need to allocate a new node. Ensure the quicklist is non-empty. */ if ( unlikely(d->arch.shadow_ht_free == NULL) ) { - SH_LOG("Allocate more shadow hashtable blocks."); + SH_VLOG("Allocate more shadow hashtable blocks."); extra = xmalloc_bytes( sizeof(void *) + (shadow_ht_extra_size * sizeof(*x))); diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/time.h --- a/xen/include/asm-x86/time.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/time.h Thu Aug 25 22:53:20 2005 @@ -7,4 +7,7 @@ extern void calibrate_tsc_bp(void); extern void calibrate_tsc_ap(void); +struct domain; +extern void init_domain_time(struct domain *d); + #endif /* __X86_TIME_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/types.h --- a/xen/include/asm-x86/types.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/types.h Thu Aug 25 22:53:20 2005 @@ -38,13 +38,16 @@ typedef unsigned long long u64; #if defined(CONFIG_X86_PAE) typedef u64 physaddr_t; +#define PRIphysaddr "016llx" #else -typedef u32 physaddr_t; +typedef unsigned long physaddr_t; +#define PRIphysaddr "08lx" #endif #elif defined(__x86_64__) typedef signed long s64; typedef unsigned long u64; -typedef u64 physaddr_t; +typedef unsigned long physaddr_t; +#define PRIphysaddr "016lx" #endif typedef unsigned long size_t; diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/uaccess.h --- a/xen/include/asm-x86/uaccess.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/uaccess.h Thu Aug 25 22:53:20 2005 @@ -125,22 +125,20 @@ __pu_err; \ }) -#define __get_user_nocheck(x,ptr,size) \ -({ \ - long __gu_err, __gu_val; \ - __get_user_size(__gu_val,(ptr),(size),__gu_err,-EFAULT);\ - (x) = (__typeof__(*(ptr)))__gu_val; \ - __gu_err; \ +#define __get_user_nocheck(x,ptr,size) \ +({ \ + long __gu_err; \ + __get_user_size((x),(ptr),(size),__gu_err,-EFAULT); \ + __gu_err; \ }) -#define __get_user_check(x,ptr,size) \ -({ \ - long __gu_err, __gu_val; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __get_user_size(__gu_val,__gu_addr,(size),__gu_err,-EFAULT); \ - (x) = (__typeof__(*(ptr)))__gu_val; \ - if (!__addr_ok(__gu_addr)) __gu_err = -EFAULT; \ - __gu_err; \ +#define __get_user_check(x,ptr,size) \ +({ \ + long __gu_err; \ + __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + __get_user_size((x),__gu_addr,(size),__gu_err,-EFAULT); \ + if (!__addr_ok(__gu_addr)) __gu_err = -EFAULT; \ + __gu_err; \ }) struct __large_struct { unsigned long buf[100]; }; diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/vmx.h --- a/xen/include/asm-x86/vmx.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/vmx.h Thu Aug 25 22:53:20 2005 @@ -31,10 +31,11 @@ extern void vmx_asm_vmexit_handler(struct cpu_user_regs); extern void vmx_asm_do_resume(void); extern void vmx_asm_do_launch(void); -extern void vmx_intr_assist(struct vcpu *d); +extern void vmx_intr_assist(void); extern void arch_vmx_do_launch(struct vcpu *); extern void arch_vmx_do_resume(struct vcpu *); +extern void arch_vmx_do_relaunch(struct vcpu *); extern int vmcs_size; extern unsigned int cpu_rev; @@ -354,7 +355,7 @@ } /* Make sure that xen intercepts any FP accesses from current */ -static inline void vmx_stts() +static inline void vmx_stts(void) { unsigned long cr0; diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/vmx_vmcs.h --- a/xen/include/asm-x86/vmx_vmcs.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/vmx_vmcs.h Thu Aug 25 22:53:20 2005 @@ -28,10 +28,10 @@ extern void stop_vmx(void); #if defined (__x86_64__) -extern void vmx_load_msrs(struct vcpu *p, struct vcpu *n); +extern void vmx_load_msrs(struct vcpu *n); void vmx_restore_msrs(struct vcpu *d); #else -#define vmx_load_msrs(_p, _n) ((void)0) +#define vmx_load_msrs(_n) ((void)0) #define vmx_restore_msrs(_v) ((void)0) #endif @@ -93,6 +93,7 @@ void vmx_do_launch(struct vcpu *); void vmx_do_resume(struct vcpu *); +void vmx_set_host_env(struct vcpu *); struct vmcs_struct *alloc_vmcs(void); void free_vmcs(struct vmcs_struct *); diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/x86_32/page-3level.h --- a/xen/include/asm-x86/x86_32/page-3level.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/x86_32/page-3level.h Thu Aug 25 22:53:20 2005 @@ -63,7 +63,7 @@ /* Extract flags into 32-bit integer, or turn 32-bit flags into a pte mask. */ #define get_pte_flags(x) (((int)((x) >> 32) & ~0xFFF) | ((int)(x) & 0xFFF)) -#define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 40) | ((x) & 0xFFF)) +#define put_pte_flags(x) (((intpte_t)((x) & ~0xFFF) << 32) | ((x) & 0xFFF)) #define L1_DISALLOW_MASK (0xFFFFF180U & ~_PAGE_NX) /* PAT/GLOBAL */ #define L2_DISALLOW_MASK (0xFFFFF180U & ~_PAGE_NX) /* PSE/GLOBAL */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/x86_32/uaccess.h --- a/xen/include/asm-x86/x86_32/uaccess.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/x86_32/uaccess.h Thu Aug 25 22:53:20 2005 @@ -22,7 +22,11 @@ #define array_access_ok(addr,count,size) \ (likely(count < (~0UL/size)) && access_ok(addr,count*size)) +/* Undefined function to catch size mismatches on 64-bit get_user/put_user. */ +extern void __uaccess_var_not_u64(void); + #define __put_user_u64(x, addr, retval, errret) \ + if (sizeof(x) != 8) __uaccess_var_not_u64(); \ __asm__ __volatile__( \ "1: movl %%eax,0(%2)\n" \ "2: movl %%edx,4(%2)\n" \ @@ -52,6 +56,7 @@ } while (0) #define __get_user_u64(x, addr, retval, errret) \ + if (sizeof(x) != 8) __uaccess_var_not_u64(); \ __asm__ __volatile__( \ "1: movl 0(%2),%%eax\n" \ "2: movl 4(%2),%%edx\n" \ @@ -67,7 +72,7 @@ " .long 1b,4b\n" \ " .long 2b,4b\n" \ ".previous" \ - : "=r" (retval), "=A" (x) \ + : "=r" (retval), "=&A" (x) \ : "r" (addr), "i"(errret), "0"(retval)) #define __get_user_size(x,ptr,size,retval,errret) \ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/asm-x86/x86_64/page.h --- a/xen/include/asm-x86/x86_64/page.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/asm-x86/x86_64/page.h Thu Aug 25 22:53:20 2005 @@ -42,7 +42,8 @@ #endif /* !__ASSEMBLY__ */ /* Given a virtual address, get an entry offset into a linear page table. */ -#define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> PAGE_SHIFT) +#define l1_linear_offset(_a) (((_a) & VADDR_MASK) >> L1_PAGETABLE_SHIFT) +#define l2_linear_offset(_a) (((_a) & VADDR_MASK) >> L2_PAGETABLE_SHIFT) #define is_guest_l1_slot(_s) (1) #define is_guest_l2_slot(_t, _s) (1) diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/arch-ia64.h --- a/xen/include/public/arch-ia64.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/arch-ia64.h Thu Aug 25 22:53:20 2005 @@ -12,9 +12,6 @@ #define MAX_VIRT_CPUS 1 #ifndef __ASSEMBLY__ - -/* NB. Both the following are 64 bits each. */ -typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ #define MAX_NR_SECTION 32 // at most 32 memory holes typedef struct { diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/arch-x86_32.h --- a/xen/include/public/arch-x86_32.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/arch-x86_32.h Thu Aug 25 22:53:20 2005 @@ -63,9 +63,6 @@ #ifndef __ASSEMBLY__ -/* NB. Both the following are 32 bits each. */ -typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ - /* * Send an array of these to HYPERVISOR_set_trap_table() */ @@ -74,10 +71,10 @@ #define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl)) #define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2)) typedef struct trap_info { - u8 vector; /* exception vector */ - u8 flags; /* 0-3: privilege level; 4: clear event enable? */ - u16 cs; /* code selector */ - memory_t address; /* code address */ + u8 vector; /* exception vector */ + u8 flags; /* 0-3: privilege level; 4: clear event enable? */ + u16 cs; /* code selector */ + unsigned long address; /* code offset */ } trap_info_t; typedef struct cpu_user_regs { diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/arch-x86_64.h --- a/xen/include/public/arch-x86_64.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/arch-x86_64.h Thu Aug 25 22:53:20 2005 @@ -103,9 +103,6 @@ /* Bottom of switch_to_user stack frame. */ }; -/* NB. Both the following are 64 bits each. */ -typedef unsigned long memory_t; /* Full-sized pointer/address/memory-size. */ - /* * Send an array of these to HYPERVISOR_set_trap_table(). * N.B. As in x86/32 mode, the privilege level specifies which modes may enter @@ -121,10 +118,10 @@ #define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl)) #define TI_SET_IF(_ti,_if) ((_ti)->flags |= ((!!(_if))<<2)) typedef struct trap_info { - u8 vector; /* exception vector */ - u8 flags; /* 0-3: privilege level; 4: clear event enable? */ - u16 cs; /* code selector */ - memory_t address; /* code address */ + u8 vector; /* exception vector */ + u8 flags; /* 0-3: privilege level; 4: clear event enable? */ + u16 cs; /* code selector */ + unsigned long address; /* code offset */ } trap_info_t; typedef struct cpu_user_regs { diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/dom0_ops.h --- a/xen/include/public/dom0_ops.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/dom0_ops.h Thu Aug 25 22:53:20 2005 @@ -19,7 +19,7 @@ * This makes sure that old versions of dom0 tools will stop working in a * well-defined way (rather than crashing the machine, for instance). */ -#define DOM0_INTERFACE_VERSION 0xAAAA100E +#define DOM0_INTERFACE_VERSION 0xAAAA1010 /************************************************************************/ @@ -27,10 +27,10 @@ typedef struct { /* IN variables. */ domid_t domain; - memory_t max_pfns; + unsigned long max_pfns; void *buffer; /* OUT variables. */ - memory_t num_pfns; + unsigned long num_pfns; } dom0_getmemlist_t; #define DOM0_SCHEDCTL 6 @@ -83,9 +83,9 @@ #define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code. */ #define DOMFLAGS_SHUTDOWNSHIFT 16 u32 flags; - memory_t tot_pages; - memory_t max_pages; - memory_t shared_info_frame; /* MFN of shared_info struct */ + unsigned long tot_pages; + unsigned long max_pages; + unsigned long shared_info_frame; /* MFN of shared_info struct */ u64 cpu_time; u32 n_vcpu; s32 vcpu_to_cpu[MAX_VIRT_CPUS]; /* current mapping */ @@ -131,14 +131,14 @@ } dom0_debug_t; /* - * Set clock such that it would read <secs,usecs> after 00:00:00 UTC, + * Set clock such that it would read <secs,nsecs> after 00:00:00 UTC, * 1 January, 1970 if the current system time was <system_time>. */ #define DOM0_SETTIME 17 typedef struct { /* IN variables. */ u32 secs; - u32 usecs; + u32 nsecs; u64 system_time; } dom0_settime_t; @@ -155,7 +155,7 @@ typedef struct { /* IN variables. */ - memory_t pfn; /* Machine page frame number to query. */ + unsigned long pfn; /* Machine page frame number to query. */ domid_t domain; /* To which domain does the frame belong? */ /* OUT variables. */ /* Is the page PINNED to a type? */ @@ -197,7 +197,7 @@ unsigned long cpu_mask; u32 evt_mask; /* OUT variables */ - memory_t mach_addr; + unsigned long buffer_mfn; u32 size; } dom0_tbufcontrol_t; @@ -211,8 +211,8 @@ u32 sockets_per_node; u32 nr_nodes; u32 cpu_khz; - memory_t total_pages; - memory_t free_pages; + unsigned long total_pages; + unsigned long free_pages; } dom0_physinfo_t; /* @@ -252,7 +252,7 @@ u32 op; unsigned long *dirty_bitmap; /* pointer to locked buffer */ /* IN/OUT variables. */ - memory_t pages; /* size of buffer, updated with actual size */ + unsigned long pages; /* size of buffer, updated with actual size */ /* OUT variables. */ dom0_shadow_control_stats_t stats; } dom0_shadow_control_t; @@ -260,15 +260,15 @@ #define DOM0_SETDOMAINMAXMEM 28 typedef struct { /* IN variables. */ - domid_t domain; - memory_t max_memkb; + domid_t domain; + unsigned long max_memkb; } dom0_setdomainmaxmem_t; #define DOM0_GETPAGEFRAMEINFO2 29 /* batched interface */ typedef struct { /* IN variables. */ - domid_t domain; - memory_t num; + domid_t domain; + unsigned long num; /* IN/OUT variables. */ unsigned long *array; } dom0_getpageframeinfo2_t; @@ -283,12 +283,12 @@ #define DOM0_ADD_MEMTYPE 31 typedef struct { /* IN variables. */ - memory_t pfn; - memory_t nr_pfns; - u32 type; - /* OUT variables. */ - u32 handle; - u32 reg; + unsigned long pfn; + unsigned long nr_pfns; + u32 type; + /* OUT variables. */ + u32 handle; + u32 reg; } dom0_add_memtype_t; /* @@ -311,8 +311,8 @@ /* IN variables. */ u32 reg; /* OUT variables. */ - memory_t pfn; - memory_t nr_pfns; + unsigned long pfn; + unsigned long nr_pfns; u32 type; } dom0_read_memtype_t; @@ -361,10 +361,10 @@ typedef struct { /* IN variables. */ domid_t first_domain; - memory_t max_domains; + unsigned int max_domains; dom0_getdomaininfo_t *buffer; /* OUT variables. */ - memory_t num_domains; + unsigned int num_domains; } dom0_getdomaininfolist_t; #define DOM0_PLATFORM_QUIRK 39 diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/grant_table.h --- a/xen/include/public/grant_table.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/grant_table.h Thu Aug 25 22:53:20 2005 @@ -142,7 +142,10 @@ * 1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address * via which I/O devices may access the granted frame. * 2. If GNTPIN_map_for_host is specified then a mapping will be added at - * virtual address <host_virt_addr> in the current address space. + * either a host virtual address in the current address space, or at + * a PTE at the specified machine address. The type of mapping to + * perform is selected through the GNTMAP_contains_pte flag, and the + * address is specified in <host_addr>. * 3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a * host mapping is destroyed by other means then it is *NOT* guaranteed * to be accounted to the correct grant reference! @@ -150,18 +153,18 @@ #define GNTTABOP_map_grant_ref 0 typedef struct gnttab_map_grant_ref { /* IN parameters. */ - memory_t host_virt_addr; + u64 host_addr; domid_t dom; grant_ref_t ref; u16 flags; /* GNTMAP_* */ /* OUT parameters. */ s16 handle; /* +ve: handle; -ve: GNTST_* */ - memory_t dev_bus_addr; + u64 dev_bus_addr; } gnttab_map_grant_ref_t; /* * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings - * tracked by <handle>. If <host_virt_addr> or <dev_bus_addr> is zero, that + * tracked by <handle>. If <host_addr> or <dev_bus_addr> is zero, that * field is ignored. If non-zero, they must refer to a device/host mapping * that is tracked by <handle> * NOTES: @@ -173,14 +176,12 @@ #define GNTTABOP_unmap_grant_ref 1 typedef struct gnttab_unmap_grant_ref { /* IN parameters. */ - memory_t host_virt_addr; - memory_t dev_bus_addr; + u64 host_addr; + u64 dev_bus_addr; u16 handle; /* OUT parameters. */ s16 status; /* GNTST_* */ } gnttab_unmap_grant_ref_t; - -#define GNTUNMAP_DEV_FROM_VIRT (~0U) /* * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least @@ -220,7 +221,7 @@ */ #define GNTTABOP_donate 4 typedef struct { - memory_t mfn; /* 0 */ + unsigned long mfn; /* 0 */ domid_t domid; /* 4 */ u16 handle; /* 8 */ s16 status; /* 10: GNTST_* */ @@ -247,10 +248,18 @@ #define _GNTMAP_application_map (3) #define GNTMAP_application_map (1<<_GNTMAP_application_map) + /* + * GNTMAP_contains_pte subflag: + * 0 => This map request contains a host virtual address. + * 1 => This map request contains the machine addess of the PTE to update. + */ +#define _GNTMAP_contains_pte (4) +#define GNTMAP_contains_pte (1<<_GNTMAP_contains_pte) + /* * Values for error status returns. All errors are -ve. */ -#define GNTST_okay (0) +#define GNTST_okay (0) /* Normal return. */ #define GNTST_general_error (-1) /* General undefined error. */ #define GNTST_bad_domain (-2) /* Unrecognsed domain id. */ #define GNTST_bad_gntref (-3) /* Unrecognised or inappropriate gntref. */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/io/blkif.h --- a/xen/include/public/io/blkif.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/io/blkif.h Thu Aug 25 22:53:20 2005 @@ -18,7 +18,6 @@ #define BLKIF_OP_READ 0 #define BLKIF_OP_WRITE 1 -#define BLKIF_OP_PROBE 2 /* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */ #define BLKIF_RING_SIZE 64 @@ -33,28 +32,22 @@ typedef struct blkif_request { u8 operation; /* BLKIF_OP_??? */ u8 nr_segments; /* number of segments */ - blkif_vdev_t device; /* only for read/write requests */ + blkif_vdev_t handle; /* only for read/write requests */ unsigned long id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ /* @f_a_s[4:0]=last_sect ; @f_a_s[9:5]=first_sect */ -#ifdef CONFIG_XEN_BLKDEV_GRANT /* @f_a_s[:16]= grant reference (16 bits) */ -#else - /* @f_a_s[:12]=@frame: machine page frame number. */ -#endif /* @first_sect: first sector in frame to transfer (inclusive). */ /* @last_sect: last sector in frame to transfer (inclusive). */ unsigned long frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST]; } blkif_request_t; -#define blkif_fas(_addr, _fs, _ls) ((addr)|((_fs)<<5)|(_ls)) +#define blkif_fas(_addr, _fs, _ls) ((_addr)|((_fs)<<5)|(_ls)) #define blkif_first_sect(_fas) (((_fas)>>5)&31) #define blkif_last_sect(_fas) ((_fas)&31) -#ifdef CONFIG_XEN_BLKDEV_GRANT #define blkif_fas_from_gref(_gref, _fs, _ls) (((_gref)<<16)|((_fs)<<5)|(_ls)) #define blkif_gref_from_fas(_fas) ((_fas)>>16) -#endif typedef struct blkif_response { unsigned long id; /* copied from request */ @@ -65,37 +58,17 @@ #define BLKIF_RSP_ERROR -1 /* non-specific 'error' */ #define BLKIF_RSP_OKAY 0 /* non-specific 'okay' */ +#define BLKIF_MAJOR(dev) ((dev)>>8) +#define BLKIF_MINOR(dev) ((dev) & 0xff) + /* * Generate blkif ring structures and types. */ DEFINE_RING_TYPES(blkif, blkif_request_t, blkif_response_t); -/* - * BLKIF_OP_PROBE: - * The request format for a probe request is constrained as follows: - * @operation == BLKIF_OP_PROBE - * @nr_segments == size of probe buffer in pages - * @device == unused (zero) - * @id == any value (echoed in response message) - * @sector_num == unused (zero) - * @frame_and_sects == list of page-sized buffers. - * (i.e., @first_sect == 0, @last_sect == 7). - * - * The response is a list of vdisk_t elements copied into the out-of-band - * probe buffer. On success the response status field contains the number - * of vdisk_t elements. - */ - #define VDISK_CDROM 0x1 #define VDISK_REMOVABLE 0x2 #define VDISK_READONLY 0x4 -typedef struct vdisk { - blkif_sector_t capacity; /* Size in terms of 512-byte sectors. */ - blkif_vdev_t device; /* Device number (opaque 16 bit value). */ - u16 info; /* Device type and flags (VDISK_*). */ - u16 sector_size; /* Minimum alignment for requests. */ -} vdisk_t; /* 16 bytes */ - #endif /* __XEN_PUBLIC_IO_BLKIF_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/io/domain_controller.h --- a/xen/include/public/io/domain_controller.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/io/domain_controller.h Thu Aug 25 22:53:20 2005 @@ -139,7 +139,7 @@ */ typedef struct blkif_fe_interface_connect { u32 handle; - memory_t shmem_frame; + unsigned long shmem_frame; int shmem_ref; } blkif_fe_interface_connect_t; @@ -249,7 +249,7 @@ /* IN */ domid_t domid; /* Domain attached to new interface. */ u32 blkif_handle; /* Domain-specific interface handle. */ - memory_t shmem_frame; /* Page cont. shared comms window. */ + unsigned long shmem_frame;/* Page cont. shared comms window. */ int shmem_ref; /* Grant table reference. */ u32 evtchn; /* Event channel for notifications. */ /* OUT */ @@ -364,9 +364,11 @@ * STATUS_CONNECTED message. */ typedef struct netif_fe_interface_connect { - u32 handle; - memory_t tx_shmem_frame; - memory_t rx_shmem_frame; + u32 handle; + unsigned long tx_shmem_frame; + int tx_shmem_ref; + unsigned long rx_shmem_frame; + int rx_shmem_ref; } netif_fe_interface_connect_t; /* @@ -484,11 +486,13 @@ */ typedef struct netif_be_connect { /* IN */ - domid_t domid; /* Domain attached to new interface. */ - u32 netif_handle; /* Domain-specific interface handle. */ - memory_t tx_shmem_frame; /* Page cont. tx shared comms window. */ - memory_t rx_shmem_frame; /* Page cont. rx shared comms window. */ - u16 evtchn; /* Event channel for notifications. */ + domid_t domid; /* Domain attached to new interface. */ + u32 netif_handle; /* Domain-specific interface handle. */ + unsigned long tx_shmem_frame;/* Page cont. tx shared comms window. */ + int tx_shmem_ref; /* Grant reference for above */ + unsigned long rx_shmem_frame;/* Page cont. rx shared comms window. */ + int rx_shmem_ref; /* Grant reference for above */ + u16 evtchn; /* Event channel for notifications. */ /* OUT */ u32 status; } netif_be_connect_t; @@ -573,7 +577,7 @@ * STATUS_CONNECTED message. */ typedef struct usbif_fe_interface_connect { - memory_t shmem_frame; + unsigned long shmem_frame; } usbif_fe_interface_connect_t; /* @@ -656,7 +660,7 @@ typedef struct usbif_be_connect { /* IN */ domid_t domid; /* Domain attached to new interface. */ - memory_t shmem_frame; /* Page cont. shared comms window. */ + unsigned long shmem_frame;/* Page cont. shared comms window. */ u32 evtchn; /* Event channel for notifications. */ u32 bandwidth; /* Bandwidth allocated for isoch / int - us * per 1ms frame (ie between 0 and 900 or 800 @@ -776,7 +780,7 @@ #define PDB_CONNECTION_STATUS_UP 1 #define PDB_CONNECTION_STATUS_DOWN 2 u32 status; - memory_t ring; /* status: UP */ + unsigned long ring; /* status: UP */ u32 evtchn; /* status: UP */ } pdb_connection_t, *pdb_connection_p; diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/io/netif.h --- a/xen/include/public/io/netif.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/io/netif.h Thu Aug 25 22:53:20 2005 @@ -10,7 +10,7 @@ #define __XEN_PUBLIC_IO_NETIF_H__ typedef struct netif_tx_request { - memory_t addr; /* Machine address of packet. */ + unsigned long addr; /* Machine address of packet. */ u16 csum_blank:1; /* Proto csum field blank? */ u16 id:15; /* Echoed in response message. */ u16 size; /* Packet size in bytes. */ @@ -32,7 +32,7 @@ #ifdef CONFIG_XEN_NETDEV_GRANT_TX u32 addr; /* 0: Offset in page of start of received packet */ #else - memory_t addr; /* Machine address of packet. */ + unsigned long addr; /* Machine address of packet. */ #endif u16 csum_valid:1; /* Protocol checksum is validated? */ u16 id:15; diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/physdev.h --- a/xen/include/public/physdev.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/physdev.h Thu Aug 25 22:53:20 2005 @@ -27,8 +27,8 @@ typedef struct physdevop_set_iobitmap { /* IN */ - memory_t bitmap; - u32 nr_ports; + u8 *bitmap; + u32 nr_ports; } physdevop_set_iobitmap_t; typedef struct physdevop_apic { diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/trace.h --- a/xen/include/public/trace.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/trace.h Thu Aug 25 22:53:20 2005 @@ -9,11 +9,21 @@ #define __XEN_PUBLIC_TRACE_H__ /* Trace classes */ -#define TRC_GEN 0x00010000 /* General trace */ -#define TRC_SCHED 0x00020000 /* Xen Scheduler trace */ -#define TRC_DOM0OP 0x00040000 /* Xen DOM0 operation trace */ -#define TRC_VMX 0x00080000 /* Xen VMX trace */ -#define TRC_ALL 0xffff0000 +#define TRC_CLS_SHIFT 16 +#define TRC_GEN 0x0001f000 /* General trace */ +#define TRC_SCHED 0x0002f000 /* Xen Scheduler trace */ +#define TRC_DOM0OP 0x0004f000 /* Xen DOM0 operation trace */ +#define TRC_VMX 0x0008f000 /* Xen VMX trace */ +#define TRC_ALL 0xfffff000 + +/* Trace subclasses */ +#define TRC_SUBCLS_SHIFT 12 +/* trace subclasses for VMX */ +#define TRC_VMXEXIT 0x00081000 /* VMX exit trace */ +#define TRC_VMXTIMER 0x00082000 /* VMX timer trace */ +#define TRC_VMXINT 0x00084000 /* VMX interrupt trace */ +#define TRC_VMXIO 0x00088000 /* VMX io emulation trace */ + /* Trace events per class */ @@ -31,9 +41,13 @@ #define TRC_SCHED_T_TIMER_FN (TRC_SCHED + 12) #define TRC_SCHED_DOM_TIMER_FN (TRC_SCHED + 13) -#define TRC_VMX_VMEXIT (TRC_VMX + 1) -#define TRC_VMX_VECTOR (TRC_VMX + 2) -#define TRC_VMX_INT (TRC_VMX + 3) +/* trace events per subclass */ +#define TRC_VMX_VMEXIT (TRC_VMXEXIT + 1) +#define TRC_VMX_VECTOR (TRC_VMXEXIT + 2) + +#define TRC_VMX_TIMER_INTR (TRC_VMXTIMER + 1) + +#define TRC_VMX_INT (TRC_VMXINT + 1) /* This structure represents a single trace buffer record. */ struct t_rec { diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/public/xen.h --- a/xen/include/public/xen.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/public/xen.h Thu Aug 25 22:53:20 2005 @@ -171,9 +171,9 @@ unsigned int cmd; union { /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR, REASSIGN_PAGE */ - memory_t mfn; + unsigned long mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ - memory_t linear_addr; + unsigned long linear_addr; }; union { /* SET_LDT */ @@ -203,6 +203,7 @@ #define SCHEDOP_shutdown 2 /* Stop executing this domain. */ #define SCHEDOP_vcpu_down 3 /* make target VCPU not-runnable. */ #define SCHEDOP_vcpu_up 4 /* make target VCPU runnable. */ +#define SCHEDOP_vcpu_pickle 5 /* save a vcpu's context to memory. */ #define SCHEDOP_cmdmask 255 /* 8-bit command. */ #define SCHEDOP_reasonshift 8 /* 8-bit reason code. (SCHEDOP_shutdown) */ #define SCHEDOP_vcpushift 8 /* 8-bit VCPU target. (SCHEDOP_up|down) */ @@ -331,14 +332,15 @@ typedef struct vcpu_time_info { /* - * The following values are updated periodically (and not necessarily - * atomically!). The guest OS detects this because 'time_version1' is - * incremented just before updating these values, and 'time_version2' is - * incremented immediately after. See the Xen-specific Linux code for an - * example of how to read these values safely (arch/xen/kernel/time.c). + * Updates to the following values are preceded and followed by an + * increment of 'version'. The guest can therefore detect updates by + * looking for changes to 'version'. If the least-significant bit of + * the version number is set then an update is in progress and the guest + * must wait to read a consistent set of values. + * The correct way to interact with the version number is similar to + * Linux's seqlock: see the implementations of read_seqbegin/read_seqretry. */ - u32 time_version1; - u32 time_version2; + u32 version; u64 tsc_timestamp; /* TSC at last update of time vals. */ u64 system_time; /* Time, in nanosecs, since boot. */ /* @@ -400,8 +402,9 @@ * Wallclock time: updated only by control software. Guests should base * their gettimeofday() syscall on this wallclock-base value. */ - u32 wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ - u32 wc_usec; /* Usecs 00:00:00 UTC, Jan 1, 1970. */ + u32 wc_version; /* Version counter: see vcpu_time_info_t. */ + u32 wc_sec; /* Secs 00:00:00 UTC, Jan 1, 1970. */ + u32 wc_nsec; /* Nsecs 00:00:00 UTC, Jan 1, 1970. */ arch_shared_info_t arch; @@ -435,18 +438,18 @@ #define MAX_GUEST_CMDLINE 1024 typedef struct start_info { /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME. */ - memory_t nr_pages; /* Total pages allocated to this domain. */ - memory_t shared_info; /* MACHINE address of shared info struct. */ + unsigned long nr_pages; /* Total pages allocated to this domain. */ + unsigned long shared_info;/* MACHINE address of shared info struct. */ u32 flags; /* SIF_xxx flags. */ u16 domain_controller_evtchn; /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME). */ - memory_t pt_base; /* VIRTUAL address of page directory. */ - memory_t nr_pt_frames; /* Number of bootstrap p.t. frames. */ - memory_t mfn_list; /* VIRTUAL address of page-frame list. */ - memory_t mod_start; /* VIRTUAL address of pre-loaded module. */ - memory_t mod_len; /* Size (bytes) of pre-loaded module. */ + unsigned long pt_base; /* VIRTUAL address of page directory. */ + unsigned long nr_pt_frames;/* Number of bootstrap p.t. frames. */ + unsigned long mfn_list; /* VIRTUAL address of page-frame list. */ + unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ + unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ s8 cmd_line[MAX_GUEST_CMDLINE]; - memory_t store_mfn; /* MACHINE page number of shared page. */ + unsigned long store_mfn; /* MACHINE page number of shared page. */ u16 store_evtchn; /* Event channel for store communication. */ } start_info_t; diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/ac_timer.h --- a/xen/include/xen/ac_timer.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/ac_timer.h Thu Aug 25 22:53:20 2005 @@ -10,6 +10,7 @@ #include <xen/spinlock.h> #include <xen/time.h> +#include <xen/string.h> struct ac_timer { /* System time expiry value (nanoseconds since boot). */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/domain.h --- a/xen/include/xen/domain.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/domain.h Thu Aug 25 22:53:20 2005 @@ -15,7 +15,9 @@ extern void arch_do_boot_vcpu(struct vcpu *v); extern int arch_set_info_guest( - struct vcpu *d, struct vcpu_guest_context *c); + struct vcpu *v, struct vcpu_guest_context *c); + +extern void vcpu_migrate_cpu(struct vcpu *v, int newcpu); extern void free_perdomain_pt(struct domain *d); diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/event.h --- a/xen/include/xen/event.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/event.h Thu Aug 25 22:53:20 2005 @@ -26,30 +26,14 @@ { struct domain *d = v->domain; shared_info_t *s = d->shared_info; - int running; - /* These three operations must happen in strict order. */ + /* These four operations must happen in strict order. */ if ( !test_and_set_bit(port, &s->evtchn_pending[0]) && !test_bit (port, &s->evtchn_mask[0]) && - !test_and_set_bit(port>>5, &v->vcpu_info->evtchn_pending_sel) ) + !test_and_set_bit(port>>5, &v->vcpu_info->evtchn_pending_sel) && + !test_and_set_bit(0, &v->vcpu_info->evtchn_upcall_pending) ) { - /* The VCPU pending flag must be set /after/ update to evtchn-pend. */ - set_bit(0, &v->vcpu_info->evtchn_upcall_pending); evtchn_notify(v); - - /* - * NB1. 'vcpu_flags' and 'processor' must be checked /after/ update of - * pending flag. These values may fluctuate (after all, we hold no - * locks) but the key insight is that each change will cause - * evtchn_upcall_pending to be polled. - * - * NB2. We save VCPUF_running across the unblock to avoid a needless - * IPI for domains that we IPI'd to unblock. - */ - running = test_bit(_VCPUF_running, &v->vcpu_flags); - vcpu_unblock(v); - if ( running ) - smp_send_event_check_cpu(v->processor); } } @@ -73,8 +57,9 @@ */ extern void send_guest_pirq(struct domain *d, int pirq); -#define event_pending(_d) \ - ((_d)->vcpu_info->evtchn_upcall_pending && \ - !(_d)->vcpu_info->evtchn_upcall_mask) +/* Note: Bitwise operations result in fast code with no branches. */ +#define event_pending(v) \ + ((v)->vcpu_info->evtchn_upcall_pending & \ + ~(v)->vcpu_info->evtchn_upcall_mask) #endif /* __XEN_EVENT_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/grant_table.h --- a/xen/include/xen/grant_table.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/grant_table.h Thu Aug 25 22:53:20 2005 @@ -53,19 +53,20 @@ #define ORDER_GRANT_FRAMES 2 #define NR_GRANT_FRAMES (1U << ORDER_GRANT_FRAMES) -#define NR_GRANT_ENTRIES (NR_GRANT_FRAMES * PAGE_SIZE / sizeof(grant_entry_t)) - +#define NR_GRANT_ENTRIES \ + ((NR_GRANT_FRAMES << PAGE_SHIFT) / sizeof(grant_entry_t)) /* * Tracks a mapping of another domain's grant reference. Each domain has a * table of these, indexes into which are returned as a 'mapping handle'. */ typedef struct { - u16 ref_and_flags; /* 0-2: GNTMAP_* ; 3-15: grant ref */ + u16 ref_and_flags; /* 0-4: GNTMAP_* ; 5-15: grant ref */ domid_t domid; /* granting domain */ } grant_mapping_t; -#define MAPTRACK_GNTMAP_MASK 7 -#define MAPTRACK_REF_SHIFT 3 +#define MAPTRACK_GNTMAP_MASK 0x1f +#define MAPTRACK_REF_SHIFT 5 +#define MAPTRACK_MAX_ENTRIES (1 << (16 - MAPTRACK_REF_SHIFT)) /* Per-domain grant information. */ typedef struct { @@ -108,10 +109,15 @@ /* Notify 'rd' of a completed transfer via an already-locked grant entry. */ void gnttab_notify_transfer( - struct domain *rd, struct domain *ld, grant_ref_t ref, unsigned long frame); + struct domain *rd, struct domain *ld, + grant_ref_t ref, unsigned long frame); -/* Pre-domain destruction release of granted device mappings of other domains.*/ +/* Domain death release of granted device mappings of other domains.*/ void gnttab_release_dev_mappings(grant_table_t *gt); +/* Extra GNTST_ values, for internal use only. */ +#define GNTST_flush_all (2) /* Success, need to flush entire TLB. */ +#define GNTST_flush_one (1) /* Success, need to flush a vaddr. */ + #endif /* __XEN_GRANT_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/perfc_defn.h --- a/xen/include/xen/perfc_defn.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/perfc_defn.h Thu Aug 25 22:53:20 2005 @@ -1,5 +1,6 @@ -#ifndef __XEN_PERFC_DEFN_H__ -#define __XEN_PERFC_DEFN_H__ +/* This file is legitimately included multiple times. */ +/*#ifndef __XEN_PERFC_DEFN_H__*/ +/*#define __XEN_PERFC_DEFN_H__*/ #define PERFC_MAX_PT_UPDATES 64 #define PERFC_PT_UPDATES_BUCKET_SIZE 3 @@ -124,4 +125,4 @@ PERFCOUNTER_CPU(remove_write_bad_prediction, "remove_write bad prediction") PERFCOUNTER_CPU(update_hl2e_invlpg, "update_hl2e calls invlpg") -#endif /* __XEN_PERFC_DEFN_H__ */ +/*#endif*/ /* __XEN_PERFC_DEFN_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/sched.h Thu Aug 25 22:53:20 2005 @@ -258,12 +258,32 @@ extern void sync_lazy_execstate_all(void); extern int __sync_lazy_execstate(void); -/* Called by the scheduler to switch to another vcpu. */ +/* + * Called by the scheduler to switch to another VCPU. On entry, although + * VCPUF_running is no longer asserted for @prev, its context is still running + * on the local CPU and is not committed to memory. The local scheduler lock + * is therefore still held, and interrupts are disabled, because the local CPU + * is in an inconsistent state. + * + * The callee must ensure that the local CPU is no longer running in @prev's + * context, and that the context is saved to memory, before returning. + * Alternatively, if implementing lazy context switching, it suffices to ensure + * that invoking __sync_lazy_execstate() will switch and commit @prev's state. + */ extern void context_switch( struct vcpu *prev, struct vcpu *next); -/* Called by the scheduler to continue running the current vcpu. */ +/* + * On some architectures (notably x86) it is not possible to entirely load + * @next's context with interrupts disabled. These may implement a function to + * finalise loading the new context after interrupts are re-enabled. This + * function is not given @prev and is not permitted to access it. + */ +extern void context_switch_finalise( + struct vcpu *next); + +/* Called by the scheduler to continue running the current VCPU. */ extern void continue_running( struct vcpu *same); @@ -297,10 +317,9 @@ (unsigned long)(_a1), (unsigned long)(_a2), (unsigned long)(_a3), \ (unsigned long)(_a4), (unsigned long)(_a5), (unsigned long)(_a6)) -#define hypercall_preempt_check() (unlikely( \ - softirq_pending(smp_processor_id()) | \ - (!!current->vcpu_info->evtchn_upcall_pending & \ - !current->vcpu_info->evtchn_upcall_mask) \ +#define hypercall_preempt_check() (unlikely( \ + softirq_pending(smp_processor_id()) | \ + event_pending(current) \ )) /* This domain_hash and domain_list are protected by the domlist_lock. */ @@ -386,6 +405,7 @@ void domain_unpause(struct domain *d); void domain_pause_by_systemcontroller(struct domain *d); void domain_unpause_by_systemcontroller(struct domain *d); +void cpu_init(void); static inline void vcpu_unblock(struct vcpu *v) { diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/serial.h --- a/xen/include/xen/serial.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/serial.h Thu Aug 25 22:53:20 2005 @@ -113,8 +113,9 @@ /* * Initialisers for individual uart drivers. */ +/* NB. Any default value can be 0 if it is unknown and must be specified. */ struct ns16550_defaults { - int baud; /* default baud rate; 0 == pre-configured */ + int baud; /* default baud rate; BAUD_AUTO == pre-configured */ int data_bits; /* default data bits (5, 6, 7 or 8) */ int parity; /* default parity (n, o, e, m or s) */ int stop_bits; /* default stop bits (1 or 2) */ @@ -122,6 +123,9 @@ unsigned long io_base; /* default io_base address */ }; void ns16550_init(int index, struct ns16550_defaults *defaults); + +/* Baud rate was pre-configured before invoking the UART driver. */ +#define BAUD_AUTO (-1) #endif /* __XEN_SERIAL_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/time.h --- a/xen/include/xen/time.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/time.h Thu Aug 25 22:53:20 2005 @@ -57,7 +57,7 @@ extern void update_dom_time(struct vcpu *v); extern void do_settime( - unsigned long secs, unsigned long usecs, u64 system_time_base); + unsigned long secs, unsigned long nsecs, u64 system_time_base); #endif /* __XEN_TIME_H__ */ diff -r 5f1ed597f107 -r 8799d14bef77 xen/include/xen/trace.h --- a/xen/include/xen/trace.h Wed Aug 24 02:43:18 2005 +++ b/xen/include/xen/trace.h Thu Aug 25 22:53:20 2005 @@ -67,6 +67,15 @@ if ( (tb_event_mask & event) == 0 ) return 0; + /* match class */ + if ( ((tb_event_mask >> TRC_CLS_SHIFT) & (event >> TRC_CLS_SHIFT)) == 0 ) + return 0; + + /* then match subclass */ + if ( (((tb_event_mask >> TRC_SUBCLS_SHIFT) & 0xf ) + & ((event >> TRC_SUBCLS_SHIFT) & 0xf )) == 0 ) + return 0; + if ( (tb_cpu_mask & (1UL << smp_processor_id())) == 0 ) return 0; diff -r 5f1ed597f107 -r 8799d14bef77 xen/tools/Makefile --- a/xen/tools/Makefile Wed Aug 24 02:43:18 2005 +++ b/xen/tools/Makefile Thu Aug 25 22:53:20 2005 @@ -10,4 +10,4 @@ rm -f *.o symbols symbols: symbols.c - $(HOSTCC) -o $@ $< + $(HOSTCC) $(HOSTCFLAGS) -o $@ $< diff -r 5f1ed597f107 -r 8799d14bef77 xen/tools/symbols.c --- a/xen/tools/symbols.c Wed Aug 24 02:43:18 2005 +++ b/xen/tools/symbols.c Thu Aug 25 22:53:20 2005 @@ -152,8 +152,8 @@ /* include the type field in the symbol name, so that it gets * compressed together */ s->len = strlen(str) + 1; - s->sym = (char *) malloc(s->len + 1); - strcpy(s->sym + 1, str); + s->sym = (unsigned char *) malloc(s->len + 1); + strcpy((char *)s->sym + 1, str); s->sym[0] = s->type; return 0; @@ -197,16 +197,16 @@ * move then they may get dropped in pass 2, which breaks the * symbols rules. */ - if (s->addr == _etext && strcmp(s->sym + offset, "_etext")) + if (s->addr == _etext && strcmp((char *)s->sym + offset, "_etext")) return 0; } /* Exclude symbols which vary between passes. */ - if (strstr(s->sym + offset, "_compiled.")) + if (strstr((char *)s->sym + offset, "_compiled.")) return 0; for (i = 0; special_symbols[i]; i++) - if( strcmp(s->sym + offset, special_symbols[i]) == 0 ) + if( strcmp((char *)s->sym + offset, special_symbols[i]) == 0 ) return 0; return 1; @@ -311,7 +311,7 @@ off = 0; for (i = 0; i < cnt; i++) { - if (!table[i].flags & SYM_FLAG_VALID) + if (!(table[i].flags & SYM_FLAG_VALID)) continue; if ((valid & 0xFF) == 0) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/init_task.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/init_task.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,49 @@ +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/fs.h> +#include <linux/mqueue.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/desc.h> + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); + +#define swapper_pg_dir ((pgd_t *)NULL) +struct mm_struct init_mm = INIT_MM(init_mm); +#undef swapper_pg_dir + +EXPORT_SYMBOL(init_mm); + +/* + * Initial thread structure. + * + * We need to make sure that this is THREAD_SIZE aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. + */ +DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp = INIT_TSS; + diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/swiotlb.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/swiotlb.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,660 @@ +/* + * Dynamic DMA mapping support. + * + * This implementation is a fallback for platforms that do not support + * I/O TLBs (aka DMA address translation hardware). + * Copyright (C) 2000 Asit Mallick <Asit.K.Mallick@xxxxxxxxx> + * Copyright (C) 2000 Goutham Rao <goutham.rao@xxxxxxxxx> + * Copyright (C) 2000, 2003 Hewlett-Packard Co + * David Mosberger-Tang <davidm@xxxxxxxxxx> + * Copyright (C) 2005 Keir Fraser <keir@xxxxxxxxxxxxx> + */ + +#include <linux/cache.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/highmem.h> +#include <asm/io.h> +#include <asm/pci.h> +#include <asm/dma.h> + +#define OFFSET(val,align) ((unsigned long)((val) & ( (align) - 1))) + +#define SG_ENT_PHYS_ADDRESS(sg) (page_to_phys((sg)->page) + (sg)->offset) + +/* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? + * The complexity of {map,unmap}_single is linearly dependent on this value. + */ +#define IO_TLB_SEGSIZE 128 + +/* + * log of the size of each IO TLB slab. The number of slabs is command line + * controllable. + */ +#define IO_TLB_SHIFT 11 + +int swiotlb_force; +static char *iotlb_virt_start; +static unsigned long iotlb_nslabs; + +/* + * Used to do a quick range check in swiotlb_unmap_single and + * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ +static dma_addr_t iotlb_bus_start, iotlb_bus_mask; + +/* Does the given dma address reside within the swiotlb aperture? */ +#define in_swiotlb_aperture(a) (!(((a) ^ iotlb_bus_start) & iotlb_bus_mask)) + +/* + * When the IOMMU overflows we return a fallback buffer. This sets the size. + */ +static unsigned long io_tlb_overflow = 32*1024; + +void *io_tlb_overflow_buffer; + +/* + * This is a free list describing the number of free entries available from + * each index + */ +static unsigned int *io_tlb_list; +static unsigned int io_tlb_index; + +/* + * We need to save away the original address corresponding to a mapped entry + * for the sync operations. + */ +static struct phys_addr { + struct page *page; + unsigned int offset; +} *io_tlb_orig_addr; + +/* + * Protect the above data structures in the map and unmap calls + */ +static DEFINE_SPINLOCK(io_tlb_lock); + +static int __init +setup_io_tlb_npages(char *str) +{ + /* Unlike ia64, the size is aperture in megabytes, not 'slabs'! */ + if (isdigit(*str)) { + iotlb_nslabs = simple_strtoul(str, &str, 0) << + (20 - IO_TLB_SHIFT); + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); + /* Round up to power of two (xen_create_contiguous_region). */ + while (iotlb_nslabs & (iotlb_nslabs-1)) + iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1); + } + if (*str == ',') + ++str; + /* + * NB. 'force' enables the swiotlb, but doesn't force its use for + * every DMA like it does on native Linux. + */ + if (!strcmp(str, "force")) + swiotlb_force = 1; + return 1; +} +__setup("swiotlb=", setup_io_tlb_npages); +/* make io_tlb_overflow tunable too? */ + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the PCI DMA API. + */ +void +swiotlb_init_with_default_size (size_t default_size) +{ + unsigned long i, bytes; + + if (!iotlb_nslabs) { + iotlb_nslabs = (default_size >> IO_TLB_SHIFT); + iotlb_nslabs = ALIGN(iotlb_nslabs, IO_TLB_SEGSIZE); + /* Round up to power of two (xen_create_contiguous_region). */ + while (iotlb_nslabs & (iotlb_nslabs-1)) + iotlb_nslabs += iotlb_nslabs & ~(iotlb_nslabs-1); + } + + bytes = iotlb_nslabs * (1UL << IO_TLB_SHIFT); + + /* + * Get IO TLB memory from the low pages + */ + iotlb_virt_start = alloc_bootmem_low_pages(bytes); + if (!iotlb_virt_start) + panic("Cannot allocate SWIOTLB buffer!\n" + "Use dom0_mem Xen boot parameter to reserve\n" + "some DMA memory (e.g., dom0_mem=-128M).\n"); + + xen_create_contiguous_region( + (unsigned long)iotlb_virt_start, get_order(bytes)); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE. + */ + io_tlb_list = alloc_bootmem(iotlb_nslabs * sizeof(int)); + for (i = 0; i < iotlb_nslabs; i++) + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_index = 0; + io_tlb_orig_addr = alloc_bootmem( + iotlb_nslabs * sizeof(*io_tlb_orig_addr)); + + /* + * Get the overflow emergency buffer + */ + io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); + + iotlb_bus_start = virt_to_bus(iotlb_virt_start); + iotlb_bus_mask = ~(dma_addr_t)(bytes - 1); + + printk(KERN_INFO "Software IO TLB enabled: \n" + " Aperture: %lu megabytes\n" + " Bus range: 0x%016lx - 0x%016lx\n" + " Kernel range: 0x%016lx - 0x%016lx\n", + bytes >> 20, + (unsigned long)iotlb_bus_start, + (unsigned long)iotlb_bus_start + bytes, + (unsigned long)iotlb_virt_start, + (unsigned long)iotlb_virt_start + bytes); +} + +void +swiotlb_init(void) +{ + /* The user can forcibly enable swiotlb. */ + if (swiotlb_force) + swiotlb = 1; + + /* + * Otherwise, enable for domain 0 if the machine has 'lots of memory', + * which we take to mean more than 2GB. + */ + if (xen_start_info.flags & SIF_INITDOMAIN) { + dom0_op_t op; + op.cmd = DOM0_PHYSINFO; + if ((HYPERVISOR_dom0_op(&op) == 0) && + (op.u.physinfo.total_pages > 0x7ffff)) + swiotlb = 1; + } + + if (swiotlb) + swiotlb_init_with_default_size(64 * (1<<20)); +} + +static void +__sync_single(struct phys_addr buffer, char *dma_addr, size_t size, int dir) +{ + if (PageHighMem(buffer.page)) { + size_t len, bytes; + char *dev, *host, *kmp; + len = size; + while (len != 0) { + if (((bytes = len) + buffer.offset) > PAGE_SIZE) + bytes = PAGE_SIZE - buffer.offset; + kmp = kmap_atomic(buffer.page, KM_SWIOTLB); + dev = dma_addr + size - len; + host = kmp + buffer.offset; + memcpy((dir == DMA_FROM_DEVICE) ? host : dev, + (dir == DMA_FROM_DEVICE) ? dev : host, + bytes); + kunmap_atomic(kmp, KM_SWIOTLB); + len -= bytes; + buffer.page++; + buffer.offset = 0; + } + } else { + char *host = (char *)phys_to_virt( + page_to_pseudophys(buffer.page)) + buffer.offset; + if (dir == DMA_FROM_DEVICE) + memcpy(host, dma_addr, size); + else if (dir == DMA_TO_DEVICE) + memcpy(dma_addr, host, size); + } +} + +/* + * Allocates bounce buffer and returns its kernel virtual address. + */ +static void * +map_single(struct device *hwdev, struct phys_addr buffer, size_t size, int dir) +{ + unsigned long flags; + char *dma_addr; + unsigned int nslots, stride, index, wrap; + int i; + + /* + * For mappings greater than a page, we limit the stride (and + * hence alignment) to a page size. + */ + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (size > PAGE_SIZE) + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); + else + stride = 1; + + BUG_ON(!nslots); + + /* + * Find suitable number of IO TLB entries size that will fit this + * request and allocate a buffer from that IO TLB pool. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + wrap = index = ALIGN(io_tlb_index, stride); + + if (index >= iotlb_nslabs) + wrap = index = 0; + + do { + /* + * If we find a slot that indicates we have 'nslots' + * number of contiguous buffers, we allocate the + * buffers from that slot and mark the entries as '0' + * indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int)(index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; + (OFFSET(i, IO_TLB_SEGSIZE) != + IO_TLB_SEGSIZE -1) && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + dma_addr = iotlb_virt_start + + (index << IO_TLB_SHIFT); + + /* + * Update the indices to avoid searching in + * the next round. + */ + io_tlb_index = + ((index + nslots) < iotlb_nslabs + ? (index + nslots) : 0); + + goto found; + } + index += stride; + if (index >= iotlb_nslabs) + index = 0; + } while (index != wrap); + + spin_unlock_irqrestore(&io_tlb_lock, flags); + return NULL; + } + found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + + /* + * Save away the mapping from the original address to the DMA address. + * This is needed when we sync the memory. Then we sync the buffer if + * needed. + */ + io_tlb_orig_addr[index] = buffer; + if ((dir == DMA_TO_DEVICE) || (dir == DMA_BIDIRECTIONAL)) + __sync_single(buffer, dma_addr, size, DMA_TO_DEVICE); + + return dma_addr; +} + +/* + * dma_addr is the kernel virtual address of the bounce buffer to unmap. + */ +static void +unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +{ + unsigned long flags; + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; + struct phys_addr buffer = io_tlb_orig_addr[index]; + + /* + * First, sync the memory before unmapping the entry + */ + if ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)) + __sync_single(buffer, dma_addr, size, DMA_FROM_DEVICE); + + /* + * Return the buffer to the free list by setting the corresponding + * entries to indicate the number of contigous entries available. + * While returning the entries to the free list, we merge the entries + * with slots below and above the pool being returned. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? + io_tlb_list[index + nslots] : 0); + /* + * Step 1: return the slots to the free list, merging the + * slots with superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) + io_tlb_list[i] = ++count; + /* + * Step 2: merge the returned slots with the preceding slots, + * if available (non zero) + */ + for (i = index - 1; + (OFFSET(i, IO_TLB_SEGSIZE) != + IO_TLB_SEGSIZE -1) && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + } + spin_unlock_irqrestore(&io_tlb_lock, flags); +} + +static void +sync_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +{ + int index = (dma_addr - iotlb_virt_start) >> IO_TLB_SHIFT; + struct phys_addr buffer = io_tlb_orig_addr[index]; + BUG_ON((dir != DMA_FROM_DEVICE) && (dir != DMA_TO_DEVICE)); + __sync_single(buffer, dma_addr, size, dir); +} + +static void +swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) +{ + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * unless they check for pci_dma_mapping_error (most don't) + * When the mapping is small enough return a static buffer to limit + * the damage, or panic when the transfer is too big. + */ + printk(KERN_ERR "PCI-DMA: Out of SW-IOMMU space for %lu bytes at " + "device %s\n", (unsigned long)size, dev ? dev->bus_id : "?"); + + if (size > io_tlb_overflow && do_panic) { + if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Memory would be corrupted\n"); + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic("PCI-DMA: Random memory would be DMAed\n"); + } +} + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * PCI address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + */ +dma_addr_t +swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, int dir) +{ + dma_addr_t dev_addr = virt_to_bus(ptr); + void *map; + struct phys_addr buffer; + + BUG_ON(dir == DMA_NONE); + + /* + * If the pointer passed in happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (!range_straddles_page_boundary(ptr, size) && + !address_needs_mapping(hwdev, dev_addr)) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + buffer.page = virt_to_page(ptr); + buffer.offset = (unsigned long)ptr & ~PAGE_MASK; + map = map_single(hwdev, buffer, size, dir); + if (!map) { + swiotlb_full(hwdev, size, dir, 1); + map = io_tlb_overflow_buffer; + } + + dev_addr = virt_to_bus(map); + + /* + * Ensure that the address returned is DMA'ble + */ + if (address_needs_mapping(hwdev, dev_addr)) + panic("map_single: bounce buffer is not DMA'ble"); + + return dev_addr; +} + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous swiotlb_map_single call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +void +swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, + int dir) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + unmap_single(hwdev, bus_to_virt(dev_addr), size, dir); +} + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the PCI dma mapping, you must + * call this function before doing so. At the next point you give the PCI dma + * address back to the card, you must first perform a + * swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +void +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + sync_single(hwdev, bus_to_virt(dev_addr), size, dir); +} + +void +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) +{ + BUG_ON(dir == DMA_NONE); + if (in_swiotlb_aperture(dev_addr)) + sync_single(hwdev, bus_to_virt(dev_addr), size, dir); +} + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above swiotlb_map_single + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for swiotlb_map_single are the + * same here. + */ +int +swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, int nelems, + int dir) +{ + struct phys_addr buffer; + dma_addr_t dev_addr; + char *map; + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) { + dev_addr = SG_ENT_PHYS_ADDRESS(sg); + if (address_needs_mapping(hwdev, dev_addr)) { + buffer.page = sg->page; + buffer.offset = sg->offset; + map = map_single(hwdev, buffer, sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + swiotlb_full(hwdev, sg->length, dir, 0); + swiotlb_unmap_sg(hwdev, sg - i, i, dir); + sg[0].dma_length = 0; + return 0; + } + sg->dma_address = (dma_addr_t)virt_to_bus(map); + } else + sg->dma_address = dev_addr; + sg->dma_length = sg->length; + } + return nelems; +} + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_single() above. + */ +void +swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nelems, + int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) + unmap_single(hwdev, + (void *)bus_to_virt(sg->dma_address), + sg->dma_length, dir); +} + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +void +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) + sync_single(hwdev, + (void *)bus_to_virt(sg->dma_address), + sg->dma_length, dir); +} + +void +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, int dir) +{ + int i; + + BUG_ON(dir == DMA_NONE); + + for (i = 0; i < nelems; i++, sg++) + if (sg->dma_address != SG_ENT_PHYS_ADDRESS(sg)) + sync_single(hwdev, + (void *)bus_to_virt(sg->dma_address), + sg->dma_length, dir); +} + +dma_addr_t +swiotlb_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + struct phys_addr buffer; + dma_addr_t dev_addr; + char *map; + + dev_addr = page_to_phys(page) + offset; + if (address_needs_mapping(hwdev, dev_addr)) { + buffer.page = page; + buffer.offset = offset; + map = map_single(hwdev, buffer, size, direction); + if (!map) { + swiotlb_full(hwdev, size, direction, 1); + map = io_tlb_overflow_buffer; + } + dev_addr = (dma_addr_t)virt_to_bus(map); + } + + return dev_addr; +} + +void +swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + if (in_swiotlb_aperture(dma_address)) + unmap_single(hwdev, bus_to_virt(dma_address), size, direction); +} + +int +swiotlb_dma_mapping_error(dma_addr_t dma_addr) +{ + return (dma_addr == virt_to_bus(io_tlb_overflow_buffer)); +} + +/* + * Return whether the given PCI device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during PCI bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +swiotlb_dma_supported (struct device *hwdev, u64 mask) +{ + return (mask >= 0xffffffffUL); +} + +EXPORT_SYMBOL(swiotlb_init); +EXPORT_SYMBOL(swiotlb_map_single); +EXPORT_SYMBOL(swiotlb_unmap_single); +EXPORT_SYMBOL(swiotlb_map_sg); +EXPORT_SYMBOL(swiotlb_unmap_sg); +EXPORT_SYMBOL(swiotlb_sync_single_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_single_for_device); +EXPORT_SYMBOL(swiotlb_sync_sg_for_cpu); +EXPORT_SYMBOL(swiotlb_sync_sg_for_device); +EXPORT_SYMBOL(swiotlb_map_page); +EXPORT_SYMBOL(swiotlb_unmap_page); +EXPORT_SYMBOL(swiotlb_dma_mapping_error); +EXPORT_SYMBOL(swiotlb_dma_supported); + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,123 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Generic APIC sub-arch probe layer. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/module.h> + +#include <asm/smp.h> +#include <asm/ipi.h> + +#if defined(CONFIG_ACPI_BUS) +#include <acpi/acpi_bus.h> +#endif + +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +EXPORT_SYMBOL(x86_cpu_to_apicid); +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +extern struct genapic apic_cluster; +extern struct genapic apic_flat; + +#ifndef CONFIG_XEN +struct genapic *genapic = &apic_flat; +#else +extern struct genapic apic_xen; +struct genapic *genapic = &apic_xen; +#endif + + +/* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +void __init clustered_apic_check(void) +{ +#ifndef CONFIG_XEN + long i; + u8 clusters, max_cluster; + u8 id; + u8 cluster_cnt[NUM_APIC_CLUSTERS]; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + /* AMD always uses flat mode right now */ + genapic = &apic_flat; + goto print; + } + +#if defined(CONFIG_ACPI_BUS) + /* + * Some x86_64 machines use physical APIC mode regardless of how many + * procs/clusters are present (x86_64 ES7000 is an example). + */ + if (acpi_fadt.revision > FADT2_REVISION_ID) + if (acpi_fadt.force_apic_physical_destination_mode) { + genapic = &apic_cluster; + goto print; + } +#endif + + memset(cluster_cnt, 0, sizeof(cluster_cnt)); + + for (i = 0; i < NR_CPUS; i++) { + id = bios_cpu_apicid[i]; + if (id != BAD_APICID) + cluster_cnt[APIC_CLUSTERID(id)]++; + } + + clusters = 0; + max_cluster = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (cluster_cnt[i] > 0) { + ++clusters; + if (cluster_cnt[i] > max_cluster) + max_cluster = cluster_cnt[i]; + } + } + + /* + * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, + * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical + * else physical mode. + * (We don't use lowest priority delivery + HW APIC IRQ steering, so + * can ignore the clustered logical case and go straight to physical.) + */ + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + genapic = &apic_flat; + else + genapic = &apic_cluster; + +print: +#else + /* hardcode to xen apic functions */ + genapic = &apic_xen; +#endif + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); +} + +/* Same for both flat and clustered. */ + +#ifdef CONFIG_XEN +extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); +#endif + +void send_IPI_self(int vector) +{ +#ifndef CONFIG_XEN + __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +#else + xen_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +#endif +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic_xen.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/genapic_xen.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,167 @@ +/* + * Copyright 2004 James Cleverdon, IBM. + * Subject to the GNU Public License, v.2 + * + * Xen APIC subarch code. Maximum 8 CPUs, logical delivery. + * + * Hacked for x86-64 by James Cleverdon from i386 architecture code by + * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and + * James Cleverdon. + * + * Hacked to pieces for Xen by Chris Wright. + */ +#include <linux/config.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/init.h> +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +#include <asm/smp.h> +#include <asm/ipi.h> +#else +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/genapic.h> +#endif +#include <asm-xen/evtchn.h> + +DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]); + +static inline void __send_IPI_one(unsigned int cpu, int vector) +{ + unsigned int evtchn; + Dprintk("%s\n", __FUNCTION__); + + evtchn = per_cpu(ipi_to_evtchn, cpu)[vector]; + if (evtchn) + notify_via_evtchn(evtchn); + else + printk("send_IPI to unbound port %d/%d", cpu, vector); +} + +void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest) +{ + int cpu; + + switch (shortcut) { + case APIC_DEST_SELF: + __send_IPI_one(smp_processor_id(), vector); + break; + case APIC_DEST_ALLBUT: + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu == smp_processor_id()) + continue; + if (cpu_isset(cpu, cpu_online_map)) { + __send_IPI_one(cpu, vector); + } + } + break; + case APIC_DEST_ALLINC: + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu_isset(cpu, cpu_online_map)) { + __send_IPI_one(cpu, vector); + } + } + break; + default: + printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut, + vector); + break; + } +} + +static cpumask_t xen_target_cpus(void) +{ + return cpu_online_map; +} + +/* + * Set up the logical destination ID. + * Do nothing, not called now. + */ +static void xen_init_apic_ldr(void) +{ + Dprintk("%s\n", __FUNCTION__); + return; +} + +static void xen_send_IPI_allbutself(int vector) +{ + /* + * if there are no other CPUs in the system then + * we get an APIC send error if we try to broadcast. + * thus we have to avoid sending IPIs in this case. + */ + Dprintk("%s\n", __FUNCTION__); + if (num_online_cpus() > 1) + xen_send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL); +} + +static void xen_send_IPI_all(int vector) +{ + Dprintk("%s\n", __FUNCTION__); + xen_send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); +} + +static void xen_send_IPI_mask(cpumask_t cpumask, int vector) +{ + unsigned long mask = cpus_addr(cpumask)[0]; + unsigned int cpu; + unsigned long flags; + + Dprintk("%s\n", __FUNCTION__); + local_irq_save(flags); + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); + + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + if (cpu_isset(cpu, cpumask)) { + __send_IPI_one(cpu, vector); + } + } + local_irq_restore(flags); +} + +#ifdef CONFIG_XEN_PRIVILEGED_GUEST +static int xen_apic_id_registered(void) +{ + /* better be set */ + Dprintk("%s\n", __FUNCTION__); + return physid_isset(smp_processor_id(), phys_cpu_present_map); +} +#endif + +static unsigned int xen_cpu_mask_to_apicid(cpumask_t cpumask) +{ + Dprintk("%s\n", __FUNCTION__); + return cpus_addr(cpumask)[0] & APIC_ALL_CPUS; +} + +static unsigned int phys_pkg_id(int index_msb) +{ + u32 ebx; + + Dprintk("%s\n", __FUNCTION__); + ebx = cpuid_ebx(1); + return ((ebx >> 24) & 0xFF) >> index_msb; +} + +struct genapic apic_xen = { + .name = "xen", +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + .int_delivery_mode = dest_LowestPrio, +#endif + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .int_delivery_dest = APIC_DEST_LOGICAL | APIC_DM_LOWEST, + .target_cpus = xen_target_cpus, +#ifdef CONFIG_XEN_PRIVILEGED_GUEST + .apic_id_registered = xen_apic_id_registered, +#endif + .init_apic_ldr = xen_init_apic_ldr, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_mask = xen_send_IPI_mask, + .cpu_mask_to_apicid = xen_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, +}; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,280 @@ +/* Xenbus code for blkif backend + Copyright (C) 2005 Rusty Russell <rusty@xxxxxxxxxxxxxxx> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <stdarg.h> +#include <linux/module.h> +#include <asm-xen/xenbus.h> +#include "common.h" + +struct backend_info +{ + struct xenbus_device *dev; + + /* our communications channel */ + blkif_t *blkif; + + long int frontend_id; + long int pdev; + long int readonly; + + /* watch back end for changes */ + struct xenbus_watch backend_watch; + + /* watch front end for changes */ + struct xenbus_watch watch; + char *frontpath; +}; + +static int blkback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->data; + + if (be->watch.node) + unregister_xenbus_watch(&be->watch); + unregister_xenbus_watch(&be->backend_watch); + if (be->blkif) + blkif_put(be->blkif); + if (be->frontpath) + kfree(be->frontpath); + kfree(be); + return 0; +} + +/* Front end tells us frame. */ +static void frontend_changed(struct xenbus_watch *watch, const char *node) +{ + unsigned long ring_ref; + unsigned int evtchn; + int err; + struct backend_info *be + = container_of(watch, struct backend_info, watch); + + /* If other end is gone, delete ourself. */ + if (node && !xenbus_exists(be->frontpath, "")) { + xenbus_rm(be->dev->nodename, ""); + device_unregister(&be->dev->dev); + return; + } + if (be->blkif == NULL || be->blkif->status == CONNECTED) + return; + + err = xenbus_gather(be->frontpath, "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_error(be->dev, err, + "reading %s/ring-ref and event-channel", + be->frontpath); + return; + } + + /* Supply the information about the device the frontend needs */ + err = xenbus_transaction_start(be->dev->nodename); + if (err) { + xenbus_dev_error(be->dev, err, "starting transaction"); + return; + } + + err = xenbus_printf(be->dev->nodename, "sectors", "%lu", + vbd_size(&be->blkif->vbd)); + if (err) { + xenbus_dev_error(be->dev, err, "writing %s/sectors", + be->dev->nodename); + goto abort; + } + + /* FIXME: use a typename instead */ + err = xenbus_printf(be->dev->nodename, "info", "%u", + vbd_info(&be->blkif->vbd)); + if (err) { + xenbus_dev_error(be->dev, err, "writing %s/info", + be->dev->nodename); + goto abort; + } + err = xenbus_printf(be->dev->nodename, "sector-size", "%lu", + vbd_secsize(&be->blkif->vbd)); + if (err) { + xenbus_dev_error(be->dev, err, "writing %s/sector-size", + be->dev->nodename); + goto abort; + } + + /* Map the shared frame, irq etc. */ + err = blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_error(be->dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + goto abort; + } + + xenbus_transaction_end(0); + xenbus_dev_ok(be->dev); + + return; + +abort: + xenbus_transaction_end(1); +} + +/* + Setup supplies physical device. + We provide event channel and device details to front end. + Frontend supplies shared frame and event channel. + */ +static void backend_changed(struct xenbus_watch *watch, const char *node) +{ + int err; + char *p; + long int handle, pdev; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + + err = xenbus_scanf(dev->nodename, "physical-device", "%li", &pdev); + if (XENBUS_EXIST_ERR(err)) + return; + if (err < 0) { + xenbus_dev_error(dev, err, "reading physical-device"); + return; + } + if (be->pdev && be->pdev != pdev) { + printk(KERN_WARNING + "changing physical-device not supported\n"); + return; + } + be->pdev = pdev; + + /* If there's a read-only node, we're read only. */ + p = xenbus_read(dev->nodename, "read-only", NULL); + if (!IS_ERR(p)) { + be->readonly = 1; + kfree(p); + } + + if (be->blkif == NULL) { + /* Front end dir is a number, which is used as the handle. */ + p = strrchr(be->frontpath, '/') + 1; + handle = simple_strtoul(p, NULL, 0); + + be->blkif = alloc_blkif(be->frontend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_error(dev, err, "creating block interface"); + return; + } + + err = vbd_create(be->blkif, handle, be->pdev, be->readonly); + if (err) { + xenbus_dev_error(dev, err, "creating vbd structure"); + return; + } + + /* Pass in NULL node to skip exist test. */ + frontend_changed(&be->watch, NULL); + } +} + +static int blkback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct backend_info *be; + char *frontend; + int err; + + be = kmalloc(sizeof(*be), GFP_KERNEL); + if (!be) { + xenbus_dev_error(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + memset(be, 0, sizeof(*be)); + + frontend = NULL; + err = xenbus_gather(dev->nodename, + "frontend-id", "%li", &be->frontend_id, + "frontend", NULL, &frontend, + NULL); + if (XENBUS_EXIST_ERR(err)) + goto free_be; + if (err < 0) { + xenbus_dev_error(dev, err, + "reading %s/frontend or frontend-id", + dev->nodename); + goto free_be; + } + if (strlen(frontend) == 0 || !xenbus_exists(frontend, "")) { + /* If we can't get a frontend path and a frontend-id, + * then our bus-id is no longer valid and we need to + * destroy the backend device. + */ + err = -ENOENT; + goto free_be; + } + + be->dev = dev; + be->backend_watch.node = dev->nodename; + be->backend_watch.callback = backend_changed; + err = register_xenbus_watch(&be->backend_watch); + if (err) { + be->backend_watch.node = NULL; + xenbus_dev_error(dev, err, "adding backend watch on %s", + dev->nodename); + goto free_be; + } + + be->frontpath = frontend; + be->watch.node = be->frontpath; + be->watch.callback = frontend_changed; + err = register_xenbus_watch(&be->watch); + if (err) { + be->watch.node = NULL; + xenbus_dev_error(dev, err, + "adding frontend watch on %s", + be->frontpath); + goto free_be; + } + + dev->data = be; + + backend_changed(&be->backend_watch, dev->nodename); + return 0; + + free_be: + if (be->backend_watch.node) + unregister_xenbus_watch(&be->backend_watch); + if (frontend) + kfree(frontend); + kfree(be); + return err; +} + +static struct xenbus_device_id blkback_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver blkback = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = blkback_ids, + .probe = blkback_probe, + .remove = blkback_remove, +}; + +void blkif_xenbus_init(void) +{ + xenbus_register_backend(&blkback); +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netback/xenbus.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,257 @@ +/* Xenbus code for netif backend + Copyright (C) 2005 Rusty Russell <rusty@xxxxxxxxxxxxxxx> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <stdarg.h> +#include <linux/module.h> +#include <asm-xen/xenbus.h> +#include "common.h" + +struct backend_info +{ + struct xenbus_device *dev; + + /* our communications channel */ + netif_t *netif; + + long int frontend_id; +#if 0 + long int pdev; + long int readonly; +#endif + + /* watch back end for changes */ + struct xenbus_watch backend_watch; + + /* watch front end for changes */ + struct xenbus_watch watch; + char *frontpath; +}; + +static int netback_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev->data; + + if (be->watch.node) + unregister_xenbus_watch(&be->watch); + unregister_xenbus_watch(&be->backend_watch); + if (be->netif) + netif_disconnect(be->netif); + if (be->frontpath) + kfree(be->frontpath); + kfree(be); + return 0; +} + +/* Front end tells us frame. */ +static void frontend_changed(struct xenbus_watch *watch, const char *node) +{ + unsigned long tx_ring_ref, rx_ring_ref; + unsigned int evtchn; + int err; + struct backend_info *be + = container_of(watch, struct backend_info, watch); + char *mac, *e, *s; + int i; + + /* If other end is gone, delete ourself. */ + if (node && !xenbus_exists(be->frontpath, "")) { + xenbus_rm(be->dev->nodename, ""); + device_unregister(&be->dev->dev); + return; + } + if (be->netif == NULL || be->netif->status == CONNECTED) + return; + + mac = xenbus_read(be->frontpath, "mac", NULL); + if (IS_ERR(mac)) { + err = PTR_ERR(mac); + xenbus_dev_error(be->dev, err, "reading %s/mac", + be->dev->nodename); + return; + } + s = mac; + for (i = 0; i < ETH_ALEN; i++) { + be->netif->fe_dev_addr[i] = simple_strtoul(s, &e, 16); + if (s == e || (e[0] != ':' && e[0] != 0)) { + kfree(mac); + err = -ENOENT; + xenbus_dev_error(be->dev, err, "parsing %s/mac", + be->dev->nodename); + return; + } + s = &e[1]; + } + kfree(mac); + + err = xenbus_gather(be->frontpath, "tx-ring-ref", "%lu", &tx_ring_ref, + "rx-ring-ref", "%lu", &rx_ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_error(be->dev, err, + "reading %s/ring-ref and event-channel", + be->frontpath); + return; + } + + /* Map the shared frame, irq etc. */ + err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn); + if (err) { + xenbus_dev_error(be->dev, err, + "mapping shared-frames %lu/%lu port %u", + tx_ring_ref, rx_ring_ref, evtchn); + return; + } + + xenbus_dev_ok(be->dev); + + return; +} + +/* + Setup supplies physical device. + We provide event channel and device details to front end. + Frontend supplies shared frame and event channel. + */ +static void backend_changed(struct xenbus_watch *watch, const char *node) +{ + int err; + long int handle; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + u8 be_mac[ETH_ALEN] = { 0, 0, 0, 0, 0, 0 }; + + err = xenbus_scanf(dev->nodename, "handle", "%li", &handle); + if (XENBUS_EXIST_ERR(err)) + return; + if (err < 0) { + xenbus_dev_error(dev, err, "reading handle"); + return; + } + + if (be->netif == NULL) { + be->netif = alloc_netif(be->frontend_id, handle, be_mac); + if (IS_ERR(be->netif)) { + err = PTR_ERR(be->netif); + be->netif = NULL; + xenbus_dev_error(dev, err, "creating interface"); + return; + } + +#if 0 + err = vbd_create(be->netif, handle, be->pdev, be->readonly); + if (err) { + xenbus_dev_error(dev, err, "creating vbd structure"); + return; + } +#endif + + /* Pass in NULL node to skip exist test. */ + frontend_changed(&be->watch, NULL); + } +} + +static int netback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + struct backend_info *be; + char *frontend; + int err; + + be = kmalloc(sizeof(*be), GFP_KERNEL); + if (!be) { + xenbus_dev_error(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + memset(be, 0, sizeof(*be)); + + frontend = NULL; + err = xenbus_gather(dev->nodename, + "frontend-id", "%li", &be->frontend_id, + "frontend", NULL, &frontend, + NULL); + if (XENBUS_EXIST_ERR(err)) + goto free_be; + if (err < 0) { + xenbus_dev_error(dev, err, + "reading %s/frontend or frontend-id", + dev->nodename); + goto free_be; + } + if (strlen(frontend) == 0 || !xenbus_exists(frontend, "")) { + /* If we can't get a frontend path and a frontend-id, + * then our bus-id is no longer valid and we need to + * destroy the backend device. + */ + err = -ENOENT; + goto free_be; + } + + be->dev = dev; + be->backend_watch.node = dev->nodename; + be->backend_watch.callback = backend_changed; + err = register_xenbus_watch(&be->backend_watch); + if (err) { + be->backend_watch.node = NULL; + xenbus_dev_error(dev, err, "adding backend watch on %s", + dev->nodename); + goto free_be; + } + + be->frontpath = frontend; + be->watch.node = be->frontpath; + be->watch.callback = frontend_changed; + err = register_xenbus_watch(&be->watch); + if (err) { + be->watch.node = NULL; + xenbus_dev_error(dev, err, + "adding frontend watch on %s", + be->frontpath); + goto free_be; + } + + dev->data = be; + + backend_changed(&be->backend_watch, dev->nodename); + return 0; + + free_be: + if (be->backend_watch.node) + unregister_xenbus_watch(&be->backend_watch); + if (frontend) + kfree(frontend); + kfree(be); + return err; +} + +static struct xenbus_device_id netback_ids[] = { + { "vif" }, + { "" } +}; + +static struct xenbus_driver netback = { + .name = "vif", + .owner = THIS_MODULE, + .ids = netback_ids, + .probe = netback_probe, + .remove = netback_remove, +}; + +void netif_xenbus_init(void) +{ + xenbus_register_backend(&netback); +} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/hw_irq.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hw_irq.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,71 @@ +#ifndef _ASM_HW_IRQ_H +#define _ASM_HW_IRQ_H + +/* + * linux/include/asm/hw_irq.h + * + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar + * + * moved some of the old arch/i386/kernel/irq.h to here. VY + * + * IRQ/IPI changes taken from work by Thomas Radke + * <tomsoft@xxxxxxxxxxxxxxxxxxxxxxxxx> + */ + +#include <linux/config.h> +#include <linux/profile.h> +#include <asm/atomic.h> +#include <asm/irq.h> +#include <asm/sections.h> + +/* + * Various low-level irq details needed by irq.c, process.c, + * time.c, io_apic.c and smp.c + * + * Interrupt entry/exit code at both C and assembly level + */ + +extern u8 irq_vector[NR_IRQ_VECTORS]; +#define IO_APIC_VECTOR(irq) (irq_vector[irq]) +#define AUTO_ASSIGN -1 + +extern void (*interrupt[NR_IRQS])(void); + +#ifdef CONFIG_SMP +fastcall void reschedule_interrupt(void); +fastcall void invalidate_interrupt(void); +fastcall void call_function_interrupt(void); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC +fastcall void apic_timer_interrupt(void); +fastcall void error_interrupt(void); +fastcall void spurious_interrupt(void); +fastcall void thermal_interrupt(struct pt_regs *); +#define platform_legacy_irq(irq) ((irq) < 16) +#endif + +void disable_8259A_irq(unsigned int irq); +void enable_8259A_irq(unsigned int irq); +int i8259A_irq_pending(unsigned int irq); +void make_8259A_irq(unsigned int irq); +void init_8259A(int aeoi); +void FASTCALL(send_IPI_self(int vector)); +void init_VISWS_APIC_irqs(void); +void setup_IO_APIC(void); +void disable_IO_APIC(void); +void print_IO_APIC(void); +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +void send_IPI(int dest, int vector); +void setup_ioapic_dest(void); + +extern unsigned long io_apic_irqs; + +extern atomic_t irq_err_count; +extern atomic_t irq_mis_count; + +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) + +extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i); + +#endif /* _ASM_HW_IRQ_H */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/kmap_types.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/kmap_types.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,32 @@ +#ifndef _ASM_KMAP_TYPES_H +#define _ASM_KMAP_TYPES_H + +#include <linux/config.h> + +#ifdef CONFIG_DEBUG_HIGHMEM +# define D(n) __KM_FENCE_##n , +#else +# define D(n) +#endif + +enum km_type { +D(0) KM_BOUNCE_READ, +D(1) KM_SKB_SUNRPC_DATA, +D(2) KM_SKB_DATA_SOFTIRQ, +D(3) KM_USER0, +D(4) KM_USER1, +D(5) KM_BIO_SRC_IRQ, +D(6) KM_BIO_DST_IRQ, +D(7) KM_PTE0, +D(8) KM_PTE1, +D(9) KM_IRQ0, +D(10) KM_IRQ1, +D(11) KM_SOFTIRQ0, +D(12) KM_SOFTIRQ1, +D(13) KM_SWIOTLB, +D(14) KM_TYPE_NR +}; + +#undef D + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/scatterlist.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/scatterlist.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,22 @@ +#ifndef _I386_SCATTERLIST_H +#define _I386_SCATTERLIST_H + +struct scatterlist { + struct page *page; + unsigned int offset; + unsigned int length; + dma_addr_t dma_address; + unsigned int dma_length; +}; + +/* These macros should be used after a pci_map_sg call has been done + * to get bus addresses of each of the SG entries and their lengths. + * You should only work with the number of sg entries pci_map_sg + * returns. + */ +#define sg_dma_address(sg) ((sg)->dma_address) +#define sg_dma_len(sg) ((sg)->dma_length) + +#define ISA_DMA_THRESHOLD (0x00ffffff) + +#endif /* !(_I386_SCATTERLIST_H) */ diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-i386/swiotlb.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/swiotlb.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,42 @@ +#ifndef _ASM_SWIOTLB_H +#define _ASM_SWIOTLB_H 1 + +#include <linux/config.h> + +/* SWIOTLB interface */ + +extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr, size_t size, + int dir); +extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir); +extern void swiotlb_sync_single_for_cpu(struct device *hwdev, + dma_addr_t dev_addr, + size_t size, int dir); +extern void swiotlb_sync_single_for_device(struct device *hwdev, + dma_addr_t dev_addr, + size_t size, int dir); +extern void swiotlb_sync_sg_for_cpu(struct device *hwdev, + struct scatterlist *sg, int nelems, + int dir); +extern void swiotlb_sync_sg_for_device(struct device *hwdev, + struct scatterlist *sg, int nelems, + int dir); +extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction); +extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg, + int nents, int direction); +extern int swiotlb_dma_mapping_error(dma_addr_t dma_addr); +extern dma_addr_t swiotlb_map_page(struct device *hwdev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction); +extern void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dma_address, + size_t size, enum dma_data_direction direction); +extern int swiotlb_dma_supported(struct device *hwdev, u64 mask); + +#ifdef CONFIG_SWIOTLB +extern int swiotlb; +#else +#define swiotlb 0 +#endif + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hw_irq.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hw_irq.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,138 @@ +#ifndef _ASM_HW_IRQ_H +#define _ASM_HW_IRQ_H + +/* + * linux/include/asm/hw_irq.h + * + * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar + * + * moved some of the old arch/i386/kernel/irq.h to here. VY + * + * IRQ/IPI changes taken from work by Thomas Radke + * <tomsoft@xxxxxxxxxxxxxxxxxxxxxxxxx> + * + * hacked by Andi Kleen for x86-64. + * + * $Id: hw_irq.h,v 1.24 2001/09/14 20:55:03 vojtech Exp $ + */ + +#ifndef __ASSEMBLY__ +#include <linux/config.h> +#include <asm/atomic.h> +#include <asm/irq.h> +#include <linux/profile.h> +#include <linux/smp.h> + +struct hw_interrupt_type; +#endif + +/* + * IDT vectors usable for external interrupt sources start + * at 0x20: + */ +#define FIRST_EXTERNAL_VECTOR 0x20 + +#define IA32_SYSCALL_VECTOR 0x80 + + +/* + * Vectors 0x20-0x2f are used for ISA interrupts. + */ + +/* + * Special IRQ vectors used by the SMP architecture, 0xf0-0xff + * + * some of the following vectors are 'rare', they are merged + * into a single vector (CALL_FUNCTION_VECTOR) to save vector space. + * TLB, reschedule and local APIC vectors are performance-critical. + * + * Vectors 0xf0-0xf9 are free (reserved for future Linux use). + */ +#ifndef CONFIG_XEN +#define SPURIOUS_APIC_VECTOR 0xff +#define ERROR_APIC_VECTOR 0xfe +#define INVALIDATE_TLB_VECTOR 0xfd +#define RESCHEDULE_VECTOR 0xfc +#define TASK_MIGRATION_VECTOR 0xfb +#define CALL_FUNCTION_VECTOR 0xfa +#define KDB_VECTOR 0xf9 + +#define THERMAL_APIC_VECTOR 0xf0 +#endif + +/* + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. + */ +#define LOCAL_TIMER_VECTOR 0xef + +/* + * First APIC vector available to drivers: (vectors 0x30-0xee) + * we start at 0x31 to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) + */ +#define FIRST_DEVICE_VECTOR 0x31 +#define FIRST_SYSTEM_VECTOR 0xef /* duplicated in irq.h */ + + +#ifndef __ASSEMBLY__ +extern u8 irq_vector[NR_IRQ_VECTORS]; +#define IO_APIC_VECTOR(irq) (irq_vector[irq]) +#define AUTO_ASSIGN -1 + +/* + * Various low-level irq details needed by irq.c, process.c, + * time.c, io_apic.c and smp.c + * + * Interrupt entry/exit code at both C and assembly level + */ + +extern void disable_8259A_irq(unsigned int irq); +extern void enable_8259A_irq(unsigned int irq); +extern int i8259A_irq_pending(unsigned int irq); +extern void make_8259A_irq(unsigned int irq); +extern void init_8259A(int aeoi); +extern void FASTCALL(send_IPI_self(int vector)); +extern void init_VISWS_APIC_irqs(void); +extern void setup_IO_APIC(void); +extern void disable_IO_APIC(void); +extern void print_IO_APIC(void); +extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +extern void send_IPI(int dest, int vector); +extern void setup_ioapic_dest(void); + +extern unsigned long io_apic_irqs; + +extern atomic_t irq_err_count; +extern atomic_t irq_mis_count; + +#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) + +#define __STR(x) #x +#define STR(x) __STR(x) + +#include <asm/ptrace.h> + +#define IRQ_NAME2(nr) nr##_interrupt(void) +#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) + +/* + * SMP has a few special interrupts for IPI messages + */ + +#define BUILD_IRQ(nr) \ +asmlinkage void IRQ_NAME(nr); \ +__asm__( \ +"\n.p2align\n" \ +"IRQ" #nr "_interrupt:\n\t" \ + "push $" #nr "-256 ; " \ + "jmp common_interrupt"); + +extern void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i); + +#define platform_legacy_irq(irq) ((irq) < 16) + +#endif + +#endif /* _ASM_HW_IRQ_H */ diff -r 5f1ed597f107 -r 8799d14bef77 patches/linux-2.6.12/patch-2.6.12.5 --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/patches/linux-2.6.12/patch-2.6.12.5 Thu Aug 25 22:53:20 2005 @@ -0,0 +1,1614 @@ +diff --git a/Makefile b/Makefile +--- a/Makefile ++++ b/Makefile +@@ -1,7 +1,7 @@ + VERSION = 2 + PATCHLEVEL = 6 + SUBLEVEL = 12 +-EXTRAVERSION = ++EXTRAVERSION = .5 + NAME=Woozy Numbat + + # *DOCUMENTATION* +@@ -1149,7 +1149,7 @@ endif # KBUILD_EXTMOD + #(which is the most common case IMHO) to avoid unneeded clutter in the big tags file. + #Adding $(srctree) adds about 20M on i386 to the size of the output file! + +-ifeq ($(KBUILD_OUTPUT),) ++ifeq ($(src),$(obj)) + __srctree = + else + __srctree = $(srctree)/ +diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c ++++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +@@ -44,7 +44,7 @@ + + #define PFX "powernow-k8: " + #define BFX PFX "BIOS error: " +-#define VERSION "version 1.40.2" ++#define VERSION "version 1.40.4" + #include "powernow-k8.h" + + /* serialize freq changes */ +@@ -978,7 +978,7 @@ static int __init powernowk8_cpu_init(st + { + struct powernow_k8_data *data; + cpumask_t oldmask = CPU_MASK_ALL; +- int rc; ++ int rc, i; + + if (!check_supported_cpu(pol->cpu)) + return -ENODEV; +@@ -1064,7 +1064,9 @@ static int __init powernowk8_cpu_init(st + printk("cpu_init done, current fid 0x%x, vid 0x%x\n", + data->currfid, data->currvid); + +- powernow_data[pol->cpu] = data; ++ for_each_cpu_mask(i, cpu_core_map[pol->cpu]) { ++ powernow_data[i] = data; ++ } + + return 0; + +diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c +--- a/arch/i386/kernel/process.c ++++ b/arch/i386/kernel/process.c +@@ -827,6 +827,8 @@ asmlinkage int sys_get_thread_area(struc + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + ++ memset(&info, 0, sizeof(info)); ++ + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; +diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c +--- a/arch/ia64/kernel/ptrace.c ++++ b/arch/ia64/kernel/ptrace.c +@@ -945,6 +945,13 @@ access_uarea (struct task_struct *child, + *data = (pt->cr_ipsr & IPSR_MASK); + return 0; + ++ case PT_AR_RSC: ++ if (write_access) ++ pt->ar_rsc = *data | (3 << 2); /* force PL3 */ ++ else ++ *data = pt->ar_rsc; ++ return 0; ++ + case PT_AR_RNAT: + urbs_end = ia64_get_user_rbs_end(child, pt, NULL); + rnat_addr = (long) ia64_rse_rnat_addr((long *) +@@ -996,9 +1003,6 @@ access_uarea (struct task_struct *child, + case PT_AR_BSPSTORE: + ptr = pt_reg_addr(pt, ar_bspstore); + break; +- case PT_AR_RSC: +- ptr = pt_reg_addr(pt, ar_rsc); +- break; + case PT_AR_UNAT: + ptr = pt_reg_addr(pt, ar_unat); + break; +@@ -1234,7 +1238,7 @@ ptrace_getregs (struct task_struct *chil + static long + ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr) + { +- unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val = 0; ++ unsigned long psr, rsc, ec, lc, rnat, bsp, cfm, nat_bits, val = 0; + struct unw_frame_info info; + struct switch_stack *sw; + struct ia64_fpreg fpval; +@@ -1267,7 +1271,7 @@ ptrace_setregs (struct task_struct *chil + /* app regs */ + + retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]); +- retval |= __get_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]); ++ retval |= __get_user(rsc, &ppr->ar[PT_AUR_RSC]); + retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]); + retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]); + retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]); +@@ -1365,6 +1369,7 @@ ptrace_setregs (struct task_struct *chil + retval |= __get_user(nat_bits, &ppr->nat); + + retval |= access_uarea(child, PT_CR_IPSR, &psr, 1); ++ retval |= access_uarea(child, PT_AR_RSC, &rsc, 1); + retval |= access_uarea(child, PT_AR_EC, &ec, 1); + retval |= access_uarea(child, PT_AR_LC, &lc, 1); + retval |= access_uarea(child, PT_AR_RNAT, &rnat, 1); +diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c +--- a/arch/ia64/kernel/signal.c ++++ b/arch/ia64/kernel/signal.c +@@ -94,7 +94,7 @@ sys_sigaltstack (const stack_t __user *u + static long + restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) + { +- unsigned long ip, flags, nat, um, cfm; ++ unsigned long ip, flags, nat, um, cfm, rsc; + long err; + + /* Always make any pending restarted system calls return -EINTR */ +@@ -106,7 +106,7 @@ restore_sigcontext (struct sigcontext __ + err |= __get_user(ip, &sc->sc_ip); /* instruction pointer */ + err |= __get_user(cfm, &sc->sc_cfm); + err |= __get_user(um, &sc->sc_um); /* user mask */ +- err |= __get_user(scr->pt.ar_rsc, &sc->sc_ar_rsc); ++ err |= __get_user(rsc, &sc->sc_ar_rsc); + err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat); + err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr); + err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs); +@@ -119,6 +119,7 @@ restore_sigcontext (struct sigcontext __ + err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8); /* r15 */ + + scr->pt.cr_ifs = cfm | (1UL << 63); ++ scr->pt.ar_rsc = rsc | (3 << 2); /* force PL3 */ + + /* establish new instruction pointer: */ + scr->pt.cr_iip = ip & ~0x3UL; +diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c +--- a/arch/ppc/kernel/time.c ++++ b/arch/ppc/kernel/time.c +@@ -89,6 +89,9 @@ unsigned long tb_to_ns_scale; + + extern unsigned long wall_jiffies; + ++/* used for timezone offset */ ++static long timezone_offset; ++ + DEFINE_SPINLOCK(rtc_lock); + + EXPORT_SYMBOL(rtc_lock); +@@ -170,7 +173,7 @@ void timer_interrupt(struct pt_regs * re + xtime.tv_sec - last_rtc_update >= 659 && + abs((xtime.tv_nsec / 1000) - (1000000-1000000/HZ)) < 500000/HZ && + jiffies - wall_jiffies == 1) { +- if (ppc_md.set_rtc_time(xtime.tv_sec+1 + time_offset) == 0) ++ if (ppc_md.set_rtc_time(xtime.tv_sec+1 + timezone_offset) == 0) + last_rtc_update = xtime.tv_sec+1; + else + /* Try again one minute later */ +@@ -286,7 +289,7 @@ void __init time_init(void) + unsigned old_stamp, stamp, elapsed; + + if (ppc_md.time_init != NULL) +- time_offset = ppc_md.time_init(); ++ timezone_offset = ppc_md.time_init(); + + if (__USE_RTC()) { + /* 601 processor: dec counts down by 128 every 128ns */ +@@ -331,10 +334,10 @@ void __init time_init(void) + set_dec(tb_ticks_per_jiffy); + + /* If platform provided a timezone (pmac), we correct the time */ +- if (time_offset) { +- sys_tz.tz_minuteswest = -time_offset / 60; ++ if (timezone_offset) { ++ sys_tz.tz_minuteswest = -timezone_offset / 60; + sys_tz.tz_dsttime = 0; +- xtime.tv_sec -= time_offset; ++ xtime.tv_sec -= timezone_offset; + } + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); +diff --git a/arch/ppc64/boot/zlib.c b/arch/ppc64/boot/zlib.c +--- a/arch/ppc64/boot/zlib.c ++++ b/arch/ppc64/boot/zlib.c +@@ -1307,7 +1307,7 @@ local int huft_build( + { + *t = (inflate_huft *)Z_NULL; + *m = 0; +- return Z_OK; ++ return Z_DATA_ERROR; + } + + +@@ -1351,6 +1351,7 @@ local int huft_build( + if ((j = *p++) != 0) + v[x[j]++] = i; + } while (++i < n); ++ n = x[g]; /* set n to length of v */ + + + /* Generate the Huffman codes and for each, make the table entries */ +diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c +--- a/arch/um/kernel/process.c ++++ b/arch/um/kernel/process.c +@@ -130,7 +130,7 @@ int start_fork_tramp(void *thread_arg, u + return(arg.pid); + } + +-static int ptrace_child(void) ++static int ptrace_child(void *arg) + { + int ret; + int pid = os_getpid(), ppid = getppid(); +@@ -159,16 +159,20 @@ static int ptrace_child(void) + _exit(ret); + } + +-static int start_ptraced_child(void) ++static int start_ptraced_child(void **stack_out) + { ++ void *stack; ++ unsigned long sp; + int pid, n, status; + +- pid = fork(); +- if(pid == 0) +- ptrace_child(); +- ++ stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if(stack == MAP_FAILED) ++ panic("check_ptrace : mmap failed, errno = %d", errno); ++ sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *); ++ pid = clone(ptrace_child, (void *) sp, SIGCHLD, NULL); + if(pid < 0) +- panic("check_ptrace : fork failed, errno = %d", errno); ++ panic("check_ptrace : clone failed, errno = %d", errno); + CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED)); + if(n < 0) + panic("check_ptrace : wait failed, errno = %d", errno); +@@ -176,6 +180,7 @@ static int start_ptraced_child(void) + panic("check_ptrace : expected SIGSTOP, got status = %d", + status); + ++ *stack_out = stack; + return(pid); + } + +@@ -183,12 +188,12 @@ static int start_ptraced_child(void) + * just avoid using sysemu, not panic, but only if SYSEMU features are broken. + * So only for SYSEMU features we test mustpanic, while normal host features + * must work anyway!*/ +-static int stop_ptraced_child(int pid, int exitcode, int mustexit) ++static int stop_ptraced_child(int pid, void *stack, int exitcode, int mustpanic) + { + int status, n, ret = 0; + + if(ptrace(PTRACE_CONT, pid, 0, 0) < 0) +- panic("stop_ptraced_child : ptrace failed, errno = %d", errno); ++ panic("check_ptrace : ptrace failed, errno = %d", errno); + CATCH_EINTR(n = waitpid(pid, &status, 0)); + if(!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) { + int exit_with = WEXITSTATUS(status); +@@ -199,13 +204,15 @@ static int stop_ptraced_child(int pid, i + printk("check_ptrace : child exited with exitcode %d, while " + "expecting %d; status 0x%x", exit_with, + exitcode, status); +- if (mustexit) ++ if (mustpanic) + panic("\n"); + else + printk("\n"); + ret = -1; + } + ++ if(munmap(stack, PAGE_SIZE) < 0) ++ panic("check_ptrace : munmap failed, errno = %d", errno); + return ret; + } + +@@ -227,11 +234,12 @@ __uml_setup("nosysemu", nosysemu_cmd_par + + static void __init check_sysemu(void) + { ++ void *stack; + int pid, syscall, n, status, count=0; + + printk("Checking syscall emulation patch for ptrace..."); + sysemu_supported = 0; +- pid = start_ptraced_child(); ++ pid = start_ptraced_child(&stack); + + if(ptrace(PTRACE_SYSEMU, pid, 0, 0) < 0) + goto fail; +@@ -249,7 +257,7 @@ static void __init check_sysemu(void) + panic("check_sysemu : failed to modify system " + "call return, errno = %d", errno); + +- if (stop_ptraced_child(pid, 0, 0) < 0) ++ if (stop_ptraced_child(pid, stack, 0, 0) < 0) + goto fail_stopped; + + sysemu_supported = 1; +@@ -257,7 +265,7 @@ static void __init check_sysemu(void) + set_using_sysemu(!force_sysemu_disabled); + + printk("Checking advanced syscall emulation patch for ptrace..."); +- pid = start_ptraced_child(); ++ pid = start_ptraced_child(&stack); + while(1){ + count++; + if(ptrace(PTRACE_SYSEMU_SINGLESTEP, pid, 0, 0) < 0) +@@ -282,7 +290,7 @@ static void __init check_sysemu(void) + break; + } + } +- if (stop_ptraced_child(pid, 0, 0) < 0) ++ if (stop_ptraced_child(pid, stack, 0, 0) < 0) + goto fail_stopped; + + sysemu_supported = 2; +@@ -293,17 +301,18 @@ static void __init check_sysemu(void) + return; + + fail: +- stop_ptraced_child(pid, 1, 0); ++ stop_ptraced_child(pid, stack, 1, 0); + fail_stopped: + printk("missing\n"); + } + + void __init check_ptrace(void) + { ++ void *stack; + int pid, syscall, n, status; + + printk("Checking that ptrace can change system call numbers..."); +- pid = start_ptraced_child(); ++ pid = start_ptraced_child(&stack); + + if (ptrace(PTRACE_OLDSETOPTIONS, pid, 0, (void *)PTRACE_O_TRACESYSGOOD) < 0) + panic("check_ptrace: PTRACE_SETOPTIONS failed, errno = %d", errno); +@@ -330,7 +339,7 @@ void __init check_ptrace(void) + break; + } + } +- stop_ptraced_child(pid, 0, 1); ++ stop_ptraced_child(pid, stack, 0, 1); + printk("OK\n"); + check_sysemu(); + } +@@ -362,10 +371,11 @@ void forward_pending_sigio(int target) + static inline int check_skas3_ptrace_support(void) + { + struct ptrace_faultinfo fi; ++ void *stack; + int pid, n, ret = 1; + + printf("Checking for the skas3 patch in the host..."); +- pid = start_ptraced_child(); ++ pid = start_ptraced_child(&stack); + + n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi); + if (n < 0) { +@@ -380,7 +390,7 @@ static inline int check_skas3_ptrace_sup + } + + init_registers(pid); +- stop_ptraced_child(pid, 1, 1); ++ stop_ptraced_child(pid, stack, 1, 1); + + return(ret); + } +diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c +--- a/arch/x86_64/ia32/syscall32.c ++++ b/arch/x86_64/ia32/syscall32.c +@@ -57,6 +57,7 @@ int syscall32_setup_pages(struct linux_b + int npages = (VSYSCALL32_END - VSYSCALL32_BASE) >> PAGE_SHIFT; + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; ++ int ret; + + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) +@@ -78,7 +79,11 @@ int syscall32_setup_pages(struct linux_b + vma->vm_mm = mm; + + down_write(&mm->mmap_sem); +- insert_vm_struct(mm, vma); ++ if ((ret = insert_vm_struct(mm, vma))) { ++ up_write(&mm->mmap_sem); ++ kmem_cache_free(vm_area_cachep, vma); ++ return ret; ++ } + mm->total_vm += npages; + up_write(&mm->mmap_sem); + return 0; +diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c +--- a/arch/x86_64/kernel/setup.c ++++ b/arch/x86_64/kernel/setup.c +@@ -729,8 +729,6 @@ static void __init amd_detect_cmp(struct + int cpu = smp_processor_id(); + int node = 0; + unsigned bits; +- if (c->x86_num_cores == 1) +- return; + + bits = 0; + while ((1 << bits) < c->x86_num_cores) +diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c +--- a/arch/x86_64/kernel/smp.c ++++ b/arch/x86_64/kernel/smp.c +@@ -284,6 +284,71 @@ struct call_data_struct { + static struct call_data_struct * call_data; + + /* ++ * this function sends a 'generic call function' IPI to one other CPU ++ * in the system. ++ */ ++static void __smp_call_function_single (int cpu, void (*func) (void *info), void *info, ++ int nonatomic, int wait) ++{ ++ struct call_data_struct data; ++ int cpus = 1; ++ ++ data.func = func; ++ data.info = info; ++ atomic_set(&data.started, 0); ++ data.wait = wait; ++ if (wait) ++ atomic_set(&data.finished, 0); ++ ++ call_data = &data; ++ wmb(); ++ /* Send a message to all other CPUs and wait for them to respond */ ++ send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); ++ ++ /* Wait for response */ ++ while (atomic_read(&data.started) != cpus) ++ cpu_relax(); ++ ++ if (!wait) ++ return; ++ ++ while (atomic_read(&data.finished) != cpus) ++ cpu_relax(); ++} ++ ++/* ++ * Run a function on another CPU ++ * <func> The function to run. This must be fast and non-blocking. ++ * <info> An arbitrary pointer to pass to the function. ++ * <nonatomic> Currently unused. ++ * <wait> If true, wait until function has completed on other CPUs. ++ * [RETURNS] 0 on success, else a negative status code. ++ * ++ * Does not return until the remote CPU is nearly ready to execute <func> ++ * or is or has executed. ++ */ ++ ++int smp_call_function_single (int cpu, void (*func) (void *info), void *info, ++ int nonatomic, int wait) ++{ ++ ++ int me = get_cpu(); /* prevent preemption and reschedule on another processor */ ++ ++ if (cpu == me) { ++ printk("%s: trying to call self\n", __func__); ++ put_cpu(); ++ return -EBUSY; ++ } ++ spin_lock_bh(&call_lock); ++ ++ __smp_call_function_single(cpu, func,info,nonatomic,wait); ++ ++ spin_unlock_bh(&call_lock); ++ put_cpu(); ++ return 0; ++} ++ ++/* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. + */ +diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c +--- a/arch/x86_64/kernel/smpboot.c ++++ b/arch/x86_64/kernel/smpboot.c +@@ -202,9 +202,6 @@ static __cpuinit void sync_master(void * + { + unsigned long flags, i; + +- if (smp_processor_id() != boot_cpu_id) +- return; +- + go[MASTER] = 0; + + local_irq_save(flags); +@@ -253,7 +250,7 @@ get_delta(long *rt, long *master) + return tcenter - best_tm; + } + +-static __cpuinit void sync_tsc(void) ++static __cpuinit void sync_tsc(unsigned int master) + { + int i, done = 0; + long delta, adj, adjust_latency = 0; +@@ -267,9 +264,17 @@ static __cpuinit void sync_tsc(void) + } t[NUM_ROUNDS] __cpuinitdata; + #endif + ++ printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", ++ smp_processor_id(), master); ++ + go[MASTER] = 1; + +- smp_call_function(sync_master, NULL, 1, 0); ++ /* It is dangerous to broadcast IPI as cpus are coming up, ++ * as they may not be ready to accept them. So since ++ * we only need to send the ipi to the boot cpu direct ++ * the message, and avoid the race. ++ */ ++ smp_call_function_single(master, sync_master, NULL, 1, 0); + + while (go[MASTER]) /* wait for master to be ready */ + no_cpu_relax(); +@@ -313,16 +318,14 @@ static __cpuinit void sync_tsc(void) + printk(KERN_INFO + "CPU %d: synchronized TSC with CPU %u (last diff %ld cycles, " + "maxerr %lu cycles)\n", +- smp_processor_id(), boot_cpu_id, delta, rt); ++ smp_processor_id(), master, delta, rt); + } + + static void __cpuinit tsc_sync_wait(void) + { + if (notscsync || !cpu_has_tsc) + return; +- printk(KERN_INFO "CPU %d: Syncing TSC to CPU %u.\n", smp_processor_id(), +- boot_cpu_id); +- sync_tsc(); ++ sync_tsc(0); + } + + static __init int notscsync_setup(char *s) +diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c +--- a/drivers/acpi/pci_irq.c ++++ b/drivers/acpi/pci_irq.c +@@ -433,8 +433,9 @@ acpi_pci_irq_enable ( + printk(KERN_WARNING PREFIX "PCI Interrupt %s[%c]: no GSI", + pci_name(dev), ('A' + pin)); + /* Interrupt Line values above 0xF are forbidden */ +- if (dev->irq >= 0 && (dev->irq <= 0xF)) { ++ if (dev->irq > 0 && (dev->irq <= 0xF)) { + printk(" - using IRQ %d\n", dev->irq); ++ acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW); + return_VALUE(0); + } + else { +diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c +--- a/drivers/char/rocket.c ++++ b/drivers/char/rocket.c +@@ -277,7 +277,7 @@ static void rp_do_receive(struct r_port + ToRecv = space; + + if (ToRecv <= 0) +- return; ++ goto done; + + /* + * if status indicates there are errored characters in the +@@ -359,6 +359,7 @@ static void rp_do_receive(struct r_port + } + /* Push the data up to the tty layer */ + ld->receive_buf(tty, tty->flip.char_buf, tty->flip.flag_buf, count); ++done: + tty_ldisc_deref(ld); + } + +diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c +--- a/drivers/char/tpm/tpm.c ++++ b/drivers/char/tpm/tpm.c +@@ -32,12 +32,6 @@ + + #define TPM_BUFSIZE 2048 + +-/* PCI configuration addresses */ +-#define PCI_GEN_PMCON_1 0xA0 +-#define PCI_GEN1_DEC 0xE4 +-#define PCI_LPC_EN 0xE6 +-#define PCI_GEN2_DEC 0xEC +- + static LIST_HEAD(tpm_chip_list); + static DEFINE_SPINLOCK(driver_lock); + static int dev_mask[32]; +@@ -61,72 +55,6 @@ void tpm_time_expired(unsigned long ptr) + EXPORT_SYMBOL_GPL(tpm_time_expired); + + /* +- * Initialize the LPC bus and enable the TPM ports +- */ +-int tpm_lpc_bus_init(struct pci_dev *pci_dev, u16 base) +-{ +- u32 lpcenable, tmp; +- int is_lpcm = 0; +- +- switch (pci_dev->vendor) { +- case PCI_VENDOR_ID_INTEL: +- switch (pci_dev->device) { +- case PCI_DEVICE_ID_INTEL_82801CA_12: +- case PCI_DEVICE_ID_INTEL_82801DB_12: +- is_lpcm = 1; +- break; +- } +- /* init ICH (enable LPC) */ +- pci_read_config_dword(pci_dev, PCI_GEN1_DEC, &lpcenable); +- lpcenable |= 0x20000000; +- pci_write_config_dword(pci_dev, PCI_GEN1_DEC, lpcenable); +- +- if (is_lpcm) { +- pci_read_config_dword(pci_dev, PCI_GEN1_DEC, +- &lpcenable); +- if ((lpcenable & 0x20000000) == 0) { +- dev_err(&pci_dev->dev, +- "cannot enable LPC\n"); +- return -ENODEV; +- } +- } +- +- /* initialize TPM registers */ +- pci_read_config_dword(pci_dev, PCI_GEN2_DEC, &tmp); +- +- if (!is_lpcm) +- tmp = (tmp & 0xFFFF0000) | (base & 0xFFF0); +- else +- tmp = +- (tmp & 0xFFFF0000) | (base & 0xFFF0) | +- 0x00000001; +- +- pci_write_config_dword(pci_dev, PCI_GEN2_DEC, tmp); +- +- if (is_lpcm) { +- pci_read_config_dword(pci_dev, PCI_GEN_PMCON_1, +- &tmp); +- tmp |= 0x00000004; /* enable CLKRUN */ +- pci_write_config_dword(pci_dev, PCI_GEN_PMCON_1, +- tmp); +- } +- tpm_write_index(0x0D, 0x55); /* unlock 4F */ +- tpm_write_index(0x0A, 0x00); /* int disable */ +- tpm_write_index(0x08, base); /* base addr lo */ +- tpm_write_index(0x09, (base & 0xFF00) >> 8); /* base addr hi */ +- tpm_write_index(0x0D, 0xAA); /* lock 4F */ +- break; +- case PCI_VENDOR_ID_AMD: +- /* nothing yet */ +- break; +- } +- +- return 0; +-} +- +-EXPORT_SYMBOL_GPL(tpm_lpc_bus_init); +- +-/* + * Internal kernel interface to transmit TPM commands + */ + static ssize_t tpm_transmit(struct tpm_chip *chip, const char *buf, +@@ -590,10 +518,6 @@ int tpm_pm_resume(struct pci_dev *pci_de + if (chip == NULL) + return -ENODEV; + +- spin_lock(&driver_lock); +- tpm_lpc_bus_init(pci_dev, chip->vendor->base); +- spin_unlock(&driver_lock); +- + return 0; + } + +diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h +--- a/drivers/char/tpm/tpm.h ++++ b/drivers/char/tpm/tpm.h +@@ -79,8 +79,6 @@ static inline void tpm_write_index(int i + } + + extern void tpm_time_expired(unsigned long); +-extern int tpm_lpc_bus_init(struct pci_dev *, u16); +- + extern int tpm_register_hardware(struct pci_dev *, + struct tpm_vendor_specific *); + extern int tpm_open(struct inode *, struct file *); +diff --git a/drivers/char/tpm/tpm_atmel.c b/drivers/char/tpm/tpm_atmel.c +--- a/drivers/char/tpm/tpm_atmel.c ++++ b/drivers/char/tpm/tpm_atmel.c +@@ -22,7 +22,10 @@ + #include "tpm.h" + + /* Atmel definitions */ +-#define TPM_ATML_BASE 0x400 ++enum tpm_atmel_addr { ++ TPM_ATMEL_BASE_ADDR_LO = 0x08, ++ TPM_ATMEL_BASE_ADDR_HI = 0x09 ++}; + + /* write status bits */ + #define ATML_STATUS_ABORT 0x01 +@@ -127,7 +130,6 @@ static struct tpm_vendor_specific tpm_at + .cancel = tpm_atml_cancel, + .req_complete_mask = ATML_STATUS_BUSY | ATML_STATUS_DATA_AVAIL, + .req_complete_val = ATML_STATUS_DATA_AVAIL, +- .base = TPM_ATML_BASE, + .miscdev = { .fops = &atmel_ops, }, + }; + +@@ -136,14 +138,16 @@ static int __devinit tpm_atml_init(struc + { + u8 version[4]; + int rc = 0; ++ int lo, hi; + + if (pci_enable_device(pci_dev)) + return -EIO; + +- if (tpm_lpc_bus_init(pci_dev, TPM_ATML_BASE)) { +- rc = -ENODEV; +- goto out_err; +- } ++ lo = tpm_read_index( TPM_ATMEL_BASE_ADDR_LO ); ++ hi = tpm_read_index( TPM_ATMEL_BASE_ADDR_HI ); ++ ++ tpm_atmel.base = (hi<<8)|lo; ++ dev_dbg( &pci_dev->dev, "Operating with base: 0x%x\n", tpm_atmel.base); + + /* verify that it is an Atmel part */ + if (tpm_read_index(4) != 'A' || tpm_read_index(5) != 'T' +diff --git a/drivers/char/tpm/tpm_nsc.c b/drivers/char/tpm/tpm_nsc.c +--- a/drivers/char/tpm/tpm_nsc.c ++++ b/drivers/char/tpm/tpm_nsc.c +@@ -24,6 +24,10 @@ + /* National definitions */ + #define TPM_NSC_BASE 0x360 + #define TPM_NSC_IRQ 0x07 ++#define TPM_NSC_BASE0_HI 0x60 ++#define TPM_NSC_BASE0_LO 0x61 ++#define TPM_NSC_BASE1_HI 0x62 ++#define TPM_NSC_BASE1_LO 0x63 + + #define NSC_LDN_INDEX 0x07 + #define NSC_SID_INDEX 0x20 +@@ -234,7 +238,6 @@ static struct tpm_vendor_specific tpm_ns + .cancel = tpm_nsc_cancel, + .req_complete_mask = NSC_STATUS_OBF, + .req_complete_val = NSC_STATUS_OBF, +- .base = TPM_NSC_BASE, + .miscdev = { .fops = &nsc_ops, }, + + }; +@@ -243,15 +246,16 @@ static int __devinit tpm_nsc_init(struct + const struct pci_device_id *pci_id) + { + int rc = 0; ++ int lo, hi; ++ ++ hi = tpm_read_index(TPM_NSC_BASE0_HI); ++ lo = tpm_read_index(TPM_NSC_BASE0_LO); ++ ++ tpm_nsc.base = (hi<<8) | lo; + + if (pci_enable_device(pci_dev)) + return -EIO; + +- if (tpm_lpc_bus_init(pci_dev, TPM_NSC_BASE)) { +- rc = -ENODEV; +- goto out_err; +- } +- + /* verify that it is a National part (SID) */ + if (tpm_read_index(NSC_SID_INDEX) != 0xEF) { + rc = -ENODEV; +diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c +--- a/drivers/char/tty_ioctl.c ++++ b/drivers/char/tty_ioctl.c +@@ -476,11 +476,11 @@ int n_tty_ioctl(struct tty_struct * tty, + ld = tty_ldisc_ref(tty); + switch (arg) { + case TCIFLUSH: +- if (ld->flush_buffer) ++ if (ld && ld->flush_buffer) + ld->flush_buffer(tty); + break; + case TCIOFLUSH: +- if (ld->flush_buffer) ++ if (ld && ld->flush_buffer) + ld->flush_buffer(tty); + /* fall through */ + case TCOFLUSH: +diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c +--- a/drivers/media/video/cx88/cx88-video.c ++++ b/drivers/media/video/cx88/cx88-video.c +@@ -261,7 +261,7 @@ static struct cx88_ctrl cx8800_ctls[] = + .default_value = 0, + .type = V4L2_CTRL_TYPE_INTEGER, + }, +- .off = 0, ++ .off = 128, + .reg = MO_HUE, + .mask = 0x00ff, + .shift = 0, +diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c +--- a/drivers/net/e1000/e1000_main.c ++++ b/drivers/net/e1000/e1000_main.c +@@ -2307,6 +2307,7 @@ e1000_xmit_frame(struct sk_buff *skb, st + tso = e1000_tso(adapter, skb); + if (tso < 0) { + dev_kfree_skb_any(skb); ++ spin_unlock_irqrestore(&adapter->tx_lock, flags); + return NETDEV_TX_OK; + } + +diff --git a/drivers/net/hamradio/Kconfig b/drivers/net/hamradio/Kconfig +--- a/drivers/net/hamradio/Kconfig ++++ b/drivers/net/hamradio/Kconfig +@@ -17,7 +17,7 @@ config MKISS + + config 6PACK + tristate "Serial port 6PACK driver" +- depends on AX25 && BROKEN_ON_SMP ++ depends on AX25 + ---help--- + 6pack is a transmission protocol for the data exchange between your + PC and your TNC (the Terminal Node Controller acts as a kind of +diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c +--- a/drivers/net/shaper.c ++++ b/drivers/net/shaper.c +@@ -135,10 +135,8 @@ static int shaper_start_xmit(struct sk_b + { + struct shaper *shaper = dev->priv; + struct sk_buff *ptr; +- +- if (down_trylock(&shaper->sem)) +- return -1; + ++ spin_lock(&shaper->lock); + ptr=shaper->sendq.prev; + + /* +@@ -232,7 +230,7 @@ static int shaper_start_xmit(struct sk_b + shaper->stats.collisions++; + } + shaper_kick(shaper); +- up(&shaper->sem); ++ spin_unlock(&shaper->lock); + return 0; + } + +@@ -271,11 +269,9 @@ static void shaper_timer(unsigned long d + { + struct shaper *shaper = (struct shaper *)data; + +- if (!down_trylock(&shaper->sem)) { +- shaper_kick(shaper); +- up(&shaper->sem); +- } else +- mod_timer(&shaper->timer, jiffies); ++ spin_lock(&shaper->lock); ++ shaper_kick(shaper); ++ spin_unlock(&shaper->lock); + } + + /* +@@ -332,21 +328,6 @@ static void shaper_kick(struct shaper *s + + + /* +- * Flush the shaper queues on a closedown +- */ +- +-static void shaper_flush(struct shaper *shaper) +-{ +- struct sk_buff *skb; +- +- down(&shaper->sem); +- while((skb=skb_dequeue(&shaper->sendq))!=NULL) +- dev_kfree_skb(skb); +- shaper_kick(shaper); +- up(&shaper->sem); +-} +- +-/* + * Bring the interface up. We just disallow this until a + * bind. + */ +@@ -375,7 +356,15 @@ static int shaper_open(struct net_device + static int shaper_close(struct net_device *dev) + { + struct shaper *shaper=dev->priv; +- shaper_flush(shaper); ++ struct sk_buff *skb; ++ ++ while ((skb = skb_dequeue(&shaper->sendq)) != NULL) ++ dev_kfree_skb(skb); ++ ++ spin_lock_bh(&shaper->lock); ++ shaper_kick(shaper); ++ spin_unlock_bh(&shaper->lock); ++ + del_timer_sync(&shaper->timer); + return 0; + } +@@ -576,6 +565,7 @@ static void shaper_init_priv(struct net_ + init_timer(&sh->timer); + sh->timer.function=shaper_timer; + sh->timer.data=(unsigned long)sh; ++ spin_lock_init(&sh->lock); + } + + /* +diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -396,7 +396,7 @@ int pci_register_driver(struct pci_drive + /* FIXME, once all of the existing PCI drivers have been fixed to set + * the pci shutdown function, this test can go away. */ + if (!drv->driver.shutdown) +- drv->driver.shutdown = pci_device_shutdown, ++ drv->driver.shutdown = pci_device_shutdown; + drv->driver.owner = drv->owner; + drv->driver.kobj.ktype = &pci_driver_kobj_type; + pci_init_dynids(&drv->dynids); +diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c +--- a/drivers/scsi/qla2xxx/qla_init.c ++++ b/drivers/scsi/qla2xxx/qla_init.c +@@ -1914,9 +1914,11 @@ qla2x00_reg_remote_port(scsi_qla_host_t + rport_ids.roles |= FC_RPORT_ROLE_FCP_TARGET; + + fcport->rport = rport = fc_remote_port_add(ha->host, 0, &rport_ids); +- if (!rport) ++ if (!rport) { + qla_printk(KERN_WARNING, ha, + "Unable to allocate fc remote port!\n"); ++ return; ++ } + + if (rport->scsi_target_id != -1 && rport->scsi_target_id < MAX_TARGETS) + fcport->os_target_id = rport->scsi_target_id; +diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c +--- a/drivers/scsi/qla2xxx/qla_os.c ++++ b/drivers/scsi/qla2xxx/qla_os.c +@@ -1150,7 +1150,7 @@ iospace_error_exit: + */ + int qla2x00_probe_one(struct pci_dev *pdev, struct qla_board_info *brd_info) + { +- int ret; ++ int ret = -ENODEV; + device_reg_t __iomem *reg; + struct Scsi_Host *host; + scsi_qla_host_t *ha; +@@ -1161,7 +1161,7 @@ int qla2x00_probe_one(struct pci_dev *pd + fc_port_t *fcport; + + if (pci_enable_device(pdev)) +- return -1; ++ goto probe_out; + + host = scsi_host_alloc(&qla2x00_driver_template, + sizeof(scsi_qla_host_t)); +@@ -1183,9 +1183,8 @@ int qla2x00_probe_one(struct pci_dev *pd + + /* Configure PCI I/O space */ + ret = qla2x00_iospace_config(ha); +- if (ret != 0) { +- goto probe_alloc_failed; +- } ++ if (ret) ++ goto probe_failed; + + /* Sanitize the information from PCI BIOS. */ + host->irq = pdev->irq; +@@ -1258,23 +1257,10 @@ int qla2x00_probe_one(struct pci_dev *pd + qla_printk(KERN_WARNING, ha, + "[ERROR] Failed to allocate memory for adapter\n"); + +- goto probe_alloc_failed; ++ ret = -ENOMEM; ++ goto probe_failed; + } + +- pci_set_drvdata(pdev, ha); +- host->this_id = 255; +- host->cmd_per_lun = 3; +- host->unique_id = ha->instance; +- host->max_cmd_len = MAX_CMDSZ; +- host->max_channel = ha->ports - 1; +- host->max_id = ha->max_targets; +- host->max_lun = ha->max_luns; +- host->transportt = qla2xxx_transport_template; +- if (scsi_add_host(host, &pdev->dev)) +- goto probe_alloc_failed; +- +- qla2x00_alloc_sysfs_attr(ha); +- + if (qla2x00_initialize_adapter(ha) && + !(ha->device_flags & DFLG_NO_CABLE)) { + +@@ -1285,11 +1271,10 @@ int qla2x00_probe_one(struct pci_dev *pd + "Adapter flags %x.\n", + ha->host_no, ha->device_flags)); + ++ ret = -ENODEV; + goto probe_failed; + } + +- qla2x00_init_host_attr(ha); +- + /* + * Startup the kernel thread for this host adapter + */ +@@ -1299,17 +1284,26 @@ int qla2x00_probe_one(struct pci_dev *pd + qla_printk(KERN_WARNING, ha, + "Unable to start DPC thread!\n"); + ++ ret = -ENODEV; + goto probe_failed; + } + wait_for_completion(&ha->dpc_inited); + ++ host->this_id = 255; ++ host->cmd_per_lun = 3; ++ host->unique_id = ha->instance; ++ host->max_cmd_len = MAX_CMDSZ; ++ host->max_channel = ha->ports - 1; ++ host->max_lun = MAX_LUNS; ++ host->transportt = qla2xxx_transport_template; ++ + if (IS_QLA2100(ha) || IS_QLA2200(ha)) + ret = request_irq(host->irq, qla2100_intr_handler, + SA_INTERRUPT|SA_SHIRQ, ha->brd_info->drv_name, ha); + else + ret = request_irq(host->irq, qla2300_intr_handler, + SA_INTERRUPT|SA_SHIRQ, ha->brd_info->drv_name, ha); +- if (ret != 0) { ++ if (ret) { + qla_printk(KERN_WARNING, ha, + "Failed to reserve interrupt %d already in use.\n", + host->irq); +@@ -1363,9 +1357,18 @@ int qla2x00_probe_one(struct pci_dev *pd + msleep(10); + } + ++ pci_set_drvdata(pdev, ha); + ha->flags.init_done = 1; + num_hosts++; + ++ ret = scsi_add_host(host, &pdev->dev); ++ if (ret) ++ goto probe_failed; ++ ++ qla2x00_alloc_sysfs_attr(ha); ++ ++ qla2x00_init_host_attr(ha); ++ + qla_printk(KERN_INFO, ha, "\n" + " QLogic Fibre Channel HBA Driver: %s\n" + " QLogic %s - %s\n" +@@ -1384,9 +1387,6 @@ int qla2x00_probe_one(struct pci_dev *pd + probe_failed: + fc_remove_host(ha->host); + +- scsi_remove_host(host); +- +-probe_alloc_failed: + qla2x00_free_device(ha); + + scsi_host_put(host); +@@ -1394,7 +1394,8 @@ probe_alloc_failed: + probe_disable_device: + pci_disable_device(pdev); + +- return -1; ++probe_out: ++ return ret; + } + EXPORT_SYMBOL_GPL(qla2x00_probe_one); + +diff --git a/fs/bio.c b/fs/bio.c +--- a/fs/bio.c ++++ b/fs/bio.c +@@ -261,6 +261,7 @@ inline void __bio_clone(struct bio *bio, + */ + bio->bi_vcnt = bio_src->bi_vcnt; + bio->bi_size = bio_src->bi_size; ++ bio->bi_idx = bio_src->bi_idx; + bio_phys_segments(q, bio); + bio_hw_segments(q, bio); + } +diff --git a/fs/char_dev.c b/fs/char_dev.c +--- a/fs/char_dev.c ++++ b/fs/char_dev.c +@@ -139,7 +139,7 @@ __unregister_chrdev_region(unsigned majo + struct char_device_struct *cd = NULL, **cp; + int i = major_to_index(major); + +- up(&chrdevs_lock); ++ down(&chrdevs_lock); + for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) + if ((*cp)->major == major && + (*cp)->baseminor == baseminor && +diff --git a/fs/exec.c b/fs/exec.c +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -649,6 +649,7 @@ static inline int de_thread(struct task_ + } + sig->group_exit_task = NULL; + sig->notify_count = 0; ++ sig->real_timer.data = (unsigned long)current; + spin_unlock_irq(lock); + + /* +diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c +--- a/fs/isofs/compress.c ++++ b/fs/isofs/compress.c +@@ -129,8 +129,14 @@ static int zisofs_readpage(struct file * + cend = le32_to_cpu(*(__le32 *)(bh->b_data + (blockendptr & bufmask))); + brelse(bh); + ++ if (cstart > cend) ++ goto eio; ++ + csize = cend-cstart; + ++ if (csize > deflateBound(1UL << zisofs_block_shift)) ++ goto eio; ++ + /* Now page[] contains an array of pages, any of which can be NULL, + and the locks on which we hold. We should now read the data and + release the pages. If the pages are NULL the decompressed data +diff --git a/include/asm-i386/string.h b/include/asm-i386/string.h +--- a/include/asm-i386/string.h ++++ b/include/asm-i386/string.h +@@ -116,7 +116,8 @@ __asm__ __volatile__( + "orb $1,%%al\n" + "3:" + :"=a" (__res), "=&S" (d0), "=&D" (d1) +- :"1" (cs),"2" (ct)); ++ :"1" (cs),"2" (ct) ++ :"memory"); + return __res; + } + +@@ -138,8 +139,9 @@ __asm__ __volatile__( + "3:\tsbbl %%eax,%%eax\n\t" + "orb $1,%%al\n" + "4:" +- :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2) +- :"1" (cs),"2" (ct),"3" (count)); ++ :"=a" (__res), "=&S" (d0), "=&D" (d1), "=&c" (d2) ++ :"1" (cs),"2" (ct),"3" (count) ++ :"memory"); + return __res; + } + +@@ -158,7 +160,9 @@ __asm__ __volatile__( + "movl $1,%1\n" + "2:\tmovl %1,%0\n\t" + "decl %0" +- :"=a" (__res), "=&S" (d0) : "1" (s),"0" (c)); ++ :"=a" (__res), "=&S" (d0) ++ :"1" (s),"0" (c) ++ :"memory"); + return __res; + } + +@@ -175,7 +179,9 @@ __asm__ __volatile__( + "leal -1(%%esi),%0\n" + "2:\ttestb %%al,%%al\n\t" + "jne 1b" +- :"=g" (__res), "=&S" (d0), "=&a" (d1) :"0" (0),"1" (s),"2" (c)); ++ :"=g" (__res), "=&S" (d0), "=&a" (d1) ++ :"0" (0),"1" (s),"2" (c) ++ :"memory"); + return __res; + } + +@@ -189,7 +195,9 @@ __asm__ __volatile__( + "scasb\n\t" + "notl %0\n\t" + "decl %0" +- :"=c" (__res), "=&D" (d0) :"1" (s),"a" (0), "0" (0xffffffffu)); ++ :"=c" (__res), "=&D" (d0) ++ :"1" (s),"a" (0), "0" (0xffffffffu) ++ :"memory"); + return __res; + } + +@@ -333,7 +341,9 @@ __asm__ __volatile__( + "je 1f\n\t" + "movl $1,%0\n" + "1:\tdecl %0" +- :"=D" (__res), "=&c" (d0) : "a" (c),"0" (cs),"1" (count)); ++ :"=D" (__res), "=&c" (d0) ++ :"a" (c),"0" (cs),"1" (count) ++ :"memory"); + return __res; + } + +@@ -369,7 +379,7 @@ __asm__ __volatile__( + "je 2f\n\t" + "stosb\n" + "2:" +- : "=&c" (d0), "=&D" (d1) ++ :"=&c" (d0), "=&D" (d1) + :"a" (c), "q" (count), "0" (count/4), "1" ((long) s) + :"memory"); + return (s); +@@ -392,7 +402,8 @@ __asm__ __volatile__( + "jne 1b\n" + "3:\tsubl %2,%0" + :"=a" (__res), "=&d" (d0) +- :"c" (s),"1" (count)); ++ :"c" (s),"1" (count) ++ :"memory"); + return __res; + } + /* end of additional stuff */ +@@ -473,7 +484,8 @@ static inline void * memscan(void * addr + "dec %%edi\n" + "1:" + : "=D" (addr), "=c" (size) +- : "0" (addr), "1" (size), "a" (c)); ++ : "0" (addr), "1" (size), "a" (c) ++ : "memory"); + return addr; + } + +diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h +--- a/include/asm-x86_64/smp.h ++++ b/include/asm-x86_64/smp.h +@@ -46,6 +46,8 @@ extern int pic_mode; + extern int smp_num_siblings; + extern void smp_flush_tlb(void); + extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); ++extern int smp_call_function_single (int cpuid, void (*func) (void *info), void *info, ++ int retry, int wait); + extern void smp_send_reschedule(int cpu); + extern void smp_invalidate_rcv(void); /* Process an NMI */ + extern void zap_low_mappings(void); +diff --git a/include/linux/if_shaper.h b/include/linux/if_shaper.h +--- a/include/linux/if_shaper.h ++++ b/include/linux/if_shaper.h +@@ -23,7 +23,7 @@ struct shaper + __u32 shapeclock; + unsigned long recovery; /* Time we can next clock a packet out on + an empty queue */ +- struct semaphore sem; ++ spinlock_t lock; + struct net_device_stats stats; + struct net_device *dev; + int (*hard_start_xmit) (struct sk_buff *skb, +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -1192,7 +1192,7 @@ static inline void *skb_header_pointer(c + { + int hlen = skb_headlen(skb); + +- if (offset + len <= hlen) ++ if (hlen - offset >= len) + return skb->data + offset; + + if (skb_copy_bits(skb, offset, buffer, len) < 0) +diff --git a/include/linux/zlib.h b/include/linux/zlib.h +--- a/include/linux/zlib.h ++++ b/include/linux/zlib.h +@@ -506,6 +506,11 @@ extern int zlib_deflateReset (z_streamp + stream state was inconsistent (such as zalloc or state being NULL). + */ + ++static inline unsigned long deflateBound(unsigned long s) ++{ ++ return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11; ++} ++ + extern int zlib_deflateParams (z_streamp strm, int level, int strategy); + /* + Dynamically update the compression level and compression strategy. The +diff --git a/kernel/module.c b/kernel/module.c +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -249,13 +249,18 @@ static inline unsigned int block_size(in + /* Created by linker magic */ + extern char __per_cpu_start[], __per_cpu_end[]; + +-static void *percpu_modalloc(unsigned long size, unsigned long align) ++static void *percpu_modalloc(unsigned long size, unsigned long align, ++ const char *name) + { + unsigned long extra; + unsigned int i; + void *ptr; + +- BUG_ON(align > SMP_CACHE_BYTES); ++ if (align > SMP_CACHE_BYTES) { ++ printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", ++ name, align, SMP_CACHE_BYTES); ++ align = SMP_CACHE_BYTES; ++ } + + ptr = __per_cpu_start; + for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { +@@ -347,7 +352,8 @@ static int percpu_modinit(void) + } + __initcall(percpu_modinit); + #else /* ... !CONFIG_SMP */ +-static inline void *percpu_modalloc(unsigned long size, unsigned long align) ++static inline void *percpu_modalloc(unsigned long size, unsigned long align, ++ const char *name) + { + return NULL; + } +@@ -1554,7 +1560,8 @@ static struct module *load_module(void _ + if (pcpuindex) { + /* We have a special allocation for this section. */ + percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, +- sechdrs[pcpuindex].sh_addralign); ++ sechdrs[pcpuindex].sh_addralign, ++ mod->name); + if (!percpu) { + err = -ENOMEM; + goto free_mod; +diff --git a/lib/inflate.c b/lib/inflate.c +--- a/lib/inflate.c ++++ b/lib/inflate.c +@@ -326,7 +326,7 @@ DEBG("huft1 "); + { + *t = (struct huft *)NULL; + *m = 0; +- return 0; ++ return 2; + } + + DEBG("huft2 "); +@@ -374,6 +374,7 @@ DEBG("huft5 "); + if ((j = *p++) != 0) + v[x[j]++] = i; + } while (++i < n); ++ n = x[g]; /* set n to length of v */ + + DEBG("h6 "); + +@@ -410,12 +411,13 @@ DEBG1("1 "); + DEBG1("2 "); + f -= a + 1; /* deduct codes from patterns left */ + xp = c + k; +- while (++j < z) /* try smaller tables up to z bits */ +- { +- if ((f <<= 1) <= *++xp) +- break; /* enough codes to use up j bits */ +- f -= *xp; /* else deduct codes from patterns */ +- } ++ if (j < z) ++ while (++j < z) /* try smaller tables up to z bits */ ++ { ++ if ((f <<= 1) <= *++xp) ++ break; /* enough codes to use up j bits */ ++ f -= *xp; /* else deduct codes from patterns */ ++ } + } + DEBG1("3 "); + z = 1 << j; /* table entries for j-bit table */ +diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c +--- a/lib/zlib_inflate/inftrees.c ++++ b/lib/zlib_inflate/inftrees.c +@@ -141,7 +141,7 @@ static int huft_build( + { + *t = NULL; + *m = 0; +- return Z_OK; ++ return Z_DATA_ERROR; + } + + +diff --git a/mm/memory.c b/mm/memory.c +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1164,7 +1164,7 @@ int remap_pfn_range(struct vm_area_struc + { + pgd_t *pgd; + unsigned long next; +- unsigned long end = addr + size; ++ unsigned long end = addr + PAGE_ALIGN(size); + struct mm_struct *mm = vma->vm_mm; + int err; + +diff --git a/mm/mempolicy.c b/mm/mempolicy.c +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -409,7 +409,7 @@ asmlinkage long sys_set_mempolicy(int mo + struct mempolicy *new; + DECLARE_BITMAP(nodes, MAX_NUMNODES); + +- if (mode > MPOL_MAX) ++ if (mode < 0 || mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(nodes, nmask, maxnode, mode); + if (err) +diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c +--- a/net/8021q/vlan.c ++++ b/net/8021q/vlan.c +@@ -578,6 +578,14 @@ static int vlan_device_event(struct noti + if (!vlandev) + continue; + ++ if (netif_carrier_ok(dev)) { ++ if (!netif_carrier_ok(vlandev)) ++ netif_carrier_on(vlandev); ++ } else { ++ if (netif_carrier_ok(vlandev)) ++ netif_carrier_off(vlandev); ++ } ++ + if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { + vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) + | flgs; +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -111,7 +111,6 @@ static int ip_dev_loopback_xmit(struct s + #ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_loopback_xmit(newskb); + #endif +- nf_reset(newskb); + netif_rx(newskb); + return 0; + } +@@ -196,8 +195,6 @@ static inline int ip_finish_output2(stru + nf_debug_ip_finish_output2(skb); + #endif /*CONFIG_NETFILTER_DEBUG*/ + +- nf_reset(skb); +- + if (hh) { + int hh_alen; + +diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c +--- a/net/ipv4/netfilter/ip_conntrack_core.c ++++ b/net/ipv4/netfilter/ip_conntrack_core.c +@@ -1124,6 +1124,9 @@ void ip_conntrack_cleanup(void) + schedule(); + goto i_see_dead_people; + } ++ /* wait until all references to ip_conntrack_untracked are dropped */ ++ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) ++ schedule(); + + kmem_cache_destroy(ip_conntrack_cachep); + kmem_cache_destroy(ip_conntrack_expect_cachep); +diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c +--- a/net/ipv4/netfilter/ip_conntrack_standalone.c ++++ b/net/ipv4/netfilter/ip_conntrack_standalone.c +@@ -432,6 +432,13 @@ static unsigned int ip_conntrack_defrag( + const struct net_device *out, + int (*okfn)(struct sk_buff *)) + { ++#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) ++ /* Previously seen (loopback)? Ignore. Do this before ++ fragment check. */ ++ if ((*pskb)->nfct) ++ return NF_ACCEPT; ++#endif ++ + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, +diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c +--- a/net/ipv4/netfilter/ip_nat_proto_tcp.c ++++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c +@@ -40,7 +40,8 @@ tcp_unique_tuple(struct ip_conntrack_tup + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) + { +- static u_int16_t port, *portptr; ++ static u_int16_t port; ++ u_int16_t *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) +diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c +--- a/net/ipv4/netfilter/ip_nat_proto_udp.c ++++ b/net/ipv4/netfilter/ip_nat_proto_udp.c +@@ -41,7 +41,8 @@ udp_unique_tuple(struct ip_conntrack_tup + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) + { +- static u_int16_t port, *portptr; ++ static u_int16_t port; ++ u_int16_t *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) +diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c +--- a/net/ipv6/netfilter/ip6_queue.c ++++ b/net/ipv6/netfilter/ip6_queue.c +@@ -76,7 +76,9 @@ static DECLARE_MUTEX(ipqnl_sem); + static void + ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) + { ++ local_bh_disable(); + nf_reinject(entry->skb, entry->info, verdict); ++ local_bh_enable(); + kfree(entry); + } + +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -315,8 +315,8 @@ err: + static void netlink_remove(struct sock *sk) + { + netlink_table_grab(); +- nl_table[sk->sk_protocol].hash.entries--; +- sk_del_node_init(sk); ++ if (sk_del_node_init(sk)) ++ nl_table[sk->sk_protocol].hash.entries--; + if (nlk_sk(sk)->groups) + __sk_del_bind_node(sk); + netlink_table_ungrab(); +@@ -429,7 +429,12 @@ retry: + err = netlink_insert(sk, pid); + if (err == -EADDRINUSE) + goto retry; +- return 0; ++ ++ /* If 2 threads race to autobind, that is fine. */ ++ if (err == -EBUSY) ++ err = 0; ++ ++ return err; + } + + static inline int netlink_capable(struct socket *sock, unsigned int flag) +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -274,6 +274,9 @@ static int packet_rcv_spkt(struct sk_buf + dst_release(skb->dst); + skb->dst = NULL; + ++ /* drop conntrack reference */ ++ nf_reset(skb); ++ + spkt = (struct sockaddr_pkt*)skb->cb; + + skb_push(skb, skb->data-skb->mac.raw); +@@ -517,6 +520,9 @@ static int packet_rcv(struct sk_buff *sk + dst_release(skb->dst); + skb->dst = NULL; + ++ /* drop conntrack reference */ ++ nf_reset(skb); ++ + spin_lock(&sk->sk_receive_queue.lock); + po->stats.tp_packets++; + __skb_queue_tail(&sk->sk_receive_queue, skb); +diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c +--- a/net/xfrm/xfrm_user.c ++++ b/net/xfrm/xfrm_user.c +@@ -1180,6 +1180,9 @@ static struct xfrm_policy *xfrm_compile_ + if (nr > XFRM_MAX_DEPTH) + return NULL; + ++ if (p->dir > XFRM_POLICY_OUT) ++ return NULL; ++ + xp = xfrm_policy_alloc(GFP_KERNEL); + if (xp == NULL) { + *dir = -ENOBUFS; +diff --git a/security/keys/keyring.c b/security/keys/keyring.c +--- a/security/keys/keyring.c ++++ b/security/keys/keyring.c +@@ -188,7 +188,11 @@ static void keyring_destroy(struct key * + + if (keyring->description) { + write_lock(&keyring_name_lock); +- list_del(&keyring->type_data.link); ++ ++ if (keyring->type_data.link.next != NULL && ++ !list_empty(&keyring->type_data.link)) ++ list_del(&keyring->type_data.link); ++ + write_unlock(&keyring_name_lock); + } + +diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c +--- a/security/keys/process_keys.c ++++ b/security/keys/process_keys.c +@@ -641,7 +641,7 @@ long join_session_keyring(const char *na + keyring = keyring_alloc(name, tsk->uid, tsk->gid, 0, NULL); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); +- goto error; ++ goto error2; + } + } + else if (IS_ERR(keyring)) { diff -r 5f1ed597f107 -r 8799d14bef77 tools/blktap/parallax/Makefile --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/blktap/parallax/Makefile Thu Aug 25 22:53:20 2005 @@ -0,0 +1,64 @@ +XEN_ROOT = ../../.. +include $(XEN_ROOT)/tools/Rules.mk + +PARALLAX_INSTALL_DIR = /usr/sbin + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +INSTALL_DIR = $(INSTALL) -d -m0755 + +INCLUDES += -I.. -I/usr/include -I $(XEN_LIBXC) + +LDFLAGS = -L.. -lpthread -lz -lblktap + +#PLX_SRCS := +PLX_SRCS := vdi.c +PLX_SRCS += radix.c +PLX_SRCS += snaplog.c +PLX_SRCS += blockstore.c +PLX_SRCS += block-async.c +PLX_SRCS += requests-async.c +VDI_SRCS := $(PLX_SRCS) +PLX_SRCS += parallax.c + +#VDI_TOOLS := +VDI_TOOLS := vdi_create +VDI_TOOLS += vdi_list +VDI_TOOLS += vdi_snap +VDI_TOOLS += vdi_snap_list +VDI_TOOLS += vdi_snap_delete +VDI_TOOLS += vdi_fill +VDI_TOOLS += vdi_tree +VDI_TOOLS += vdi_validate + +CFLAGS += -Wall +CFLAGS += -Werror +CFLAGS += -Wno-unused +#CFLAGS += -O3 +CFLAGS += -g3 +CFLAGS += -fno-strict-aliasing +CFLAGS += $(INCLUDES) +CFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE +# Get gcc to generate the dependencies for us. +CFLAGS += -Wp,-MD,.$(@F).d +DEPS = .*.d + +OBJS = $(patsubst %.c,%.o,$(SRCS)) +IBINS = parallax $(VDI_TOOLS) + +all: $(VDI_TOOLS) parallax blockstored + +install: all + $(INSTALL_PROG) $(IBINS) $(DESTDIR)$(PARALLAX_INSTALL_DIR) + +clean: + rm -rf *.o *~ $(DEPS) xen TAGS $(VDI_TOOLS) parallax vdi_unittest + +parallax: $(PLX_SRCS) + $(CC) $(CFLAGS) -o parallax -L.. $(LDFLAGS) $(PLX_SRCS) + +${VDI_TOOLS}: %: %.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o $@ $@.c $(LDFLAGS) $(VDI_SRCS) + +.PHONY: TAGS clean install rpm +-include $(DEPS) \ No newline at end of file diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/Makefile --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/Makefile Thu Aug 25 22:53:20 2005 @@ -0,0 +1,39 @@ + +XEN_ROOT=../.. +include $(XEN_ROOT)/tools/Rules.mk + +DAEMON_INSTALL_DIR = /usr/sbin +CLIENT_INSTALL_DIR = /usr/libexec/xen + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 +INSTALL_DIR = $(INSTALL) -d -m0755 + +CC = gcc +CFLAGS = -Wall -Werror -g3 + +CFLAGS += -I $(XEN_XCS) +CFLAGS += -I $(XEN_LIBXC) +CFLAGS += -I $(XEN_XENSTORE) + +BIN = xenconsoled xenconsole + +all: $(BIN) + +clean: + $(RM) *.a *.so *.o *.rpm $(BIN) + $(RM) client/*.o daemon/*.o + +xenconsoled: $(patsubst %.c,%.o,$(wildcard daemon/*.c)) + $(CC) $(CFLAGS) $^ -o $@ -L$(XEN_LIBXC) -L$(XEN_XENSTORE) \ + -lxenctrl -lxenstore + +xenconsole: $(patsubst %.c,%.o,$(wildcard client/*.c)) + $(CC) $(CFLAGS) $^ -o $@ -L$(XEN_LIBXC) -L$(XEN_XENSTORE) \ + -lxenctrl -lxenstore + +install: $(BIN) + $(INSTALL_DIR) -p $(DESTDIR)/$(DAEMON_INSTALL_DIR) + $(INSTALL_PROG) xenconsoled $(DESTDIR)/$(DAEMON_INSTALL_DIR) + $(INSTALL_DIR) -p $(DESTDIR)/$(CLIENT_INSTALL_DIR) + $(INSTALL_PROG) xenconsole $(DESTDIR)/$(CLIENT_INSTALL_DIR) diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/client/main.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/client/main.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,236 @@ +/*\ + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> + * + * Xen Console Daemon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +\*/ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <stdlib.h> +#include <time.h> +#include <fcntl.h> +#include <sys/wait.h> +#include <termios.h> +#include <signal.h> +#include <getopt.h> +#include <sys/select.h> +#include <err.h> +#include <errno.h> +#include <pty.h> + +#include "xenctrl.h" +#include "xs.h" + +#define ESCAPE_CHARACTER 0x1d + +static volatile sig_atomic_t received_signal = 0; + +static void sighandler(int signum) +{ + received_signal = 1; +} + +static bool write_sync(int fd, const void *data, size_t size) +{ + size_t offset = 0; + ssize_t len; + + while (offset < size) { + len = write(fd, data + offset, size - offset); + if (len < 1) { + return false; + } + offset += len; + } + + return true; +} + +static void usage(const char *program) { + printf("Usage: %s [OPTION] DOMID\n" + "Attaches to a virtual domain console\n" + "\n" + " -h, --help display this help and exit\n" + , program); +} + +/* don't worry too much if setting terminal attributes fail */ +static void init_term(int fd, struct termios *old) +{ + struct termios new_term; + + if (tcgetattr(fd, old) == -1) { + return; + } + + new_term = *old; + cfmakeraw(&new_term); + + tcsetattr(fd, TCSAFLUSH, &new_term); +} + +static void restore_term(int fd, struct termios *old) +{ + tcsetattr(fd, TCSAFLUSH, old); +} + +static int console_loop(int xc_handle, domid_t domid, int fd) +{ + int ret; + + do { + fd_set fds; + + FD_ZERO(&fds); + FD_SET(STDIN_FILENO, &fds); + FD_SET(fd, &fds); + + ret = select(fd + 1, &fds, NULL, NULL, NULL); + if (ret == -1) { + if (errno == EINTR || errno == EAGAIN) { + continue; + } + return -1; + } + + if (FD_ISSET(STDIN_FILENO, &fds)) { + ssize_t len; + char msg[60]; + + len = read(STDIN_FILENO, msg, sizeof(msg)); + if (len == 1 && msg[0] == ESCAPE_CHARACTER) { + return 0; + } + + if (len == 0 || len == -1) { + if (len == -1 && + (errno == EINTR || errno == EAGAIN)) { + continue; + } + return -1; + } + + if (!write_sync(fd, msg, len)) { + perror("write() failed"); + return -1; + } + } + + if (FD_ISSET(fd, &fds)) { + ssize_t len; + char msg[512]; + + len = read(fd, msg, sizeof(msg)); + if (len == 0 || len == -1) { + if (len == -1 && + (errno == EINTR || errno == EAGAIN)) { + continue; + } + return -1; + } + + if (!write_sync(STDOUT_FILENO, msg, len)) { + perror("write() failed"); + return -1; + } + } + } while (received_signal == 0); + + return 0; +} + +int main(int argc, char **argv) +{ + struct termios attr; + int domid; + int xc_handle; + char *sopt = "h"; + int ch; + int opt_ind=0; + struct option lopt[] = { + { "help", 0, 0, 'h' }, + { 0 }, + + }; + char *str_pty; + char path[1024]; + int spty; + unsigned int len = 0; + struct xs_handle *xs; + char *end; + + while((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { + switch(ch) { + case 'h': + usage(argv[0]); + exit(0); + break; + } + } + + if ((argc - optind) != 1) { + fprintf(stderr, "Invalid number of arguments\n"); + fprintf(stderr, "Try `%s --help' for more information.\n", + argv[0]); + exit(EINVAL); + } + + domid = strtol(argv[optind], &end, 10); + if (end && *end) { + fprintf(stderr, "Invalid DOMID `%s'\n", argv[optind]); + fprintf(stderr, "Try `%s --help' for more information.\n", + argv[0]); + exit(EINVAL); + } + + xs = xs_daemon_open(); + if (xs == NULL) { + err(errno, "Could not contact XenStore"); + } + + xc_handle = xc_interface_open(); + if (xc_handle == -1) { + err(errno, "xc_interface_open()"); + } + + signal(SIGTERM, sighandler); + + snprintf(path, sizeof(path), "/console/%d/tty", domid); + str_pty = xs_read(xs, path, &len); + /* FIXME consoled currently does not assume domain-0 doesn't have a + console which is good when we break domain-0 up. To keep us + user friendly, we'll bail out here since no data will ever show + up on domain-0. */ + if (domid == 0 || str_pty == NULL) { + err(errno, "Could not read tty from store"); + } + spty = open(str_pty, O_RDWR | O_NOCTTY); + if (spty == -1) { + err(errno, "Could not open tty `%s'", str_pty); + } + free(str_pty); + + init_term(STDIN_FILENO, &attr); + console_loop(xc_handle, domid, spty); + restore_term(STDIN_FILENO, &attr); + + return 0; + } diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/daemon/io.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/daemon/io.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,362 @@ +/*\ + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> + * + * Xen Console Daemon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +\*/ + +#define _GNU_SOURCE + +#include "utils.h" +#include "io.h" + +#include "xenctrl.h" +#include "xs.h" +#include "xen/io/domain_controller.h" +#include "xcs_proto.h" + +#include <malloc.h> +#include <stdlib.h> +#include <errno.h> +#include <string.h> +#include <sys/select.h> +#include <fcntl.h> +#include <unistd.h> +#include <termios.h> + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +struct buffer +{ + char *data; + size_t size; + size_t capacity; + size_t max_capacity; +}; + +static void buffer_append(struct buffer *buffer, const void *data, size_t size) +{ + if ((buffer->capacity - buffer->size) < size) { + buffer->capacity += (size + 1024); + buffer->data = realloc(buffer->data, buffer->capacity); + if (buffer->data == NULL) { + dolog(LOG_ERR, "Memory allocation failed"); + exit(ENOMEM); + } + } + + memcpy(buffer->data + buffer->size, data, size); + buffer->size += size; + + if (buffer->max_capacity && + buffer->size > buffer->max_capacity) { + memmove(buffer->data + (buffer->size - buffer->max_capacity), + buffer->data, buffer->max_capacity); + buffer->data = realloc(buffer->data, buffer->max_capacity); + buffer->capacity = buffer->max_capacity; + } +} + +static bool buffer_empty(struct buffer *buffer) +{ + return buffer->size == 0; +} + +static void buffer_advance(struct buffer *buffer, size_t size) +{ + size = MIN(size, buffer->size); + memmove(buffer->data, buffer + size, buffer->size - size); + buffer->size -= size; +} + +struct domain +{ + int domid; + int tty_fd; + bool is_dead; + struct buffer buffer; + struct domain *next; +}; + +static struct domain *dom_head; + +static bool domain_is_valid(int domid) +{ + bool ret; + xc_dominfo_t info; + + ret = (xc_domain_getinfo(xc, domid, 1, &info) == 1 && + info.domid == domid); + + return ret; +} + +static int domain_create_tty(struct domain *dom) +{ + char path[1024]; + int master; + + if ((master = getpt()) == -1 || + grantpt(master) == -1 || unlockpt(master) == -1) { + dolog(LOG_ERR, "Failed to create tty for domain-%d", + dom->domid); + master = -1; + } else { + const char *slave = ptsname(master); + struct termios term; + char *data; + unsigned int len; + + if (tcgetattr(master, &term) != -1) { + cfmakeraw(&term); + tcsetattr(master, TCSAFLUSH, &term); + } + + xs_mkdir(xs, "/console"); + snprintf(path, sizeof(path), "/console/%d", dom->domid); + xs_mkdir(xs, path); + strcat(path, "/tty"); + + xs_write(xs, path, slave, strlen(slave), O_CREAT); + + snprintf(path, sizeof(path), "/console/%d/limit", dom->domid); + data = xs_read(xs, path, &len); + if (data) { + dom->buffer.max_capacity = strtoul(data, 0, 0); + free(data); + } + } + + return master; +} + +static struct domain *create_domain(int domid) +{ + struct domain *dom; + + dom = (struct domain *)malloc(sizeof(struct domain)); + if (dom == NULL) { + dolog(LOG_ERR, "Out of memory %s:%s():L%d", + __FILE__, __FUNCTION__, __LINE__); + exit(ENOMEM); + } + + dom->domid = domid; + dom->tty_fd = domain_create_tty(dom); + dom->is_dead = false; + dom->buffer.data = 0; + dom->buffer.size = 0; + dom->buffer.capacity = 0; + dom->buffer.max_capacity = 0; + dom->next = 0; + + dolog(LOG_DEBUG, "New domain %d", domid); + + return dom; +} + +static struct domain *lookup_domain(int domid) +{ + struct domain **pp; + + for (pp = &dom_head; *pp; pp = &(*pp)->next) { + struct domain *dom = *pp; + + if (dom->domid == domid) { + return dom; + } else if (dom->domid > domid) { + *pp = create_domain(domid); + (*pp)->next = dom; + return *pp; + } + } + + *pp = create_domain(domid); + return *pp; +} + +static void remove_domain(struct domain *dom) +{ + struct domain **pp; + + dolog(LOG_DEBUG, "Removing domain-%d", dom->domid); + + for (pp = &dom_head; *pp; pp = &(*pp)->next) { + struct domain *d = *pp; + + if (dom->domid == d->domid) { + *pp = d->next; + if (d->buffer.data) { + free(d->buffer.data); + } + free(d); + break; + } + } +} + +static void remove_dead_domains(struct domain *dom) +{ + if (dom == NULL) return; + remove_dead_domains(dom->next); + + if (dom->is_dead) { + remove_domain(dom); + } +} + +static void handle_tty_read(struct domain *dom) +{ + ssize_t len; + xcs_msg_t msg; + + msg.type = XCS_REQUEST; + msg.u.control.remote_dom = dom->domid; + msg.u.control.msg.type = CMSG_CONSOLE; + msg.u.control.msg.subtype = CMSG_CONSOLE_DATA; + msg.u.control.msg.id = 1; + + len = read(dom->tty_fd, msg.u.control.msg.msg, 60); + if (len < 1) { + close(dom->tty_fd); + + if (domain_is_valid(dom->domid)) { + dom->tty_fd = domain_create_tty(dom); + } else { + dom->is_dead = true; + } + } else if (domain_is_valid(dom->domid)) { + msg.u.control.msg.length = len; + + if (!write_sync(xcs_data_fd, &msg, sizeof(msg))) { + dolog(LOG_ERR, "Write to xcs failed: %m"); + exit(1); + } + } else { + close(dom->tty_fd); + dom->is_dead = true; + } +} + +static void handle_tty_write(struct domain *dom) +{ + ssize_t len; + + len = write(dom->tty_fd, dom->buffer.data, dom->buffer.size); + if (len < 1) { + close(dom->tty_fd); + + if (domain_is_valid(dom->domid)) { + dom->tty_fd = domain_create_tty(dom); + } else { + dom->is_dead = true; + } + } else { + buffer_advance(&dom->buffer, len); + } +} + +static void handle_xcs_msg(int fd) +{ + xcs_msg_t msg; + + if (!read_sync(fd, &msg, sizeof(msg))) { + dolog(LOG_ERR, "read from xcs failed! %m"); + exit(1); + } else if (msg.type == XCS_REQUEST) { + struct domain *dom; + + dom = lookup_domain(msg.u.control.remote_dom); + buffer_append(&dom->buffer, + msg.u.control.msg.msg, + msg.u.control.msg.length); + } +} + +static void enum_domains(void) +{ + int domid = 0; + xc_dominfo_t dominfo; + + while (xc_domain_getinfo(xc, domid, 1, &dominfo) == 1) { + lookup_domain(dominfo.domid); + domid = dominfo.domid + 1; + } +} + +void handle_io(void) +{ + fd_set readfds, writefds; + int ret; + int max_fd = -1; + int num_of_writes = 0; + + do { + struct domain *d; + struct timeval tv = { 1, 0 }; + + FD_ZERO(&readfds); + FD_ZERO(&writefds); + + FD_SET(xcs_data_fd, &readfds); + max_fd = MAX(xcs_data_fd, max_fd); + + for (d = dom_head; d; d = d->next) { + if (d->tty_fd != -1) { + FD_SET(d->tty_fd, &readfds); + } + + if (d->tty_fd != -1 && !buffer_empty(&d->buffer)) { + FD_SET(d->tty_fd, &writefds); + } + + max_fd = MAX(d->tty_fd, max_fd); + } + + ret = select(max_fd + 1, &readfds, &writefds, 0, &tv); + if (tv.tv_sec == 1 && (++num_of_writes % 100) == 0) { +#if 0 + /* FIXME */ + /* This is a nasty hack. xcs does not handle the + control channels filling up well at all. We'll + throttle ourselves here since we do proper + queueing to give the domains a shot at pulling out + the data. Fixing xcs is not worth it as it's + going away */ + tv.tv_usec = 1000; + select(0, 0, 0, 0, &tv); +#endif + } + enum_domains(); + + if (FD_ISSET(xcs_data_fd, &readfds)) { + handle_xcs_msg(xcs_data_fd); + } + + for (d = dom_head; d; d = d->next) { + if (!d->is_dead && FD_ISSET(d->tty_fd, &readfds)) { + handle_tty_read(d); + } + + if (!d->is_dead && FD_ISSET(d->tty_fd, &writefds)) { + handle_tty_write(d); + } + } + + remove_dead_domains(dom_head); + } while (ret > -1); +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/daemon/io.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/daemon/io.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,26 @@ +/*\ + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> + * + * Xen Console Daemon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +\*/ + +#ifndef CONSOLED_IO_H +#define CONSOLED_IO_H + +void handle_io(void); + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/daemon/main.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/daemon/main.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,93 @@ +/*\ + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> + * + * Xen Console Daemon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +\*/ + +#include <getopt.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <sys/types.h> + +#include "xenctrl.h" +#include "xen/io/domain_controller.h" +#include "xcs_proto.h" + +#include "utils.h" +#include "io.h" + +int main(int argc, char **argv) +{ + const char *sopts = "hVvi"; + struct option lopts[] = { + { "help", 0, 0, 'h' }, + { "version", 0, 0, 'V' }, + { "verbose", 0, 0, 'v' }, + { "interactive", 0, 0, 'i' }, + { 0 }, + }; + bool is_interactive = false; + int ch; + int syslog_option = LOG_CONS; + int syslog_mask = LOG_WARNING; + int opt_ind = 0; + + while ((ch = getopt_long(argc, argv, sopts, lopts, &opt_ind)) != -1) { + switch (ch) { + case 'h': + //usage(argv[0]); + exit(0); + case 'V': + //version(argv[0]); + exit(0); + case 'v': + syslog_option |= LOG_PERROR; + syslog_mask = LOG_DEBUG; + break; + case 'i': + is_interactive = true; + break; + case '?': + fprintf(stderr, + "Try `%s --help' for more information\n", + argv[0]); + exit(EINVAL); + } + } + + if (geteuid() != 0) { + fprintf(stderr, "%s requires root to run.\n", argv[0]); + exit(EPERM); + } + + openlog("xenconsoled", syslog_option, LOG_DAEMON); + setlogmask(syslog_mask); + + if (!is_interactive) { + daemonize("/var/run/xenconsoled.pid"); + } + + xen_setup(); + + handle_io(); + + closelog(); + + return 0; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/daemon/utils.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/daemon/utils.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,253 @@ +/*\ + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> + * + * Xen Console Daemon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +\*/ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <unistd.h> +#include <stdlib.h> +#include <fcntl.h> +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <getopt.h> +#include <stdbool.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <string.h> + +#include "xenctrl.h" +#include "xen/io/domain_controller.h" +#include "xcs_proto.h" + +#include "utils.h" + +struct xs_handle *xs; +int xc; + +int xcs_ctrl_fd = -1; +int xcs_data_fd = -1; + +bool _read_write_sync(int fd, void *data, size_t size, bool do_read) +{ + size_t offset = 0; + ssize_t len; + + while (offset < size) { + if (do_read) { + len = read(fd, data + offset, size - offset); + } else { + len = write(fd, data + offset, size - offset); + } + + if (len < 1) { + if (len == -1 && (errno == EAGAIN || errno == EINTR)) { + continue; + } else { + return false; + } + } else { + offset += len; + } + } + + return true; +} + +static int open_domain_socket(const char *path) +{ + struct sockaddr_un addr; + int sock; + size_t addr_len; + + if ((sock = socket(PF_UNIX, SOCK_STREAM, 0)) == -1) { + goto out; + } + + addr.sun_family = AF_UNIX; + strcpy(addr.sun_path, path); + addr_len = sizeof(addr.sun_family) + strlen(XCS_SUN_PATH) + 1; + + if (connect(sock, (struct sockaddr *)&addr, addr_len) == -1) { + goto out_close_sock; + } + + return sock; + + out_close_sock: + close(sock); + out: + return -1; +} + +static void child_exit(int sig) +{ + while (waitpid(-1, NULL, WNOHANG) > 0); +} + +void daemonize(const char *pidfile) +{ + pid_t pid; + int fd; + int len; + int i; + char buf[100]; + + if (getppid() == 1) { + return; + } + + if ((pid = fork()) > 0) { + exit(0); + } else if (pid == -1) { + err(errno, "fork() failed"); + } + + setsid(); + + /* redirect fd 0,1,2 to /dev/null */ + if ((fd = open("/dev/null",O_RDWR)) == -1) { + exit(1); + } + + for (i = 0; i <= 2; i++) { + close(i); + dup2(fd, i); + } + + close(fd); + + umask(027); + chdir("/"); + + fd = open(pidfile, O_RDWR | O_CREAT); + if (fd == -1) { + exit(1); + } + + if (lockf(fd, F_TLOCK, 0) == -1) { + exit(1); + } + + len = sprintf(buf, "%d\n", getpid()); + write(fd, buf, len); + + signal(SIGCHLD, child_exit); + signal(SIGTSTP, SIG_IGN); + signal(SIGTTOU, SIG_IGN); + signal(SIGTTIN, SIG_IGN); +} + +/* synchronized send/recv strictly for setting up xcs */ +/* always use asychronize callbacks any other time */ +static bool xcs_send_recv(int fd, xcs_msg_t *msg) +{ + bool ret = false; + + if (!write_sync(fd, msg, sizeof(*msg))) { + dolog(LOG_ERR, "Write failed at %s:%s():L%d? Possible bug.", + __FILE__, __FUNCTION__, __LINE__); + goto out; + } + + if (!read_sync(fd, msg, sizeof(*msg))) { + dolog(LOG_ERR, "Read failed at %s:%s():L%d? Possible bug.", + __FILE__, __FUNCTION__, __LINE__); + goto out; + } + + ret = true; + + out: + return ret; +} + +bool xen_setup(void) +{ + int sock; + xcs_msg_t msg; + + xs = xs_daemon_open(); + if (xs == NULL) { + dolog(LOG_ERR, + "Failed to contact xenstore (%m). Is it running?"); + goto out; + } + + xc = xc_interface_open(); + if (xc == -1) { + dolog(LOG_ERR, "Failed to contact hypervisor (%m)"); + goto out; + } + + sock = open_domain_socket(XCS_SUN_PATH); + if (sock == -1) { + dolog(LOG_ERR, "Failed to contact xcs (%m). Is it running?"); + goto out_close_store; + } + + xcs_ctrl_fd = sock; + + sock = open_domain_socket(XCS_SUN_PATH); + if (sock == -1) { + dolog(LOG_ERR, "Failed to contact xcs (%m). Is it running?"); + goto out_close_ctrl; + } + + xcs_data_fd = sock; + + memset(&msg, 0, sizeof(msg)); + msg.type = XCS_CONNECT_CTRL; + if (!xcs_send_recv(xcs_ctrl_fd, &msg) || msg.result != XCS_RSLT_OK) { + dolog(LOG_ERR, "xcs control connect failed. Possible bug."); + goto out_close_data; + } + + msg.type = XCS_CONNECT_DATA; + if (!xcs_send_recv(xcs_data_fd, &msg) || msg.result != XCS_RSLT_OK) { + dolog(LOG_ERR, "xcs data connect failed. Possible bug."); + goto out_close_data; + } + + /* Since the vast majority of control messages are console messages + it's just easier to ignore other messages that try to bind to + a specific type. */ + msg.type = XCS_MSG_BIND; + msg.u.bind.port = PORT_WILDCARD; + msg.u.bind.type = TYPE_WILDCARD; + if (!xcs_send_recv(xcs_ctrl_fd, &msg) || msg.result != XCS_RSLT_OK) { + dolog(LOG_ERR, "xcs vind failed. Possible bug."); + goto out_close_data; + } + + return true; + + out_close_data: + close(xcs_ctrl_fd); + xcs_data_fd = -1; + out_close_ctrl: + close(xcs_ctrl_fd); + xcs_ctrl_fd = -1; + out_close_store: + xs_daemon_close(xs); + out: + return false; +} + diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/daemon/utils.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/daemon/utils.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,47 @@ +/*\ + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> + * + * Xen Console Daemon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +\*/ + +#ifndef CONSOLED_UTILS_H +#define CONSOLED_UTILS_H + +#include <stdbool.h> +#include <syslog.h> +#include <stdio.h> + +#include "xs.h" + +void daemonize(const char *pidfile); +bool xen_setup(void); +#define read_sync(fd, buffer, size) _read_write_sync(fd, buffer, size, true) +#define write_sync(fd, buffer, size) _read_write_sync(fd, buffer, size, false) +bool _read_write_sync(int fd, void *data, size_t size, bool do_read); + +extern int xcs_ctrl_fd; +extern int xcs_data_fd; +extern struct xs_handle *xs; +extern int xc; + +#if 1 +#define dolog(val, fmt, ...) syslog(val, fmt, ## __VA_ARGS__) +#else +#define dolog(val, fmt, ...) fprintf(stderr, fmt "\n", ## __VA_ARGS__) +#endif + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/testsuite/Makefile --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/testsuite/Makefile Thu Aug 25 22:53:20 2005 @@ -0,0 +1,11 @@ +CFLAGS=-g -Wall +CC=gcc +LDFLAGS=-static + +all: console-dom0 console-domU procpipe + +console-dom0: console-dom0.o +console-domU: console-domU.o +procpipe: procpipe.o + +clean:; $(RM) *.o console-domU console-dom0 procpipe diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/testsuite/README --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/testsuite/README Thu Aug 25 22:53:20 2005 @@ -0,0 +1,29 @@ +ABOUT + +This tool uses two programs, one that lives in dom0 and one that lives in domU +to verify that no data is lost. dom0 and domU share a handshake with each +other that they use to exchange a random seed. + +Both programs then generate a series of random numbers and then writes and +reads the numbers via the console. Because each side starts with the same seed +they know what data the other side is generating and therefore what should be +expected. + +RUNNNING + +console-domU should be installed within the guest image. It must be launched +from the client automatically. I use a custom initrd image and put it in the +/linuxrc. + +console-dom0 and console-domU will communicate with each other and stress the +console code. You can verify it at various levels by invoking it in different +ways. procpipe is used to connect the two. I use the following command for +testing: + +./procpipe ./console-dom0 'xm create -c /etc/xen/xmexample1' + +xmexample1 has no devices and no root set (this is what triggers /linuxrc). + +If it freezes, it probably means that console-domU is expecting more data from +console-dom0 (which means that some data got dropped). I'd like to add +timeouts in the future to handle this more gracefully. diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/testsuite/console-dom0.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/testsuite/console-dom0.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,117 @@ +/* Written by Anthony Liguori <aliguori@xxxxxxxxxx> */ + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <string.h> +#include <unistd.h> +#include <termios.h> + +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +static void generate_random_buffer(char *buffer, size_t size) +{ + int i; + + for (i = 0; i < size; i++) { + buffer[i] = random() & 0xFF; + } +} + +static void canonicalize(char *buffer) +{ + char *reader, *writer; + + reader = writer = buffer; + + while (*reader) { + *writer = *reader; + if (*reader != '\r') writer++; + reader++; + } + *writer = *reader; +} + +int main(int argc, char **argv) +{ + char buffer[4096]; + char *line; + unsigned int seed; + size_t size; + int runs; + unsigned long long total_bytes = 0; + struct termios term; + + tcgetattr(STDIN_FILENO, &term); + cfmakeraw(&term); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &term); + + tcgetattr(STDOUT_FILENO, &term); + cfmakeraw(&term); + tcsetattr(STDOUT_FILENO, TCSAFLUSH, &term); + + while ((line = fgets(buffer, sizeof(buffer), stdin))) { + canonicalize(line); + + if (strcmp(line, "!!!XEN Test Begin!!!\n") == 0) { + break; + } else { + fprintf(stderr, "%s", line); + } + } + + if (line == NULL) { + fprintf(stderr, "Client never sent start string.\n"); + return 1; + } + + seed = time(0); + + printf("%u\n", seed); fflush(stdout); + + fprintf(stderr, "Waiting for seed acknowledgement\n"); + line = fgets(buffer, sizeof(buffer), stdin); + if (line == NULL) { + fprintf(stderr, "Client never acknowledge seed.\n"); + return 1; + } + + canonicalize(line); + if (strcmp(line, "Seed Okay.\n") != 0) { + fprintf(stderr, "Incorrect seed acknowledgement.\n"); + fprintf(stderr, "[%s]", line); + return 1; + } else { + fprintf(stderr, "Processed seed.\n"); + } + + srandom(seed); + + for (runs = (random() % 100000) + 4096; runs > 0; runs--) { + + size = random() % 4096; + + fprintf(stderr, "Writing %d bytes.\n", size); + + generate_random_buffer(buffer, size); + fwrite(buffer, size, 1, stdout); + fflush(stdout); + + do { + line = fgets(buffer, sizeof(buffer), stdin); + if (line == NULL) { + fprintf(stderr, "Premature EOF from client.\n"); + return 1; + } + + canonicalize(line); + fprintf(stderr, "%s", line); + } while (strcmp(line, "Okay.\n") != 0); + + total_bytes += size; + } + + fprintf(stderr, "PASS: processed %llu byte(s).\n", total_bytes); + + return 0; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/testsuite/console-domU.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/testsuite/console-domU.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,76 @@ +/* Written by Anthony Liguori <aliguori@xxxxxxxxxx> */ + +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <termios.h> +#include <unistd.h> + +static void canonicalize(char *buffer) +{ + char *reader, *writer; + + reader = writer = buffer; + + while (*reader) { + *writer = *reader; + if (*reader != '\r') writer++; + reader++; + } + *writer = *reader; +} + +int main(int argc, char **argv) +{ + char buffer[4096]; + char *line; + unsigned int seed; + size_t size; + int i; + int runs; + struct termios term; + + tcgetattr(STDIN_FILENO, &term); + cfmakeraw(&term); + tcsetattr(STDIN_FILENO, TCSAFLUSH, &term); + + tcgetattr(STDOUT_FILENO, &term); + cfmakeraw(&term); + tcsetattr(STDOUT_FILENO, TCSAFLUSH, &term); + + printf("!!!XEN Test Begin!!!\n"); fflush(stdout); + line = fgets(buffer, sizeof(buffer), stdin); + if (line == NULL) { + printf("Failure\n"); fflush(stdout); + return 1; + } + + canonicalize(line); + seed = strtoul(line, 0, 0); + + printf("Seed Okay.\n"); fflush(stdout); + + srandom(seed); + + for (runs = (random() % 100000) + 4096; runs > 0; runs--) { + size = random() % 4096; + + for (i = 0; i < size; i++) { + int ch; + int exp; + + ch = fgetc(stdin); + exp = random() & 0xFF; + if (ch != exp) { + printf("Expected %d got %d\n", + exp, ch); + fflush(stdout); + } + printf("Got %d/%d good bytes\n", i, size); + } + + printf("Okay.\n"); fflush(stdout); + } + + return 0; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/console/testsuite/procpipe.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/console/testsuite/procpipe.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,133 @@ +/* Written by Anthony Liguori <aliguori@xxxxxxxxxx> */ + +#include <stdio.h> +#include <getopt.h> +#include <errno.h> +#include <stdlib.h> +#include <err.h> + +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> + +#define PACKAGE_NAME "procpipe" +#define PACKAGE_VERSION "0.0.1" + +#define GPL_SHORT \ +"This is free software; see the source for copying conditions. There is NO\n"\ +"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +#define PACKAGE_BUGS "aliguori@xxxxxxxxxx" +#define PACKAGE_AUTHOR "Anthony Liguori" +#define PACKAGE_OWNER "IBM, Corp." +#define PACKAGE_LICENSE GPL_SHORT + +static void usage(const char *name) +{ + printf("Usage: %s [OPTIONS]\n" + "\n" + " -h, --help display this help and exit\n" + " -V, --version output version information and exit\n" + "\n" + "Report bugs to <%s>.\n" + , name, PACKAGE_BUGS); +} + +static void version(const char *name) +{ + printf("%s (%s) %s\n" + "Written by %s.\n" + "\n" + "Copyright (C) 2005 %s.\n" + "%s\n" + , name, PACKAGE_NAME, PACKAGE_VERSION, + PACKAGE_AUTHOR, PACKAGE_OWNER, PACKAGE_LICENSE); +} + +static pid_t exec(int stdout, int stdin, const char *cmd) +{ + pid_t pid; + + pid = fork(); + if (pid == 0) { + close(STDOUT_FILENO); + dup2(stdout, STDOUT_FILENO); + close(STDIN_FILENO); + dup2(stdin, STDIN_FILENO); + + execlp("/bin/sh", "sh", "-c", cmd, NULL); + } + + return pid; +} + +int main(int argc, char **argv) +{ + int ch, opt_ind = 0; + const char *sopt = "hV"; + struct option lopt[] = { + { "help", 0, 0, 'h' }, + { "version", 0, 0, 'V' }, + { 0 } + }; + int host_stdout[2]; + int host_stdin[2]; + int res; + pid_t pid1, pid2; + int status; + + while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { + switch (ch) { + case 'h': + usage(argv[0]); + exit(0); + case 'V': + version(argv[0]); + exit(0); + case '?': + errx(EINVAL, "Try `%s --help' for more information.", + argv[0]); + } + } + + if ((argc - optind) != 2) { + errx(EINVAL, "Two commands are required.\n" + "Try `%s --help' for more information.", argv[0]); + } + + res = pipe(host_stdout); + if (res == -1) { + err(errno, "pipe() failed"); + } + + res = pipe(host_stdin); + if (res == -1) { + err(errno, "pipe() failed"); + } + + pid1 = exec(host_stdout[1], host_stdin[0], argv[optind]); + if (pid1 == -1) { + err(errno, "exec(%s)", argv[optind]); + } + + pid2 = exec(host_stdin[1], host_stdout[0], argv[optind + 1]); + if (pid2 == -1) { + err(errno, "exec(%s)", argv[optind + 1]); + } + + waitpid(pid1, &status, 0); + if (WIFEXITED(status)) status = WEXITSTATUS(status); + + if (status != 0) { + printf("Child exited with status %d\n", status); + } + + waitpid(pid2, &status, 0); + if (WIFEXITED(status)) status = WEXITSTATUS(status); + + if (status != 0) { + printf("Child2 exited with status %d\n", status); + } + + return 0; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/backend.hotplug --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/examples/backend.hotplug Thu Aug 25 22:53:20 2005 @@ -0,0 +1,21 @@ +#! /bin/sh + +#DEVPATH=/devices/xen-backend/vif-1-0 +#ACTION=add + +PATH=/etc/xen/scripts:$PATH + +DEV=$(basename "$DEVPATH") +case "$ACTION" in + add) + case "$DEV" in + vif-*) + vif=$(echo "$DEV" | sed 's/-\([0-9]*\)-\([0-9]*\)/\1.\2/') + vif-bridge up domain=unknown vif="$vif" mac=fe:ff:ff:ff:ff:ff bridge=xen-br0 >/dev/null 2>&1 + ;; + esac + ;; + remove) + ;; +esac + diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/network-bridge --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/examples/network-bridge Thu Aug 25 22:53:20 2005 @@ -0,0 +1,261 @@ +#!/bin/sh -x +#============================================================================ +# Default Xen network start/stop script. +# Xend calls a network script when it starts. +# The script name to use is defined in /etc/xen/xend-config.sxp +# in the network-script field. +# +# This script creates a bridge (default xen-br0), adds a device +# (default eth0) to it, copies the IP addresses from the device +# to the bridge and adjusts the routes accordingly. +# +# If all goes well, this should ensure that networking stays up. +# However, some configurations are upset by this, especially +# NFS roots. If the bridged setup does not meet your needs, +# configure a different script, for example using routing instead. +# +# Usage: +# +# network (start|stop|status) {VAR=VAL}* +# +# Vars: +# +# bridge The bridge to use (default xen-br0). +# netdev The interface to add to the bridge (default eth0). +# antispoof Whether to use iptables to prevent spoofing (default yes). +# +# start: +# Creates the bridge and enslaves netdev to it. +# Copies the IP addresses from netdev to the bridge. +# Deletes the routes to netdev and adds them on bridge. +# +# stop: +# Removes netdev from the bridge. +# Deletes the routes to bridge and adds them to netdev. +# +# status: +# Print ifconfig for netdev and bridge. +# Print routes. +# +#============================================================================ + +# Exit if anything goes wrong. +set -e + +# First arg is the operation. +OP=$1 +shift + +# Pull variables in args in to environment. +for arg ; do export "${arg}" ; done + +bridge=${bridge:-xen-br0} +netdev=${netdev:-eth0} +antispoof=${antispoof:-no} + +echo "*network $OP bridge=$bridge netdev=$netdev antispoof=$antispoof" >&2 + +# Usage: transfer_addrs src dst +# Copy all IP addresses (including aliases) from device $src to device $dst. +transfer_addrs () { + local src=$1 + local dst=$2 + # Don't bother if $dst already has IP addresses. + if ip addr show dev ${dst} | egrep -q '^ *inet ' ; then + return + fi + # Address lines start with 'inet' and have the device in them. + # Replace 'inet' with 'ip addr add' and change the device name $src + # to 'dev $src'. + ip addr show dev ${src} | egrep '^ *inet ' | sed -e " +s/inet/ip addr add/ +s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+/[0-9]\+\)@\1@ +s/${src}/dev ${dst}/ +" | sh -e + # Remove automatic routes on destionation device + ip route list | sed -ne " +/dev ${dst}\( \|$\)/ { + s/^/ip route del / + p +}" | sh -e +} + +# Usage: del_addrs src +del_addrs () { + local src=$1 + ip addr show dev ${src} | egrep '^ *inet ' | sed -e " +s/inet/ip addr del/ +s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+\)/[0-9]\+@\1@ +s/${src}/dev ${src}/ +" | sh -e +} + +# Usage: transfer_routes src dst +# Get all IP routes to device $src, delete them, and +# add the same routes to device $dst. +# The original routes have to be deleted, otherwise adding them +# for $dst fails (duplicate routes). +transfer_routes () { + local src=$1 + local dst=$2 + # List all routes and grep the ones with $src in. + # Stick 'ip route del' on the front to delete. + # Change $src to $dst and use 'ip route add' to add. + ip route list | sed -ne " +/dev ${src}\( \|$\)/ { + h + s/^/ip route del / + P + g + s/${src}/${dst}/ + s/^/ip route add / + P + d +}" | sh -e +} + +# Usage: create_bridge bridge +create_bridge () { + local bridge=$1 + + # Don't create the bridge if it already exists. + if ! brctl show | grep -q ${bridge} ; then + brctl addbr ${bridge} + brctl stp ${bridge} off + brctl setfd ${bridge} 0 + fi + ifconfig ${bridge} up +} + +# Usage: add_to_bridge bridge dev +add_to_bridge () { + local bridge=$1 + local dev=$2 + # Don't add $dev to $bridge if it's already on a bridge. + if ! brctl show | grep -q ${dev} ; then + brctl addif ${bridge} ${dev} + fi +} + +# Usage: antispoofing dev bridge +# Set the default forwarding policy for $dev to drop. +# Allow forwarding to the bridge. +antispoofing () { + local dev=$1 + local bridge=$2 + + iptables -P FORWARD DROP + iptables -A FORWARD -m physdev --physdev-in ${dev} -j ACCEPT +} + +# Usage: show_status dev bridge +# Print ifconfig and routes. +show_status () { + local dev=$1 + local bridge=$2 + + echo '============================================================' + ifconfig ${dev} + ifconfig ${bridge} + echo ' ' + ip route list + echo ' ' + route -n + echo '============================================================' +} + +op_start () { + if [ "${bridge}" == "null" ] ; then + return + fi + + create_bridge ${bridge} + + if ifconfig 2>/dev/null | grep -q veth0 ; then + return + fi + + if ifconfig veth0 2>/dev/null | grep -q veth0 ; then + mac=`ifconfig ${netdev} | grep HWadd | sed -e 's/.*\(..:..:..:..:..:..\).*/\1/'` + if ! ifdown ${netdev} ; then + # if ifup didn't work, see if we have an ip= on cmd line + if egrep 'ip=[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+:' /proc/cmdline ; + then + kip=`sed -e 's!.*ip=\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+\):.*!\1!' /proc/cmdline` + kmask=`sed -e 's!.*ip=[^:]*:[^:]*:[^:]*:\([^:]*\):.*!\1!' /proc/cmdline` + kgate=`sed -e 's!.*ip=[^:]*:[^:]*:\([^:]*\):.*!\1!' /proc/cmdline` + ifconfig ${netdev} 0.0.0.0 down + fi + fi + ip link set ${netdev} name p${netdev} + ip link set veth0 name ${netdev} + ifconfig p${netdev} 0.0.0.0 -arp down + ifconfig p${netdev} hw ether fe:ff:ff:ff:ff:ff + ifconfig ${netdev} hw ether ${mac} + add_to_bridge ${bridge} vif0.0 + add_to_bridge ${bridge} p${netdev} + ip link set ${bridge} up + ip link set vif0.0 up + ip link set p${netdev} up + if ! ifup ${netdev} ; then + if [ ${kip} ] ; then + # use the addresses we grocked from /proc/cmdline + ifconfig ${netdev} ${kip} + [ ${kmask} ] && ifconfig ${netdev} netmask ${kmask} + ifconfig ${netdev} up + [ ${kgate} ] && ip route add default via ${kgate} + fi + fi + else + # old style without veth0 + transfer_addrs ${netdev} ${bridge} + transfer_routes ${netdev} ${bridge} + fi + + if [ ${antispoof} == 'yes' ] ; then + antispoofing ${netdev} ${bridge} + fi +} + +op_stop () { + if [ "${bridge}" == "null" ] ; then + return + fi + + brctl delif ${bridge} ${netdev} + + if ifconfig veth0 2>/dev/null | grep -q veth0 ; then + brctl delif ${bridge} vif0.0 + ifconfig vif0.0 down + mac=`ifconfig veth0 | grep HWadd | sed -e 's/.*\(..:..:..:..:..:..\).*/\1/'` + ifconfig ${netdev} down + ifconfig ${netdev} hw ether ${mac} + ifconfig ${netdev} arp up + transfer_addrs veth0 ${netdev} + transfer_routes veth0 ${netdev} + del_addrs veth0 + ifconfig veth0 -arp down + ifconfig veth0 hw ether 00:00:00:00:00:00 + else + transfer_routes ${bridge} ${netdev} + fi +} + +case ${OP} in + start) + op_start + ;; + + stop) + op_stop + ;; + + status) + show_status ${netdev} ${bridge} + ;; + + *) + echo 'Unknown command: ' ${OP} >&2 + echo 'Valid commands are: start, stop, status' >&2 + exit 1 +esac diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xenctrl.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xenctrl.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,526 @@ +/****************************************************************************** + * xenctrl.h + * + * A library for low-level access to the Xen control interfaces. + * + * Copyright (c) 2003-2004, K A Fraser. + */ + +#ifndef XENCTRL_H +#define XENCTRL_H + +#include <stdint.h> + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +#include <sys/ptrace.h> +#include <xen/xen.h> +#include <xen/dom0_ops.h> +#include <xen/event_channel.h> +#include <xen/sched_ctl.h> +#include <xen/acm.h> + +#ifdef __ia64__ +#define XC_PAGE_SHIFT 14 +#else +#define XC_PAGE_SHIFT 12 +#endif +#define XC_PAGE_SIZE (1UL << XC_PAGE_SHIFT) +#define XC_PAGE_MASK (~(XC_PAGE_SIZE-1)) + +/* + * DEFINITIONS FOR CPU BARRIERS + */ + +#if defined(__i386__) +#define mb() __asm__ __volatile__ ( "lock; addl $0,0(%%esp)" : : : "memory" ) +#define rmb() __asm__ __volatile__ ( "lock; addl $0,0(%%esp)" : : : "memory" ) +#define wmb() __asm__ __volatile__ ( "" : : : "memory") +#elif defined(__x86_64__) +#define mb() __asm__ __volatile__ ( "mfence" : : : "memory") +#define rmb() __asm__ __volatile__ ( "lfence" : : : "memory") +#define wmb() __asm__ __volatile__ ( "" : : : "memory") +#elif defined(__ia64__) +/* FIXME */ +#define mb() +#define rmb() +#define wmb() +#else +#error "Define barriers" +#endif + +/* + * INITIALIZATION FUNCTIONS + */ + +/** + * This function opens a handle to the hypervisor interface. This function can + * be called multiple times within a single process. Multiple processes can + * have an open hypervisor interface at the same time. + * + * Each call to this function should have a corresponding call to + * xc_interface_close(). + * + * This function can fail if the caller does not have superuser permission or + * if a Xen-enabled kernel is not currently running. + * + * @return a handle to the hypervisor interface or -1 on failure + */ +int xc_interface_open(void); + +/** + * This function closes an open hypervisor interface. + * + * This function can fail if the handle does not represent an open interface or + * if there were problems closing the interface. + * + * @parm xc_handle a handle to an open hypervisor interface + * @return 0 on success, -1 otherwise. + */ +int xc_interface_close(int xc_handle); + +/* + * DOMAIN DEBUGGING FUNCTIONS + */ + +typedef struct xc_core_header { + unsigned int xch_magic; + unsigned int xch_nr_vcpus; + unsigned int xch_nr_pages; + unsigned int xch_ctxt_offset; + unsigned int xch_index_offset; + unsigned int xch_pages_offset; +} xc_core_header_t; + + +long xc_ptrace(enum __ptrace_request request, + u32 domid, + long addr, + long data); + +long xc_ptrace_core(enum __ptrace_request request, + u32 domid, + long addr, + long data); + +int xc_waitdomain(int domain, + int *status, + int options); + +int xc_waitdomain_core(int domain, + int *status, + int options); + +/* + * DOMAIN MANAGEMENT FUNCTIONS + */ + +typedef struct { + u32 domid; + u32 ssidref; + unsigned int dying:1, crashed:1, shutdown:1, + paused:1, blocked:1, running:1; + unsigned int shutdown_reason; /* only meaningful if shutdown==1 */ + unsigned long nr_pages; + unsigned long shared_info_frame; + u64 cpu_time; + unsigned long max_memkb; + unsigned int vcpus; + s32 vcpu_to_cpu[MAX_VIRT_CPUS]; + cpumap_t cpumap[MAX_VIRT_CPUS]; +} xc_dominfo_t; + +typedef dom0_getdomaininfo_t xc_domaininfo_t; +int xc_domain_create(int xc_handle, + u32 ssidref, + u32 *pdomid); + + +int xc_domain_dumpcore(int xc_handle, + u32 domid, + const char *corename); + + +/** + * This function pauses a domain. A paused domain still exists in memory + * however it does not receive any timeslices from the hypervisor. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm domid the domain id to pause + * @return 0 on success, -1 on failure. + */ +int xc_domain_pause(int xc_handle, + u32 domid); +/** + * This function unpauses a domain. The domain should have been previously + * paused. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm domid the domain id to unpause + * return 0 on success, -1 on failure + */ +int xc_domain_unpause(int xc_handle, + u32 domid); + +/** + * This function will destroy a domain. Destroying a domain removes the domain + * completely from memory. This function should be called after sending the + * domain a SHUTDOWN control message to free up the domain resources. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm domid the domain id to destroy + * @return 0 on success, -1 on failure + */ +int xc_domain_destroy(int xc_handle, + u32 domid); +int xc_domain_pincpu(int xc_handle, + u32 domid, + int vcpu, + cpumap_t *cpumap); +/** + * This function will return information about one or more domains. It is + * designed to iterate over the list of domains. If a single domain is + * requested, this function will return the next domain in the list - if + * one exists. It is, therefore, important in this case to make sure the + * domain requested was the one returned. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm first_domid the first domain to enumerate information from. Domains + * are currently enumerate in order of creation. + * @parm max_doms the number of elements in info + * @parm info an array of max_doms size that will contain the information for + * the enumerated domains. + * @return the number of domains enumerated or -1 on error + */ +int xc_domain_getinfo(int xc_handle, + u32 first_domid, + unsigned int max_doms, + xc_dominfo_t *info); + +/** + * This function will return information about one or more domains, using a + * single hypercall. The domain information will be stored into the supplied + * array of xc_domaininfo_t structures. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm first_domain the first domain to enumerate information from. + * Domains are currently enumerate in order of creation. + * @parm max_domains the number of elements in info + * @parm info an array of max_doms size that will contain the information for + * the enumerated domains. + * @return the number of domains enumerated or -1 on error + */ +int xc_domain_getinfolist(int xc_handle, + u32 first_domain, + unsigned int max_domains, + xc_domaininfo_t *info); + +/** + * This function returns information about one domain. This information is + * more detailed than the information from xc_domain_getinfo(). + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm domid the domain to get information from + * @parm info a pointer to an xc_domaininfo_t to store the domain information + * @parm ctxt a pointer to a structure to store the execution context of the + * domain + * @return 0 on success, -1 on failure + */ +int xc_domain_get_vcpu_context(int xc_handle, + u32 domid, + u32 vcpu, + vcpu_guest_context_t *ctxt); + +int xc_domain_setcpuweight(int xc_handle, + u32 domid, + float weight); +long long xc_domain_get_cpu_usage(int xc_handle, + domid_t domid, + int vcpu); + + +typedef dom0_shadow_control_stats_t xc_shadow_control_stats_t; +int xc_shadow_control(int xc_handle, + u32 domid, + unsigned int sop, + unsigned long *dirty_bitmap, + unsigned long pages, + xc_shadow_control_stats_t *stats); + +int xc_bvtsched_global_set(int xc_handle, + unsigned long ctx_allow); + +int xc_bvtsched_domain_set(int xc_handle, + u32 domid, + u32 mcuadv, + int warpback, + s32 warpvalue, + long long warpl, + long long warpu); + +int xc_bvtsched_global_get(int xc_handle, + unsigned long *ctx_allow); + +int xc_bvtsched_domain_get(int xc_handle, + u32 domid, + u32 *mcuadv, + int *warpback, + s32 *warpvalue, + long long *warpl, + long long *warpu); + +int xc_sedf_domain_set(int xc_handle, + u32 domid, + u64 period, u64 slice, u64 latency, u16 extratime, u16 weight); + +int xc_sedf_domain_get(int xc_handle, + u32 domid, + u64* period, u64 *slice, u64 *latency, u16 *extratime, u16* weight); + +typedef evtchn_status_t xc_evtchn_status_t; + +/* + * EVENT CHANNEL FUNCTIONS + */ + +/** + * This function allocates an unbound port. Ports are named endpoints used for + * interdomain communication. This function is most useful in opening a + * well-known port within a domain to receive events on. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm dom the ID of the domain. This maybe DOMID_SELF + * @parm port a pointer to a port. This is an in/out parameter. If *port is + * 0, then a new port will be assigned, if port is > 0 then that + * port is allocated if the port is unallocated. + * @return 0 on success, -1 on failure + */ +int xc_evtchn_alloc_unbound(int xc_handle, + u32 dom, + int *port); + +/** + * This function creates a pair of ports between two domains. A port can only + * be bound once within a domain. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm dom1 one of the two domains to connect. Can be DOMID_SELF. + * @parm dom2 the other domain to connect. Can be DOMID_SELF. + * @parm port1 an in/out parameter. If > 0, then try to connect *port. If + * 0, then allocate a new port and store the port in *port. + * @parm port2 the port connected on port2. This parameter behaves the same + * way as port1. + * @return 0 on success, -1 on error. + */ +int xc_evtchn_bind_interdomain(int xc_handle, + u32 dom1, + u32 dom2, + int *port1, + int *port2); +int xc_evtchn_bind_virq(int xc_handle, + int virq, + int *port); + +/** + * This function will close a single port on an event channel. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm dom the domain that the port exists on. May be DOMID_SELF. + * @parm port the port to close + * @return 0 on success, -1 on error + */ +int xc_evtchn_close(int xc_handle, + u32 dom, /* may be DOMID_SELF */ + int port); + +/** + * This function generates a notify event on a bound port. + * + * Notifies can be read within Linux by opening /dev/xen/evtchn and reading + * a 16 bit value. The result will be the port the event occurred on. When + * events occur, the port is masked until the 16 bit port value is written back + * to the file. When /dev/xen/evtchn is opened, it has to be bound via an + * ioctl to each port to listen on. The ioctl for binding is _IO('E', 2). The + * parameter is the port to listen on. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm local_port the port to generate the notify on + * @return 0 on success, -1 on error + */ +int xc_evtchn_send(int xc_handle, + int local_port); +int xc_evtchn_status(int xc_handle, + u32 dom, /* may be DOMID_SELF */ + int port, + xc_evtchn_status_t *status); + +int xc_physdev_pci_access_modify(int xc_handle, + u32 domid, + int bus, + int dev, + int func, + int enable); + +int xc_readconsolering(int xc_handle, + char **pbuffer, + unsigned int *pnr_chars, + int clear); + +typedef dom0_physinfo_t xc_physinfo_t; +int xc_physinfo(int xc_handle, + xc_physinfo_t *info); + +int xc_sched_id(int xc_handle, + int *sched_id); + +int xc_domain_setmaxmem(int xc_handle, + u32 domid, + unsigned int max_memkb); + +int xc_domain_memory_increase_reservation(int xc_handle, + u32 domid, + unsigned int mem_kb); + +typedef dom0_perfc_desc_t xc_perfc_desc_t; +/* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */ +int xc_perfc_control(int xc_handle, + u32 op, + xc_perfc_desc_t *desc); + +/* read/write msr */ +long long xc_msr_read(int xc_handle, int cpu_mask, int msr); +int xc_msr_write(int xc_handle, int cpu_mask, int msr, unsigned int low, + unsigned int high); + +/** + * Memory maps a range within one domain to a local address range. Mappings + * should be unmapped with munmap and should follow the same rules as mmap + * regarding page alignment. Returns NULL on failure. + * + * In Linux, the ring queue for the control channel is accessible by mapping + * the shared_info_frame (from xc_domain_getinfo()) + 2048. The structure + * stored there is of type control_if_t. + * + * @parm xc_handle a handle on an open hypervisor interface + * @parm dom the domain to map memory from + * @parm size the amount of memory to map (in multiples of page size) + * @parm prot same flag as in mmap(). + * @parm mfn the frame address to map. + */ +void *xc_map_foreign_range(int xc_handle, u32 dom, + int size, int prot, + unsigned long mfn ); + +void *xc_map_foreign_batch(int xc_handle, u32 dom, int prot, + unsigned long *arr, int num ); + +int xc_get_pfn_list(int xc_handle, u32 domid, unsigned long *pfn_buf, + unsigned long max_pfns); + +int xc_ia64_get_pfn_list(int xc_handle, u32 domid, unsigned long *pfn_buf, + unsigned int start_page, unsigned int nr_pages); + +int xc_mmuext_op(int xc_handle, struct mmuext_op *op, unsigned int nr_ops, + domid_t dom); + +int xc_dom_mem_op(int xc_handle, unsigned int memop, unsigned int *extent_list, + unsigned int nr_extents, unsigned int extent_order, + domid_t domid); + +int xc_get_pfn_type_batch(int xc_handle, u32 dom, int num, unsigned long *arr); + + +/*\ + * GRANT TABLE FUNCTIONS +\*/ + +/** + * This function opens a handle to the more restricted grant table hypervisor + * interface. This may be used where the standard interface is not + * available because the domain is not privileged. + * This function can be called multiple times within a single process. + * Multiple processes can have an open hypervisor interface at the same time. + * + * Each call to this function should have a corresponding call to + * xc_grant_interface_close(). + * + * This function can fail if a Xen-enabled kernel is not currently running. + * + * @return a handle to the hypervisor grant table interface or -1 on failure + */ +int xc_grant_interface_open(void); + +/** + * This function closes an open grant table hypervisor interface. + * + * This function can fail if the handle does not represent an open interface or + * if there were problems closing the interface. + * + * @parm xc_handle a handle to an open grant table hypervisor interface + * @return 0 on success, -1 otherwise. + */ +int xc_grant_interface_close(int xc_handle); + +int xc_gnttab_map_grant_ref(int xc_handle, + u64 host_virt_addr, + u32 dom, + u16 ref, + u16 flags, + s16 *handle, + u64 *dev_bus_addr); + +int xc_gnttab_unmap_grant_ref(int xc_handle, + u64 host_virt_addr, + u64 dev_bus_addr, + u16 handle, + s16 *status); + +int xc_gnttab_setup_table(int xc_handle, + u32 dom, + u16 nr_frames, + s16 *status, + unsigned long **frame_list); + +/* Grant debug builds only: */ +int xc_gnttab_dump_table(int xc_handle, + u32 dom, + s16 *status); + +/* Get current total pages allocated to a domain. */ +long xc_get_tot_pages(int xc_handle, u32 domid); + +/* Execute a privileged dom0 operation. */ +int xc_dom0_op(int xc_handle, dom0_op_t *op); + +/* Initializes the store (for dom0) + remote_port should be the remote end of a bound interdomain channel between + the store and dom0. + + This function returns a shared frame that should be passed to + xs_introduce_domain + */ +long xc_init_store(int xc_handle, int remote_port); + +/* + * MMU updates. + */ +#define MAX_MMU_UPDATES 1024 +struct xc_mmu { + mmu_update_t updates[MAX_MMU_UPDATES]; + int idx; + domid_t subject; +}; +typedef struct xc_mmu xc_mmu_t; +xc_mmu_t *xc_init_mmu_updates(int xc_handle, domid_t dom); +int xc_add_mmu_update(int xc_handle, xc_mmu_t *mmu, + unsigned long ptr, unsigned long val); +int xc_finish_mmu_updates(int xc_handle, xc_mmu_t *mmu); + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xenguest.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xenguest.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,66 @@ +/****************************************************************************** + * xenguest.h + * + * A library for guest domain management in Xen. + * + * Copyright (c) 2003-2004, K A Fraser. + */ + +#ifndef XENBUILD_H +#define XENBUILD_H + +#define XCFLAGS_VERBOSE 1 +#define XCFLAGS_LIVE 2 +#define XCFLAGS_DEBUG 4 +#define XCFLAGS_CONFIGURE 8 + +/** + * This function will save a domain running Linux. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm fd the file descriptor to save a domain to + * @parm dom the id of the domain + * @return 0 on success, -1 on failure + */ +int xc_linux_save(int xc_handle, int fd, uint32_t dom); + +/** + * This function will restore a saved domain running Linux. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm fd the file descriptor to restore a domain from + * @parm dom the id of the domain + * @parm nr_pfns the number of pages + * @parm store_evtchn the store event channel for this domain to use + * @parm store_mfn returned with the mfn of the store page + * @return 0 on success, -1 on failure + */ +int xc_linux_restore(int xc_handle, int io_fd, uint32_t dom, unsigned long nr_pfns, + unsigned int store_evtchn, unsigned long *store_mfn); + +int xc_linux_build(int xc_handle, + uint32_t domid, + const char *image_name, + const char *ramdisk_name, + const char *cmdline, + unsigned int control_evtchn, + unsigned long flags, + unsigned int vcpus, + unsigned int store_evtchn, + unsigned long *store_mfn); + +struct mem_map; +int xc_vmx_build(int xc_handle, + uint32_t domid, + int memsize, + const char *image_name, + struct mem_map *memmap, + const char *ramdisk_name, + const char *cmdline, + unsigned int control_evtchn, + unsigned long flags, + unsigned int vcpus, + unsigned int store_evtchn, + unsigned long *store_mfn); + +#endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xg_private.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xg_private.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,86 @@ +/****************************************************************************** + * xg_private.c + * + * Helper functions for the rest of the library. + */ + +#include <stdlib.h> +#include <zlib.h> + +#include "xg_private.h" + +char *xc_read_kernel_image(const char *filename, unsigned long *size) +{ + int kernel_fd = -1; + gzFile kernel_gfd = NULL; + char *image = NULL; + unsigned int bytes; + + if ( (kernel_fd = open(filename, O_RDONLY)) < 0 ) + { + PERROR("Could not open kernel image"); + goto out; + } + + if ( (*size = xc_get_filesz(kernel_fd)) == 0 ) + { + PERROR("Could not read kernel image"); + goto out; + } + + if ( (kernel_gfd = gzdopen(kernel_fd, "rb")) == NULL ) + { + PERROR("Could not allocate decompression state for state file"); + goto out; + } + + if ( (image = malloc(*size)) == NULL ) + { + PERROR("Could not allocate memory for kernel image"); + goto out; + } + + if ( (bytes = gzread(kernel_gfd, image, *size)) != *size ) + { + PERROR("Error reading kernel image, could not" + " read the whole image (%d != %ld).", bytes, *size); + free(image); + image = NULL; + } + + out: + if ( kernel_gfd != NULL ) + gzclose(kernel_gfd); + else if ( kernel_fd >= 0 ) + close(kernel_fd); + return image; +} + +/*******************/ + +int pin_table( + int xc_handle, unsigned int type, unsigned long mfn, domid_t dom) +{ + struct mmuext_op op; + + op.cmd = type; + op.mfn = mfn; + + if ( xc_mmuext_op(xc_handle, &op, 1, dom) < 0 ) + return 1; + + return 0; +} + +/* This is shared between save and restore, and may generally be useful. */ +unsigned long csum_page (void * page) +{ + int i; + unsigned long *p = page; + unsigned long long sum=0; + + for ( i = 0; i < (PAGE_SIZE/sizeof(unsigned long)); i++ ) + sum += p[i]; + + return sum ^ (sum>>32); +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/libxc/xg_private.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/libxc/xg_private.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,170 @@ +#ifndef XG_PRIVATE_H +#define XG_PRIVATE_H + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> + +#include "xenctrl.h" + +#include <xen/linux/privcmd.h> + +char *xc_read_kernel_image(const char *filename, unsigned long *size); +unsigned long csum_page (void * page); + +#define _PAGE_PRESENT 0x001 +#define _PAGE_RW 0x002 +#define _PAGE_USER 0x004 +#define _PAGE_PWT 0x008 +#define _PAGE_PCD 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_PAT 0x080 +#define _PAGE_PSE 0x080 +#define _PAGE_GLOBAL 0x100 + +#if defined(__i386__) +#define L1_PAGETABLE_SHIFT 12 +#define L2_PAGETABLE_SHIFT 22 +#define L1_PAGETABLE_SHIFT_PAE 12 +#define L2_PAGETABLE_SHIFT_PAE 21 +#define L3_PAGETABLE_SHIFT_PAE 30 +#elif defined(__x86_64__) +#define L1_PAGETABLE_SHIFT 12 +#define L2_PAGETABLE_SHIFT 21 +#define L3_PAGETABLE_SHIFT 30 +#define L4_PAGETABLE_SHIFT 39 +#endif + +#if defined(__i386__) +#define ENTRIES_PER_L1_PAGETABLE 1024 +#define ENTRIES_PER_L2_PAGETABLE 1024 +#define L1_PAGETABLE_ENTRIES_PAE 512 +#define L2_PAGETABLE_ENTRIES_PAE 512 +#define L3_PAGETABLE_ENTRIES_PAE 4 +#elif defined(__x86_64__) +#define L1_PAGETABLE_ENTRIES 512 +#define L2_PAGETABLE_ENTRIES 512 +#define L3_PAGETABLE_ENTRIES 512 +#define L4_PAGETABLE_ENTRIES 512 +#endif + +#define PAGE_SHIFT XC_PAGE_SHIFT +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +typedef u32 l1_pgentry_32_t; +typedef u32 l2_pgentry_32_t; +typedef u64 l1_pgentry_64_t; +typedef u64 l2_pgentry_64_t; +typedef u64 l3_pgentry_64_t; +typedef unsigned long l1_pgentry_t; +typedef unsigned long l2_pgentry_t; +#if defined(__x86_64__) +typedef unsigned long l3_pgentry_t; +typedef unsigned long l4_pgentry_t; +#endif + +#if defined(__i386__) +#define l1_table_offset(_a) \ + (((_a) >> L1_PAGETABLE_SHIFT) & (ENTRIES_PER_L1_PAGETABLE - 1)) +#define l2_table_offset(_a) \ + ((_a) >> L2_PAGETABLE_SHIFT) +#define l1_table_offset_pae(_a) \ + (((_a) >> L1_PAGETABLE_SHIFT_PAE) & (L1_PAGETABLE_ENTRIES_PAE - 1)) +#define l2_table_offset_pae(_a) \ + (((_a) >> L2_PAGETABLE_SHIFT_PAE) & (L2_PAGETABLE_ENTRIES_PAE - 1)) +#define l3_table_offset_pae(_a) \ + (((_a) >> L3_PAGETABLE_SHIFT_PAE) & (L3_PAGETABLE_ENTRIES_PAE - 1)) +#elif defined(__x86_64__) +#define l1_table_offset(_a) \ + (((_a) >> L1_PAGETABLE_SHIFT) & (L1_PAGETABLE_ENTRIES - 1)) +#define l2_table_offset(_a) \ + (((_a) >> L2_PAGETABLE_SHIFT) & (L2_PAGETABLE_ENTRIES - 1)) +#define l3_table_offset(_a) \ + (((_a) >> L3_PAGETABLE_SHIFT) & (L3_PAGETABLE_ENTRIES - 1)) +#define l4_table_offset(_a) \ + (((_a) >> L4_PAGETABLE_SHIFT) & (L4_PAGETABLE_ENTRIES - 1)) +#endif + +#define ERROR(_m, _a...) \ +do { \ + int __saved_errno = errno; \ + fprintf(stderr, "ERROR: " _m "\n" , ## _a ); \ + errno = __saved_errno; \ +} while (0) + + +#define PERROR(_m, _a...) \ +do { \ + int __saved_errno = errno; \ + fprintf(stderr, "ERROR: " _m " (%d = %s)\n" , ## _a , \ + __saved_errno, strerror(__saved_errno)); \ + errno = __saved_errno; \ +} while (0) + + +struct domain_setup_info +{ + unsigned long v_start; + unsigned long v_end; + unsigned long v_kernstart; + unsigned long v_kernend; + unsigned long v_kernentry; + + unsigned int load_symtab; + unsigned int pae_kernel; + unsigned long symtab_addr; + unsigned long symtab_len; +}; + +typedef int (*parseimagefunc)(char *image, unsigned long image_size, + struct domain_setup_info *dsi); +typedef int (*loadimagefunc)(char *image, unsigned long image_size, int xch, + u32 dom, unsigned long *parray, + struct domain_setup_info *dsi); + +struct load_funcs +{ + parseimagefunc parseimage; + loadimagefunc loadimage; +}; + +#define mfn_mapper_queue_size 128 + +typedef struct mfn_mapper { + int xc_handle; + int size; + int prot; + int error; + int max_queue_size; + void * addr; + privcmd_mmap_t ioctl; + +} mfn_mapper_t; + +unsigned long xc_get_m2p_start_mfn (int xc_handle); + +int xc_copy_to_domain_page(int xc_handle, u32 domid, + unsigned long dst_pfn, void *src_page); + +unsigned long xc_get_filesz(int fd); + +void xc_map_memcpy(unsigned long dst, char *src, unsigned long size, + int xch, u32 dom, unsigned long *parray, + unsigned long vstart); + +int pin_table(int xc_handle, unsigned int type, unsigned long mfn, + domid_t dom); + +/* image loading */ +int probe_elf(char *image, unsigned long image_size, struct load_funcs *funcs); +int probe_bin(char *image, unsigned long image_size, struct load_funcs *funcs); +int probe_aout9(char *image, unsigned long image_size, struct load_funcs *funcs); + +#endif + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/example.txt --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/example.txt Thu Aug 25 22:53:20 2005 @@ -0,0 +1,269 @@ +## +# example.txt <description to the xen access control architecture> +# +# Author: +# Reiner Sailer 08/15/2005 <sailer@xxxxxxxxxxxxxx> +# +# +# This file introduces into the tools to manage policies +# and to label domains and resources. +## + +We will show how to install and use the chwall_ste policy. +Other policies work similarly. Feedback welcome! + + + +1. Using secpol_xml2bin to translate the chwall_ste policy: +=========================================================== + +#tools/security/secpol_xml2bin chwall_ste + +Successful execution should print: + + [root@laptopxn security]# ./secpol_xml2bin chwall_ste + Validating label file policies/chwall_ste/chwall_ste-security_label_template.xml... + XML Schema policies/security_policy.xsd valid. + Validating policy file policies/chwall_ste/chwall_ste-security_policy.xml... + XML Schema policies/security_policy.xsd valid. + Creating ssid mappings ... + Creating label mappings ... + Max chwall labels: 7 + Max chwall-types: 4 + Max chwall-ssids: 5 + Max ste labels: 14 + Max ste-types: 6 + Max ste-ssids: 10 + +The tool looks in directory policies/chwall_ste for +the label and policy files. + +The default policy directory structure under tools/security looks like: + +policies +|-- security_policy.xsd +|-- chwall +| |-- chwall-security_label_template.xml +| `-- chwall-security_policy.xml +|-- chwall_ste +| |-- chwall_ste-security_label_template.xml +| `-- chwall_ste-security_policy.xml +|-- null +| |-- null-security_label_template.xml +| `-- null-security_policy.xml +`-- ste + |-- ste-security_label_template.xml + `-- ste-security_policy.xml + +policies/security_policy.xsd contains the schema against which both the +label-template and the policy files must validate during translation. + +policies/chwall_ste/chwall_ste-security_policy.xml defines the +policies and the types known to the policies. + +policies/chwall_ste/chwall_ste-security_label_template.xml contains +label definitions that group chwall and ste types together and make +them easier to use for users + +After executing the above secpol_xml2bin command, you will find 2 new +files in the policies/chwall_ste sub-directory: + +policies/chwall_ste/chwall_ste.map ... this file includes the mapping +of names from the xml files into their binary code representation. + +policies/chwall_ste/chwall_ste.bin ... this is the binary policy file, +the result of parsing the xml files and using the mapping to extract a +binary version that can be loaded into the hypervisor. + + + +2. Loading and activating the policy: +===================================== + +We assume that xen is already configured to use the chwall_ste policy; +please refer to install.txt for instructions. + +To activate the policy from the command line (assuming that the +currently established policy is the minimal boot-policy that is +hard-coded into the hypervisor: + +# ./secpol_tool loadpolicy policies/chwall_ste/chwall_ste.bin + +To activate the policy at next reboot: + +# cp policies/chwall_ste/chwall_ste.bin /boot + +Add a module line to your /boot/grub/grub.conf Xen entry. +My boot entry with chwall_ste enabled looks like this: + + title Xen (2.6.12) + root (hd0,5) + kernel /boot/xen.gz dom0_mem=1200000 console=vga + module /boot/vmlinuz-2.6.12-xen0 ro root=/dev/hda6 rhgb + module /boot/initrd-2.6.12-xen0.img + module /boot/chwall_ste.bin + +This tells the grub boot-loader to load the binary policy, which +the hypervisor will recognize. The hypervisor will then establish +this binary policy during boot instead of the minimal policy that +is hardcoded as default. + +If you have any trouble here, maks sure you have the access control +framework enabled (see: install.txt). + + + +3. Labeling domains: +==================== + +a) Labeling Domain0: + +The chwall_ste-security_label_template.xml file includes an attribute +"bootstrap", which is set to the label name that will be assigned to +Dom0 (this label will be mapped to ssidref 1/1, the default for Dom0). + +b) Labeling User Domains: + +Use the script tools/security/setlabel.sh to choose a label and to +assign labels to user domains. + +To show available labels for the chwall_ste policy: + +#tools/security/setlabel.sh -l + +lists all available labels. For the default chwall_ste it should print +the following: + + [root@laptopxn security]# ./setlabel.sh -l chwall_ste + The following labels are available: + dom_SystemManagement + dom_HomeBanking + dom_Fun + dom_BoincClient + dom_StorageDomain + dom_NetworkDomain + +You need to have compiled the policy beforehand so that a .map file +exists. Setlabel.sh uses the mapping file created throughout the +policy translation to translate a user-friendly label string into a +ssidref-number that is eventually used by the Xen hypervisor. + +We distinguish two kinds of labels: a) VM labels (for domains) and RES +Labels (for resources). We are currently working on support for +resource labeling but will focus here on VM labels. + +Setlabel.sh only prints VM labels (which we have prefixed with "dom_") +since only those are used at this time. + +If you would like to assign the dom_HomeBanking label to one of your +user domains (which you hopefully keep clean), look at an example +domain configuration homebanking.xm: + + #------HOMEBANKING--------- + kernel = "/boot/vmlinuz-2.6.12-xenU" + ramdisk="/boot/U1_ramdisk.img" + memory = 65 + name = "test34" + cpu = -1 # leave to Xen to pick + # Number of network interfaces. Default is 1. + nics=1 + dhcp="dhcp" + #------------------------- + +Now we label this domain + +[root@laptopxn security]# ./setlabel.sh homebanking.xm dom_HomeBanking chwall_ste +Mapped label 'dom_HomeBanking' to ssidref '0x00020002'. + +The domain configuration my look now like: + + [root@laptopxn security]# cat homebanking.xm + #------HOMEBANKING--------- + kernel = "/boot/vmlinuz-2.6.12-xenU" + ramdisk="/boot/U1_ramdisk.img" + memory = 65 + name = "test34" + cpu = -1 # leave to Xen to pick + # Number of network interfaces. Default is 1. + nics=1 + dhcp="dhcp" + #------------------------- + #ACM_POLICY=chwall_ste-security_policy.xml + #ACM_LABEL=dom_HomeBanking + ssidref = 0x00020002 + +You can see 3 new entries, two of which are comments. The only value +that the hypervisor cares about is the ssidref that will reference +those types assigned to this label. You can look them up in the +xml label-template file for the chwall_ste policy. + +This script will eventually move into the domain management and will +be called when the domain is instantiated. For now, the setlabel +script must be run on domains whenever the policy files change since +the mapping between label names and ssidrefs can change in this case. + + +4. Starting a labeled domain +============================ + +Now, start the domain: + #xm create -c homebanking.xm + + +If you label another domain configuration as dom_Fun and try to start +it afterwards, its start will fail. Why? + +Because the running homebanking domain has the chinese wall type +"cw_Sensitive". The new domain dom_Fun has the chinese wall label +"cw_Distrusted". This domain is not allowed to run simultaneously +because of the defined conflict set + + <conflictset name="Protection1"> + <type>cw_Sensitive</type> + <type>cw_Distrusted</type> + </conflictset> + +(in policies/chwall_ste/chwall_ste-security_policy.xml), which says +that only one of the types cw_sensitive and cw_Distrusted can run at a +time. + +If you save or shutdown the HomeBanking domain, you will be able to +start the "Fun" domain. You can look into the Xen log to see if a +domain was denied to start because of the access control framework +with the command 'xm dmesg'. + +It is important (and usually non-trivial) to define the labels in a +way that the semantics of the labels are enforced and supported by the +types and the conflict sets. + +Note: While the chinese wall policy enforcement is complete, the type +enforcement is currently enforced in the Xen hypervisor +only. Therefore, only point-to-point sharing with regard to the type +enforcement is currently controlled. We are working on enhancements to +Dom0 that enforce types also for network traffic that is routed +through Dom0 and on the enforcement of resource labeling when binding +resources to domains (e.g., enforcing types between domains and +hardware resources, such as disk partitions). + + +4. Adding your own policies +=========================== + +Writing your own policy (e.g. "mypolicy") requires the following: + +a) the policy definition (types etc.) file +b) the label template definition (labels etc.) file + +If your policy name is "mypolicy", you need to create a +subdirectory mypolicy in tools/security/policies. + +Then you create +tools/security/policies/mypolicy/mypolicy-security_policy.xml and +tools/security/policies/mypolicy/mypolicy-security_label_template.xml. + +You need to keep to the schema as defined in +tools/security/security_policy.xsd since the translation tool +secpol_xml2bin is written against this schema. + +If you keep to the security policy schema, then you can use all the +tools described above. Refer to install.txt to install it. diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/install.txt --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/install.txt Thu Aug 25 22:53:20 2005 @@ -0,0 +1,67 @@ +## +# install.txt <description to the xen access control architecture> +# +# Author: +# Reiner Sailer 08/15/2005 <sailer@xxxxxxxxxxxxxx> +# +# +# This file shows how to activate and install the access control +# framework. +## + + +INSTALLING A SECURITY POLICY IN XEN +=================================== + +By default, the access control architecture is disabled in Xen. To +enable the access control architecture in Xen follow the steps below. +This description assumes that you want to install the Chinese Wall and +Simple Type Enforcement policy. Some file names need to be replaced +below to activate the Chinese Wall OR the Type Enforcement policy +exclusively (chwall_ste --> {chwall, ste}). + +1. enable access control in Xen + # cd "xen_root" + # edit/xemacs/vi Config.mk + + change the line: + ACM_USE_SECURITY_POLICY ?= ACM_NULL_POLICY + + to: + ACM_USE_SECURITY_POLICY ?= ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY + + # make all + # ./install.sh + +2. compile the policy from xml to a binary format that can be loaded + into the hypervisor for enforcement + # cd tools/security + # make + + manual steps (alternative to make boot_install): + #./secpol_xml2bin chwall_ste + #cp policies/chwall_ste/chwall_ste.bin /boot + #edit /boot/grub/grub.conf + add the follwoing line to your xen boot entry: + "module chwall_ste.bin" + + alternatively, you can try our automatic translation and + installation of the policy: + # make boot_install + + [we try hard to do the right thing to the right boot entry but + please verify boot entry in /boot/grub/grub.conf afterwards; + your xen boot entry should have an additional module line + specifying a chwall_ste.bin file with the correct directory + (e.g. "/" or "/boot").] + + +3. reboot into the newly compiled hypervisor + + after boot + #xm dmesg should show an entry about the policy being loaded + during the boot process + + #tools/security/secpol_tool getpolicy + should print the new chwall_ste binary policy representation + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/chwall/chwall-security_label_template.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/chwall/chwall-security_label_template.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,76 @@ +<?xml version="1.0"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security labels, which can --> +<!-- be attached to Domains and resources. Based on --> +<!-- these labels, the access control module decides --> +<!-- about sharing between Domains and about access --> +<!-- of Domains to real resources. --> + +<SecurityLabelTemplate + xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> + <LabelHeader> + <Name>chwall-security_label_template</Name> + <Date>2005-08-10</Date> + <PolicyName> + <Url>chwall-security_policy.xml</Url> + <Reference>abcdef123456abcdef</Reference> + </PolicyName> + </LabelHeader> + + <SubjectLabels bootstrap="dom_SystemManagement"> + <!-- single ste typed domains --> + <!-- ACM enforces that only domains with --> + <!-- the same type can share information --> + <!-- --> + <!-- Bootstrap label is assigned to Dom0 --> + <VirtualMachineLabel> + <Name>dom_HomeBanking</Name> + <ChineseWallTypes> + <Type>cw_Sensitive</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <Name>dom_Fun</Name> + <ChineseWallTypes> + <Type>cw_Distrusted</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- donating some cycles to seti@home --> + <Name>dom_BoincClient</Name> + <ChineseWallTypes> + <Type>cw_Isolated</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <!-- Domains with multiple ste types services; such domains --> + <!-- must keep the types inside their domain safely confined. --> + <VirtualMachineLabel> + <Name>dom_SystemManagement</Name> + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- serves persistent storage to other domains --> + <Name>dom_StorageDomain</Name> + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- serves network access to other domains --> + <Name>dom_NetworkDomain</Name> + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + </SubjectLabels> +</SecurityLabelTemplate> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/chwall/chwall-security_policy.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/chwall/chwall-security_policy.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security policies, which --> +<!-- can be enforced by the Xen Access Control Module. --> +<!-- Currently: Chinese Wall and Simple Type Enforcement--> +<SecurityPolicyDefinition xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> +<PolicyHeader> + <Name>chwall-security_policy</Name> + <Date>2005-08-10</Date> +</PolicyHeader> +<!-- --> +<!-- example of a chinese wall type definition --> +<!-- along with its conflict sets --> +<!-- (typse in a confict set are exclusive, i.e. --> +<!-- once a Domain with one type of a set is --> +<!-- running, no other Domain with another type --> +<!-- of the same conflict set can start.) --> + <ChineseWall priority="PrimaryPolicyComponent"> + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + <Type>cw_Sensitive</Type> + <Type>cw_Isolated</Type> + <Type>cw_Distrusted</Type> + </ChineseWallTypes> + + <ConflictSets> + <Conflict name="Protection1"> + <Type>cw_Sensitive</Type> + <Type>cw_Distrusted</Type> + </Conflict> + </ConflictSets> + </ChineseWall> +</SecurityPolicyDefinition> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/chwall_ste/chwall_ste-security_label_template.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/chwall_ste/chwall_ste-security_label_template.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,167 @@ +<?xml version="1.0"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security labels, which can --> +<!-- be attached to Domains and resources. Based on --> +<!-- these labels, the access control module decides --> +<!-- about sharing between Domains and about access --> +<!-- of Domains to real resources. --> + +<SecurityLabelTemplate + xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> + <LabelHeader> + <Name>chwall_ste-security_label_template</Name> + <Date>2005-08-10</Date> + <PolicyName> + <Url>chwall_ste-security_policy.xml</Url> + <Reference>abcdef123456abcdef</Reference> + </PolicyName> + </LabelHeader> + + <SubjectLabels bootstrap="dom_SystemManagement"> + <!-- single ste typed domains --> + <!-- ACM enforces that only domains with --> + <!-- the same type can share information --> + <!-- --> + <!-- Bootstrap label is assigned to Dom0 --> + <VirtualMachineLabel> + <Name>dom_HomeBanking</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersonalFinances</Type> + </SimpleTypeEnforcementTypes> + + <ChineseWallTypes> + <Type>cw_Sensitive</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <Name>dom_Fun</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_InternetInsecure</Type> + </SimpleTypeEnforcementTypes> + + <ChineseWallTypes> + <Type>cw_Distrusted</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- donating some cycles to seti@home --> + <Name>dom_BoincClient</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_DonatedCycles</Type> + </SimpleTypeEnforcementTypes> + + <ChineseWallTypes> + <Type>cw_Isolated</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <!-- Domains with multiple ste types services; such domains --> + <!-- must keep the types inside their domain safely confined. --> + <VirtualMachineLabel> + <Name>dom_SystemManagement</Name> + <SimpleTypeEnforcementTypes> + <!-- since dom0 needs access to every domain and --> + <!-- resource right now ... --> + <Type>ste_SystemManagement</Type> + <Type>ste_PersonalFinances</Type> + <Type>ste_InternetInsecure</Type> + <Type>ste_DonatedCycles</Type> + <Type>ste_PersistentStorageA</Type> + <Type>ste_NetworkAdapter0</Type> + </SimpleTypeEnforcementTypes> + + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- serves persistent storage to other domains --> + <Name>dom_StorageDomain</Name> + <SimpleTypeEnforcementTypes> + <!-- access right to the resource (hard drive a) --> + <Type>ste_PersistentStorageA</Type> + <!-- can serve following types --> + <Type>ste_PersonalFinances</Type> + <Type>ste_InternetInsecure</Type> + </SimpleTypeEnforcementTypes> + + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- serves network access to other domains --> + <Name>dom_NetworkDomain</Name> + <SimpleTypeEnforcementTypes> + <!-- access right to the resource (ethernet card) --> + <Type>ste_NetworkAdapter0</Type> + <!-- can serve following types --> + <Type>ste_PersonalFinances</Type> + <Type>ste_InternetInsecure</Type> + <Type>ste_DonatedCycles</Type> + </SimpleTypeEnforcementTypes> + + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + </ChineseWallTypes> + </VirtualMachineLabel> + </SubjectLabels> + + <ObjectLabels> + <ResourceLabel> + <Name>res_ManagementResource</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_SystemManagement</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_HardDrive (hda)</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersistentStorageA</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_LogicalDiskPartition1 (hda1)</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersonalFinances</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_LogicalDiskPartition2 (hda2)</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_InternetInsecure</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_EthernetCard</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_NetworkAdapter0</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_SecurityToken</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersonalFinances</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_GraphicsAdapter</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_SystemManagement</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + </ObjectLabels> +</SecurityLabelTemplate> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/chwall_ste/chwall_ste-security_policy.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/chwall_ste/chwall_ste-security_policy.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,49 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security policies, which --> +<!-- can be enforced by the Xen Access Control Module. --> +<!-- Currently: Chinese Wall and Simple Type Enforcement--> +<SecurityPolicyDefinition xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> +<PolicyHeader> + <Name>chwall_ste-security_policy</Name> + <Date>2005-08-10</Date> +</PolicyHeader> +<!-- --> +<!-- example of a simple type enforcement policy definition --> +<!-- --> + <SimpleTypeEnforcement> + <SimpleTypeEnforcementTypes> + <Type>ste_SystemManagement</Type> <!-- machine/security management --> + <Type>ste_PersonalFinances</Type> <!-- personal finances --> + <Type>ste_InternetInsecure</Type> <!-- games, active X, etc. --> + <Type>ste_DonatedCycles</Type> <!-- donation to BOINC/seti@home --> + <Type>ste_PersistentStorageA</Type> <!-- domain managing the harddrive A--> + <Type>ste_NetworkAdapter0</Type> <!-- type of the domain managing ethernet adapter 0--> + </SimpleTypeEnforcementTypes> + </SimpleTypeEnforcement> +<!-- --> +<!-- example of a chinese wall type definition --> +<!-- along with its conflict sets --> +<!-- (typse in a confict set are exclusive, i.e. --> +<!-- once a Domain with one type of a set is --> +<!-- running, no other Domain with another type --> +<!-- of the same conflict set can start.) --> + <ChineseWall priority="PrimaryPolicyComponent"> + <ChineseWallTypes> + <Type>cw_SystemManagement</Type> + <Type>cw_Sensitive</Type> + <Type>cw_Isolated</Type> + <Type>cw_Distrusted</Type> + </ChineseWallTypes> + + <ConflictSets> + <Conflict name="Protection1"> + <Type>cw_Sensitive</Type> + <Type>cw_Distrusted</Type> + </Conflict> + </ConflictSets> + </ChineseWall> +</SecurityPolicyDefinition> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/null/null-security_label_template.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/null/null-security_label_template.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,24 @@ +<?xml version="1.0"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security labels, which can --> +<!-- be attached to Domains and resources. Based on --> +<!-- these labels, the access control module decides --> +<!-- about sharing between Domains and about access --> +<!-- of Domains to real resources. --> + +<SecurityLabelTemplate + xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> + <LabelHeader> + <Name>null-security_label_template</Name> + + <Date>2005-08-10</Date> + <PolicyName> + <Url>null-security_policy.xml</Url> + + <Reference>abcdef123456abcdef</Reference> + </PolicyName> + </LabelHeader> +</SecurityLabelTemplate> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/null/null-security_policy.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/null/null-security_policy.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,14 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security policies, which --> +<!-- can be enforced by the Xen Access Control Module. --> +<!-- Currently: Chinese Wall and Simple Type Enforcement--> +<SecurityPolicyDefinition xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> +<PolicyHeader> + <Name>null-security_policy</Name> + <Date>2005-08-10</Date> +</PolicyHeader> +</SecurityPolicyDefinition> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/security_policy.xsd --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/security_policy.xsd Thu Aug 25 22:53:20 2005 @@ -0,0 +1,138 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Author: Ray Valdez, Reiner Sailer {rvaldez,sailer}@us.ibm.com --> +<!-- This file defines the schema, which is used to define --> +<!-- the security policy and the security labels in Xe. --> + +<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema" targetNamespace="http://www.ibm.com" xmlns="http://www.ibm.com" elementFormDefault="qualified"> + <xsd:element name="SecurityPolicyDefinition"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="PolicyHeader" minOccurs="0" maxOccurs="1"></xsd:element> + <xsd:element ref="SimpleTypeEnforcement" minOccurs="0" maxOccurs="1"></xsd:element> + <xsd:element ref="ChineseWall" minOccurs="0" maxOccurs="1"></xsd:element> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="SecurityLabelTemplate"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="LabelHeader" minOccurs="1" maxOccurs="1"></xsd:element> + <xsd:element name="SubjectLabels" minOccurs="0" maxOccurs="1"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="VirtualMachineLabel" minOccurs="1" maxOccurs="unbounded"></xsd:element> + </xsd:sequence> + <xsd:attribute name="bootstrap" type="xsd:string" use="required"></xsd:attribute> + </xsd:complexType> + </xsd:element> + <xsd:element name="ObjectLabels" minOccurs="0" maxOccurs="1"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="ResourceLabel" minOccurs="1" maxOccurs="unbounded"></xsd:element> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="PolicyHeader"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="Name" minOccurs="1" maxOccurs="1" /> + <xsd:element ref="Date" minOccurs="1" maxOccurs="1" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="LabelHeader"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="Name"></xsd:element> + <xsd:element ref="Date" minOccurs="1" maxOccurs="1"></xsd:element> + <xsd:element ref="PolicyName" minOccurs="1" maxOccurs="1"></xsd:element> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="SimpleTypeEnforcement"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="SimpleTypeEnforcementTypes" /> + </xsd:sequence> + <xsd:attribute name="priority" type="PolicyOrder" use="optional"></xsd:attribute> + </xsd:complexType> + </xsd:element> + <xsd:element name="ChineseWall"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="ChineseWallTypes" /> + <xsd:element ref="ConflictSets" /> + </xsd:sequence> + <xsd:attribute name="priority" type="PolicyOrder" use="optional"></xsd:attribute> + </xsd:complexType> + </xsd:element> + <xsd:element name="ChineseWallTypes"> + <xsd:complexType> + <xsd:sequence> + <xsd:element maxOccurs="unbounded" minOccurs="1" ref="Type" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="ConflictSets"> + <xsd:complexType> + <xsd:sequence> + <xsd:element maxOccurs="unbounded" minOccurs="1" ref="Conflict" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="SimpleTypeEnforcementTypes"> + <xsd:complexType> + <xsd:sequence> + <xsd:element maxOccurs="unbounded" minOccurs="1" ref="Type" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="Conflict"> + <xsd:complexType> + <xsd:sequence> + <xsd:element maxOccurs="unbounded" minOccurs="1" ref="Type" /> + </xsd:sequence> + <xsd:attribute name="name" type="xsd:string" use="optional"></xsd:attribute> + </xsd:complexType> + </xsd:element> + <xsd:element name="VirtualMachineLabel"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="Name"></xsd:element> + <xsd:element ref="SimpleTypeEnforcementTypes" minOccurs="0" maxOccurs="unbounded" /> + <xsd:element ref="ChineseWallTypes" minOccurs="0" maxOccurs="unbounded" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="ResourceLabel"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="Name"></xsd:element> + <xsd:element ref="SimpleTypeEnforcementTypes" minOccurs="0" maxOccurs="unbounded" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="PolicyName"> + <xsd:complexType> + <xsd:sequence> + <xsd:element ref="Url" /> + <xsd:element ref="Reference" /> + </xsd:sequence> + </xsd:complexType> + </xsd:element> + <xsd:element name="Date" type="xsd:string" /> + <xsd:element name="Name" type="xsd:string" /> + <xsd:element name="Type" type="xsd:string" /> + <xsd:element name="Reference" type="xsd:string" /> + <xsd:element name="Url"></xsd:element> + + <xsd:simpleType name="PolicyOrder"> + <xsd:restriction base="xsd:string"> + <xsd:enumeration value="PrimaryPolicyComponent"></xsd:enumeration> + </xsd:restriction> + </xsd:simpleType> + +</xsd:schema> diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/ste/ste-security_label_template.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/ste/ste-security_label_template.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,143 @@ +<?xml version="1.0"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security labels, which can --> +<!-- be attached to Domains and resources. Based on --> +<!-- these labels, the access control module decides --> +<!-- about sharing between Domains and about access --> +<!-- of Domains to real resources. --> + +<SecurityLabelTemplate + xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> + <LabelHeader> + <Name>ste-security_label_template</Name> + <Date>2005-08-10</Date> + <PolicyName> + <Url>ste-security_policy.xml</Url> + <Reference>abcdef123456abcdef</Reference> + </PolicyName> + </LabelHeader> + + <SubjectLabels bootstrap="dom_SystemManagement"> + <!-- single ste typed domains --> + <!-- ACM enforces that only domains with --> + <!-- the same type can share information --> + <!-- --> + <!-- Bootstrap label is assigned to Dom0 --> + <VirtualMachineLabel> + <Name>dom_HomeBanking</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersonalFinances</Type> + </SimpleTypeEnforcementTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <Name>dom_Fun</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_InternetInsecure</Type> + </SimpleTypeEnforcementTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- donating some cycles to seti@home --> + <Name>dom_BoincClient</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_DonatedCycles</Type> + </SimpleTypeEnforcementTypes> + </VirtualMachineLabel> + + <!-- Domains with multiple ste types services; such domains --> + <!-- must keep the types inside their domain safely confined. --> + <VirtualMachineLabel> + <Name>dom_SystemManagement</Name> + <SimpleTypeEnforcementTypes> + <!-- since dom0 needs access to every domain and --> + <!-- resource right now ... --> + <Type>ste_SystemManagement</Type> + <Type>ste_PersonalFinances</Type> + <Type>ste_InternetInsecure</Type> + <Type>ste_DonatedCycles</Type> + <Type>ste_PersistentStorageA</Type> + <Type>ste_NetworkAdapter0</Type> + </SimpleTypeEnforcementTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- serves persistent storage to other domains --> + <Name>dom_StorageDomain</Name> + <SimpleTypeEnforcementTypes> + <!-- access right to the resource (hard drive a) --> + <Type>ste_PersistentStorageA</Type> + <!-- can serve following types --> + <Type>ste_PersonalFinances</Type> + <Type>ste_InternetInsecure</Type> + </SimpleTypeEnforcementTypes> + </VirtualMachineLabel> + + <VirtualMachineLabel> + <!-- serves network access to other domains --> + <Name>dom_NetworkDomain</Name> + <SimpleTypeEnforcementTypes> + <!-- access right to the resource (ethernet card) --> + <Type>ste_NetworkAdapter0</Type> + <!-- can serve following types --> + <Type>ste_PersonalFinances</Type> + <Type>ste_InternetInsecure</Type> + <Type>ste_DonatedCycles</Type> + </SimpleTypeEnforcementTypes> + </VirtualMachineLabel> + </SubjectLabels> + + <ObjectLabels> + <ResourceLabel> + <Name>res_ManagementResource</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_SystemManagement</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_HardDrive (hda)</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersistentStorageA</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_LogicalDiskPartition1 (hda1)</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersonalFinances</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_LogicalDiskPartition2 (hda2)</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_InternetInsecure</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_EthernetCard</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_NetworkAdapter0</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_SecurityToken</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_PersonalFinances</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + + <ResourceLabel> + <Name>res_GraphicsAdapter</Name> + <SimpleTypeEnforcementTypes> + <Type>ste_SystemManagement</Type> + </SimpleTypeEnforcementTypes> + </ResourceLabel> + </ObjectLabels> +</SecurityLabelTemplate> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policies/ste/ste-security_policy.xml --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policies/ste/ste-security_policy.xml Thu Aug 25 22:53:20 2005 @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- Author: Reiner Sailer, Ray Valdez {sailer,rvaldez}@us.ibm.com --> +<!-- This file defines the security policies, which --> +<!-- can be enforced by the Xen Access Control Module. --> +<!-- Currently: Chinese Wall and Simple Type Enforcement--> +<SecurityPolicyDefinition xmlns="http://www.ibm.com" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://www.ibm.com security_policy.xsd"> +<PolicyHeader> + <Name>ste-security_policy</Name> + <Date>2005-08-10</Date> +</PolicyHeader> +<!-- --> +<!-- example of a simple type enforcement policy definition --> +<!-- --> + <SimpleTypeEnforcement> + <SimpleTypeEnforcementTypes> + <Type>ste_SystemManagement</Type> <!-- machine/security management --> + <Type>ste_PersonalFinances</Type> <!-- personal finances --> + <Type>ste_InternetInsecure</Type> <!-- games, active X, etc. --> + <Type>ste_DonatedCycles</Type> <!-- donation to BOINC/seti@home --> + <Type>ste_PersistentStorageA</Type> <!-- domain managing the harddrive A--> + <Type>ste_NetworkAdapter0</Type> <!-- type of the domain managing ethernet adapter 0--> + </SimpleTypeEnforcementTypes> + </SimpleTypeEnforcement> +</SecurityPolicyDefinition> + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/policy.txt --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/policy.txt Thu Aug 25 22:53:20 2005 @@ -0,0 +1,405 @@ +## +# policy.txt <description to the Xen access control architecture> +# +# Author: +# Reiner Sailer 08/15/2005 <sailer@xxxxxxxxxxxxxx> +# +# +# This file gives an overview of the security policies currently +# provided and also gives some reasoning about how to assign +# labels to domains. +## + +Xen access control policies + + +General explanation of supported security policies: +===================================================== + +We have implemented the mandatory access control architecture of our +hypervisor security architecture (sHype) for the Xen hypervisor. It +controls communication (in Xen: event channels, grant tables) between +Virtual Machines (from here on called domains) and through this the +virtual block devices, networking, and shared memory are implemented +on top of these communication means. While we have implemented the +described policies and access control architecture for other +hypervisor systems, we will describe below specifically its +implementation and use in the Xen hypervisor. The policy enforcement +is called mandatory regarding user domains since the policy it is +given by the security administration and enforced independently of the +user domains by the Xen hypervisor in cooperation with the domain +management. + +The access control architecture consists of three parts: + +i) The access control policy determines the "command set" of the ACM +and the hooks with which they can be configured to constrain the +sharing of virtual resources. The current access control architecture +implemented for Xen supports two policies: Chinese Wall and Simple +Type Enforcement, which we describe in turn below. + + +ii) The actually enforced policy instantiation uses the policy +language (i) to configure the Xen access control in a way that suits +the specific application (home desktop environment, company desktop, +Web server system, etc.). We have defined an exemplary policy +instantiation for Chinese Wall (chwall policy) and Simple Type +Enforcement (ste policy) for a desktop system. We offer these policies +in combination since they are controlling orthogonal events. + + +iii) The access control module (ACM) and related hooks are part of the +core hypervisor and their controls cannot be bypassed by domains. The +ACM and hooks are the active security components. We refer to +publications that describe how access control is enforced in the Xen +hypervisor using the ACM (access decision) and the hooks (decision +enforcement) inserted into the setup of event channels and grant +tables, and into domain operations (create, destroy, save, restore, +migrate). These controls decide based on the active policy +configuration (see i. and ii.) if the operation proceeds of if the +operation is aborted (denied). + + +In general, security policy instantiations in the Xen access control +framework are defined by two files: + +a) a single "policy-name"-security_policy.xml file that defines the +types known to the ACM and policy rules based on these types + +b) a single "policy-name"-security_label_template.xml file that +defines labels based on known types + +Every security policy has its own sub-directory under +"Xen-root"/tools/security/policies in order to simplify their +management and the security policy tools. We will describe those files +for our example policy (Chinese Wall and Simple Type Enforcement) in +more detail as we go along. Eventually, we will move towards a system +installation where the policies will reside under /etc. + + +CHINESE WALL +============ + +The Chinese Wall policy enables the user to define "which workloads +(domain payloads) cannot run on a single physical system at the same +time". Why would we want to prevent workloads from running at the same +time on the same system? This supports requirements that can (but +don't have to) be rooted in the measure of trust into the isolation of +different domains that share the same hardware. Since the access +control architecture aims at high performance and non-intrusive +implementation, it currently does not address covert (timing) channels +and aims at medium assurance. Users can apply the Chinese Wall policy +to guarantee an air-gap between very sensitive payloads both regarding +covert information channels and regarding resource starvation. + +To enable the CW control, each domain is labeled with a set of Chinese +Wall types and CW Conflict Sets are defined which include those CW +types that cannot run simultaneously on the same hardware. This +interpretation of conflict sets is the only policy rule for the Chines +Wall policy. + +This is enforced by controlling the start of domains according to +their assigned CW worload types. Domains with Chinese Wall types that +appear in a common conflict set are running mutually exclusive on a +platform, i.e., once a domain with one of the cw-types of a conflict +set is running, no domain with another cw-type of the same conflict +set can start until the first domain is destroyed, paused, or migrated +away from the physical system (this assumes that such a partition can +no longer be observed). The idea is to assign cw-types according to +the type of payload that a domain runs and to use the Chinese Wall +policy to ensure that payload types can be differentiated by the +hypervisor and can be prevented from being executed on the same system +at the same time. Using the flexible CW policy maintains system +consolidation and workload-balancing while introducing guaranteed +constraints where necessary. + + +Example of a Chinese Wall Policy Instantiation +---------------------------------------------- + +The file chwall-security_policy.xml defines the Chinese Wall types as +well as the conflict sets for our example policy (you find it in the +directory "xen_root"/tools/security/policies/chwall). + +It defines four Chinese Wall types (prefixed with cw_) with the +following meaning: + +* cw_SystemsManagement is a type identifying workloads for systems +management, e.g., domain management, device management, or hypervisor +management. + +* cw_Sensitive is identifying workloads that are critical to the user +for one reason or another. + +* cw_Distrusted is identifying workloads a user does not have much +confidence in. E.g. a domain used for surfing in the internet without +protection( i.e., active-X, java, java-script, executing web content) +or for (Internet) Games should be typed this way. + +* cw_Isolated is identifying workloads that are supposedly isolated by +use of the type enforcement policy (described below). For example, if +a user wants to donate cycles to seti@home, she can setup a separate +domain for a Boinc (http://boinc.ssl.berkeley.edu/) client, disable +this domain from accessing the hard drive and from communicating to +other local domains, and type it as cw_Isolated. We will look at a +specific example later. + +The example policy uses the defined types to define one conflict set: +Protection1 = {cw_Sensitive, cw_Distrusted}. This conflict set tells +the hypervisor that once a domain typed as cw_Sensitive is running, a +domain typed as cw_Distrusted cannot run concurrently (and the other +way round). With this policy, a domain typed as cw_Isolated is allowed +to run simultaneously with domains tagged as cw_Sensitive. + +Consequently, the access control module in the Xen hypervisor +distinguishes in this example policy 4 different workload types in +this example policy. It is the user's responsibility to type the +domains in a way that reflects the workloads of these domains and, in +the case of cw_Isolated, its properties, e.g. by configuring the +sharing capabilities of the domain accordingly by using the simple +type enforcement policy. + +Users can define their own or change the existing example policy +according to their working environment and security requirements. To +do so, replace the file chwall-security_policy.xml with the new +policy. + + +SIMPLE TYPE ENFORCEMENT +======================= + +The file ste-security_policy.xml defines the simple type enforcement +types for our example policy (you find it in the directory +"xen_root"/tools/security/policies/ste). The Simple Type Enforcement +policy defines which domains can share information with which other +domains. To this end, it controls + +i) inter-domain communication channels (e.g., network traffic, events, +and shared memory). + +ii) access of domains to physical resources (e.g., hard drive, network +cards, graphics adapter, keyboard). + +In order to enable the hypervisor to distinguish different domains and +the user to express access rules, the simple type enforcement defines +a set of types (ste_types). + +The policy defines that communication between domains is allowed if +the domains share a common STE type. As with the chwall types, STE +types should enable the differentiation of workloads. The simple type +enforcement access control implementation in the hypervisor enforces +that domains can only communicate (setup event channels, grant tables) +if they share a common type, i.e., both domains have assigned at least +on type in common. A domain can access a resource, if the domain and +the resource share a common type. Hence, assigning STE types to +domains and resources allows users to define constraints on sharing +between domains and to keep sensitive data confined from distrusted +domains. + +Domain <--> Domain Sharing +'''''''''''''''''''''''''' +(implemented but its effective use requires factorization of Dom0) + +a) Domains with a single STE type (general user domains): Sharing +between such domains is enforced entirely by the hypervisor access +control. It is independent of the domains and does not require their +co-operation. + +b) Domains with multiple STE types: One example is a domain that +virtualizes a physical resource (e.g., hard drive) and serves it as +multiple virtual resources (virtual block drives) to other domains of +different types. The idea is that only a specific device domain has +assigned the type required to access the physical hard-drive. Logical +drives are then assigned the types of domains that have access to this +logical drive. Since the Xen hypervisor cannot distinguish between the +logical drives, the access control (type enforcement) is delegated to +the device domain, which has access to the types of domains requesting +to mount a logical drive as well as the types assigned to the +different available logical drives. + +Currently in Xen, Dom0 controls all hardware, needs to communicate +with all domains during their setup, and intercepts all communication +between domains. Consequently, Dom0 needs to be assigned all types +used and must be completely trusted to maintain the separation of +informatio ncoming from domains with different STE types. Thus a +refactoring of Dom0 is recommended for stronger confinement +guarantees. + +Domain --> RESOURCES Access +''''''''''''''''''''''''''' +(current work) + +We define for each resource that we want to distinguish a separate STE +type. Each STE type is assigned to the respective resource and to +those domains that are allowed to access this resource. Type +enforcement will guarantee that other domains cannot access this +resource since they don't share the resource's STE type. + +Since in the current implementation of Xen, Dom0 controls access to +all hardware (e.g., disk drives, network), Domain-->Resource access +control enforcement must be implemented in Dom0. This is possible +since Dom0 has access to both the domain configuration (including the +domain STE types) and the resource configuration (including the +resource STE types). + +For purposes of gaining higher assurance in the resulting system, it +may be desirable to reduce the size of dom0 by adding one or more +"device domains" (DDs). These DDs, e.g. providing storage or network +access, can support one or more physical devices, and manage +enforcement of MAC policy relevant for said devices. Security benefits +come from the smaller size of these DDs, as they can be more easily +audited than monolithic device driver domains. DDs can help to obtain +maximum security benefit from sHype. + + +Example of a Simple Type Enforcement Policy Instantiation +--------------------------------------------------------- + +We define the following types: + +* ste_SystemManagement identifies workloads (and domains that runs +them) that must share information to accomplish the management of the +system + +* ste_PersonalFinances identifies workloads that are related to +sensitive programs such as HomeBanking applications or safely +configured web browsers for InternetBanking + +* ste_InternetInsecure identifies workloads that are very +function-rich and unrestricted to offer for example an environment +where internet games can run efficiently + +* ste_DonatedCycles identifies workloads that run on behalf of others, +e.g. a Boinc client + +* ste_PersistentStorage identifies workloads that have direct access +to persistent storage (e.g., hard drive) + +* ste_NetworkAccess identifies workload that have direct access to +network cards and related networks + + + +SECURITY LABEL TEMPLATES +======================== + +We introduce security label templates because it is difficult for +users to ensure tagging of domains consistently and since there are +--as we have seen in the case of isolation-- useful dependencies +between the policies. Security Label Templates define type sets that +can be addressed by more user-friendly label names, +e.g. dom_Homebanking describes a typical typeset tagged to domains +used for sensitive Homebanking work-loads. Labels are defined in the +file + +Using Security Label Templates has multiple advantages: +a) easy reference of typical sets of type assignments +b) consistent interpretation of type combinations +c) meaningful application-level label names + +The definition of label templates depends on the combination of +policies that are used. We will describe some of the labels defined +for the Chinese Wall and Simple Type Enforcement combination. + +In the BoincClient example, the label_template file specifies that +this Label is assigned the Chinese Wall type cw_Isolated. We do this +assuming that this BoincClient is isolated against the rest of the +system infrastructure (no persistent memory, no sharing with local +domains). Since cw_Isolated is not included in any conflict set, it +can run at any time concurrently with any other domain. The +ste_DonatedCycles type assigned to the BoincClient reflect the +isolation assumption: it is only assigned to the dom_NetworkDomain +giving the BoincClient domain access to the network to communicate +with its BoincServer. + +The strategy for combining types into Labels is the following: First +we define a label for each type of general user domain +(workload-oriented). Then we define a new label for each physical +resource that shall be shared using a DD domain (e.g., disk) and for +each logical resource offered through this physical resource (logical +disk partition). We define then device domain labels (here: +dom_SystemManagement, dom_StorageDomain, dom_NetworkDomain) which +include the types of the physical resources (e.g. hda) their domains +need to connect to. Such physical resources can only be accessed +directly by device domains types with the respective device's STE +type. Additionally we assign to such a device domain Label the STE +types of those user domains that are allowed to access one of the +logical resources (e.g., hda1, hda2) built on top of this physical +resource through the device domain. + + +Label Construction Example: +--------------------------- + +We define here a storage domain label for a domain that owns a real +disk drive and creates the logical disk partitions hda1 and hda2 which +it serves to domains labeled dom_HomeBanking and dom_Fun +respectively. The labels we refer to are defined in the label template +file policies/chwall_ste/chwall_ste-security-label-template.xml. + +step1: To distinguish different shared disk drives, we create a +separate Label and STE type for each of them. Here: we create a type +ste_PersistentStorageA for disk drive hda. If you have another disk +drive, you may define another persistent storage type +ste_PersistentStorageB in the chwall_ste-security_policy.xml. + +step2: To distinguish different domains, we create multiple domain +labels including different types. Here: label dom_HomeBanking includes +STE type ste_PersonalFinances, label dom_Fun includes STE type +ste_InternetInsecure. + +step3: The storage domain in charge of the hard drive A needs access +to this hard drive. Therefore the storage domain label +dom_StorageDomain must include the type assigned to the hard drive +(ste_PersistentStorageA). + +step4: In order to serve dom hda1 to domains labeled dom_HomeBanking +and hda2 to domains labeled dom_Fun, the storage domain label must +include the types of those domains as well (ste_PersonalFinance, +ste_InternetInsecure). + +step5: In order to keep the data for different types safely apart, the +different logical disk partitions must be assigned unique labels and +types, which are used inside the storage domain to extend the ACM +access enforcement to logical resources served from inside the storage +domain. We define labels "res_LogicalDiskPartition1 (hda1)" and assign +it to hda1 and "res_LogicalDiskPartition2 (hda2)" and assign it to +hda2. These labels must include the STE types of those domains that +are allowed to use them (e.g., ste_PersonalFinances for hda1). + +The overall mandatory access control is then enforced in 3 different +Xen components and these components use a single consistent policy to +co-operatively enforce the policy. In the storage domain example, we +have three components that co-operate: + +1. The ACM module inside the hypervisor enforces: communication between +user domains and the storage domain (only domains including types +ste_PersonalFinances or ste_InternetInsecure can communicate with the +storage domain and request access to logical resource). This confines +the sharing to the types assigned to the storage domain. + +2. The domain management will enforce (work in progress): assignment of +real resources (hda) to domains (storage domain) that share a +type with the resource. + +3. If the storage domain serves multiple STE types (as in our example), +it enforces (work in progress): that domains can access (mount) +logical resources only if they share an STE type with the respective +resource. In our example, domains with the STE type +ste_PersonalFinances can request access (mount) to logical resource +hda1 from the storage domain. + +If you look at the virtual machine label dom_StorageDomain, you will +see the minimal set of types assigned to our domain manageing disk +drive hda for serving logical disk partitions exclusively to +dom_HomeBanking and dom_Fun. + +Similary, network domains can confine access to the network or +network communication between user domains. + +As a result, device domains (e.g., storage domain, network domain) +must be simple and small to ensure their correct co-operation in the +type enforcement model. If such trust is not possible, then hardware +should be assigned exclusively to a single type (or to a single +partition) in which case the hypervisor ACM enforcement enforces the +types independently. diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/readme.txt --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/readme.txt Thu Aug 25 22:53:20 2005 @@ -0,0 +1,29 @@ + +## +# readme.txt <description to the xen access control architecture> +# +# Author: +# Reiner Sailer 08/15/2005 <sailer@xxxxxxxxxxxxxx> +# +# +# This file is a toc for information regarding +# the access control policy and tools in Xen. +## + +1. policy.txt: + + describes the general reasoning and examples for access + control policies in Xen + + +2. install.txt + + describes the activation of the access control framework + in Xen + +3. example.txt + + describes the available tools for managing security policies + in Xen and the tools to label domains + + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/secpol_compat.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/secpol_compat.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,14 @@ +/* secpol_compat.h + * 'translates' data types necessary to + * include <xen/acm.h> + */ +#include <stdint.h> + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/secpol_xml2bin.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/secpol_xml2bin.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,1396 @@ +/**************************************************************** + * secpol_xml2bin.c + * + * Copyright (C) 2005 IBM Corporation + * + * Author: Reiner Sailer <sailer@xxxxxxxxxx> + * + * Maintained: + * Reiner Sailer <sailer@xxxxxxxxxx> + * Ray Valdez <rvaldez@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * sHype policy translation tool. This tool takes an XML + * policy specification as input and produces a binary + * policy file that can be loaded into Xen through the + * ACM operations (secpol_tool loadpolicy) interface or at + * boot time (grub module parameter) + * + * indent -i4 -kr -nut + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <libgen.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/queue.h> +#include <netinet/in.h> +#include <libxml/xmlschemas.h> +#include <libxml/parser.h> +#include <libxml/tree.h> +#include <libxml/xmlreader.h> +#include "secpol_compat.h" +#include <xen/acm.h> + +#include "secpol_xml2bin.h" + +#define DEBUG 0 + +/* primary / secondary policy component setting */ +enum policycomponent { CHWALL, STE, NULLPOLICY } + primary = NULLPOLICY, secondary = NULLPOLICY; + +/* general list element for ste and chwall type queues */ +struct type_entry { + TAILQ_ENTRY(type_entry) entries; + char *name; /* name of type from xml file */ + type_t mapping; /* type mapping into 16bit */ +}; + +TAILQ_HEAD(tailhead, type_entry) ste_head, chwall_head; + +/* general list element for all label queues */ +enum label_type { VM, RES, ANY }; +struct ssid_entry { + TAILQ_ENTRY(ssid_entry) entries; + char *name; /* label name */ + enum label_type type; /* type: VM / RESOURCE LABEL */ + u_int32_t num; /* ssid or referenced ssid */ + int is_ref; /* if this entry references earlier ssid number */ + unsigned char *row; /* index of types (if not a reference) */ +}; + +TAILQ_HEAD(tailhead_ssid, ssid_entry) ste_ssid_head, chwall_ssid_head, + conflictsets_head; +struct ssid_entry *current_chwall_ssid_p = NULL; +struct ssid_entry *current_ste_ssid_p = NULL; +struct ssid_entry *current_conflictset_p = NULL; + +/* which label to assign to dom0 during boot */ +char *bootstrap_label; + +u_int32_t max_ste_ssids = 0; +u_int32_t max_chwall_ssids = 0; +u_int32_t max_chwall_labels = 0; +u_int32_t max_ste_labels = 0; +u_int32_t max_conflictsets = 0; + +char *current_ssid_name; /* store name until structure is allocated */ +char *current_conflictset_name; /* store name until structure is allocated */ + +/* dynamic list of type mappings for STE */ +u_int32_t max_ste_types = 0; + +/* dynamic list of type mappings for CHWALL */ +u_int32_t max_chwall_types = 0; + +/* dynamic list of conflict sets */ +int max_conflict_set = 0; + +/* which policies are defined */ +int have_ste = 0; +int have_chwall = 0; + +/* input/output file names */ +char *policy_filename = NULL, + *label_filename = NULL, + *binary_filename = NULL, *mapping_filename = NULL; + +void usage(char *prg) +{ + printf("usage:\n%s policyname[-policy.xml/-security_label_template.xml]\n", + prg); + exit(EXIT_FAILURE); +} + + +/***************** policy-related parsing *********************/ + +char *type_by_mapping(struct tailhead *head, u_int32_t mapping) +{ + struct type_entry *np; + for (np = head->tqh_first; np != NULL; np = np->entries.tqe_next) + if (np->mapping == mapping) + return np->name; + return NULL; +} + + +struct type_entry *lookup(struct tailhead *head, char *name) +{ + struct type_entry *np; + for (np = head->tqh_first; np != NULL; np = np->entries.tqe_next) + if (!(strcmp(np->name, name))) + return np; + return NULL; +} + +/* enforces single-entry lists */ +int add_entry(struct tailhead *head, char *name, type_t mapping) +{ + struct type_entry *e; + if (lookup(head, name)) + { + printf("Error: Type >%s< defined more than once.\n", name); + return -EFAULT; /* already in the list */ + } + if (!(e = malloc(sizeof(struct type_entry)))) + return -ENOMEM; + + e->name = name; + e->mapping = mapping; + TAILQ_INSERT_TAIL(head, e, entries); + return 0; +} + +int totoken(char *tok) +{ + int i; + for (i = 0; token[i] != NULL; i++) + if (!strcmp(token[i], tok)) + return i; + return -EFAULT; +} + +/* conflictsets use the same data structure as ssids; since + * they are similar in structure (set of types) + */ +int init_next_conflictset(void) +{ + struct ssid_entry *conflictset = malloc(sizeof(struct ssid_entry)); + + if (!conflictset) + return -ENOMEM; + + conflictset->name = current_conflictset_name; + conflictset->num = max_conflictsets++; + conflictset->is_ref = 0; /* n/a for conflictsets */ + /** + * row: allocate one byte per type; + * [i] != 0 --> mapped type >i< is part of the conflictset + */ + conflictset->row = malloc(max_chwall_types); + if (!conflictset->row) + return -ENOMEM; + + memset(conflictset->row, 0, max_chwall_types); + TAILQ_INSERT_TAIL(&conflictsets_head, conflictset, entries); + current_conflictset_p = conflictset; + return 0; +} + +int register_type(xmlNode * cur_node, xmlDocPtr doc, unsigned long state) +{ + xmlChar *text; + struct type_entry *e; + + + text = xmlNodeListGetString(doc, cur_node->xmlChildrenNode, 1); + if (!text) + { + printf("Error reading type name!\n"); + return -EFAULT; + } + + switch (state) { + case XML2BIN_stetype_S: + if (add_entry(&ste_head, (char *) text, max_ste_types)) + { + xmlFree(text); + return -EFAULT; + } + max_ste_types++; + break; + + case XML2BIN_chwalltype_S: + if (add_entry(&chwall_head, (char *) text, max_chwall_types)) + { + xmlFree(text); + return -EFAULT; + } + max_chwall_types++; + break; + + case XML2BIN_conflictsettype_S: + /* a) search the type in the chwall_type list */ + e = lookup(&chwall_head, (char *) text); + if (e == NULL) + { + printf("CS type >%s< not a CHWALL type.\n", text); + xmlFree(text); + return -EFAULT; + } + /* b) add type entry to the current cs set */ + if (current_conflictset_p->row[e->mapping]) + { + printf("ERROR: Double entry of type >%s< in conflict set %d.\n", + text, current_conflictset_p->num); + xmlFree(text); + return -EFAULT; + } + current_conflictset_p->row[e->mapping] = 1; + break; + + default: + printf("Incorrect type environment (state = %lx, text = %s).\n", + state, text); + xmlFree(text); + return -EFAULT; + } + return 0; +} + +void set_component_type(xmlNode * cur_node, enum policycomponent pc) +{ + xmlChar *order; + + if ((order = xmlGetProp(cur_node, (xmlChar *) PRIMARY_COMPONENT_ATTR_NAME))) { + if (strcmp((char *) order, PRIMARY_COMPONENT)) + { + printf("ERROR: Illegal attribut value >order=%s<.\n", + (char *) order); + xmlFree(order); + exit(EXIT_FAILURE); + } + if (primary != NULLPOLICY) + { + printf("ERROR: Primary Policy Component set twice!\n"); + exit(EXIT_FAILURE); + } + primary = pc; + xmlFree(order); + } +} + +void walk_policy(xmlNode * start, xmlDocPtr doc, unsigned long state) +{ + xmlNode *cur_node = NULL; + int code; + + for (cur_node = start; cur_node; cur_node = cur_node->next) + { + if ((code = totoken((char *) cur_node->name)) < 0) + { + printf("Unknown token: >%s<. Aborting.\n", cur_node->name); + exit(EXIT_FAILURE); + } + switch (code) { /* adjust state to new state */ + case XML2BIN_SECPOL: + case XML2BIN_STETYPES: + case XML2BIN_CHWALLTYPES: + case XML2BIN_CONFLICTSETS: + walk_policy(cur_node->children, doc, state | (1 << code)); + break; + + case XML2BIN_STE: + if (WRITTEN_AGAINST_ACM_STE_VERSION != ACM_STE_VERSION) + { + printf("ERROR: This program was written against another STE version.\n"); + exit(EXIT_FAILURE); + } + have_ste = 1; + set_component_type(cur_node, STE); + walk_policy(cur_node->children, doc, state | (1 << code)); + break; + + case XML2BIN_CHWALL: + if (WRITTEN_AGAINST_ACM_CHWALL_VERSION != ACM_CHWALL_VERSION) + { + printf("ERROR: This program was written against another CHWALL version.\n"); + exit(EXIT_FAILURE); + } + have_chwall = 1; + set_component_type(cur_node, CHWALL); + walk_policy(cur_node->children, doc, state | (1 << code)); + break; + + case XML2BIN_CSTYPE: + current_conflictset_name = + (char *) xmlGetProp(cur_node, (xmlChar *) "name"); + if (!current_conflictset_name) + current_conflictset_name = ""; + + if (init_next_conflictset()) + { + printf + ("ERROR: creating new conflictset structure failed.\n"); + exit(EXIT_FAILURE); + } + walk_policy(cur_node->children, doc, state | (1 << code)); + break; + + case XML2BIN_TYPE: + if (register_type(cur_node, doc, state)) + exit(EXIT_FAILURE); + /* type leaf */ + break; + + case XML2BIN_TEXT: + case XML2BIN_COMMENT: + case XML2BIN_POLICYHEADER: + /* leaf - nothing to do */ + break; + + default: + printf("Unkonwn token Error (%d)\n", code); + exit(EXIT_FAILURE); + } + + } + return; +} + +int create_type_mapping(xmlDocPtr doc) +{ + xmlNode *root_element = xmlDocGetRootElement(doc); + struct type_entry *te; + struct ssid_entry *se; + int i; + + printf("Creating ssid mappings ...\n"); + + /* initialize the ste and chwall type lists */ + TAILQ_INIT(&ste_head); + TAILQ_INIT(&chwall_head); + TAILQ_INIT(&conflictsets_head); + + walk_policy(root_element, doc, XML2BIN_NULL); + + /* determine primary/secondary policy component orders */ + if ((primary == NULLPOLICY) && have_chwall) + primary = CHWALL; /* default if not set */ + else if ((primary == NULLPOLICY) && have_ste) + primary = STE; + + switch (primary) { + + case CHWALL: + if (have_ste) + secondary = STE; + /* else default = NULLPOLICY */ + break; + + case STE: + if (have_chwall) + secondary = CHWALL; + /* else default = NULLPOLICY */ + break; + + default: + /* NULL/NULL policy */ + break; + } + + if (!DEBUG) + return 0; + + /* print queues */ + if (have_ste) + { + printf("STE-Type queue (%s):\n", + (primary == STE) ? "PRIMARY" : "SECONDARY"); + for (te = ste_head.tqh_first; te != NULL; + te = te->entries.tqe_next) + printf("name=%22s, map=%x\n", te->name, te->mapping); + } + if (have_chwall) + { + printf("CHWALL-Type queue (%s):\n", + (primary == CHWALL) ? "PRIMARY" : "SECONDARY"); + for (te = chwall_head.tqh_first; te != NULL; + te = te->entries.tqe_next) + printf("name=%s, map=%x\n", te->name, te->mapping); + + printf("Conflictset queue (max=%d):\n", max_conflictsets); + for (se = conflictsets_head.tqh_first; se != NULL; + se = se->entries.tqe_next) + { + printf("conflictset name >%s<\n", + se->name ? se->name : "NONAME"); + for (i = 0; i < max_chwall_types; i++) + if (se->row[i]) + printf("#%x ", i); + printf("\n"); + } + } + return 0; +} + + +/***************** template-related parsing *********************/ + +/* add default ssid at head of ssid queues */ +int init_ssid_queues(void) +{ + struct ssid_entry *default_ssid_chwall, *default_ssid_ste; + + default_ssid_chwall = malloc(sizeof(struct ssid_entry)); + default_ssid_ste = malloc(sizeof(struct ssid_entry)); + + if ((!default_ssid_chwall) || (!default_ssid_ste)) + return -ENOMEM; + + /* default chwall ssid */ + default_ssid_chwall->name = "DEFAULT"; + default_ssid_chwall->num = max_chwall_ssids++; + default_ssid_chwall->is_ref = 0; + default_ssid_chwall->type = ANY; + + default_ssid_chwall->row = malloc(max_chwall_types); + + if (!default_ssid_chwall->row) + return -ENOMEM; + + memset(default_ssid_chwall->row, 0, max_chwall_types); + + TAILQ_INSERT_TAIL(&chwall_ssid_head, default_ssid_chwall, entries); + current_chwall_ssid_p = default_ssid_chwall; + max_chwall_labels++; + + /* default ste ssid */ + default_ssid_ste->name = "DEFAULT"; + default_ssid_ste->num = max_ste_ssids++; + default_ssid_ste->is_ref = 0; + default_ssid_ste->type = ANY; + + default_ssid_ste->row = malloc(max_ste_types); + + if (!default_ssid_ste->row) + return -ENOMEM; + + memset(default_ssid_ste->row, 0, max_ste_types); + + TAILQ_INSERT_TAIL(&ste_ssid_head, default_ssid_ste, entries); + current_ste_ssid_p = default_ssid_ste; + max_ste_labels++; + return 0; +} + +int init_next_chwall_ssid(unsigned long state) +{ + struct ssid_entry *ssid = malloc(sizeof(struct ssid_entry)); + + if (!ssid) + return -ENOMEM; + + ssid->name = current_ssid_name; + ssid->num = max_chwall_ssids++; + ssid->is_ref = 0; + + if (state & (1 << XML2BIN_VM)) + ssid->type = VM; + else + ssid->type = RES; + /** + * row: allocate one byte per type; + * [i] != 0 --> mapped type >i< is part of the ssid + */ + ssid->row = malloc(max_chwall_types); + if (!ssid->row) + return -ENOMEM; + + memset(ssid->row, 0, max_chwall_types); + TAILQ_INSERT_TAIL(&chwall_ssid_head, ssid, entries); + current_chwall_ssid_p = ssid; + max_chwall_labels++; + return 0; +} + +int init_next_ste_ssid(unsigned long state) +{ + struct ssid_entry *ssid = malloc(sizeof(struct ssid_entry)); + + if (!ssid) + return -ENOMEM; + + ssid->name = current_ssid_name; + ssid->num = max_ste_ssids++; + ssid->is_ref = 0; + + if (state & (1 << XML2BIN_VM)) + ssid->type = VM; + else + ssid->type = RES; + + /** + * row: allocate one byte per type; + * [i] != 0 --> mapped type >i< is part of the ssid + */ + ssid->row = malloc(max_ste_types); + if (!ssid->row) + return -ENOMEM; + + memset(ssid->row, 0, max_ste_types); + TAILQ_INSERT_TAIL(&ste_ssid_head, ssid, entries); + current_ste_ssid_p = ssid; + max_ste_labels++; + + return 0; +} + + +/* adds a type to the current ssid */ +int add_type(xmlNode * cur_node, xmlDocPtr doc, unsigned long state) +{ + xmlChar *text; + struct type_entry *e; + + text = xmlNodeListGetString(doc, cur_node->xmlChildrenNode, 1); + if (!text) + { + printf("Error reading type name!\n"); + return -EFAULT; + } + /* same for all: 1. lookup type mapping, 2. mark type in ssid */ + switch (state) { + case XML2BIN_VM_STE_S: + case XML2BIN_RES_STE_S: + /* lookup the type mapping and include the type mapping into the array */ + if (!(e = lookup(&ste_head, (char *) text))) + { + printf("ERROR: unknown VM STE type >%s<.\n", text); + exit(EXIT_FAILURE); + } + if (current_ste_ssid_p->row[e->mapping]) + printf("Warning: double entry of VM STE type >%s<.\n", text); + + current_ste_ssid_p->row[e->mapping] = 1; + break; + + case XML2BIN_VM_CHWALL_S: + /* lookup the type mapping and include the type mapping into the array */ + if (!(e = lookup(&chwall_head, (char *) text))) + { + printf("ERROR: unknown VM CHWALL type >%s<.\n", text); + exit(EXIT_FAILURE); + } + if (current_chwall_ssid_p->row[e->mapping]) + printf("Warning: double entry of VM CHWALL type >%s<.\n", + text); + + current_chwall_ssid_p->row[e->mapping] = 1; + break; + + default: + printf("Incorrect type environment (state = %lx, text = %s).\n", + state, text); + xmlFree(text); + return -EFAULT; + } + return 0; +} + +void set_bootstrap_label(xmlNode * cur_node) +{ + xmlChar *order; + + if ((order = xmlGetProp(cur_node, (xmlChar *) BOOTSTRAP_LABEL_ATTR_NAME))) + bootstrap_label = (char *)order; + else { + printf("ERROR: No bootstrap label defined!\n"); + exit(EXIT_FAILURE); + } +} + +void walk_labels(xmlNode * start, xmlDocPtr doc, unsigned long state) +{ + xmlNode *cur_node = NULL; + int code; + + for (cur_node = start; cur_node; cur_node = cur_node->next) + { + if ((code = totoken((char *) cur_node->name)) < 0) + { + printf("Unkonwn token: >%s<. Aborting.\n", cur_node->name); + exit(EXIT_FAILURE); + } + switch (code) { /* adjust state to new state */ + + case XML2BIN_SUBJECTS: + set_bootstrap_label(cur_node); + /* fall through */ + case XML2BIN_VM: + case XML2BIN_RES: + case XML2BIN_SECTEMPLATE: + case XML2BIN_OBJECTS: + walk_labels(cur_node->children, doc, state | (1 << code)); + break; + + case XML2BIN_STETYPES: + /* create new ssid entry to use and point current to it */ + if (init_next_ste_ssid(state)) + { + printf("ERROR: creating new ste ssid structure failed.\n"); + exit(EXIT_FAILURE); + } + walk_labels(cur_node->children, doc, state | (1 << code)); + + break; + + case XML2BIN_CHWALLTYPES: + /* create new ssid entry to use and point current to it */ + if (init_next_chwall_ssid(state)) + { + printf("ERROR: creating new chwall ssid structure failed.\n"); + exit(EXIT_FAILURE); + } + walk_labels(cur_node->children, doc, state | (1 << code)); + + break; + + case XML2BIN_TYPE: + /* add type to current ssid */ + if (add_type(cur_node, doc, state)) + exit(EXIT_FAILURE); + break; + + case XML2BIN_NAME: + if ((state != XML2BIN_VM_S) && (state != XML2BIN_RES_S)) + { + printf("ERROR: >name< out of VM/RES context.\n"); + exit(EXIT_FAILURE); + } + current_ssid_name = (char *) + xmlNodeListGetString(doc, cur_node->xmlChildrenNode, 1); + + if (!current_ssid_name) + { + printf("ERROR: empty >name<!\n"); + exit(EXIT_FAILURE); + } + break; + + case XML2BIN_TEXT: + case XML2BIN_COMMENT: + case XML2BIN_LABELHEADER: + break; + + default: + printf("Unkonwn token Error (%d)\n", code); + exit(EXIT_FAILURE); + } + + } + return; +} + +/* this function walks through a ssid queue + * and transforms double entries into references + * of the first definition (we need to keep the + * entry to map labels but we don't want double + * ssids in the binary policy + */ +void +remove_doubles(struct tailhead_ssid *head, + u_int32_t max_types, u_int32_t * max_ssids) +{ + struct ssid_entry *np, *ni; + + /* walk once through the list */ + for (np = head->tqh_first; np != NULL; np = np->entries.tqe_next) + { + /* now search from the start until np for the same entry */ + for (ni = head->tqh_first; ni != np; ni = ni->entries.tqe_next) + { + if (ni->is_ref) + continue; + if (memcmp(np->row, ni->row, max_types)) + continue; + /* found one, set np reference to ni */ + np->is_ref = 1; + np->num = ni->num; + (*max_ssids)--; + } + } + + /* now minimize the ssid numbers used (doubles introduce holes) */ + (*max_ssids) = 0; /* reset */ + + for (np = head->tqh_first; np != NULL; np = np->entries.tqe_next) + { + if (np->is_ref) + continue; + + if (np->num != (*max_ssids)) { + /* first reset all later references to the new max_ssid */ + for (ni = np->entries.tqe_next; ni != NULL; ni = ni->entries.tqe_next) + { + if (ni->num == np->num) + ni->num = (*max_ssids); + } + /* now reset num */ + np->num = (*max_ssids)++; + } + else + (*max_ssids)++; + } +} + +/* + * will go away as soon as we have non-static bootstrap ssidref for dom0 + */ +void fixup_bootstrap_label(struct tailhead_ssid *head, + u_int32_t max_types, u_int32_t * max_ssids) +{ + struct ssid_entry *np; + int i; + + /* should not happen if xml / xsd checks work */ + if (!bootstrap_label) + { + printf("ERROR: No bootstrap label defined.\n"); + exit(EXIT_FAILURE); + } + + /* search bootstrap_label */ + for (np = head->tqh_first; np != NULL; np = np->entries.tqe_next) + { + if (!strcmp(np->name, bootstrap_label)) + { + break; + } + } + + if (!np) { + /* bootstrap label not found */ + printf("ERROR: Bootstrap label >%s< not found.\n", bootstrap_label); + exit(EXIT_FAILURE); + } + + /* move this entry ahead in the list right after the default entry so it + * receives ssidref 1/1 */ + TAILQ_REMOVE(head, np, entries); + TAILQ_INSERT_AFTER(head, head->tqh_first, np, entries); + + /* renumber the ssids (we could also just switch places with 1st element) */ + for (np = head->tqh_first, i=0; np != NULL; np = np->entries.tqe_next, i++) + np->num = i; + +} + +int create_ssid_mapping(xmlDocPtr doc) +{ + xmlNode *root_element = xmlDocGetRootElement(doc); + struct ssid_entry *np; + int i; + + printf("Creating label mappings ...\n"); + /* initialize the ste and chwall type lists */ + TAILQ_INIT(&chwall_ssid_head); + TAILQ_INIT(&ste_ssid_head); + + /* init with default ssids */ + if (init_ssid_queues()) + { + printf("ERROR adding default ssids.\n"); + exit(EXIT_FAILURE); + } + + /* now walk the template DOM tree and fill in ssids */ + walk_labels(root_element, doc, XML2BIN_NULL); + + /* + * now sort bootstrap label to the head of the list + * (for now), dom0 assumes its label in the first + * defined ssidref (1/1). 0/0 is the default non-Label + */ + if (have_chwall) + fixup_bootstrap_label(&chwall_ssid_head, max_chwall_types, + &max_chwall_ssids); + if (have_ste) + fixup_bootstrap_label(&ste_ssid_head, max_ste_types, + &max_ste_ssids); + + /* remove any double entries (insert reference instead) */ + if (have_chwall) + remove_doubles(&chwall_ssid_head, max_chwall_types, + &max_chwall_ssids); + if (have_ste) + remove_doubles(&ste_ssid_head, max_ste_types, + &max_ste_ssids); + + if (!DEBUG) + return 0; + + /* print queues */ + if (have_chwall) + { + printf("CHWALL SSID queue (max ssidrefs=%d):\n", max_chwall_ssids); + np = NULL; + for (np = chwall_ssid_head.tqh_first; np != NULL; + np = np->entries.tqe_next) + { + printf("SSID #%02u (Label=%s)\n", np->num, np->name); + if (np->is_ref) + printf("REFERENCE"); + else + for (i = 0; i < max_chwall_types; i++) + if (np->row[i]) + printf("#%02d ", i); + printf("\n\n"); + } + } + if (have_ste) + { + printf("STE SSID queue (max ssidrefs=%d):\n", max_ste_ssids); + np = NULL; + for (np = ste_ssid_head.tqh_first; np != NULL; + np = np->entries.tqe_next) + { + printf("SSID #%02u (Label=%s)\n", np->num, np->name); + if (np->is_ref) + printf("REFERENCE"); + else + for (i = 0; i < max_ste_types; i++) + if (np->row[i]) + printf("#%02d ", i); + printf("\n\n"); + } + } + return 0; +} + +/***************** writing the binary policy *********************/ + +/* + * the mapping file is ascii-based since it will likely be used from + * within scripts (using awk, grep, etc.); + * + * We print from high-level to low-level information so that with one + * pass, any symbol can be resolved (e.g. Label -> types) + */ +int write_mapping(char *filename) +{ + + struct ssid_entry *e; + struct type_entry *t; + int i; + FILE *file; + + if ((file = fopen(filename, "w")) == NULL) + return -EIO; + + fprintf(file, "MAGIC %08x\n", ACM_MAGIC); + fprintf(file, "POLICY %s\n", + basename(policy_filename)); + fprintf(file, "BINARY %s\n", + basename(binary_filename)); + if (have_chwall) + { + fprintf(file, "MAX-CHWALL-TYPES %08x\n", max_chwall_types); + fprintf(file, "MAX-CHWALL-SSIDS %08x\n", max_chwall_ssids); + fprintf(file, "MAX-CHWALL-LABELS %08x\n", max_chwall_labels); + } + if (have_ste) + { + fprintf(file, "MAX-STE-TYPES %08x\n", max_ste_types); + fprintf(file, "MAX-STE-SSIDS %08x\n", max_ste_ssids); + fprintf(file, "MAX-STE-LABELS %08x\n", max_ste_labels); + } + fprintf(file, "\n"); + + /* primary / secondary order for combined ssid synthesis/analysis + * if no primary is named, then chwall is primary */ + switch (primary) { + case CHWALL: + fprintf(file, "PRIMARY CHWALL\n"); + break; + + case STE: + fprintf(file, "PRIMARY STE\n"); + break; + + default: + fprintf(file, "PRIMARY NULL\n"); + break; + } + + switch (secondary) { + case CHWALL: + fprintf(file, "SECONDARY CHWALL\n"); + break; + + case STE: + fprintf(file, "SECONDARY STE\n"); + break; + + default: + fprintf(file, "SECONDARY NULL\n"); + break; + } + fprintf(file, "\n"); + + /* first labels to ssid mappings */ + if (have_chwall) + { + for (e = chwall_ssid_head.tqh_first; e != NULL; + e = e->entries.tqe_next) + { + fprintf(file, "LABEL->SSID %s CHWALL %-25s %8x\n", + (e->type == + VM) ? "VM " : ((e->type == RES) ? "RES" : "ANY"), + e->name, e->num); + } + fprintf(file, "\n"); + } + if (have_ste) + { + for (e = ste_ssid_head.tqh_first; e != NULL; + e = e->entries.tqe_next) + { + fprintf(file, "LABEL->SSID %s STE %-25s %8x\n", + (e->type == + VM) ? "VM " : ((e->type == RES) ? "RES" : "ANY"), + e->name, e->num); + } + fprintf(file, "\n"); + } + + /* second ssid to type mappings */ + if (have_chwall) + { + for (e = chwall_ssid_head.tqh_first; e != NULL; + e = e->entries.tqe_next) + { + if (e->is_ref) + continue; + + fprintf(file, "SSID->TYPE CHWALL %08x", e->num); + + for (i = 0; i < max_chwall_types; i++) + if (e->row[i]) + fprintf(file, " %s", type_by_mapping(&chwall_head, i)); + + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + if (have_ste) { + for (e = ste_ssid_head.tqh_first; e != NULL; + e = e->entries.tqe_next) + { + if (e->is_ref) + continue; + + fprintf(file, "SSID->TYPE STE %08x", e->num); + + for (i = 0; i < max_ste_types; i++) + if (e->row[i]) + fprintf(file, " %s", type_by_mapping(&ste_head, i)); + + fprintf(file, "\n"); + } + fprintf(file, "\n"); + } + /* third type mappings */ + if (have_chwall) + { + for (t = chwall_head.tqh_first; t != NULL; t = t->entries.tqe_next) + { + fprintf(file, "TYPE CHWALL %-25s %8x\n", + t->name, t->mapping); + } + fprintf(file, "\n"); + } + if (have_ste) { + for (t = ste_head.tqh_first; t != NULL; t = t->entries.tqe_next) + { + fprintf(file, "TYPE STE %-25s %8x\n", + t->name, t->mapping); + } + fprintf(file, "\n"); + } + fclose(file); + return 0; +} + +unsigned char *write_chwall_binary(u_int32_t * len_chwall) +{ + unsigned char *buf, *ptr; + struct acm_chwall_policy_buffer *chwall_header; + u_int32_t len; + struct ssid_entry *e; + int i; + + if (!have_chwall) + return NULL; + + len = sizeof(struct acm_chwall_policy_buffer) + + sizeof(type_t) * max_chwall_types * max_chwall_ssids + + sizeof(type_t) * max_chwall_types * max_conflictsets; + + buf = malloc(len); + ptr = buf; + + if (!buf) + { + printf("ERROR: out of memory allocating chwall buffer.\n"); + exit(EXIT_FAILURE); + } + /* chwall has 3 parts : header, types, conflictsets */ + + chwall_header = (struct acm_chwall_policy_buffer *) buf; + chwall_header->chwall_max_types = htonl(max_chwall_types); + chwall_header->chwall_max_ssidrefs = htonl(max_chwall_ssids); + chwall_header->policy_code = htonl(ACM_CHINESE_WALL_POLICY); + chwall_header->policy_version = htonl(ACM_CHWALL_VERSION); + chwall_header->chwall_ssid_offset = + htonl(sizeof(struct acm_chwall_policy_buffer)); + chwall_header->chwall_max_conflictsets = htonl(max_conflictsets); + chwall_header->chwall_conflict_sets_offset = + htonl(ntohl(chwall_header->chwall_ssid_offset) + + sizeof(domaintype_t) * max_chwall_ssids * max_chwall_types); + chwall_header->chwall_running_types_offset = 0; /* not set, only retrieved */ + chwall_header->chwall_conflict_aggregate_offset = 0; /* not set, only retrieved */ + ptr += sizeof(struct acm_chwall_policy_buffer); + + /* types */ + for (e = chwall_ssid_head.tqh_first; e != NULL; + e = e->entries.tqe_next) + { + if (e->is_ref) + continue; + + for (i = 0; i < max_chwall_types; i++) + ((type_t *) ptr)[i] = htons((type_t) e->row[i]); + + ptr += sizeof(type_t) * max_chwall_types; + } + + /* conflictsets */ + for (e = conflictsets_head.tqh_first; e != NULL; + e = e->entries.tqe_next) + { + for (i = 0; i < max_chwall_types; i++) + ((type_t *) ptr)[i] = htons((type_t) e->row[i]); + + ptr += sizeof(type_t) * max_chwall_types; + } + + if ((ptr - buf) != len) + { + printf("ERROR: wrong lengths in %s.\n", __func__); + exit(EXIT_FAILURE); + } + + (*len_chwall) = len; + return buf; +} + +unsigned char *write_ste_binary(u_int32_t * len_ste) +{ + unsigned char *buf, *ptr; + struct acm_ste_policy_buffer *ste_header; + struct ssid_entry *e; + u_int32_t len; + int i; + + if (!have_ste) + return NULL; + + len = sizeof(struct acm_ste_policy_buffer) + + sizeof(type_t) * max_ste_types * max_ste_ssids; + + buf = malloc(len); + ptr = buf; + + if (!buf) + { + printf("ERROR: out of memory allocating chwall buffer.\n"); + exit(EXIT_FAILURE); + } + + /* fill buffer */ + ste_header = (struct acm_ste_policy_buffer *) buf; + ste_header->policy_version = htonl(ACM_STE_VERSION); + ste_header->policy_code = htonl(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY); + ste_header->ste_max_types = htonl(max_ste_types); + ste_header->ste_max_ssidrefs = htonl(max_ste_ssids); + ste_header->ste_ssid_offset = + htonl(sizeof(struct acm_ste_policy_buffer)); + + ptr += sizeof(struct acm_ste_policy_buffer); + + /* types */ + for (e = ste_ssid_head.tqh_first; e != NULL; e = e->entries.tqe_next) + { + if (e->is_ref) + continue; + + for (i = 0; i < max_ste_types; i++) + ((type_t *) ptr)[i] = htons((type_t) e->row[i]); + + ptr += sizeof(type_t) * max_ste_types; + } + + if ((ptr - buf) != len) + { + printf("ERROR: wrong lengths in %s.\n", __func__); + exit(EXIT_FAILURE); + } + (*len_ste) = len; + return buf; /* for now */ +} + +int write_binary(char *filename) +{ + struct acm_policy_buffer header; + unsigned char *ste_buffer = NULL, *chwall_buffer = NULL; + u_int32_t len; + int fd; + + u_int32_t len_ste = 0, len_chwall = 0; /* length of policy components */ + + /* open binary file */ + if ((fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR)) <= 0) + return -EIO; + + ste_buffer = write_ste_binary(&len_ste); + chwall_buffer = write_chwall_binary(&len_chwall); + + /* determine primary component (default chwall) */ + header.policy_version = htonl(ACM_POLICY_VERSION); + header.magic = htonl(ACM_MAGIC); + + len = sizeof(struct acm_policy_buffer); + if (have_chwall) + len += len_chwall; + if (have_ste) + len += len_ste; + header.len = htonl(len); + + header.primary_buffer_offset = htonl(sizeof(struct acm_policy_buffer)); + if (primary == CHWALL) + { + header.primary_policy_code = htonl(ACM_CHINESE_WALL_POLICY); + header.secondary_buffer_offset = + htonl((sizeof(struct acm_policy_buffer)) + len_chwall); + } + else if (primary == STE) + { + header.primary_policy_code = + htonl(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY); + header.secondary_buffer_offset = + htonl((sizeof(struct acm_policy_buffer)) + len_ste); + } + else + { + /* null policy */ + header.primary_policy_code = htonl(ACM_NULL_POLICY); + header.secondary_buffer_offset = + htonl(header.primary_buffer_offset); + } + + if (secondary == CHWALL) + header.secondary_policy_code = htonl(ACM_CHINESE_WALL_POLICY); + else if (secondary == STE) + header.secondary_policy_code = + htonl(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY); + else + header.secondary_policy_code = htonl(ACM_NULL_POLICY); + + if (write(fd, (void *) &header, sizeof(struct acm_policy_buffer)) + != sizeof(struct acm_policy_buffer)) + return -EIO; + + /* write primary policy component */ + if (primary == CHWALL) + { + if (write(fd, chwall_buffer, len_chwall) != len_chwall) + return -EIO; + } + else if (primary == STE) + { + if (write(fd, ste_buffer, len_ste) != len_ste) + return -EIO; + } else + ; /* NULL POLICY has no policy data */ + + /* write secondary policy component */ + if (secondary == CHWALL) + { + if (write(fd, chwall_buffer, len_chwall) != len_chwall) + return -EIO; + } + else if (secondary == STE) + { + if (write(fd, ste_buffer, len_ste) != len_ste) + return -EIO; + } else; /* NULL POLICY has no policy data */ + + close(fd); + return 0; +} + +int is_valid(xmlDocPtr doc) +{ + int err = 0; + xmlSchemaPtr schema_ctxt = NULL; + xmlSchemaParserCtxtPtr schemaparser_ctxt = NULL; + xmlSchemaValidCtxtPtr schemavalid_ctxt = NULL; + + schemaparser_ctxt = xmlSchemaNewParserCtxt(SCHEMA_FILENAME); + schema_ctxt = xmlSchemaParse(schemaparser_ctxt); + schemavalid_ctxt = xmlSchemaNewValidCtxt(schema_ctxt); + +#ifdef VALIDATE_SCHEMA + /* only tested to be available from libxml2-2.6.20 upwards */ + if ((err = xmlSchemaIsValid(schemavalid_ctxt)) != 1) + { + printf("ERROR: Invalid schema file %s (err=%d)\n", + SCHEMA_FILENAME, err); + err = -EIO; + goto out; + } + else + printf("XML Schema %s valid.\n", SCHEMA_FILENAME); +#endif + if ((err = xmlSchemaValidateDoc(schemavalid_ctxt, doc))) + { + err = -EIO; + goto out; + } + out: + xmlSchemaFreeValidCtxt(schemavalid_ctxt); + xmlSchemaFreeParserCtxt(schemaparser_ctxt); + xmlSchemaFree(schema_ctxt); + return (err != 0) ? 0 : 1; +} + +int main(int argc, char **argv) +{ + xmlDocPtr labeldoc = NULL; + xmlDocPtr policydoc = NULL; + + int err = EXIT_SUCCESS; + + char *file_prefix; + int prefix_len; + + if (ACM_POLICY_VERSION != WRITTEN_AGAINST_ACM_POLICY_VERSION) + { + printf("ERROR: This program was written against an older ACM version.\n"); + exit(EXIT_FAILURE); + } + + if (argc != 2) + usage(basename(argv[0])); + + prefix_len = strlen(POLICY_SUBDIR) + + strlen(argv[1]) + 1 /* "/" */ + + strlen(argv[1]) + 1 /* "/" */ ; + + file_prefix = malloc(prefix_len); + policy_filename = malloc(prefix_len + strlen(POLICY_EXTENSION)); + label_filename = malloc(prefix_len + strlen(LABEL_EXTENSION)); + binary_filename = malloc(prefix_len + strlen(BINARY_EXTENSION)); + mapping_filename = malloc(prefix_len + strlen(MAPPING_EXTENSION)); + + if (!file_prefix || !policy_filename || !label_filename || + !binary_filename || !mapping_filename) + { + printf("ERROR allocating file name memory.\n"); + goto out2; + } + + /* create input/output filenames out of prefix */ + strcat(file_prefix, POLICY_SUBDIR); + strcat(file_prefix, argv[1]); + strcat(file_prefix, "/"); + strcat(file_prefix, argv[1]); + + strcpy(policy_filename, file_prefix); + strcpy(label_filename, file_prefix); + strcpy(binary_filename, file_prefix); + strcpy(mapping_filename, file_prefix); + + strcat(policy_filename, POLICY_EXTENSION); + strcat(label_filename, LABEL_EXTENSION); + strcat(binary_filename, BINARY_EXTENSION); + strcat(mapping_filename, MAPPING_EXTENSION); + + labeldoc = xmlParseFile(label_filename); + + if (labeldoc == NULL) + { + printf("Error: could not parse file %s.\n", argv[1]); + goto out2; + } + + printf("Validating label file %s...\n", label_filename); + if (!is_valid(labeldoc)) + { + printf("ERROR: Failed schema-validation for file %s (err=%d)\n", + label_filename, err); + goto out1; + } + + policydoc = xmlParseFile(policy_filename); + + if (policydoc == NULL) + { + printf("Error: could not parse file %s.\n", argv[1]); + goto out1; + } + + printf("Validating policy file %s...\n", policy_filename); + + if (!is_valid(policydoc)) + { + printf("ERROR: Failed schema-validation for file %s (err=%d)\n", + policy_filename, err); + goto out; + } + + /* Init queues and parse policy */ + create_type_mapping(policydoc); + + /* create ssids */ + create_ssid_mapping(labeldoc); + + /* write label mapping file */ + if (write_mapping(mapping_filename)) + { + printf("ERROR: writing mapping file %s.\n", mapping_filename); + goto out; + } + + /* write binary file */ + if (write_binary(binary_filename)) + { + printf("ERROR: writing binary file %s.\n", binary_filename); + goto out; + } + + /* write stats */ + if (have_chwall) + { + printf("Max chwall labels: %u\n", max_chwall_labels); + printf("Max chwall-types: %u\n", max_chwall_types); + printf("Max chwall-ssids: %u\n", max_chwall_ssids); + } + + if (have_ste) + { + printf("Max ste labels: %u\n", max_ste_labels); + printf("Max ste-types: %u\n", max_ste_types); + printf("Max ste-ssids: %u\n", max_ste_ssids); + } + /* cleanup */ + out: + xmlFreeDoc(policydoc); + out1: + xmlFreeDoc(labeldoc); + out2: + xmlCleanupParser(); + return err; +} + diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/secpol_xml2bin.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/secpol_xml2bin.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,139 @@ +/**************************************************************** + * secpol_xml2bin.h + * + * Copyright (C) 2005 IBM Corporation + * + * Authors: + * Reiner Sailer <sailer@xxxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + */ +#define POLICY_SUBDIR "policies/" +#define POLICY_EXTENSION "-security_policy.xml" +#define LABEL_EXTENSION "-security_label_template.xml" +#define BINARY_EXTENSION ".bin" +#define MAPPING_EXTENSION ".map" +#define PRIMARY_COMPONENT_ATTR_NAME "order" +#define BOOTSTRAP_LABEL_ATTR_NAME "bootstrap" +#define PRIMARY_COMPONENT "PrimaryPolicyComponent" +#define SCHEMA_FILENAME "policies/security_policy.xsd" + +/* basic states (used as 1 << X) */ +#define XML2BIN_SECPOL 0 /* policy tokens */ +#define XML2BIN_STE 1 +#define XML2BIN_CHWALL 2 +#define XML2BIN_CONFLICTSETS 3 +#define XML2BIN_CSTYPE 4 + +#define XML2BIN_SECTEMPLATE 5 /* label tokens */ +#define XML2BIN_POLICYHEADER 6 +#define XML2BIN_LABELHEADER 7 +#define XML2BIN_SUBJECTS 8 +#define XML2BIN_OBJECTS 9 +#define XML2BIN_VM 10 +#define XML2BIN_RES 11 + +#define XML2BIN_STETYPES 12 /* shared tokens */ +#define XML2BIN_CHWALLTYPES 13 +#define XML2BIN_TYPE 14 +#define XML2BIN_NAME 15 +#define XML2BIN_TEXT 16 +#define XML2BIN_COMMENT 17 + +/* type "data type" (currently 16bit) */ +typedef u_int16_t type_t; + +/* list of known elements and token equivalent * + * state constants and token positions must be * + * in sync for correct state recognition */ + +char *token[20] = /* parser triggers */ +{ + [0] = "SecurityPolicyDefinition", /* policy xml */ + [1] = "SimpleTypeEnforcement", + [2] = "ChineseWall", + [3] = "ConflictSets", + [4] = "Conflict", /* label-template xml */ + [5] = "SecurityLabelTemplate", + [6] = "PolicyHeader", + [7] = "LabelHeader", + [8] = "SubjectLabels", + [9] = "ObjectLabels", + [10] = "VirtualMachineLabel", + [11] = "ResourceLabel", + [12] = "SimpleTypeEnforcementTypes", /* common tags */ + [13] = "ChineseWallTypes", + [14] = "Type", + [15] = "Name", + [16] = "text", + [17] = "comment", + [18] = NULL, +}; + +/* important combined states */ +#define XML2BIN_NULL 0 + +/* policy xml parsing states _S */ + +/* e.g., here we are in a <secpol,ste,stetypes> environment, * + * so when finding a type element, we know where to put it */ +#define XML2BIN_stetype_S ((1 << XML2BIN_SECPOL) | \ + (1 << XML2BIN_STE) | \ + (1 << XML2BIN_STETYPES)) + +#define XML2BIN_chwalltype_S ((1 << XML2BIN_SECPOL) | \ + (1 << XML2BIN_CHWALL) | \ + (1 << XML2BIN_CHWALLTYPES)) + +#define XML2BIN_conflictset_S ((1 << XML2BIN_SECPOL) | \ + (1 << XML2BIN_CHWALL) | \ + (1 << XML2BIN_CONFLICTSETS)) + +#define XML2BIN_conflictsettype_S ((1 << XML2BIN_SECPOL) | \ + (1 << XML2BIN_CHWALL) | \ + (1 << XML2BIN_CONFLICTSETS) | \ + (1 << XML2BIN_CSTYPE)) + + +/* label xml states */ +#define XML2BIN_VM_S ((1 << XML2BIN_SECTEMPLATE) | \ + (1 << XML2BIN_SUBJECTS) | \ + (1 << XML2BIN_VM)) + +#define XML2BIN_RES_S ((1 << XML2BIN_SECTEMPLATE) | \ + (1 << XML2BIN_OBJECTS) | \ + (1 << XML2BIN_RES)) + +#define XML2BIN_VM_STE_S ((1 << XML2BIN_SECTEMPLATE) | \ + (1 << XML2BIN_SUBJECTS) | \ + (1 << XML2BIN_VM) | \ + (1 << XML2BIN_STETYPES)) + +#define XML2BIN_VM_CHWALL_S ((1 << XML2BIN_SECTEMPLATE) | \ + (1 << XML2BIN_SUBJECTS) | \ + (1 << XML2BIN_VM) | \ + (1 << XML2BIN_CHWALLTYPES)) + +#define XML2BIN_RES_STE_S ((1 << XML2BIN_SECTEMPLATE) | \ + (1 << XML2BIN_OBJECTS) | \ + (1 << XML2BIN_RES) | \ + (1 << XML2BIN_STETYPES)) + + + +/* check versions of headers against which the + * xml2bin translation tool was written + */ + +/* protects from unnoticed changes in struct acm_policy_buffer */ +#define WRITTEN_AGAINST_ACM_POLICY_VERSION 1 + +/* protects from unnoticed changes in struct acm_chwall_policy_buffer */ +#define WRITTEN_AGAINST_ACM_CHWALL_VERSION 1 + +/* protects from unnoticed changes in struct acm_ste_policy_buffer */ +#define WRITTEN_AGAINST_ACM_STE_VERSION 1 diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/setlabel.sh --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/setlabel.sh Thu Aug 25 22:53:20 2005 @@ -0,0 +1,345 @@ +#!/bin/sh +# * +# * setlabel +# * +# * Copyright (C) 2005 IBM Corporation +# * +# * Authors: +# * Stefan Berger <stefanb@xxxxxxxxxx> +# * +# * This program is free software; you can redistribute it and/or +# * modify it under the terms of the GNU General Public License as +# * published by the Free Software Foundation, version 2 of the +# * License. +# * +# * 'setlabel' labels virtual machine (domain) configuration files with +# * security identifiers that can be enforced in Xen. +# * +# * 'setlabel -?' shows the usage of the program +# * +# * 'setlabel -l vmconfig-file' lists all available labels (only VM +# * labels are used right now) +# * +# * 'setlabel vmconfig-file security-label map-file' inserts the 'ssidref' +# * that corresponds to the security-label under the +# * current policy (if policy changes, 'label' +# * must be re-run over the configuration files; +# * map-file is created during policy translation and +# * is found in the policy's directory +# + +if [ -z "$runbash" ]; then + runbash="1" + export runbash + exec sh -c "bash $0 $*" +fi + + +usage () +{ + echo "Usage: $0 [Option] <vmfile> <label> <policy name> " + echo " or $0 -l <policy name>" + echo "" + echo "Valid Options are:" + echo "-r : to relabel a file without being prompted" + echo "" + echo "vmfile : XEN vm configuration file" + echo "label : the label to map" + echo "policy name : the name of the policy, i.e. 'chwall'" + echo "" + echo "-l <policy name> is used to show valid labels in the map file" + echo "" +} + + +findMapFile () +{ + mapfile="./$1.map" + if [ -r "$mapfile" ]; then + return 1 + fi + + mapfile="./policies/$1/$1.map" + if [ -r "$mapfile" ]; then + return 1 + fi + + return 0 +} + +showLabels () +{ + mapfile=$1 + if [ ! -r "$mapfile" -o "$mapfile" == "" ]; then + echo "Cannot read from vm configuration file $vmfile." + return -1 + fi + + getPrimaryPolicy $mapfile + getSecondaryPolicy $mapfile + + echo "The following labels are available:" + let line=1 + while [ 1 ]; do + ITEM=`cat $mapfile | \ + awk -vline=$line \ + -vprimary=$primary \ + '{ \ + if ($1 == "LABEL->SSID" && \ + $2 == "VM" && \ + $3 == primary ) { \ + ctr++; \ + if (ctr == line) { \ + print $4; \ + } \ + } \ + } END { \ + }'` + + if [ "$ITEM" == "" ]; then + break + fi + if [ "$secondary" != "NULL" ]; then + LABEL=`cat $mapfile | \ + awk -vitem=$ITEM \ + '{ + if ($1 == "LABEL->SSID" && \ + $2 == "VM" && \ + $3 == "CHWALL" && \ + $4 == item ) { \ + result = item; \ + } \ + } END { \ + print result \ + }'` + else + LABEL=$ITEM + fi + + if [ "$LABEL" != "" ]; then + echo "$LABEL" + found=1 + fi + let line=line+1 + done + if [ "$found" != "1" ]; then + echo "No labels found." + fi +} + +getPrimaryPolicy () +{ + mapfile=$1 + primary=`cat $mapfile | \ + awk ' \ + { \ + if ( $1 == "PRIMARY" ) { \ + res=$2; \ + } \ + } END { \ + print res; \ + } '` +} + +getSecondaryPolicy () +{ + mapfile=$1 + secondary=`cat $mapfile | \ + awk ' \ + { \ + if ( $1 == "SECONDARY" ) { \ + res=$2; \ + } \ + } END { \ + print res; \ + } '` +} + + +getDefaultSsid () +{ + mapfile=$1 + pol=$2 + RES=`cat $mapfile \ + awk -vpol=$pol \ + { \ + if ($1 == "LABEL->SSID" && \ + $2 == "ANY" && \ + $3 == pol && \ + $4 == "DEFAULT" ) {\ + res=$5; \ + } \ + } END { \ + printf "%04x", strtonum(res) \ + }'` + echo "default NULL mapping is $RES" + defaultssid=$RES +} + +relabel () +{ + vmfile=$1 + label=$2 + mapfile=$3 + mode=$4 + + if [ ! -r "$vmfile" ]; then + echo "Cannot read from vm configuration file $vmfile." + return -1 + fi + + if [ ! -w "$vmfile" ]; then + echo "Cannot write to vm configuration file $vmfile." + return -1 + fi + + if [ ! -r "$mapfile" ] ; then + echo "Cannot read mapping file $mapfile." + return -1 + fi + + # Determine which policy is primary, which sec. + getPrimaryPolicy $mapfile + getSecondaryPolicy $mapfile + + # Calculate the primary policy's SSIDREF + if [ "$primary" == "NULL" ]; then + SSIDLO="0000" + else + SSIDLO=`cat $mapfile | \ + awk -vlabel=$label \ + -vprimary=$primary \ + '{ \ + if ( $1 == "LABEL->SSID" && \ + $2 == "VM" && \ + $3 == primary && \ + $4 == label ) { \ + result=$5 \ + } \ + } END { \ + if (result != "" ) \ + {printf "%04x", strtonum(result)}\ + }'` + fi + + # Calculate the secondary policy's SSIDREF + if [ "$secondary" == "NULL" ]; then + SSIDHI="0000" + else + SSIDHI=`cat $mapfile | \ + awk -vlabel=$label \ + -vsecondary=$secondary \ + '{ \ + if ( $1 == "LABEL->SSID" && \ + $2 == "VM" && \ + $3 == secondary && \ + $4 == label ) { \ + result=$5 \ + } \ + } END { \ + if (result != "" ) \ + {printf "%04x", strtonum(result)}\ + }'` + fi + + if [ "$SSIDLO" == "" -o \ + "$SSIDHI" == "" ]; then + echo "Could not map the given label '$label'." + return -1 + fi + + ACM_POLICY=`cat $mapfile | \ + awk ' { if ( $1 == "POLICY" ) { \ + result=$2 \ + } \ + } \ + END { \ + if (result != "") { \ + printf result \ + } \ + }'` + + if [ "$ACM_POLICY" == "" ]; then + echo "Could not find 'POLICY' entry in map file." + return -1 + fi + + SSIDREF="0x$SSIDHI$SSIDLO" + + if [ "$mode" != "relabel" ]; then + RES=`cat $vmfile | \ + awk '{ \ + if ( substr($1,0,7) == "ssidref" ) {\ + print $0; \ + } \ + }'` + if [ "$RES" != "" ]; then + echo "Do you want to overwrite the existing mapping ($RES)? (y/N)" + read user + if [ "$user" != "y" -a "$user" != "Y" ]; then + echo "Aborted." + return 0 + fi + fi + fi + + #Write the output + vmtmp1="/tmp/__setlabel.tmp1" + vmtmp2="/tmp/__setlabel.tmp2" + touch $vmtmp1 + touch $vmtmp2 + if [ ! -w "$vmtmp1" -o ! -w "$vmtmp2" ]; then + echo "Cannot create temporary files. Aborting." + return -1 + fi + RES=`sed -e '/^#ACM_POLICY/d' $vmfile > $vmtmp1` + RES=`sed -e '/^#ACM_LABEL/d' $vmtmp1 > $vmtmp2` + RES=`sed -e '/^ssidref/d' $vmtmp2 > $vmtmp1` + echo "#ACM_POLICY=$ACM_POLICY" >> $vmtmp1 + echo "#ACM_LABEL=$label" >> $vmtmp1 + echo "ssidref = $SSIDREF" >> $vmtmp1 + mv -f $vmtmp1 $vmfile + rm -rf $vmtmp1 $vmtmp2 + echo "Mapped label '$label' to ssidref '$SSIDREF'." +} + + + +if [ "$1" == "-r" ]; then + mode="relabel" + shift +elif [ "$1" == "-l" ]; then + mode="show" + shift +elif [ "$1" == "-?" ]; then + mode="usage" +fi + +if [ "$mode" == "show" ]; then + if [ "$1" == "" ]; then + usage + exit -1; + fi + findMapFile $1 + res=$? + if [ "$res" != "0" ]; then + showLabels $mapfile + else + echo "Could not find map file for policy '$1'." + fi +elif [ "$mode" == "usage" ]; then + usage +else + if [ "$3" == "" ]; then + usage + exit -1; + fi + findMapFile $3 + res=$? + if [ "$res" != "0" ]; then + relabel $1 $2 $mapfile $mode + else + echo "Could not find map file for policy '$3'." + fi + +fi diff -r 5f1ed597f107 -r 8799d14bef77 tools/security/updategrub.sh --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/security/updategrub.sh Thu Aug 25 22:53:20 2005 @@ -0,0 +1,171 @@ +#!/bin/sh +# * +# * updategrub +# * +# * Copyright (C) 2005 IBM Corporation +# * +# * Authors: +# * Stefan Berger <stefanb@xxxxxxxxxx> +# * +# * This program is free software; you can redistribute it and/or +# * modify it under the terms of the GNU General Public License as +# * published by the Free Software Foundation, version 2 of the +# * License. +# * +# * +# + +if [ -z "$runbash" ]; then + runbash="1" + export runbash + exec sh -c "bash $0 $*" + exit +fi + + +# Show usage of this program +usage () +{ + echo "Usage: $0 <policy name> <root of xen repository>" + echo "" + echo "<policy name> : The name of the policy, i.e. xen_null" + echo "<root of xen repository> : The root of the XEN repositrory." + echo "" +} + +# This function sets the global variable 'linux' +# to the name of the linux kernel that was compiled +# For now a pattern should do the trick +getLinuxVersion () +{ + path=$1 + linux="" + for f in $path/linux-*-xen0 ; do + versionfile=$f/include/linux/version.h + if [ -r $versionfile ]; then + lnx=`cat $versionfile | \ + grep UTS_RELEASE | \ + awk '{ \ + len=length($3); \ + print substr($3,2,len-2) }'` + fi + if [ "$lnx" != "" ]; then + linux="[./0-9a-zA-z]*$lnx" + return; + fi + done + + #Last resort. + linux="vmlinuz-2.[45678].[0-9]*[.0-9]*-xen0$" +} + +#Return where the grub.conf file is. +#I only know of one place it can be. +findGrubConf() +{ + grubconf="/boot/grub/grub.conf" + if [ -w $grubconf ]; then + return 1 + fi + return 0 +} + + +#Update the grub configuration file. +#Search for existing entries and replace the current +#policy entry with the policy passed to this script +# +#Arguments passed to this function +# 1st : the grub configuration file +# 2nd : the binary policy file name +# 3rd : the name or pattern of the linux kernel name to match +# +# The algorithm here is based on pattern matching +# and is working correctly if +# - under a title a line beginning with 'kernel' is found +# whose following item ends with "xen.gz" +# Example: kernel /xen.gz dom0_mem=.... +# - a module line matching the 3rd parameter is found +# +updateGrub () +{ + grubconf=$1 + policyfile=$2 + linux=$3 + + tmpfile="/tmp/new_grub.conf" + + cat $grubconf | \ + awk -vpolicy=$policyfile \ + -vlinux=$linux '{ \ + if ( $1 == "title" ) { \ + kernelfound = 0; \ + if ( policymaycome == 1 ){ \ + printf ("\tmodule %s%s\n", path, policy); \ + } \ + policymaycome = 0; \ + } \ + else if ( $1 == "kernel" ) { \ + if ( match($2,"xen.gz$") ) { \ + path=substr($2,1,RSTART-1); \ + kernelfound = 1; \ + } \ + } \ + else if ( $1 == "module" && \ + kernelfound == 1 && \ + match($2,linux) ) { \ + policymaycome = 1; \ + } \ + else if ( $1 == "module" && \ + kernelfound == 1 && \ + policymaycome == 1 && \ + match($2,"[0-9a-zA-Z]*.bin$") ) { \ + printf ("\tmodule %s%s\n", path, policy); \ + policymaycome = 0; \ + kernelfound = 0; \ + dontprint = 1; \ + } \ + else if ( $1 == "" && \ + kernelfound == 1 && \ + policymaycome == 1) { \ + dontprint = 1; \ + } \ + if (dontprint == 0) { \ + printf ("%s\n", $0); \ + } \ + dontprint = 0; \ + } END { \ + if ( policymaycome == 1 ) { \ + printf ("\tmodule %s%s\n", path, policy); \ + } \ + }' > $tmpfile + if [ ! -r $tmpfile ]; then + echo "Could not create temporary file! Aborting." + exit -1 + fi + mv -f $tmpfile $grubconf +} + +if [ "$1" == "" -o "$2" == "" ]; then + usage + exit -1 +fi + +if [ "$1" == "-?" ]; then + usage + exit 0 +fi + +policy=$1 +policyfile=$policy.bin + +getLinuxVersion $2 + +findGrubConf +ERR=$? +if [ $ERR -eq 0 ]; then + echo "Could not find grub.conf. Aborting." + exit -1 +fi + +updateGrub $grubconf $policyfile $linux diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/Makefile --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/Makefile Thu Aug 25 22:53:20 2005 @@ -0,0 +1,13 @@ +XEN_ROOT = ../.. +include $(XEN_ROOT)/tools/Rules.mk + +SUBDIRS := +SUBDIRS += libxenstat +SUBDIRS += xentop + +.PHONY: all install clean + +all install clean: + @set -e; for subdir in $(SUBDIRS); do \ + $(MAKE) -C $$subdir $@; \ + done diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/COPYING --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/COPYING Thu Aug 25 22:53:20 2005 @@ -0,0 +1,510 @@ + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes a de-facto standard. To achieve this, non-free programs must +be allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least + three years, to give the same user the materials specified in + Subsection 6a, above, for a charge no more than the cost of + performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the library, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James + Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/Makefile --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/Makefile Thu Aug 25 22:53:20 2005 @@ -0,0 +1,142 @@ +# libxenstat: statistics-collection library for Xen +# Copyright (C) International Business Machines Corp., 2005 +# Author: Josh Triplett <josht@xxxxxxxxxx> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk +LINUX_ROOT := $(XEN_ROOT)/linux-2.6-xen-sparse + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 -D +INSTALL_DATA = $(INSTALL) -m0644 -D + +prefix=/usr +includedir=$(prefix)/include +libdir=$(prefix)/lib + +LDCONFIG=ldconfig +MAKE_LINK=ln -sf + +MAJOR=0 +MINOR=0 + +LIB=src/libxenstat.a +SHLIB=src/libxenstat.so.$(MAJOR).$(MINOR) +SHLIB_LINKS=src/libxenstat.so.$(MAJOR) src/libxenstat.so +OBJECTS=src/xenstat.o src/xen-interface.o +SONAME_FLAGS=-Wl,-soname -Wl,libxenstat.so.$(MAJOR) + +WARN_FLAGS=-Wall -Werror + +CFLAGS+=-Isrc +CFLAGS+=-I$(XEN_ROOT)/xen/include/public +CFLAGS+=-I$(LINUX_ROOT)/include/asm-xen/linux-public/ +LDFLAGS+=-Lsrc + +all: $(LIB) + +$(LIB): $(OBJECTS) + $(AR) rc $@ $^ + $(RANLIB) $@ + +$(SHLIB): $(OBJECTS) + $(CC) $(LDFLAGS) $(SONAME_FLAGS) -shared -o $@ $(OBJECTS) + +src/xenstat.o: src/xenstat.c src/xenstat.h src/xen-interface.h + $(CC) $(CFLAGS) $(WARN_FLAGS) -c -o $@ $< + +src/xen-interface.o: src/xen-interface.c src/xen-interface.h + $(CC) $(CFLAGS) $(WARN_FLAGS) -c -o $@ $< + +src/libxenstat.so.$(MAJOR): $(LIB) + $(MAKE_LINK) $(<F) $@ + +src/libxenstat.so: src/libxenstat.so.$(MAJOR) + $(MAKE_LINK) $(<F) $@ + +install: all +#install: all +# $(INSTALL_DATA) src/xenstat.h $(DESTDIR)$(includedir)/xenstat.h +# $(INSTALL_PROG) $(LIB) $(DESTDIR)$(libdir)/libxenstat.a +# $(INSTALL_PROG) $(SHLIB) \ +# $(DESTDIR)$(libdir)/libxenstat.so.$(MAJOR).$(MINOR) +# $(MAKE_LINK) libxenstat.so.$(MAJOR).$(MINOR) \ +# $(DESTDIR)$(libdir)/libxenstat.so.$(MAJOR) +# $(MAKE_LINK) libxenstat.so.$(MAJOR) \ +# $(DESTDIR)$(libdir)/libxenstat.so +# -$(LDCONFIG) + +PYLIB=bindings/swig/python/_xenstat.so +PYMOD=bindings/swig/python/xenstat.py +PYSRC=bindings/swig/python/_xenstat.c +PERLLIB=bindings/swig/perl/xenstat.so +PERLMOD=bindings/swig/perl/xenstat.pm +PERLSRC=bindings/swig/perl/xenstat.c +BINDINGS=$(PYLIB) $(PYMOD) $(PERLLIB) $(PERLMOD) +BINDINGSRC=$(PYSRC) $(PERLSRC) + +# The all-bindings target builds all the language bindings +all-bindings: perl-bindings python-bindings + +# The install-bindings target installs all the language bindings +install-bindings: install-perl-bindings install-python-bindings + +$(BINDINGS): $(SHLIB) $(SHLIB_LINKS) src/xenstat.h + +SWIG_FLAGS=-module xenstat -Isrc + +# Python bindings +PYTHON_VERSION=2.3 +PYTHON_FLAGS=-I/usr/include/python$(PYTHON_VERSION) -lpython$(PYTHON_VERSION) +$(PYSRC) $(PYMOD): bindings/swig/xenstat.i + swig -python $(SWIG_FLAGS) -outdir $(@D) -o $(PYSRC) $< + +$(PYLIB): $(PYSRC) + $(CC) $(CFLAGS) $(LDFLAGS) $(PYTHON_FLAGS) -shared -lxenstat -o $@ $< + +python-bindings: $(PYLIB) $(PYMOD) + +pythonlibdir=$(prefix)/lib/python$(PYTHON_VERSION)/site-packages +install-python-bindings: $(PYLIB) $(PYMOD) + $(INSTALL_PROG) $(PYLIB) $(DESTDIR)$(pythonlibdir)/_xenstat.so + $(INSTALL_PROG) $(PYMOD) $(DESTDIR)$(pythonlibdir)/xenstat.py + +ifeq ($(XENSTAT_PYTHON_BINDINGS),y) +all: python-bindings +install: install-python-bindings +endif + +# Perl bindings +PERL_FLAGS=`perl -MConfig -e 'print "$$Config{ccflags} -I$$Config{archlib}/CORE";'` +$(PERLSRC) $(PERLMOD): bindings/swig/xenstat.i + swig -perl $(SWIG_FLAGS) -outdir $(@D) -o $(PERLSRC) $< + +$(PERLLIB): $(PERLSRC) + $(CC) $(CFLAGS) $(LDFLAGS) $(PERL_FLAGS) -shared -lxenstat -o $@ $< + +perl-bindings: $(PERLLIB) $(PERLMOD) + +perllibdir=$(prefix)/lib/perl5 +perlmoddir=$(prefix)/share/perl5 +install-perl-bindings: $(PERLLIB) $(PERLMOD) + $(INSTALL_PROG) $(PERLLIB) $(DESTDIR)$(perllibdir)/xenstat.so + $(INSTALL_PROG) $(PERLMOD) $(DESTDIR)$(perlmoddir)/xenstat.pm + +ifeq ($(XENSTAT_PERL_BINDINGS),y) +all: perl-bindings +install: install-perl-bindings +endif + +clean: + rm -f $(LIB) $(SHLIB) $(SHLIB_LINKS) $(OBJECTS) \ + $(BINDINGS) $(BINDINGSRC) diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/bindings/swig/perl/.empty --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/bindings/swig/perl/.empty Thu Aug 25 22:53:20 2005 @@ -0,0 +1,1 @@ +This directory is empty; this file is included to prevent version control systems from removing the directory. diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/bindings/swig/python/.empty --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/bindings/swig/python/.empty Thu Aug 25 22:53:20 2005 @@ -0,0 +1,1 @@ +This directory is empty; this file is included to prevent version control systems from removing the directory. diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/bindings/swig/xenstat.i --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/bindings/swig/xenstat.i Thu Aug 25 22:53:20 2005 @@ -0,0 +1,8 @@ +%module xenstat_swig +%{ +/* Includes the header in the wrapper code */ +#include "xenstat.h" +%} + +/* Parse the header file to generate wrappers */ +%include "xenstat.h" diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/src/xen-interface.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/src/xen-interface.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,204 @@ +/* xen-interface.c + * + * Copyright (C) International Business Machines Corp., 2005 + * Authors: Josh Triplett <josht@xxxxxxxxxx> + * Judy Fischbach <jfisch@xxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +#include "xen-interface.h" +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include "version.h" +#include "privcmd.h" +#include "xen.h" + +struct xi_handle { + int fd; +}; + +/* Initialize for xen-interface. Returns a handle to be used with subsequent + * calls to the xen-interface functions or NULL if an error occurs. */ +xi_handle *xi_init() +{ + xi_handle *handle; + + handle = (xi_handle *)calloc(1, sizeof(xi_handle)); + if (handle == NULL) + return NULL; + + handle->fd = open("/proc/xen/privcmd", O_RDWR); + if (handle->fd < 0) { + perror("Couldn't open /proc/xen/privcmd"); + free(handle); + return NULL; + } + + return handle; +} + +/* Release the handle to libxc, free resources, etc. */ +void xi_uninit(xi_handle *handle) +{ + close (handle->fd); + free (handle); +} + +/* Make simple xen version hypervisor calls */ +static int xi_make_xen_version_hypercall(xi_handle *handle, long *vnum, xen_extraversion_t *ver) +{ + privcmd_hypercall_t privcmd; + multicall_entry_t multicall[2]; + int ret = 0; + + /* set up for doing hypercall */ + privcmd.op = __HYPERVISOR_multicall; + privcmd.arg[0] = (unsigned long)multicall; + privcmd.arg[1] = 2; + + /* first one to get xen version number */ + multicall[0].op = __HYPERVISOR_xen_version; + multicall[0].args[0] = (unsigned long)XENVER_version; + + /* second to get xen version flag */ + multicall[1].op = __HYPERVISOR_xen_version; + multicall[1].args[0] = (unsigned long)XENVER_extraversion; + multicall[1].args[1] = (unsigned long)ver; + + if (mlock( &privcmd, sizeof(privcmd_hypercall_t)) < 0) { + perror("Failed to mlock privcmd structure"); + return -1; + } + + if (mlock( multicall, sizeof(multicall_entry_t)) < 0) { + perror("Failed to mlock multicall_entry structure"); + munlock( &multicall, sizeof(multicall_entry_t)); + return -1; + } + + if (ioctl( handle->fd, IOCTL_PRIVCMD_HYPERCALL, &privcmd) < 0) { + perror("Hypercall failed"); + ret = -1; + } + + *vnum = multicall[0].result; + + munlock( &privcmd, sizeof(privcmd_hypercall_t)); + munlock( &multicall, sizeof(multicall_entry_t)); + + return ret; +} + +/* Make Xen Dom0 op hypervisor call */ +static int xi_make_dom0_op(xi_handle *handle, dom0_op_t *dom_op, int dom_opcode) +{ + privcmd_hypercall_t privcmd; + int ret = 0; + + /* set up for doing hypercall */ + privcmd.op = __HYPERVISOR_dom0_op; + privcmd.arg[0] = (unsigned long)dom_op; + dom_op->cmd = dom_opcode; + dom_op->interface_version = DOM0_INTERFACE_VERSION; + + if (mlock( &privcmd, sizeof(privcmd_hypercall_t)) < 0) { + perror("Failed to mlock privcmd structure"); + return -1; + } + + if (mlock( dom_op, sizeof(dom0_op_t)) < 0) { + perror("Failed to mlock dom0_op structure"); + munlock( &privcmd, sizeof(privcmd_hypercall_t)); + return -1; + } + + if (ioctl( handle->fd, IOCTL_PRIVCMD_HYPERCALL, &privcmd) < 0) { + perror("Hypercall failed"); + ret = -1; + } + + munlock( &privcmd, sizeof(privcmd_hypercall_t)); + munlock( dom_op, sizeof(dom0_op_t)); + + return ret; +} + +/* Obtain domain data from dom0 */ +int xi_get_physinfo(xi_handle *handle, dom0_physinfo_t *physinfo) +{ + dom0_op_t op; + + if (xi_make_dom0_op(handle, &op, DOM0_PHYSINFO) < 0) { + perror("DOM0_PHYSINFO Hypercall failed"); + return -1; + } + + *physinfo = op.u.physinfo; + return 0; +} + +/* Obtain domain data from dom0 */ +int xi_get_domaininfolist(xi_handle *handle, dom0_getdomaininfo_t *info, + unsigned int first_domain, unsigned int max_domains) +{ + dom0_op_t op; + op.u.getdomaininfolist.first_domain = first_domain; + op.u.getdomaininfolist.max_domains = max_domains; + op.u.getdomaininfolist.buffer = info; + + if (mlock( info, max_domains * sizeof(dom0_getdomaininfo_t)) < 0) { + perror("Failed to mlock domaininfo array"); + return -1; + } + + if (xi_make_dom0_op(handle, &op, DOM0_GETDOMAININFOLIST) < 0) { + perror("DOM0_GETDOMAININFOLIST Hypercall failed"); + return -1; + } + + return op.u.getdomaininfolist.num_domains; +} + +/* Returns cpu usage data from dom0 */ +long long xi_get_vcpu_usage(xi_handle *handle, unsigned int domain, + unsigned int vcpu) +{ + dom0_op_t op; + op.u.getvcpucontext.domain = domain; + op.u.getvcpucontext.vcpu = vcpu; + op.u.getvcpucontext.ctxt = NULL; + + if (xi_make_dom0_op(handle, &op, DOM0_GETVCPUCONTEXT) < 0) { + perror("DOM0_GETVCPUCONTEXT Hypercall failed"); + return -1; + } + + return op.u.getvcpucontext.cpu_time; +} + +/* gets xen version information from hypervisor */ +int xi_get_xen_version(xi_handle *handle, long *vnum, xen_extraversion_t *ver) +{ + + /* gets the XENVER_version and XENVER_extraversion */ + if (xi_make_xen_version_hypercall( handle, vnum, ver) < 0) {; + perror("XEN VERSION Hypercall failed"); + return -1; + } + + return 0; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/src/xen-interface.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/src/xen-interface.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,53 @@ +/* xen-interface.h + * + * Copyright (C) International Business Machines Corp., 2005 + * Authors: Josh Triplett <josht@xxxxxxxxxx> + * Judy Fischbach <jfisch@xxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +#include <stdint.h> + +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +#include "dom0_ops.h" +#include "version.h" + +/* Opaque handles */ +typedef struct xi_handle xi_handle; + +/* Initialize for xen-interface. Returns a handle to be used with subsequent + * calls to the xen-interface functions or NULL if an error occurs. */ +xi_handle *xi_init(); + +/* Release the handle to libxc, free resources, etc. */ +void xi_uninit(xi_handle *handle); + +/* Obtain xen version information from hypervisor */ +int xi_get_xen_version(xi_handle *, long *vnum, xen_extraversion_t *ver); + +/* Obtain physinfo data from dom0 */ +int xi_get_physinfo(xi_handle *, dom0_physinfo_t *); + +/* Obtain domain data from dom0 */ +int xi_get_domaininfolist(xi_handle *, dom0_getdomaininfo_t *, unsigned int, + unsigned int); + +/* Returns cpu usage data from dom0 */ +long long xi_get_vcpu_usage(xi_handle *, unsigned int, unsigned int); diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/src/xenstat.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/src/xenstat.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,640 @@ +/* libxenstat: statistics-collection library for Xen + * Copyright (C) International Business Machines Corp., 2005 + * Authors: Josh Triplett <josht@xxxxxxxxxx> + * Judy Fischbach <jfisch@xxxxxxxxxx> + * David Hendricks <dhendrix@xxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +#include <limits.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <xen-interface.h> +#include "xenstat.h" +#include "version.h" + +/* + * Types + */ +struct xenstat_handle { + xi_handle *xihandle; + int page_size; + FILE *procnetdev; +}; + +#define SHORT_ASC_LEN 5 /* length of 65535 */ +#define VERSION_SIZE (2 * SHORT_ASC_LEN + 1 + sizeof(xen_extraversion_t) + 1) + +struct xenstat_node { + unsigned int flags; + unsigned long long cpu_hz; + unsigned int num_cpus; + unsigned long long tot_mem; + unsigned long long free_mem; + unsigned int num_domains; + char xen_version[VERSION_SIZE]; /* xen version running on this node */ + xenstat_domain *domains; /* Array of length num_domains */ +}; + +struct xenstat_domain { + unsigned int id; + unsigned int state; + unsigned long long cpu_ns; + unsigned int num_vcpus; + xenstat_vcpu *vcpus; /* Array of length num_vcpus */ + unsigned long long cur_mem; /* Current memory reservation */ + unsigned long long max_mem; /* Total memory allowed */ + unsigned int ssid; + unsigned int num_networks; + xenstat_network *networks; /* Array of length num_networks */ +}; + +struct xenstat_vcpu { + unsigned long long ns; +}; + +struct xenstat_network { + unsigned int id; + /* Received */ + unsigned long long rbytes; + unsigned long long rpackets; + unsigned long long rerrs; + unsigned long long rdrop; + /* Transmitted */ + unsigned long long tbytes; + unsigned long long tpackets; + unsigned long long terrs; + unsigned long long tdrop; +}; + +/* + * Data-collection types + */ +/* Called to collect the information for the node and all the domains on + * it. When called, the domain information has already been collected. */ +typedef int (*xenstat_collect_func)(xenstat_handle * handle, + xenstat_node * node); +/* Called to free the information collected by the collect function. The free + * function will only be called on a xenstat_node if that node includes + * information collected by the corresponding collector. */ +typedef void (*xenstat_free_func)(xenstat_node * node); +/* Called to free any information stored in the handle. Note the lack of a + * matching init function; the collect functions should initialize on first + * use. Also, the uninit function must handle the case that the collector has + * never been initialized. */ +typedef void (*xenstat_uninit_func)(xenstat_handle * handle); +typedef struct xenstat_collector { + unsigned int flag; + xenstat_collect_func collect; + xenstat_free_func free; + xenstat_uninit_func uninit; +} xenstat_collector; + +static int xenstat_collect_vcpus(xenstat_handle * handle, + xenstat_node * node); +static int xenstat_collect_networks(xenstat_handle * handle, + xenstat_node * node); +static void xenstat_free_vcpus(xenstat_node * node); +static void xenstat_free_networks(xenstat_node * node); +static void xenstat_uninit_vcpus(xenstat_handle * handle); +static void xenstat_uninit_networks(xenstat_handle * handle); + +static xenstat_collector collectors[] = { + { XENSTAT_VCPU, xenstat_collect_vcpus, + xenstat_free_vcpus, xenstat_uninit_vcpus }, + { XENSTAT_NETWORK, xenstat_collect_networks, + xenstat_free_networks, xenstat_uninit_networks } +}; + +#define NUM_COLLECTORS (sizeof(collectors)/sizeof(xenstat_collector)) + +/* + * libxenstat API + */ +xenstat_handle *xenstat_init() +{ + xenstat_handle *handle; + + handle = (xenstat_handle *) calloc(1, sizeof(xenstat_handle)); + if (handle == NULL) + return NULL; + +#if defined(PAGESIZE) + handle->page_size = PAGESIZE; +#elif defined(PAGE_SIZE) + handle->page_size = PAGE_SIZE; +#else + handle->page_size = sysconf(_SC_PAGE_SIZE); + if (handle->page_size < 0) { + perror("Failed to retrieve page size."); + free(handle); + return NULL; + } +#endif + + handle->xihandle = xi_init(); + if (handle->xihandle == NULL) { + perror("xi_init"); + free(handle); + return NULL; + } + + return handle; +} + +void xenstat_uninit(xenstat_handle * handle) +{ + unsigned int i; + if (handle) { + for (i = 0; i < NUM_COLLECTORS; i++) + collectors[i].uninit(handle); + xi_uninit(handle->xihandle); + free(handle); + } +} + +xenstat_node *xenstat_get_node(xenstat_handle * handle, unsigned int flags) +{ +#define DOMAIN_CHUNK_SIZE 256 + xenstat_node *node; + dom0_physinfo_t physinfo; + xen_extraversion_t version; + long vnum = 0; + dom0_getdomaininfo_t domaininfo[DOMAIN_CHUNK_SIZE]; + unsigned int num_domains, new_domains; + unsigned int i; + + /* Create the node */ + node = (xenstat_node *) calloc(1, sizeof(xenstat_node)); + if (node == NULL) + return NULL; + + /* Get information about the physical system */ + if (xi_get_physinfo(handle->xihandle, &physinfo) < 0) { + free(node); + return NULL; + } + + /* Get the xen version number and xen version tag */ + if (xi_get_xen_version(handle->xihandle, &vnum, &version) < 0) { + free(node); + return NULL; + } + snprintf(node->xen_version, VERSION_SIZE, + "%ld.%ld%s\n", ((vnum >> 16) & 0xFFFF), vnum & 0xFFFF, (char *)version); + + node->cpu_hz = ((unsigned long long)physinfo.cpu_khz) * 1000ULL; + node->num_cpus = + (physinfo.threads_per_core * physinfo.cores_per_socket * + physinfo.sockets_per_node * physinfo.nr_nodes); + node->tot_mem = ((unsigned long long)physinfo.total_pages) + * handle->page_size; + node->free_mem = ((unsigned long long)physinfo.free_pages) + * handle->page_size; + + /* malloc(0) is not portable, so allocate a single domain. This will + * be resized below. */ + node->domains = malloc(sizeof(xenstat_domain)); + if (node->domains == NULL) { + free(node); + return NULL; + } + + num_domains = 0; + do { + xenstat_domain *domain; + + new_domains = xi_get_domaininfolist(handle->xihandle, + domaininfo, num_domains, + DOMAIN_CHUNK_SIZE); + + node->domains = realloc(node->domains, + (num_domains + new_domains) + * sizeof(xenstat_domain)); + if (node->domains == NULL) { + free(node); + return NULL; + } + + domain = node->domains + num_domains; + + for (i = 0; i < new_domains; i++) { + /* Fill in domain using domaininfo[i] */ + domain->id = domaininfo[i].domain; + domain->state = domaininfo[i].flags; + domain->cpu_ns = domaininfo[i].cpu_time; + domain->num_vcpus = domaininfo[i].n_vcpu; + domain->vcpus = NULL; + domain->cur_mem = + ((unsigned long long)domaininfo[i].tot_pages) + * handle->page_size; + domain->max_mem = + domaininfo[i].max_pages == UINT_MAX + ? (unsigned long long)-1 + : (unsigned long long)(domaininfo[i].max_pages + * handle->page_size); + domain->ssid = domaininfo[i].ssidref; + domain->num_networks = 0; + domain->networks = NULL; + + domain++; + } + num_domains += new_domains; + } while (new_domains == DOMAIN_CHUNK_SIZE); + node->num_domains = num_domains; + + /* Run all the extra data collectors requested */ + node->flags = 0; + for (i = 0; i < NUM_COLLECTORS; i++) { + if ((flags & collectors[i].flag) == collectors[i].flag) { + node->flags |= collectors[i].flag; + if(collectors[i].collect(handle, node) == 0) { + xenstat_free_node(node); + return NULL; + } + } + } + + return node; +} + +void xenstat_free_node(xenstat_node * node) +{ + int i; + + if (node) { + if (node->domains) { + for (i = 0; i < NUM_COLLECTORS; i++) + if((node->flags & collectors[i].flag) + == collectors[i].flag) + collectors[i].free(node); + free(node->domains); + } + free(node); + } +} + +xenstat_domain *xenstat_node_domain(xenstat_node * node, unsigned int domid) +{ + unsigned int i; + + /* FIXME: binary search */ + /* Find the appropriate domain entry in the node struct. */ + for (i = 0; i < node->num_domains; i++) { + if (node->domains[i].id == domid) + return &(node->domains[i]); + } + return NULL; +} + +xenstat_domain *xenstat_node_domain_by_index(xenstat_node * node, + unsigned int index) +{ + if (0 <= index && index < node->num_domains) + return &(node->domains[index]); + return NULL; +} + +const char *xenstat_node_xen_ver(xenstat_node * node) +{ + return node->xen_version; +} + +unsigned long long xenstat_node_tot_mem(xenstat_node * node) +{ + return node->tot_mem; +} + +unsigned long long xenstat_node_free_mem(xenstat_node * node) +{ + return node->free_mem; +} + +unsigned int xenstat_node_num_domains(xenstat_node * node) +{ + return node->num_domains; +} + +unsigned int xenstat_node_num_cpus(xenstat_node * node) +{ + return node->num_cpus; +} + +/* Get information about the CPU speed */ +unsigned long long xenstat_node_cpu_hz(xenstat_node * node) +{ + return node->cpu_hz; +} + +/* Get the domain ID for this domain */ +unsigned xenstat_domain_id(xenstat_domain * domain) +{ + return domain->id; +} + +/* Get information about how much CPU time has been used */ +unsigned long long xenstat_domain_cpu_ns(xenstat_domain * domain) +{ + return domain->cpu_ns; +} + +/* Find the number of VCPUs allocated to a domain */ +unsigned int xenstat_domain_num_vcpus(xenstat_domain * domain) +{ + return domain->num_vcpus; +} + +xenstat_vcpu *xenstat_domain_vcpu(xenstat_domain * domain, unsigned int vcpu) +{ + if (0 <= vcpu && vcpu < domain->num_vcpus) + return &(domain->vcpus[vcpu]); + return NULL; +} + +/* Find the current memory reservation for this domain */ +unsigned long long xenstat_domain_cur_mem(xenstat_domain * domain) +{ + return domain->cur_mem; +} + +/* Find the maximum memory reservation for this domain */ +unsigned long long xenstat_domain_max_mem(xenstat_domain * domain) +{ + return domain->max_mem; +} + +/* Find the domain's SSID */ +unsigned int xenstat_domain_ssid(xenstat_domain * domain) +{ + return domain->ssid; +} + +/* Get domain states */ +unsigned int xenstat_domain_dying(xenstat_domain * domain) +{ + return (domain->state & DOMFLAGS_DYING) == DOMFLAGS_DYING; +} + +unsigned int xenstat_domain_crashed(xenstat_domain * domain) +{ + return ((domain->state & DOMFLAGS_SHUTDOWN) == DOMFLAGS_SHUTDOWN) + && (((domain->state >> DOMFLAGS_SHUTDOWNSHIFT) + & DOMFLAGS_SHUTDOWNMASK) == SHUTDOWN_crash); +} + +unsigned int xenstat_domain_shutdown(xenstat_domain * domain) +{ + return ((domain->state & DOMFLAGS_SHUTDOWN) == DOMFLAGS_SHUTDOWN) + && (((domain->state >> DOMFLAGS_SHUTDOWNSHIFT) + & DOMFLAGS_SHUTDOWNMASK) != SHUTDOWN_crash); +} + +unsigned int xenstat_domain_paused(xenstat_domain * domain) +{ + return (domain->state & DOMFLAGS_PAUSED) == DOMFLAGS_PAUSED; +} + +unsigned int xenstat_domain_blocked(xenstat_domain * domain) +{ + return (domain->state & DOMFLAGS_BLOCKED) == DOMFLAGS_BLOCKED; +} + +unsigned int xenstat_domain_running(xenstat_domain * domain) +{ + return (domain->state & DOMFLAGS_RUNNING) == DOMFLAGS_RUNNING; +} + +/* Get the number of networks for a given domain */ +unsigned int xenstat_domain_num_networks(xenstat_domain * domain) +{ + return domain->num_networks; +} + +/* Get the network handle to obtain network stats */ +xenstat_network *xenstat_domain_network(xenstat_domain * domain, + unsigned int network) +{ + if (domain->networks && 0 <= network && network < domain->num_networks) + return &(domain->networks[network]); + return NULL; +} + +/* + * VCPU functions + */ +/* Collect information about VCPUs */ +static int xenstat_collect_vcpus(xenstat_handle * handle, xenstat_node * node) +{ + unsigned int i, vcpu; + /* Fill in VCPU information */ + for (i = 0; i < node->num_domains; i++) { + node->domains[i].vcpus = malloc(node->domains[i].num_vcpus + * sizeof(xenstat_vcpu)); + if (node->domains[i].vcpus == NULL) + return 0; + + for (vcpu = 0; vcpu < node->domains[i].num_vcpus; vcpu++) { + /* FIXME: need to be using a more efficient mechanism*/ + long long vcpu_time; + vcpu_time = + xi_get_vcpu_usage(handle->xihandle, + node->domains[i].id, + vcpu); + if (vcpu_time < 0) + return 0; + node->domains[i].vcpus[vcpu].ns = vcpu_time; + } + } + return 1; +} + +/* Free VCPU information */ +static void xenstat_free_vcpus(xenstat_node * node) +{ + unsigned int i; + for (i = 0; i < node->num_domains; i++) + free(node->domains[i].vcpus); +} + +/* Free VCPU information in handle - nothing to do */ +static void xenstat_uninit_vcpus(xenstat_handle * handle) +{ +} + +/* Get VCPU usage */ +unsigned long long xenstat_vcpu_ns(xenstat_vcpu * vcpu) +{ + return vcpu->ns; +} + +/* + * Network functions + */ + +/* Expected format of /proc/net/dev */ +static const char PROCNETDEV_HEADER[] = + "Inter-| Receive |" + " Transmit\n" + " face |bytes packets errs drop fifo frame compressed multicast|" + "bytes packets errs drop fifo colls carrier compressed\n"; + +/* Collect information about networks */ +static int xenstat_collect_networks(xenstat_handle * handle, + xenstat_node * node) +{ + /* Open and validate /proc/net/dev if we haven't already */ + if (handle->procnetdev == NULL) { + char header[sizeof(PROCNETDEV_HEADER)]; + handle->procnetdev = fopen("/proc/net/dev", "r"); + if (handle->procnetdev == NULL) { + perror("Error opening /proc/net/dev"); + return 1; + } + + /* Validate the format of /proc/net/dev */ + if (fread(header, sizeof(PROCNETDEV_HEADER) - 1, 1, + handle->procnetdev) != 1) { + perror("Error reading /proc/net/dev header"); + return 1; + } + header[sizeof(PROCNETDEV_HEADER) - 1] = '\0'; + if (strcmp(header, PROCNETDEV_HEADER) != 0) { + fprintf(stderr, + "Unexpected /proc/net/dev format\n"); + return 1; + } + } + + /* Fill in networks */ + /* FIXME: optimize this */ + fseek(handle->procnetdev, sizeof(PROCNETDEV_HEADER) - 1, SEEK_SET); + while (1) { + xenstat_domain *domain; + xenstat_network net; + unsigned int domid; + int ret = fscanf(handle->procnetdev, + "vif%u.%u:%llu%llu%llu%llu%*u%*u%*u%*u" + "%llu%llu%llu%llu%*u%*u%*u%*u\n", + &domid, &net.id, + &net.tbytes, &net.tpackets, &net.terrs, + &net.tdrop, + &net.rbytes, &net.rpackets, &net.rerrs, + &net.rdrop); + if (ret == EOF) + break; + if (ret != 10) { + unsigned int c; + do { + c = fgetc(handle->procnetdev); + } while (c != '\n' && c != EOF); + if (c == EOF) + break; + continue; + } + + /* FIXME: this does a search for the domid */ + domain = xenstat_node_domain(node, domid); + if (domain == NULL) { + fprintf(stderr, + "Found interface vif%u.%u but domain %u" + " does not exist.\n", domid, net.id, + domid); + continue; + } + if (domain->networks == NULL) { + domain->num_networks = 1; + domain->networks = malloc(sizeof(xenstat_network)); + } else { + domain->num_networks++; + domain->networks = + realloc(domain->networks, + domain->num_networks * + sizeof(xenstat_network)); + } + if (domain->networks == NULL) + return 1; + domain->networks[domain->num_networks - 1] = net; + } + + return 1; +} + +/* Free network information */ +static void xenstat_free_networks(xenstat_node * node) +{ + unsigned int i; + for (i = 0; i < node->num_domains; i++) + free(node->domains[i].networks); +} + +/* Free network information in handle */ +static void xenstat_uninit_networks(xenstat_handle * handle) +{ + if(handle->procnetdev) + fclose(handle->procnetdev); +} + +/* Get the network ID */ +unsigned int xenstat_network_id(xenstat_network * network) +{ + return network->id; +} + +/* Get the number of receive bytes */ +unsigned long long xenstat_network_rbytes(xenstat_network * network) +{ + return network->rbytes; +} + +/* Get the number of receive packets */ +unsigned long long xenstat_network_rpackets(xenstat_network * network) +{ + return network->rpackets; +} + +/* Get the number of receive errors */ +unsigned long long xenstat_network_rerrs(xenstat_network * network) +{ + return network->rerrs; +} + +/* Get the number of receive drops */ +unsigned long long xenstat_network_rdrop(xenstat_network * network) +{ + return network->rdrop; +} + +/* Get the number of transmit bytes */ +unsigned long long xenstat_network_tbytes(xenstat_network * network) +{ + return network->tbytes; +} + +/* Get the number of transmit packets */ +unsigned long long xenstat_network_tpackets(xenstat_network * network) +{ + return network->tpackets; +} + +/* Get the number of transmit errors */ +unsigned long long xenstat_network_terrs(xenstat_network * network) +{ + return network->terrs; +} + +/* Get the number of transmit dropped packets */ +unsigned long long xenstat_network_tdrop(xenstat_network * network) +{ + return network->tdrop; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/libxenstat/src/xenstat.h --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/libxenstat/src/xenstat.h Thu Aug 25 22:53:20 2005 @@ -0,0 +1,150 @@ +/* libxenstat: statistics-collection library for Xen + * Copyright (C) International Business Machines Corp., 2005 + * Authors: Josh Triplett <josht@xxxxxxxxxx> + * Judy Fischbach <jfisch@xxxxxxxxxx> + * David Hendricks <dhendrix@xxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + */ + +/* libxenstat API */ + +/* Opaque handles */ +typedef struct xenstat_handle xenstat_handle; +typedef struct xenstat_domain xenstat_domain; +typedef struct xenstat_node xenstat_node; +typedef struct xenstat_vcpu xenstat_vcpu; +typedef struct xenstat_network xenstat_network; + +/* Initialize the xenstat library. Returns a handle to be used with + * subsequent calls to the xenstat library, or NULL if an error occurs. */ +xenstat_handle *xenstat_init(); + +/* Release the handle to libxc, free resources, etc. */ +void xenstat_uninit(xenstat_handle * handle); + +/* Get all available information about a node */ +#define XENSTAT_VCPU 0x1 +#define XENSTAT_NETWORK 0x2 +#define XENSTAT_ALL (XENSTAT_VCPU|XENSTAT_NETWORK) +xenstat_node *xenstat_get_node(xenstat_handle * handle, unsigned int flags); + +/* Free the information */ +void xenstat_free_node(xenstat_node * node); + +/* + * Node functions - extract information from a xenstat_node + */ + +/* Get information about the domain with the given domain ID */ +xenstat_domain *xenstat_node_domain(xenstat_node * node, + unsigned int domid); + +/* Get the domain with the given index; used to loop over all domains. */ +xenstat_domain *xenstat_node_domain_by_index(xenstat_node * node, + unsigned index); +/* Get xen version of the node */ +const char *xenstat_node_xen_ver(xenstat_node * node); + +/* Get amount of total memory on a node */ +unsigned long long xenstat_node_tot_mem(xenstat_node * node); + +/* Get amount of free memory on a node */ +unsigned long long xenstat_node_free_mem(xenstat_node * node); + +/* Find the number of domains existing on a node */ +unsigned int xenstat_node_num_domains(xenstat_node * node); + +/* Find the number of CPUs existing on a node */ +unsigned int xenstat_node_num_cpus(xenstat_node * node); + +/* Get information about the CPU speed */ +unsigned long long xenstat_node_cpu_hz(xenstat_node * node); + +/* + * Domain functions - extract information from a xenstat_domain + */ + +/* Get the domain ID for this domain */ +unsigned xenstat_domain_id(xenstat_domain * domain); + +/* Get information about how much CPU time has been used */ +unsigned long long xenstat_domain_cpu_ns(xenstat_domain * domain); + +/* Find the number of VCPUs allocated to a domain */ +unsigned int xenstat_domain_num_vcpus(xenstat_domain * domain); + +/* Get the VCPU handle to obtain VCPU stats */ +xenstat_vcpu *xenstat_domain_vcpu(xenstat_domain * domain, + unsigned int vcpu); + +/* Find the current memory reservation for this domain */ +unsigned long long xenstat_domain_cur_mem(xenstat_domain * domain); + +/* Find the maximum memory reservation for this domain */ +unsigned long long xenstat_domain_max_mem(xenstat_domain * domain); + +/* Find the domain's SSID */ +unsigned int xenstat_domain_ssid(xenstat_domain * domain); + +/* Get domain states */ +unsigned int xenstat_domain_dying(xenstat_domain * domain); +unsigned int xenstat_domain_crashed(xenstat_domain * domain); +unsigned int xenstat_domain_shutdown(xenstat_domain * domain); +unsigned int xenstat_domain_paused(xenstat_domain * domain); +unsigned int xenstat_domain_blocked(xenstat_domain * domain); +unsigned int xenstat_domain_running(xenstat_domain * domain); + +/* Get the number of networks for a given domain */ +unsigned int xenstat_domain_num_networks(xenstat_domain *); + +/* Get the network handle to obtain network stats */ +xenstat_network *xenstat_domain_network(xenstat_domain * domain, + unsigned int network); + +/* + * VCPU functions - extract information from a xenstat_vcpu + */ + +/* Get VCPU usage */ +unsigned long long xenstat_vcpu_ns(xenstat_vcpu * vcpu); + + +/* + * Network functions - extract information from a xenstat_network + */ + +/* Get the ID for this network */ +unsigned int xenstat_network_id(xenstat_network * network); + +/* Get the number of receive bytes for this network */ +unsigned long long xenstat_network_rbytes(xenstat_network * network); + +/* Get the number of receive packets for this network */ +unsigned long long xenstat_network_rpackets(xenstat_network * network); + +/* Get the number of receive errors for this network */ +unsigned long long xenstat_network_rerrs(xenstat_network * network); + +/* Get the number of receive drops for this network */ +unsigned long long xenstat_network_rdrop(xenstat_network * network); + +/* Get the number of transmit bytes for this network */ +unsigned long long xenstat_network_tbytes(xenstat_network * network); + +/* Get the number of transmit packets for this network */ +unsigned long long xenstat_network_tpackets(xenstat_network * network); + +/* Get the number of transmit errors for this network */ +unsigned long long xenstat_network_terrs(xenstat_network * network); + +/* Get the number of transmit drops for this network */ +unsigned long long xenstat_network_tdrop(xenstat_network * network); diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/xentop/Makefile --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/xentop/Makefile Thu Aug 25 22:53:20 2005 @@ -0,0 +1,44 @@ +# Copyright (C) International Business Machines Corp., 2005 +# Author: Josh Triplett <josht@xxxxxxxxxx> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; under version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +XEN_ROOT=../../.. +include $(XEN_ROOT)/tools/Rules.mk + +ifneq ($(XENSTAT_XENTOP),y) +all install xentop: +else + +INSTALL = install +INSTALL_PROG = $(INSTALL) -m0755 -D +INSTALL_DATA = $(INSTALL) -m0644 -D + +prefix=/usr +mandir=$(prefix)/share/man +man1dir=$(mandir)/man1 +sbindir=$(prefix)/sbin + +CFLAGS += -DGCC_PRINTF -Wall -Werror -I$(XEN_LIBXENSTAT) +LDFLAGS += -L$(XEN_LIBXENSTAT) +LDLIBS += -lxenstat -lncurses + +all: xentop + +xentop: xentop.o + +install: xentop xentop.1 + $(INSTALL_PROG) xentop $(DESTDIR)$(sbindir)/xentop + $(INSTALL_DATA) xentop.1 $(DESTDIR)$(man1dir)/xentop.1 + +endif + +clean: + rm -f xentop xentop.o diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/xentop/TODO --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/xentop/TODO Thu Aug 25 22:53:20 2005 @@ -0,0 +1,34 @@ +Display error messages on the help line after bad input at a prompt. +Fractional delay times +Use prompting to search for domains +Better line editing? + +* Make CPU in % more accurate +* Domain total network TX % and RX % + +Like Top, f feature, field select of domain columns, toggle the display of +field by typing the letter associated with field, if displayed it shows in +bold and the letter is Capitalized along with a leading asterisk for the +field, if not selected for display letter is lowercase, no leading asterisk +and field is not bolded. + +Like Top, ordering of domain columns, o feature Capital letter shifts left, +lowercase letter shifts right? + +Color +Full management: pause, destroy, create domains + +Add support for Virtual Block Devices (vbd) + +To think about: +Support for one than one node display (distributed monitoring +from any node of all other nodes in a cluster) +Bottom line option (Switch node, Search node [tab completion?]) + +Capture/Logging of resource information generated during a time interval. +-b batch mode dump snapshots to standard output (used with -n) +-n number of iterations to dump to standard output (unlimited if not specified) +-d monitor DomIDs as -dD1,-dD2 or -dD1,D2... + Monitor only domains with specified domain IDs +-m monitor nodeIDs as -mN1,-mN2 or -mN1,N2... + Monitor only domains with specified node IDs diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/xentop/xentop.1 --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/xentop/xentop.1 Thu Aug 25 22:53:20 2005 @@ -0,0 +1,88 @@ +.\" Copyright (C) International Business Machines Corp., 2005 +.\" Author: Josh Triplett <josht@xxxxxxxxxx> +.\" +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; under version 2 of the License. +.\" +.\" This program is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public License +.\" along with this program; if not, write to the Free Software +.\" Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +.TH xentop 1 "August 2005" +.SH NAME +\fBxentop\fR \- displays real-time information about a Xen system and domains + +.SH SYNOPSIS +.B xentop +[\fB\-h\fR] +[\fB\-V\fR] +[\fB\-d\fRSECONDS] +[\fB\-n\fR] +[\fB\-r\fR] +[\fB\-v\fR] + +.SH DESCRIPTION +\fBxentop\fR displays information about the Xen system and domains, in a +continually-updating manner. Command-line options and interactive commands +can change the detail and format of the information displayed by \fBxentop\fR. + +.SH OPTIONS +.TP +\fB\-h\fR, \fB\-\-help\fR +display help and exit +.TP +\fB\-V\fR, \fB\-\-version\fR +output version information and exit +.TP +\fB\-d\fR, \fB\-\-delay\fR=\fISECONDS\fR +seconds between updates (default 3) +.TP +\fB\-n\fR, \fB\-\-networks\fR +output network information +.TP +\fB\-r\fR, \fB\-\-repeat\-header\fR +repeat table header before each domain +.TP +\fB\-v\fR, \fB\-\-vcpus\fR +output VCPU data + +.SH "INTERACTIVE COMMANDS" +All interactive commands are case-insensitive. +.TP +.B D +set delay between updates +.TP +.B N +toggle display of network information +.TP +.B Q, Esc +quit +.TP +.B R +toggle table header before each domain +.TP +.B S +cycle sort order +.TP +.B V +toggle display of VCPU information +.TP +.B Arrows +scroll domain display + +.SH AUTHORS +Written by Judy Fischbach, David Hendricks, and Josh Triplett + +.SH "REPORTING BUGS" +Report bugs to <dsteklof@xxxxxxxxxx>. + +.SH COPYRIGHT +Copyright \(co 2005 International Business Machines Corp +.br +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstat/xentop/xentop.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstat/xentop/xentop.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,876 @@ +/* + * Copyright (C) International Business Machines Corp., 2005 + * Author(s): Judy Fischbach <jfisch@xxxxxxxxxx> + * David Hendricks <dhendrix@xxxxxxxxxx> + * Josh Triplett <josht@xxxxxxxxxx> + * based on code from Anthony Liguori <aliguori@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <curses.h> +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <time.h> +#include <unistd.h> + +#include <xenstat.h> + +#define XENTOP_VERSION "1.0" + +#define XENTOP_DISCLAIMER \ +"Copyright (C) 2005 International Business Machines Corp\n"\ +"This is free software; see the source for copying conditions.There is NO\n"\ +"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n" +#define XENTOP_BUGSTO "Report bugs to <dsteklof@xxxxxxxxxx>.\n" + +#define _GNU_SOURCE +#include <getopt.h> + +#if !defined(__GNUC__) && !defined(__GNUG__) +#define __attribute__(arg) /* empty */ +#endif + +#define KEY_ESCAPE '\x1B' + +/* + * Function prototypes + */ +/* Utility functions */ +static void usage(const char *); +static void version(void); +static void cleanup(void); +static void fail(const char *); +static int current_row(void); +static int lines(void); +static void print(const char *, ...) __attribute__((format(printf,1,2))); +static void attr_addstr(int attr, const char *str); +static void set_delay(char *value); +static void set_prompt(char *new_prompt, void (*func)(char *)); +static int handle_key(int); +static int compare(unsigned long long, unsigned long long); +static int compare_domains(xenstat_domain **, xenstat_domain **); +static unsigned long long tot_net_bytes( xenstat_domain *, int); + +/* Field functions */ +static int compare_domid(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_domid(xenstat_domain *domain); +static int compare_state(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_state(xenstat_domain *domain); +static int compare_cpu(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_cpu(xenstat_domain *domain); +static int compare_cpu_pct(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_cpu_pct(xenstat_domain *domain); +static int compare_mem(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_mem(xenstat_domain *domain); +static void print_mem_pct(xenstat_domain *domain); +static int compare_maxmem(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_maxmem(xenstat_domain *domain); +static void print_max_pct(xenstat_domain *domain); +static int compare_vcpus(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_vcpus(xenstat_domain *domain); +static int compare_nets(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_nets(xenstat_domain *domain); +static int compare_net_tx(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_net_tx(xenstat_domain *domain); +static int compare_net_rx(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_net_rx(xenstat_domain *domain); +static int compare_ssid(xenstat_domain *domain1, xenstat_domain *domain2); +static void print_ssid(xenstat_domain *domain); + +/* Section printing functions */ +static void do_summary(void); +static void do_header(void); +static void do_bottom_line(void); +static void do_domain(xenstat_domain *); +static void do_vcpu(xenstat_domain *); +static void do_network(xenstat_domain *); +static void top(void); + +/* Field types */ +typedef enum field_id { + FIELD_DOMID, + FIELD_STATE, + FIELD_CPU, + FIELD_CPU_PCT, + FIELD_MEM, + FIELD_MEM_PCT, + FIELD_MAXMEM, + FIELD_MAX_PCT, + FIELD_VCPUS, + FIELD_NETS, + FIELD_NET_TX, + FIELD_NET_RX, + FIELD_SSID +} field_id; + +typedef struct field { + field_id num; + const char *header; + unsigned int default_width; + int (*compare)(xenstat_domain *domain1, xenstat_domain *domain2); + void (*print)(xenstat_domain *domain); +} field; + +field fields[] = { + { FIELD_DOMID, "DOMID", 5, compare_domid, print_domid }, + { FIELD_STATE, "STATE", 6, compare_state, print_state }, + { FIELD_CPU, "CPU(sec)", 10, compare_cpu, print_cpu }, + { FIELD_CPU_PCT, "CPU(%)", 6, compare_cpu_pct, print_cpu_pct }, + { FIELD_MEM, "MEM(k)", 10, compare_mem, print_mem }, + { FIELD_MEM_PCT, "MEM(%)", 6, compare_mem, print_mem_pct }, + { FIELD_MAXMEM, "MAXMEM(k)", 10, compare_maxmem, print_maxmem }, + { FIELD_MAX_PCT, "MAXMEM(%)", 9, compare_maxmem, print_max_pct }, + { FIELD_VCPUS, "VCPUS", 5, compare_vcpus, print_vcpus }, + { FIELD_NETS, "NETS", 4, compare_nets, print_nets }, + { FIELD_NET_TX, "NETTX(k)", 8, compare_net_tx, print_net_tx }, + { FIELD_NET_RX, "NETRX(k)", 8, compare_net_rx, print_net_rx }, + { FIELD_SSID, "SSID", 4, compare_ssid, print_ssid } +}; + +const unsigned int NUM_FIELDS = sizeof(fields)/sizeof(field); + +/* Globals */ +struct timeval curtime, oldtime; +xenstat_handle *xhandle = NULL; +xenstat_node *prev_node = NULL; +xenstat_node *cur_node = NULL; +field_id sort_field = FIELD_DOMID; +unsigned int first_domain_index = 0; +unsigned int delay = 3; +int show_vcpus = 0; +int show_networks = 0; +int repeat_header = 0; +#define PROMPT_VAL_LEN 80 +char *prompt = NULL; +char prompt_val[PROMPT_VAL_LEN]; +int prompt_val_len = 0; +void (*prompt_complete_func)(char *); + +/* + * Function definitions + */ + +/* Utility functions */ + +/* Print usage message, using given program name */ +static void usage(const char *program) +{ + printf("Usage: %s [OPTION]\n" + "Displays ongoing information about xen vm resources \n\n" + "-h, --help display this help and exit\n" + "-V, --version output version information and exit\n" + "-d, --delay=SECONDS seconds between updates (default 3)\n" + "-n, --networks output vif network data\n" + "-r, --repeat-header repeat table header before each domain\n" + "-v, --vcpus output vcpu data\n" + "\n" XENTOP_BUGSTO, + program); + return; +} + +/* Print program version information */ +static void version(void) +{ + printf("xentop " XENTOP_VERSION "\n" + "Written by Judy Fischbach, David Hendricks, Josh Triplett\n" + "\n" XENTOP_DISCLAIMER); +} + +/* Clean up any open resources */ +static void cleanup(void) +{ + if(!isendwin()) + endwin(); + if(prev_node != NULL) + xenstat_free_node(prev_node); + if(cur_node != NULL) + xenstat_free_node(cur_node); + if(xhandle != NULL) + xenstat_uninit(xhandle); +} + +/* Display the given message and gracefully exit */ +static void fail(const char *str) +{ + if(!isendwin()) + endwin(); + fprintf(stderr, str); + exit(1); +} + +/* Return the row containing the cursor. */ +static int current_row(void) +{ + int y, x; + getyx(stdscr, y, x); + return y; +} + +/* Return the number of lines on the screen. */ +static int lines(void) +{ + int y, x; + getmaxyx(stdscr, y, x); + return y; +} + +/* printf-style print function which calls printw, but only if the cursor is + * not on the last line. */ +static void print(const char *fmt, ...) +{ + va_list args; + + if(current_row() < lines()-1) { + va_start(args, fmt); + vw_printw(stdscr, fmt, args); + va_end(args); + } +} + +/* Print a string with the given attributes set. */ +static void attr_addstr(int attr, const char *str) +{ + attron(attr); + addstr(str); + attroff(attr); +} + +/* Handle setting the delay from the user-supplied value in prompt_val */ +static void set_delay(char *value) +{ + int new_delay; + new_delay = atoi(value); + if(new_delay > 0) + delay = new_delay; +} + +/* Enable prompting mode with the given prompt string; call the given function + * when a value is available. */ +static void set_prompt(char *new_prompt, void (*func)(char *)) +{ + prompt = new_prompt; + prompt_val[0] = '\0'; + prompt_val_len = 0; + prompt_complete_func = func; +} + +/* Handle user input, return 0 if the program should quit, or 1 if not */ +static int handle_key(int ch) +{ + if(prompt == NULL) { + /* Not prompting for input; handle interactive commands */ + switch(ch) { + case 'n': case 'N': + show_networks ^= 1; + break; + case 'r': case 'R': + repeat_header ^= 1; + break; + case 's': case 'S': + sort_field = (sort_field + 1) % NUM_FIELDS; + break; + case 'v': case 'V': + show_vcpus ^= 1; + break; + case KEY_DOWN: + first_domain_index++; + break; + case KEY_UP: + if(first_domain_index > 0) + first_domain_index--; + break; + case 'd': case 'D': + set_prompt("Delay(sec)", set_delay); + break; + case 'q': case 'Q': case KEY_ESCAPE: + return 0; + } + } else { + /* Prompting for input; handle line editing */ + switch(ch) { + case '\r': + prompt_complete_func(prompt_val); + set_prompt(NULL, NULL); + break; + case KEY_ESCAPE: + set_prompt(NULL, NULL); + break; + case KEY_BACKSPACE: + if(prompt_val_len > 0) + prompt_val[--prompt_val_len] = '\0'; + default: + if((prompt_val_len+1) < PROMPT_VAL_LEN + && isprint(ch)) { + prompt_val[prompt_val_len++] = (char)ch; + prompt_val[prompt_val_len] = '\0'; + } + } + } + + return 1; +} + +/* Compares two integers, returning -1,0,1 for <,=,> */ +static int compare(unsigned long long i1, unsigned long long i2) +{ + if(i1 < i2) + return -1; + if(i1 > i2) + return 1; + return 0; +} + +/* Comparison function for use with qsort. Compares two domains using the + * current sort field. */ +static int compare_domains(xenstat_domain **domain1, xenstat_domain **domain2) +{ + return fields[sort_field].compare(*domain1, *domain2); +} + +/* Field functions */ + +/* Compares domain ids of two domains, returning -1,0,1 for <,=,> */ +int compare_domid(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return compare(xenstat_domain_id(domain1), xenstat_domain_id(domain2)); +} + +/* Prints domain identification number */ +void print_domid(xenstat_domain *domain) +{ + print("%5u", xenstat_domain_id(domain)); +} + +struct { + unsigned int (*get)(xenstat_domain *); + char ch; +} state_funcs[] = { + { xenstat_domain_dying, 'd' }, + { xenstat_domain_shutdown, 's' }, + { xenstat_domain_blocked, 'b' }, + { xenstat_domain_crashed, 'c' }, + { xenstat_domain_paused, 'p' }, + { xenstat_domain_running, 'r' } +}; +const unsigned int NUM_STATES = sizeof(state_funcs)/sizeof(*state_funcs); + +/* Compare states of two domains, returning -1,0,1 for <,=,> */ +static int compare_state(xenstat_domain *domain1, xenstat_domain *domain2) +{ + unsigned int i, d1s, d2s; + for(i = 0; i < NUM_STATES; i++) { + d1s = state_funcs[i].get(domain1); + d2s = state_funcs[i].get(domain2); + if(d1s && !d2s) + return -1; + if(d2s && !d1s) + return 1; + } + return 0; +} + +/* Prints domain state in abbreviated letter format */ +static void print_state(xenstat_domain *domain) +{ + unsigned int i; + for(i = 0; i < NUM_STATES; i++) + print("%c", state_funcs[i].get(domain) ? state_funcs[i].ch + : '-'); +} + +/* Compares cpu usage of two domains, returning -1,0,1 for <,=,> */ +static int compare_cpu(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(xenstat_domain_cpu_ns(domain1), + xenstat_domain_cpu_ns(domain2)); +} + +/* Prints domain cpu usage in seconds */ +static void print_cpu(xenstat_domain *domain) +{ + print("%10llu", xenstat_domain_cpu_ns(domain)/1000000000); +} + +/* Computes the CPU percentage used for a specified domain */ +static double get_cpu_pct(xenstat_domain *domain) +{ + xenstat_domain *old_domain; + double us_elapsed; + + /* Can't calculate CPU percentage without a previous sample. */ + if(prev_node == NULL) + return 0.0; + + old_domain = xenstat_node_domain(prev_node, xenstat_domain_id(domain)); + if(old_domain == NULL) + return 0.0; + + /* Calculate the time elapsed in microseconds */ + us_elapsed = ((curtime.tv_sec-oldtime.tv_sec)*1000000.0 + +(curtime.tv_usec - oldtime.tv_usec)); + + /* In the following, nanoseconds must be multiplied by 1000.0 to + * convert to microseconds, then divided by 100.0 to get a percentage, + * resulting in a multiplication by 10.0 */ + return ((xenstat_domain_cpu_ns(domain) + -xenstat_domain_cpu_ns(old_domain))/10.0)/us_elapsed; +} + +static int compare_cpu_pct(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(get_cpu_pct(domain1), get_cpu_pct(domain2)); +} + +/* Prints cpu percentage statistic */ +static void print_cpu_pct(xenstat_domain *domain) +{ + print("%6.1f", get_cpu_pct(domain)); +} + +/* Compares current memory of two domains, returning -1,0,1 for <,=,> */ +static int compare_mem(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(xenstat_domain_cur_mem(domain1), + xenstat_domain_cur_mem(domain2)); +} + +/* Prints current memory statistic */ +static void print_mem(xenstat_domain *domain) +{ + print("%10llu", xenstat_domain_cur_mem(domain)/1024); +} + +/* Prints memory percentage statistic, ratio of current domain memory to total + * node memory */ +static void print_mem_pct(xenstat_domain *domain) +{ + print("%6.1f", (double)xenstat_domain_cur_mem(domain) / + (double)xenstat_node_tot_mem(cur_node) * 100); +} + +/* Compares maximum memory of two domains, returning -1,0,1 for <,=,> */ +static int compare_maxmem(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(xenstat_domain_max_mem(domain1), + xenstat_domain_max_mem(domain2)); +} + +/* Prints maximum domain memory statistic in KB */ +static void print_maxmem(xenstat_domain *domain) +{ + unsigned long long max_mem = xenstat_domain_max_mem(domain); + if(max_mem == ((unsigned long long)-1)) + print("%10s", "no limit"); + else + print("%10llu", max_mem/1024); +} + +/* Prints memory percentage statistic, ratio of current domain memory to total + * node memory */ +static void print_max_pct(xenstat_domain *domain) +{ + if (xenstat_domain_max_mem(domain) == (unsigned long long)-1) + print("%9s", "n/a"); + else + print("%9.1f", (double)xenstat_domain_max_mem(domain) / + (double)xenstat_node_tot_mem(cur_node) * 100); +} + +/* Compares number of virtual CPUs of two domains, returning -1,0,1 for + * <,=,> */ +static int compare_vcpus(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(xenstat_domain_num_vcpus(domain1), + xenstat_domain_num_vcpus(domain2)); +} + +/* Prints number of virtual CPUs statistic */ +static void print_vcpus(xenstat_domain *domain) +{ + print("%5u", xenstat_domain_num_vcpus(domain)); +} + +/* Compares number of virtual networks of two domains, returning -1,0,1 for + * <,=,> */ +static int compare_nets(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(xenstat_domain_num_networks(domain1), + xenstat_domain_num_networks(domain2)); +} + +/* Prints number of virtual networks statistic */ +static void print_nets(xenstat_domain *domain) +{ + print("%4u", xenstat_domain_num_networks(domain)); +} + +/* Compares number of total network tx bytes of two domains, returning -1,0,1 for + * <,=,> */ +static int compare_net_tx(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(tot_net_bytes(domain1, FALSE), + tot_net_bytes(domain2, FALSE)); +} + +/* Prints number of total network tx bytes statistic */ +static void print_net_tx(xenstat_domain *domain) +{ + print("%8llu", tot_net_bytes(domain, FALSE)/1024); +} + +/* Compares number of total network rx bytes of two domains, returning -1,0,1 for + * <,=,> */ +static int compare_net_rx(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return -compare(tot_net_bytes(domain1, TRUE), + tot_net_bytes(domain2, TRUE)); +} + +/* Prints number of total network rx bytes statistic */ +static void print_net_rx(xenstat_domain *domain) +{ + print("%8llu", tot_net_bytes(domain, TRUE)/1024); +} + +/* Gets number of total network bytes statistic, if rx true, then rx bytes + * otherwise tx bytes + */ +static unsigned long long tot_net_bytes(xenstat_domain *domain, int rx_flag) +{ + int i = 0; + xenstat_network *network; + unsigned num_networks = 0; + unsigned long long total = 0; + + /* How many networks? */ + num_networks = xenstat_domain_num_networks(domain); + + /* Dump information for each network */ + for (i=0; i < num_networks; i++) { + /* Next get the network information */ + network = xenstat_domain_network(domain,i); + if (rx_flag) + total += xenstat_network_rbytes(network); + else + total += xenstat_network_tbytes(network); + } + return (total); +} + +/* Compares security id (ssid) of two domains, returning -1,0,1 for <,=,> */ +static int compare_ssid(xenstat_domain *domain1, xenstat_domain *domain2) +{ + return compare(xenstat_domain_ssid(domain1), + xenstat_domain_ssid(domain2)); +} + +/* Prints ssid statistic */ +static void print_ssid(xenstat_domain *domain) +{ + print("%4u", xenstat_domain_ssid(domain)); +} + +/* Section printing functions */ +/* Prints the top summary, above the domain table */ +void do_summary(void) +{ +#define TIME_STR_LEN 9 + const char *TIME_STR_FORMAT = "%H:%M:%S"; + char time_str[TIME_STR_LEN]; + unsigned run = 0, block = 0, pause = 0, + crash = 0, dying = 0, shutdown = 0; + unsigned i, num_domains = 0; + unsigned long long used = 0; + xenstat_domain *domain; + + /* Print program name, current time, and number of domains */ + strftime(time_str, TIME_STR_LEN, TIME_STR_FORMAT, + localtime(&curtime.tv_sec)); + num_domains = xenstat_node_num_domains(cur_node); + print("xentop - %s\n", time_str); + + /* Tabulate what states domains are in for summary */ + for (i=0; i < num_domains; i++) { + domain = xenstat_node_domain_by_index(cur_node,i); + if (xenstat_domain_running(domain)) run++; + else if (xenstat_domain_blocked(domain)) block++; + else if (xenstat_domain_paused(domain)) pause++; + else if (xenstat_domain_shutdown(domain)) shutdown++; + else if (xenstat_domain_crashed(domain)) crash++; + else if (xenstat_domain_dying(domain)) dying++; + } + + print("%u domains: %u running, %u blocked, %u paused, " + "%u crashed, %u dying, %u shutdown \n", + num_domains, run, block, pause, crash, dying, shutdown); + + used = xenstat_node_tot_mem(cur_node)-xenstat_node_free_mem(cur_node); + + /* Dump node memory and cpu information */ + print("Mem: %lluk total, %lluk used, %lluk free " + "CPUs: %u @ %lluMHz\n", + xenstat_node_tot_mem(cur_node)/1024, used/1024, + xenstat_node_free_mem(cur_node)/1024, + xenstat_node_num_cpus(cur_node), + xenstat_node_cpu_hz(cur_node)/1000000); +} + +/* Display the top header for the domain table */ +void do_header(void) +{ + field_id i; + + /* Turn on REVERSE highlight attribute for headings */ + attron(A_REVERSE); + for(i = 0; i < NUM_FIELDS; i++) { + if(i != 0) + print(" "); + /* The BOLD attribute is turned on for the sort column */ + if(i == sort_field) + attron(A_BOLD); + print("%*s", fields[i].default_width, fields[i].header); + if(i == sort_field) + attroff(A_BOLD); + } + attroff(A_REVERSE); + print("\n"); +} + +/* Displays bottom status line or current prompt */ +void do_bottom_line(void) +{ + move(lines()-1, 2); + + if (prompt != NULL) { + printw("%s: %s", prompt, prompt_val); + } else { + addch(A_REVERSE | 'D'); addstr("elay "); + + /* network */ + addch(A_REVERSE | 'N'); + attr_addstr(show_networks ? COLOR_PAIR(1) : 0, "etworks"); + addstr(" "); + + /* vcpus */ + addch(A_REVERSE | 'V'); + attr_addstr(show_vcpus ? COLOR_PAIR(1) : 0, "CPUs"); + addstr(" "); + + /* repeat */ + addch(A_REVERSE | 'R'); + attr_addstr(repeat_header ? COLOR_PAIR(1) : 0, "epeat header"); + addstr(" "); + + /* sort order */ + addch(A_REVERSE | 'S'); addstr("ort order "); + + addch(A_REVERSE | 'Q'); addstr("uit "); + } +} + +/* Prints Domain information */ +void do_domain(xenstat_domain *domain) +{ + unsigned int i; + for(i = 0; i < NUM_FIELDS; i++) { + if(i != 0) + print(" "); + if(i == sort_field) + attron(A_BOLD); + fields[i].print(domain); + if(i == sort_field) + attroff(A_BOLD); + } + print("\n"); +} + +/* Output all vcpu information */ +void do_vcpu(xenstat_domain *domain) +{ + int i = 0; + unsigned num_vcpus = 0; + xenstat_vcpu *vcpu; + + print("VCPUs(sec): "); + + num_vcpus = xenstat_domain_num_vcpus(domain); + + /* for all vcpus dump out values */ + for (i=0; i< num_vcpus; i++) { + vcpu = xenstat_domain_vcpu(domain,i); + + if (i != 0 && (i%5)==0) + print("\n "); + print(" %2u: %10llus", i, xenstat_vcpu_ns(vcpu)/1000000000); + } + print("\n"); +} + +/* Output all network information */ +void do_network(xenstat_domain *domain) +{ + int i = 0; + xenstat_network *network; + unsigned num_networks = 0; + + /* How many networks? */ + num_networks = xenstat_domain_num_networks(domain); + + /* Dump information for each network */ + for (i=0; i < num_networks; i++) { + /* Next get the network information */ + network = xenstat_domain_network(domain,i); + + print("Net%d RX: %8llubytes %8llupkts %8lluerr %8lludrop ", + i, + xenstat_network_rbytes(network), + xenstat_network_rpackets(network), + xenstat_network_rerrs(network), + xenstat_network_rdrop(network)); + + print("TX: %8llubytes %8llupkts %8lluerr %8lludrop\n", + xenstat_network_tbytes(network), + xenstat_network_tpackets(network), + xenstat_network_terrs(network), + xenstat_network_tdrop(network)); + } +} + +static void top(void) +{ + xenstat_domain **domains; + unsigned int i, num_domains = 0; + + /* Now get the node information */ + if (prev_node != NULL) + xenstat_free_node(prev_node); + prev_node = cur_node; + cur_node = xenstat_get_node(xhandle, XENSTAT_ALL); + if (cur_node == NULL) + fail("Failed to retrieve statistics from libxenstat\n"); + + /* dump summary top information */ + do_summary(); + + /* Count the number of domains for which to report data */ + num_domains = xenstat_node_num_domains(cur_node); + + domains = malloc(num_domains*sizeof(xenstat_domain *)); + if(domains == NULL) + fail("Failed to allocate memory\n"); + + for (i=0; i < num_domains; i++) + domains[i] = xenstat_node_domain_by_index(cur_node, i); + + /* Sort */ + qsort(domains, num_domains, sizeof(xenstat_domain *), + (int(*)(const void *, const void *))compare_domains); + + if(first_domain_index >= num_domains) + first_domain_index = num_domains-1; + + for (i = first_domain_index; i < num_domains; i++) { + if(current_row() == lines()-1) + break; + if (i == first_domain_index || repeat_header) + do_header(); + do_domain(domains[i]); + if (show_vcpus) + do_vcpu(domains[i]); + if (show_networks) + do_network(domains[i]); + } + + do_bottom_line(); +} + +int main(int argc, char **argv) +{ + int opt, optind = 0; + int ch = ERR; + + struct option lopts[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "networks", no_argument, NULL, 'n' }, + { "repeat-header", no_argument, NULL, 'r' }, + { "vcpus", no_argument, NULL, 'v' }, + { "delay", required_argument, NULL, 'd' }, + { 0, 0, 0, 0 }, + }; + const char *sopts = "hVbnvd:"; + + if (atexit(cleanup) != 0) + fail("Failed to install cleanup handler.\n"); + + while ((opt = getopt_long(argc, argv, sopts, lopts, &optind)) != -1) { + switch (opt) { + case 'h': + case '?': + default: + usage(argv[0]); + exit(0); + case 'V': + version(); + exit(0); + case 'n': + show_networks = 1; + break; + case 'r': + repeat_header = 1; + break; + case 'v': + show_vcpus = 1; + break; + case 'd': + delay = atoi(optarg); + break; + } + } + + /* Get xenstat handle */ + xhandle = xenstat_init(); + if (xhandle == NULL) + fail("Failed to initialize xenstat library\n"); + + /* Begin curses stuff */ + initscr(); + start_color(); + cbreak(); + noecho(); + nonl(); + keypad(stdscr, TRUE); + halfdelay(5); + use_default_colors(); + init_pair(1, -1, COLOR_YELLOW); + + do { + gettimeofday(&curtime, NULL); + if(ch != ERR || (curtime.tv_sec - oldtime.tv_sec) >= delay) { + clear(); + top(); + oldtime = curtime; + refresh(); + } + ch = getch(); + } while (handle_key(ch)); + + /* Cleanup occurs in cleanup(), so no work to do here. */ + + return 0; +} diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/COPYING --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/COPYING Thu Aug 25 22:53:20 2005 @@ -0,0 +1,515 @@ +This license (LGPL) applies to the xenstore library which interfaces +with the xenstore daemon (as stated in xs.c, xs.h, xs_lib.c and +xs_lib.h). The remaining files in the directory are licensed as +stated in the comments (as of this writing, GPL, see ../../COPYING). + + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes a de-facto standard. To achieve this, non-free programs must +be allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least + three years, to give the same user the materials specified in + Subsection 6a, above, for a charge no more than the cost of + performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the library, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James + Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/01simple.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/01simple.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,4 @@ +# Create an entry, read it. +write /test create contents +expect contents +read /test diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/02directory.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/02directory.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,47 @@ +# Root directory has only tool dir in it. +expect tool +dir / + +# Create a file. +write /test create contents + +# Directory shows it. +expect test +expect tool +dir / + +# Make a new directory, check it's there +mkdir /dir +expect dir +expect test +expect tool +dir / + +# Check it's empty. +dir /dir + +# Create a file, check it exists. +write /dir/test2 create contents2 +expect test2 +dir /dir +expect contents2 +read /dir/test2 + +# Creating dir over the top should fail. +expect mkdir failed: File exists +mkdir /dir +expect mkdir failed: File exists +mkdir /dir/test2 + +# Mkdir implicitly creates directories. +mkdir /dir/1/2/3/4 +expect test2 +expect 1 +dir /dir +expect 2 +dir /dir/1 +expect 3 +dir /dir/1/2 +expect 4 +dir /dir/1/2/3 +dir /dir/1/2/3/4 diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/03write.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/03write.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,39 @@ +# Write without create fails. +expect write failed: No such file or directory +write /test none contents + +# Exclusive write succeeds +write /test excl contents +expect contents +read /test + +# Exclusive write fails to overwrite. +expect write failed: File exists +write /test excl contents + +# Non-exclusive overwrite succeeds. +write /test none contents2 +expect contents2 +read /test +write /test create contents3 +expect contents3 +read /test + +# Write should implicitly create directories +write /dir/test create contents +expect test +dir /dir +expect contents +read /dir/test +write /dir/1/2/3/4 excl contents4 +expect test +expect 1 +dir /dir +expect 2 +dir /dir/1 +expect 3 +dir /dir/1/2 +expect 4 +dir /dir/1/2/3 +expect contents4 +read /dir/1/2/3/4 diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/04rm.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/04rm.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,18 @@ +# Remove non-existant fails. +expect rm failed: No such file or directory +rm /test +expect rm failed: No such file or directory +rm /dir/test + +# Create file and remove it +write /test excl contents +rm /test + +# Create directory and remove it. +mkdir /dir +rm /dir + +# Create directory, create file, remove all. +mkdir /dir +write /dir/test excl contents +rm /dir diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/05filepermissions.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/05filepermissions.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,81 @@ +# Fail to get perms on non-existent file. +expect getperm failed: No such file or directory +getperm /test +expect getperm failed: No such file or directory +getperm /dir/test + +# Create file: inherits from root (0 READ) +write /test excl contents +expect 0 READ +getperm /test +setid 1 +expect 0 READ +getperm /test +expect contents +read /test +expect write failed: Permission denied +write /test none contents + +# Take away read access to file. +setid 0 +setperm /test 0 NONE +setid 1 +expect getperm failed: Permission denied +getperm /test +expect read failed: Permission denied +read /test +expect write failed: Permission denied +write /test none contents + +# Grant everyone write access to file. +setid 0 +setperm /test 0 WRITE +setid 1 +expect getperm failed: Permission denied +getperm /test +expect read failed: Permission denied +read /test +write /test none contents2 +setid 0 +expect contents2 +read /test + +# Grant everyone both read and write access. +setperm /test 0 READ/WRITE +setid 1 +expect 0 READ/WRITE +getperm /test +expect contents2 +read /test +write /test none contents3 +expect contents3 +read /test + +# Change so that user 1 owns it, noone else can do anything. +setid 0 +setperm /test 1 NONE +setid 1 +expect 1 NONE +getperm /test +expect contents3 +read /test +write /test none contents4 + +# User 2 can do nothing. +setid 2 +expect setperm failed: Permission denied +setperm /test 2 NONE +expect getperm failed: Permission denied +getperm /test +expect read failed: Permission denied +read /test +expect write failed: Permission denied +write /test none contents4 + +# Tools can always access things. +setid 0 +expect 1 NONE +getperm /test +expect contents4 +read /test +write /test none contents5 diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/06dirpermissions.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/06dirpermissions.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,127 @@ +# Root directory: owned by tool, everyone has read access. +expect 0 READ +getperm / + +# Create directory: inherits from root. +mkdir /dir +expect 0 READ +getperm /dir +setid 1 +expect 0 READ +getperm /dir +dir /dir +expect write failed: Permission denied +write /dir/test create contents2 + +# Remove everyone's read access to directoy. +setid 0 +setperm /dir 0 NONE +setid 1 +expect dir failed: Permission denied +dir /dir +expect read failed: Permission denied +read /dir/test create contents2 +expect write failed: Permission denied +write /dir/test create contents2 + +# Grant everyone write access to directory. +setid 0 +setperm /dir 0 WRITE +setid 1 +expect getperm failed: Permission denied +getperm /dir +expect dir failed: Permission denied +dir /dir +write /dir/test create contents +setid 0 +expect 1 WRITE +getperm /dir/test +setperm /dir/test 0 NONE +expect contents +read /dir/test + +# Grant everyone both read and write access. +setperm /dir 0 READ/WRITE +setid 1 +expect 0 READ/WRITE +getperm /dir +expect test +dir /dir +write /dir/test2 create contents +expect contents +read /dir/test2 +setperm /dir/test2 1 NONE + +# Change so that user 1 owns it, noone else can do anything. +setid 0 +setperm /dir 1 NONE +expect 1 NONE +getperm /dir +expect test +expect test2 +dir /dir +write /dir/test3 create contents + +# User 2 can do nothing. Can't even tell if file exists. +setid 2 +expect setperm failed: Permission denied +setperm /dir 2 NONE +expect getperm failed: Permission denied +getperm /dir +expect dir failed: Permission denied +dir /dir +expect read failed: Permission denied +read /dir/test +expect read failed: Permission denied +read /dir/test2 +expect read failed: Permission denied +read /dir/test3 +expect read failed: Permission denied +read /dir/test4 +expect write failed: Permission denied +write /dir/test none contents +expect write failed: Permission denied +write /dir/test create contents +expect write failed: Permission denied +write /dir/test excl contents +expect write failed: Permission denied +write /dir/test4 none contents +expect write failed: Permission denied +write /dir/test4 create contents +expect write failed: Permission denied +write /dir/test4 excl contents + +# Tools can always access things. +setid 0 +expect 1 NONE +getperm /dir +expect test +expect test2 +expect test3 +dir /dir +write /dir/test4 create contents + +# Inherited by child. +mkdir /dir/subdir +expect 1 NONE +getperm /dir/subdir +write /dir/subfile excl contents +expect 1 NONE +getperm /dir/subfile + +# But for domains, they own it. +setperm /dir/subdir 2 READ/WRITE +expect 2 READ/WRITE +getperm /dir/subdir +setid 3 +write /dir/subdir/subfile excl contents +expect 3 READ/WRITE +getperm /dir/subdir/subfile + +# Inheritence works through multiple directories, too. +write /dir/subdir/1/2/3/4 excl contents +expect 3 READ/WRITE +getperm /dir/subdir/1/2/3/4 +mkdir /dir/subdir/a/b/c/d +expect 3 READ/WRITE +getperm /dir/subdir/a/b/c/d diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/07watch.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/07watch.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,194 @@ +# Watch something, write to it, check watch has fired. +write /test create contents + +1 watch /test token +2 write /test create contents2 +expect 1:/test:token +1 waitwatch +1 ackwatch token +1 close + +# Check that reads don't set it off. +1 watch /test token +expect 2:contents2 +2 read /test +expect 1: waitwatch failed: Connection timed out +1 waitwatch +1 close + +# mkdir, setperm and rm should (also tests watching dirs) +mkdir /dir +1 watch /dir token +2 mkdir /dir/newdir +expect 1:/dir/newdir:token +1 waitwatch +1 ackwatch token +2 setperm /dir/newdir 0 READ +expect 1:/dir/newdir:token +1 waitwatch +1 ackwatch token +2 rm /dir/newdir +expect 1:/dir/newdir:token +1 waitwatch +1 ackwatch token +1 close +2 close + +# We don't get a watch from our own commands. +watch /dir token +mkdir /dir/newdir +expect waitwatch failed: Connection timed out +waitwatch +close + +# ignore watches while doing commands, should work. +watch /dir token +1 write /dir/test create contents +expect contents +read /dir/test +expect /dir/test:token +waitwatch +ackwatch token +close + +# watch priority test: all simultaneous +1 watch /dir token1 +3 watch /dir token3 +2 watch /dir token2 +write /dir/test create contents +expect 3:/dir/test:token3 +3 waitwatch +3 ackwatch token3 +expect 2:/dir/test:token2 +2 waitwatch +2 ackwatch token2 +expect 1:/dir/test:token1 +1 waitwatch +1 ackwatch token1 +1 close +2 close +3 close + +# If one dies (without acking), the other should still get ack. +1 watch /dir token1 +2 watch /dir token2 +write /dir/test create contents +expect 2:/dir/test:token2 +2 waitwatch +2 close +expect 1:/dir/test:token1 +1 waitwatch +1 ackwatch token1 +1 close + +# If one dies (without reading at all), the other should still get ack. +1 watch /dir token1 +2 watch /dir token2 +write /dir/test create contents +2 close +expect 1:/dir/test:token1 +1 waitwatch +1 ackwatch token1 +1 close +2 close + +# unwatch +1 watch /dir token1 +1 unwatch /dir token1 +1 watch /dir token2 +2 write /dir/test2 create contents +expect 1:/dir/test2:token2 +1 waitwatch +1 unwatch /dir token2 +1 close +2 close + +# unwatch while watch pending. Other watcher still gets the event. +1 watch /dir token1 +2 watch /dir token2 +write /dir/test create contents +2 unwatch /dir token2 +expect 1:/dir/test:token1 +1 waitwatch +1 ackwatch token1 +1 close +2 close + +# unwatch while watch pending. Should clear this so we get next event. +1 watch /dir token1 +write /dir/test create contents +1 unwatch /dir token1 +1 watch /dir/test token2 +write /dir/test none contents2 +expect 1:/dir/test:token2 +1 waitwatch +1 ackwatch token2 + +# check we only get notified once. +1 watch /test token +2 write /test create contents2 +expect 1:/test:token +1 waitwatch +1 ackwatch token +expect 1: waitwatch failed: Connection timed out +1 waitwatch +1 close + +# watches are queued in order. +1 watch / token +2 write /test1 create contents +2 write /test2 create contents +2 write /test3 create contents +expect 1:/test1:token +1 waitwatch +1 ackwatch token +expect 1:/test2:token +1 waitwatch +1 ackwatch token +expect 1:/test3:token +1 waitwatch +1 ackwatch token +1 close + +# Creation of subpaths should be covered correctly. +1 watch / token +2 write /test/subnode create contents2 +2 write /test/subnode/subnode create contents2 +expect 1:/test/subnode:token +1 waitwatch +1 ackwatch token +expect 1:/test/subnode/subnode:token +1 waitwatch +1 ackwatch token +expect 1: waitwatch failed: Connection timed out +1 waitwatch +1 close + +# Watch event must have happened before we registered interest. +1 watch / token +2 write /test/subnode create contents2 +1 watch / token2 0 +expect 1:/test/subnode:token +1 waitwatch +1 ackwatch token +expect 1: waitwatch failed: Connection timed out +1 waitwatch +1 close + +# Rm fires notification on child. +1 watch /test/subnode token +2 rm /test +expect 1:/test/subnode:token +1 waitwatch +1 ackwatch token + +# Watch should not double-send after we ack, even if we did something in between. +1 watch /test2 token +2 write /test2/foo create contents2 +expect 1:/test2/foo:token +1 waitwatch +expect 1:contents2 +1 read /test2/foo +1 ackwatch token +expect 1: waitwatch failed: Connection timed out +1 waitwatch diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/08transaction.slowtest --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/08transaction.slowtest Thu Aug 25 22:53:20 2005 @@ -0,0 +1,21 @@ +# Test transaction timeouts. Take a second each. + +mkdir /test +write /test/entry1 create contents + +# Transactions can take as long as the want... +start /test +sleep 1100 +rm /test/entry1 +commit +dir /test + +# ... as long as noone is waiting. +1 start /test +notimeout +2 mkdir /test/dir +1 mkdir /test/dir +expect 1:dir +1 dir /test +expect 1: commit failed: Connection timed out +1 commit diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/08transaction.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/08transaction.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,96 @@ +# Test transactions. + +mkdir /test + +# Simple transaction: create a file inside transaction. +1 start /test +1 write /test/entry1 create contents +2 dir /test +expect 1:entry1 +1 dir /test +1 commit +expect 2:contents +2 read /test/entry1 + +rm /test/entry1 + +# Create a file and abort transaction. +1 start /test +1 write /test/entry1 create contents +2 dir /test +expect 1:entry1 +1 dir /test +1 abort +2 dir /test + +write /test/entry1 create contents +# Delete in transaction, commit +1 start /test +1 rm /test/entry1 +expect 2:entry1 +2 dir /test +1 dir /test +1 commit +2 dir /test + +# Delete in transaction, abort. +write /test/entry1 create contents +1 start /test +1 rm /test/entry1 +expect 2:entry1 +2 dir /test +1 dir /test +1 abort +expect 2:entry1 +2 dir /test + +# Events inside transactions don't trigger watches until (successful) commit. +mkdir /test/dir +1 watch /test token +2 start /test +2 mkdir /test/dir/sub +expect 1: waitwatch failed: Connection timed out +1 waitwatch +2 close +1 close + +1 watch /test token +2 start /test +2 mkdir /test/dir/sub +2 abort +expect 1: waitwatch failed: Connection timed out +1 waitwatch +1 close + +1 watch /test token +2 start /test +2 mkdir /test/dir/sub +2 commit +expect 1:/test/dir/sub:token +1 waitwatch +1 ackwatch token +1 close + +# Rm inside transaction works like rm outside: children get notified. +1 watch /test/dir/sub token +2 start /test +2 rm /test/dir +2 commit +expect 1:/test/dir/sub:token +1 waitwatch +1 ackwatch token +1 close + +# Multiple events from single transaction don't trigger assert +1 watch /test token +2 start /test +2 write /test/1 create contents +2 write /test/2 create contents +2 commit +expect 1:/test/1:token +1 waitwatch +1 ackwatch token +expect 1:/test/2:token +1 waitwatch +1 ackwatch token +1 close diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/09domain.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/09domain.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,19 @@ +# Test domain communication. + +# Create a domain, write an entry. +expect handle is 1 +introduce 1 100 7 /my/home +1 write /entry1 create contents +expect entry1 +expect tool +dir / +close + +# Release that domain. +release 1 +close + +# Introduce and release by same connection. +expect handle is 2 +introduce 1 100 7 /my/home +release 1 diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/10domain-homedir.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/10domain-homedir.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,19 @@ +# Test domain "implicit" paths. + +# Create a domain, write an entry using implicit path, read using implicit +mkdir /home +expect handle is 1 +introduce 1 100 7 /home +1 write entry1 create contents +expect contents +read /home/entry1 +expect entry1 +dir /home + +# Place a watch using a relative path: expect relative answer. +1 mkdir foo +1 watch foo token +write /home/foo/bar create contents +expect 1:foo/bar:token +1 waitwatch +1 ackwatch token diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/11domain-watch.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/11domain-watch.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,52 @@ +# Test watching from a domain. + +# Watch something, write to it, check watch has fired. +write /test create contents +mkdir /dir + +expect handle is 1 +introduce 1 100 7 /my/home +1 watch /test token +write /test create contents2 +expect 1:/test:token +1 waitwatch +1 ackwatch token +1 unwatch /test token +release 1 +1 close + +# ignore watches while doing commands, should work. +expect handle is 1 +introduce 1 100 7 /my/home +1 watch /dir token +write /dir/test create contents +1 write /dir/test2 create contents2 +1 write /dir/test3 create contents3 +1 write /dir/test4 create contents4 +expect 1:/dir/test:token +1 waitwatch +1 ackwatch token +release 1 +1 close + +# unwatch +expect handle is 1 +introduce 1 100 7 /my/home +1 watch /dir token1 +1 unwatch /dir token1 +1 watch /dir token2 +write /dir/test2 create contents +expect 1:/dir/test2:token2 +1 waitwatch +1 unwatch /dir token2 +release 1 +1 close + +# unwatch while watch pending. +expect handle is 1 +introduce 1 100 7 /my/home +1 watch /dir token1 +write /dir/test2 create contents +1 unwatch /dir token1 +release 1 +1 close diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/12readonly.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/12readonly.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,41 @@ +# Test that read only connection can't alter store. + +write /test create contents + +readonly +expect test +expect tool +dir / + +expect contents +read /test +expect 0 READ +getperm /test +watch /test token +unwatch /test token +start / +commit +start / +abort + +# These don't work +expect write failed: Read-only file system +write /test2 create contents +expect write failed: Read-only file system +write /test create contents +expect setperm failed: Read-only file system +setperm /test 100 NONE +expect setperm failed: Read-only file system +setperm /test 100 NONE +expect shutdown failed: Read-only file system +shutdown +expect introduce failed: Read-only file system +introduce 1 100 7 /home + +# Check that watches work like normal. +watch / token +1 readwrite +1 write /test create contents +expect /test:token +waitwatch +ackwatch token diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/13watch-ack.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/13watch-ack.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,22 @@ +# This demonstrates a bug where an xs_acknowledge_watch returns +# EINVAL, because the daemon doesn't track what watch event it sent +# and relies on it being the "first" watch which has an event. +# Watches firing after the first event is sent out will change this. + +# Create three things to watch. +mkdir /test +mkdir /test/1 +mkdir /test/2 +mkdir /test/3 + +# Watch all three, fire event on 2, read watch, fire event on 1 and 3, ack 2. +1 watch /test/1 token1 +1 watch /test/2 token2 +1 watch /test/3 token3 +2 write /test/2 create contents2 +expect 1:/test/2:token2 +1 waitwatch +3 write /test/1 create contents1 +4 write /test/3 create contents3 +1 ackwatch token2 +1 close diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/14complexperms.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/14complexperms.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,99 @@ +# We should not be able to tell the difference between a node which +# doesn't exist, and a node we don't have permission on, if we don't +# have permission on it directory. + +mkdir /dir +setperm /dir 0 NONE + +# First when it doesn't exist +setid 1 +expect *Permission denied +dir /dir/file +expect *Permission denied +read /dir/file +expect *Permission denied +write /dir/file none value +expect *Permission denied +write /dir/file create value +expect *Permission denied +write /dir/file excl value +expect write failed: Invalid argument +write /dir/file crap value +expect *Permission denied +mkdir /dir/file +expect *Permission denied +rm /dir/file +expect *Permission denied +rm /dir +expect *Permission denied +getperm /dir/file +expect *Permission denied +setperm /dir/file 0 NONE +watch /dir/file token +setid 0 +write /dir/file create contents +rm /dir/file +setid 1 +expect waitwatch failed: Connection timed out +waitwatch +unwatch /dir/file token +expect *No such file or directory +unwatch /dir/file token +expect *Permission denied +start /dir/file +expect *No such file or directory +abort +expect *Permission denied +start /dir/file +expect *No such file or directory +commit +expect *Permission denied +introduce 2 100 7 /dir/file + +# Now it exists +setid 0 +write /dir/file create contents + +setid 1 +expect *Permission denied +dir /dir/file +expect *Permission denied +read /dir/file +expect *Permission denied +write /dir/file none value +expect *Permission denied +write /dir/file create value +expect *Permission denied +write /dir/file excl value +expect write failed: Invalid argument +write /dir/file crap value +expect *Permission denied +mkdir /dir/file +expect *Permission denied +rm /dir/file +expect *Permission denied +rm /dir +expect *Permission denied +getperm /dir/file +expect *Permission denied +setperm /dir/file 0 NONE +watch /dir/file token +setid 0 +write /dir/file create contents +rm /dir/file +setid 1 +expect waitwatch failed: Connection timed out +waitwatch +unwatch /dir/file token +expect *No such file or directory +unwatch /dir/file token +expect *Permission denied +start /dir/file +expect *No such file or directory +abort +expect *Permission denied +start /dir/file +expect *No such file or directory +commit +expect *Permission denied +introduce 2 100 7 /dir/file diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/testsuite/15nowait.test --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/testsuite/15nowait.test Thu Aug 25 22:53:20 2005 @@ -0,0 +1,25 @@ +# If we don't wait for an ack, we can crash daemon as it never expects to be +# sending out two replies on top of each other. +noackwrite /1 create 1 +noackwrite /2 create 2 +noackwrite /3 create 3 +noackwrite /4 create 4 +noackwrite /5 create 5 +readack +readack +readack +readack +readack + +expect handle is 1 +introduce 1 100 7 /my/home +1 noackwrite /1 create 1 +1 noackwrite /2 create 2 +1 noackwrite /3 create 3 +1 noackwrite /4 create 4 +1 noackwrite /5 create 5 +1 readack +1 readack +1 readack +1 readack +1 readack diff -r 5f1ed597f107 -r 8799d14bef77 tools/xenstore/xs_crashme.c --- /dev/null Wed Aug 24 02:43:18 2005 +++ b/tools/xenstore/xs_crashme.c Thu Aug 25 22:53:20 2005 @@ -0,0 +1,413 @@ +/* Code which randomly corrupts bits going to the daemon. + Copyright (C) 2005 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include <stdbool.h> +#include <stdio.h> +#include <sys/types.h> +#include <stdarg.h> +#include <string.h> +#include <sys/time.h> +#include "xs.h" +#include "talloc.h" +#include <errno.h> +#include "xenstored.h" + +#define XSTEST +#define RAND_FREQ 128 /* One char in 32 is corrupted. */ + +/* jhash.h: Jenkins hash support. + * + * Copyright (C) 1996 Bob Jenkins (bob_jenkins@xxxxxxxxxxxxxxxx) + * + * http://burtleburtle.net/bob/hash/ + * + * These are the credits from Bob's sources: + * + * lookup2.c, by Bob Jenkins, December 1996, Public Domain. + * hash(), hash2(), hash3, and mix() are externally useful functions. + * Routines to test the hash are included if SELF_TEST is defined. + * You can use this free for any purpose. It has no warranty. + * + * Copyright (C) 2003 David S. Miller (davem@xxxxxxxxxx) + * + * I've modified Bob's hash to be useful in the Linux kernel, and + * any bugs present are surely my fault. -DaveM + */ + +/* NOTE: Arguments are modified. */ +#define __jhash_mix(a, b, c) \ +{ \ + a -= b; a -= c; a ^= (c>>13); \ + b -= c; b -= a; b ^= (a<<8); \ + c -= a; c -= b; c ^= (b>>13); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<16); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>3); \ + b -= c; b -= a; b ^= (a<<10); \ + c -= a; c -= b; c ^= (b>>15); \ +} + +/* The golden ration: an arbitrary value */ +#define JHASH_GOLDEN_RATIO 0x9e3779b9 + +/* The most generic version, hashes an arbitrary sequence + * of bytes. No alignment or length assumptions are made about + * the input key. + */ +static inline u32 jhash(const void *key, u32 length, u32 initval) +{ + u32 a, b, c, len; + const u8 *k = key; + + len = length; + a = b = JHASH_GOLDEN_RATIO; + c = initval; + + while (len >= 12) { + a += (k[0] +((u32)k[1]<<8) +((u32)k[2]<<16) +((u32)k[3]<<24)); + b += (k[4] +((u32)k[5]<<8) +((u32)k[6]<<16) +((u32)k[7]<<24)); + c += (k[8] +((u32)k[9]<<8) +((u32)k[10]<<16)+((u32)k[11]<<24)); + + __jhash_mix(a,b,c); + + k += 12; + len -= 12; + } + + c += length; + switch (len) { + case 11: c += ((u32)k[10]<<24); + case 10: c += ((u32)k[9]<<16); + case 9 : c += ((u32)k[8]<<8); + case 8 : b += ((u32)k[7]<<24); + case 7 : b += ((u32)k[6]<<16); + case 6 : b += ((u32)k[5]<<8); + case 5 : b += k[4]; + case 4 : a += ((u32)k[3]<<24); + case 3 : a += ((u32)k[2]<<16); + case 2 : a += ((u32)k[1]<<8); + case 1 : a += k[0]; + }; + + __jhash_mix(a,b,c); + + return c; +} + +/* A special optimized version that handles 1 or more of u32s. + * The length parameter here is the number of u32s in the key. + */ +static inline u32 jhash2(u32 *k, u32 length, u32 initval) +{ + u32 a, b, c, len; + + a = b = JHASH_GOLDEN_RATIO; + c = initval; + len = length; + + while (len >= 3) { + a += k[0]; + b += k[1]; + c += k[2]; + __jhash_mix(a, b, c); + k += 3; len -= 3; + } + + c += length * 4; + + switch (len) { + case 2 : b += k[1]; + case 1 : a += k[0]; + }; + + __jhash_mix(a,b,c); + + return c; +} + + +/* A special ultra-optimized versions that knows they are hashing exactly + * 3, 2 or 1 word(s). + * + * NOTE: In partilar the "c += length; __jhash_mix(a,b,c);" normally + * done at the end is not done here. + */ +static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) +{ + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += initval; + + __jhash_mix(a, b, c); + + return c; +} + +static inline u32 jhash_2words(u32 a, u32 b, u32 initval) +{ + return jhash_3words(a, b, 0, initval); +} + +static inline u32 jhash_1word(u32 a, u32 initval) +{ + return jhash_3words(a, 0, 0, initval); +} + +static unsigned int get_randomness(int *state) +{ + return jhash_1word((*state)++, *state * 1103515243); +} + +static int state; + +/* Lengthening headers is pointless: other end will just wait for more + * data and timeout. We merely shorten the length. */ +static void corrupt_header(char *output, const struct xsd_sockmsg *msg, + unsigned int *next_bit) +{ + struct xsd_sockmsg newmsg = *msg; + + while (*next_bit < sizeof(*msg)) { + if (newmsg.len) + newmsg.len = get_randomness(&state) % newmsg.len; + *next_bit += get_randomness(&state) % RAND_FREQ; + } + memcpy(output, &newmsg, sizeof(newmsg)); +} + +#define read_all_choice read_all +static bool write_all_choice(int fd, const void *data, unsigned int len) +{ + char corrupt_data[len]; + bool ret; + static unsigned int next_bit; + + if (len == sizeof(struct xsd_sockmsg) + && ((unsigned long)data % __alignof__(struct xsd_sockmsg)) == 0) + corrupt_header(corrupt_data, data, &next_bit); + else { + memcpy(corrupt_data, data, len); + while (next_bit < len * CHAR_BIT) { + corrupt_data[next_bit/CHAR_BIT] + ^= (1 << (next_bit%CHAR_BIT)); + next_bit += get_randomness(&state) % RAND_FREQ; + } + } + + ret = xs_write_all(fd, corrupt_data, len); + next_bit -= len * CHAR_BIT; + return ret; +} + +#include "xs.c" + +static char *random_path(void) +{ + unsigned int i; + char *ret = NULL; + + if (get_randomness(&state) % 20 == 0) + return talloc_strdup(NULL, "/"); + + for (i = 0; i < 1 || (get_randomness(&state) % 2); i++) { + ret = talloc_asprintf_append(ret, "/%i", + get_randomness(&state) % 15); + } + return ret; +} + +static int random_flags(int *state) +{ + switch (get_randomness(state) % 4) { + case 0: + return 0; + case 1: + return O_CREAT; + case 2: + return O_CREAT|O_EXCL; + default: + return get_randomness(state); + } +} + +/* Do the next operation, return the results. */ +static void do_next_op(struct xs_handle *h, bool verbose) +{ + char *name; + unsigned int num; + + if (verbose) + printf("State %i: ", state); + + name = random_path(); + switch (get_randomness(&state) % 9) { + case 0: + if (verbose) + printf("DIR %s\n", name); + free(xs_directory(h, name, &num)); + break; + case 1: + if (verbose) + printf("READ %s\n", name); + free(xs_read(h, name, &num)); + break; + case 2: { + int flags = random_flags(&state); + char *contents = talloc_asprintf(NULL, "%i", + get_randomness(&state)); + unsigned int len = get_randomness(&state)%(strlen(contents)+1); + if (verbose) + printf("WRITE %s %s %.*s\n", name, + flags == O_CREAT ? "O_CREAT" + : flags == (O_CREAT|O_EXCL) ? "O_CREAT|O_EXCL" + : flags == 0 ? "0" : "CRAPFLAGS", + len, contents); + xs_write(h, name, contents, len, flags); + break; + } + case 3: + if (verbose) + printf("MKDIR %s\n", name); + xs_mkdir(h, name); + break; + case 4: + if (verbose) + printf("RM %s\n", name); + xs_rm(h, name); + break; + case 5: + if (verbose) + printf("GETPERMS %s\n", name); + free(xs_get_permissions(h, name, &num)); + break; + case 6: { + unsigned int i, num = get_randomness(&state)%8; + struct xs_permissions perms[num]; + + if (verbose) + printf("SETPERMS %s: ", name); + for (i = 0; i < num; i++) { + perms[i].id = get_randomness(&state)%8; + perms[i].perms = get_randomness(&state)%4; + if (verbose) + printf("%i%c ", perms[i].id, + perms[i].perms == XS_PERM_WRITE ? 'W' + : perms[i].perms == XS_PERM_READ ? 'R' + : perms[i].perms == + (XS_PERM_READ|XS_PERM_WRITE) ? 'B' + : 'N'); + } + if (verbose) + printf("\n"); + xs_set_permissions(h, name, perms, num); + break; + } + case 7: { + if (verbose) + printf("START %s\n", name); + xs_transaction_start(h, name); + break; + } + case 8: { + bool abort = (get_randomness(&state) % 2); + + if (verbose) + printf("STOP %s\n", abort ? "ABORT" : "COMMIT"); + xs_transaction_end(h, abort); + break; + } + default: + barf("Impossible randomness"); + } +} + +static struct xs_handle *h; +static void alarmed(int sig __attribute__((unused))) +{ + /* We force close on timeout. */ + close(h->fd); +} + +static int start_daemon(void) +{ + int fds[2]; + int daemon_pid; + + /* Start daemon. */ + pipe(fds); + if ((daemon_pid = fork())) { + /* Child writes PID when its ready: we wait for that. */ + char buffer[20]; + close(fds[1]); + if (read(fds[0], buffer, sizeof(buffer)) < 0) + barf("Failed to summon daemon"); + close(fds[0]); + return daemon_pid; + } else { + dup2(fds[1], STDOUT_FILENO); + close(fds[0]); +#if 1 + execlp("valgrind", "valgrind", "--log-file=/tmp/xs_crashme.vglog", "-q", "./xenstored_test", "--output-pid", + "--no-fork", "--trace-file=/tmp/trace", NULL); +#else + execlp("./xenstored_test", "xenstored_test", "--output-pid", + "--no-fork", NULL); +#endif + exit(1); + } +} + + +int main(int argc, char **argv) +{ + unsigned int i; + int pid; + + if (argc != 3 && argc != 4) + barf("Usage: xs_crashme <iterations> <seed> [pid]"); + + if (argc == 3) + pid = start_daemon(); + else + pid = atoi(argv[3]); + + state = atoi(argv[2]); + h = xs_daemon_open(); + if (!h) + barf_perror("Opening connection to daemon"); + signal(SIGALRM, alarmed); + for (i = 0; i < (unsigned)atoi(argv[1]); i++) { + alarm(1); + do_next_op(h, false); + if (i % (atoi(argv[1]) / 72 ?: 1) == 0) { + printf("."); + fflush(stdout); + } + if (kill(pid, 0) != 0) + barf_perror("Pinging daemon on iteration %i", i); + if (h->fd < 0) { + xs_daemon_close(h); + h = xs_daemon_open(); + if (!h) + barf_perror("Connecting on iteration %i", i); + } + } + kill(pid, SIGTERM); + return 0; +} + diff -r 5f1ed597f107 -r 8799d14bef77 docs/misc/shype4xen_readme.txt --- a/docs/misc/shype4xen_readme.txt Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,588 +0,0 @@ -Copyright: IBM Corporation (C) -20 June 2005 -Author: Reiner Sailer - -This document is a very short introduction into the sHype access control -security architecture implementation and how it is perceived by users. It -is a very preliminary draft for the courageous ones to get "their feet wet" -and to be able to give feedback (via the xen-devel/xense-devel mailing lists). - -Install: - -cd into xeno-unstable.bk -(use --dry-run option if you want to test the patch only) -patch -p1 -g0 < *tools.diff -patch -p1 -g0 < *xen.diff - -(no rejects, probably some line offsets) - -make uninstall; make mrproper; make; ./install.sh should install the default -sHype into Xen (rebuild your initrd images if necessary). Reboot. - -Debug output: there are two triggers for debug output: -a) General sHype debug: - xeno-unstable.bk/xen/include/public/acm.h - undefine ACM_DEBUG to switch this debug off - -b) sHype enforcement hook trace: This prints a small trace for each enforcement -hook that is executed. The trigger is in - xeno-unstable.bk/xen/include/acm/acm_hooks.h - undefine ACM_TRACE_MODE to switch this debug off - -1. The default NULL policy -*************************** -When you apply the patches and startup xen, you should at first not notice any -difference because the default policy is the "NULL" policy, which as the name -implies does not enforce anything. - -To display the currently enforced policy, use the policy tool under xeno- -unstable.bk/tools/policy: policy_tool getpolicy. You should see output like the -one below. - -[root@laptop policy]#./policy_tool getpolicy - -Policy dump: -============ -Magic = 1debc. -PolVer = aaaa0000. -Len = 14. -Primary = NULL policy (c=0, off=14). -Secondary = NULL policy (c=0, off=14). -No primary policy (NULL). -No secondary policy (NULL). - -Policy dump End. - -Since this is a dump of a binary policy, it's not pretty. The important parts -are the "Primary" and "Secondary" policy fields set to "NULL policy". sHype -currently allows to set two independent policies; thus the two SSID-REF parts -shown in 'xm list'. Right here: primary policy only means this policy is -checked first, the secondary policy is checked if the primary results in -"permitted access". The result of the combined policy is "permitted" if both -policies return permitted (NULL policy always returns permitted). The result is -"denied" if at least one of the policies returns "denied". Look into xeno- -unstable.bk/xen/include/acm/acm_hooks.h for the general hook structure -integrating the policy decisions (if you like, you won't need it for the rest -of the Readme file). - -2. Setting Chinese Wall and Simple Type Enforcement policies: -************************************************************* - -We'll get fast to the point. However, in order to understand what we are doing, -we must at least understand the purpose of the policies that we are going to -enforce. The two policies presented here are just examples and the -implementation encourages adding new policies easily. - -2.1. Chinese Wall policy: "decides whether a domain can be started based on -this domain's ssidref and the ssidrefs of the currently running domains". -Generally, the Chinese wall policy allows specifying certain types (or classes -or categories, whatever the preferred word) that conflict; we usually assign a -type to a workload and the set of types of those workloads running in a domain -make up the type set for this domain. Each domain is assigned a set of types -through its SSID-REF (we register Chinese Wall as primary policy, so the -ssidref used for determining the Chinese Wall types is the one annotated with -"p:" in xm list) since each SSID-REF points at a set of types. We'll see how -SSIDREFs are represented in Xen later when we will look at the policy. (A good -read for Chinese Wall is: Brewer/Nash The Chinese Wall Security Policy 1989.) - -So let's assume the Chinese Wall policy we are running distinguishes 10 types: -t0 ... t9. Let us assume further that each SSID-REF points to a set that -includes exactly one type (attached to domains that run workloads of a single -type). SSID-REF 0 points to {t0}, ssidref 1 points to {t1} ... 9 points to -{t9}. [This is actually the example policy we are going to push into xen later] - -Now the Chinese Wall policy allows you to define "Conflict type sets" and it -guarantees that of any conflict set at most one type is "running" at any time. -As an example, we have defined 2 conflict set: {t2, t3} and {t0, t5, t6}. -Specifying these conflict sets, sHype ensures that at most one type of each set -is running (either t2 or t3 but not both; either t0 or t5 or t6 but not -multiple of them). - -The effect is that administrators can define which workload types cannot run -simultaneously on a single Xen system. This is useful to limit the covert -timing channels between such payloads or to ensure that payloads don't -interfere with each other through existing resource dependencies. - -2.2. Simple Type Enforcement (ste) policy: "decides whether two domains can -share data, e.g., setup event channels or grant tables to each other, based on -the two domains' ssidref. This, as the name says, is a simple policy. Think of -each type as of a single color. Each domain has one or more colors, i.e., the -domains ssid for the ste policy points to a set that has set one or multiple -types. Let us assume in our example policy we differentiate 5 colors (types) -and define 5 different ssids referenced by ssidref=0..4. Each ssid shall have -exactly one type set, i.e., describes a uni-color. Only ssid(0) has all types -set, i.e., has all defined colors. - -Sharing is enforced by the ste policy by requiring that two domains that want -to establish an event channel or grant pages to each other must have a common -color. Currently all domains communicate through DOM0 by default; i.e., Domain0 -will necessarily have all colors to be able to create domains (thus, we will -assign ssidref(0) to Domain0 in our example below. - -More complex mandatory access control policies governing sharing will follow; -such policies are more sophisticated than the "color" scheme above by allowing -more flexible (and complex :_) access control decisions than "share a color" or -"don't share a color" and will be able to express finer-grained policies. - - -2.3 Binary Policy: -In the future, we will have a policy tool that takes as input a more humane -policy description, using types such as development, home-banking, donated- -Grid, CorpA-Payload ... and translates the respective policy into what we see -today as the binary policy using 1s and 0s and sets of them. For now, we must -live with the binary policy when working with sHype. - - -2.4 Exemplary use of a real sHype policy on Xen. To activate a real policy, -edit the file (yes, this will soon be a compile option): - xeno-unstable.bk/xen/include/public/acm.h - Change: #define ACM_USE_SECURITY_POLICY ACM_NULL_POLICY - To : #define ACM_USE_SECURITY_POLICY ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY - cd xeno-unstable.bk - make mrproper - make uninstall (manually remove /etc/xen.old if necessary) - make - ./install.sh (recreate your kernel initrd's if necessary) - Reboot into new xen.gz - -After booting, check out 'xm dmesg'; should show somewhere in the middle: - -(XEN) acm_init: Enforcing Primary CHINESE WALL policy, Secondary SIMPLE TYPE -ENFORCEMENT policy. - -Even though you can activate those policies in any combination and also -independently, the policy tool currently only supports setting the policy for -the above combination. - -Now look at the minimal startup policy with: - xeno-unstable.bk/tools/policytool getpolicy - -You should see something like: - -[root@laptop policy]# ./policy_tool getpolicy - -Policy dump: -============ -Magic = 1debc. -PolVer = aaaa0000. -Len = 36. -Primary = CHINESE WALL policy (c=1, off=14). -Secondary = SIMPLE TYPE ENFORCEMENT policy (c=2, off=2c). - - -Chinese Wall policy: -==================== -Max Types = 1. -Max Ssidrefs = 1. -Max ConfSets = 1. -Ssidrefs Off = 10. -Conflicts Off = 12. -Runing T. Off = 14. -C. Agg. Off = 16. - -SSID To CHWALL-Type matrix: - - ssidref 0: 00 - -Confict Sets: - - c-set 0: 00 - -Running -Types: 00 - -Conflict -Aggregate Set: 00 - - -Simple Type Enforcement policy: -=============================== -Max Types = 1. -Max Ssidrefs = 1. -Ssidrefs Off = 8. - -SSID To STE-Type matrix: - - ssidref 0: 01 - - -Policy dump End. - -This is a minimal policy (of little use), except it will disable starting any -domain that does not have ssidref set to 0x0. The Chinese Wall policy has -nothing to enforce and the ste policy only knows one type, which is set for the -only defined ssidref. - -The item that defines the ssidref in a domain configuration is: - -ssidref = 0x12345678 - -Where ssidref is interpreted as a 32bit number, where the lower 16bits become -the ssidref for the primary policy and the higher 16bits become the ssidref for -the secondary policy. sHype currently supports two policies but this is an -implementation decision and can be extended if necessary. - -This reference defines the security information of a domain. The meaning of the -SSID-REF depends on the policy, so we explain it when we explain the real -policies. - - -Setting a new Security Policy: -****************************** -The policy tool with all its current limitations has one usable example policy -compiled-in. Please try at this time to use the setpolicy command: - xeno-unstable.bk/tools/policy/policy_tool setpolicy - -You should see a dump of the policy you are setting. It should say at the very -end: - -Policy successfully set. - -Now try to dump the currently enforced policy, which is the policy we have just -set and the dynamic security state information of this policy -(<<< ... some additional explanations) - -[root@laptop policy]# ./policy_tool getpolicy - -Policy dump: -============ -Magic = 1debc. -PolVer = aaaa0000. -Len = 112. -Primary = CHINESE WALL policy (c=1, off=14). -Secondary = SIMPLE TYPE ENFORCEMENT policy (c=2, off=d8). - - -Chinese Wall policy: -==================== -Max Types = a. -Max Ssidrefs = 5. -Max ConfSets = 2. -Ssidrefs Off = 10. -Conflicts Off = 74. -Runing T. Off = 9c. -C. Agg. Off = b0. - -SSID To CHWALL-Type matrix: - - ssidref 0: 01 00 00 00 00 00 00 00 00 00 <<< type0 is set for ssidref0 - ssidref 1: 00 01 00 00 00 00 00 00 00 00 - ssidref 2: 00 00 01 00 00 00 00 00 00 00 - ssidref 3: 00 00 00 01 00 00 00 00 00 00 - ssidref 4: 00 00 00 00 01 00 00 00 00 00 <<< type4 is set for ssidref4 - <<< types 5-9 are unused -Confict Sets: - - c-set 0: 00 00 01 01 00 00 00 00 00 00 <<< type2 and type3 never run together - c-set 1: 01 00 00 00 00 01 01 00 00 00 <<< only one of types 0, 5 or 6 - <<< can run simultaneously -Running -Types: 01 00 00 00 00 00 00 00 00 00 <<< ref-count for types of running domains - -Conflict -Aggregate Set: 00 00 00 00 00 01 01 00 00 00 <<< aggregated set of types that - <<< cannot run because they - <<< are in conflict set 1 and - <<< (domain 0 is running w t0) - - -Simple Type Enforcement policy: -=============================== -Max Types = 5. -Max Ssidrefs = 5. -Ssidrefs Off = 8. - -SSID To STE-Type matrix: - - ssidref 0: 01 01 01 01 01 <<< ssidref0 points to a set that - <<< has all types set (colors) - ssidref 1: 00 01 00 00 00 <<< ssidref1 has color1 set - ssidref 2: 00 00 01 00 00 <<< ... - ssidref 3: 00 00 00 01 00 - ssidref 4: 00 00 00 00 01 - - -Policy dump End. - - -This is a small example policy with which we will demonstrate the enforcement. - -Starting Domains with policy enforcement -======================================== -Now let us play with this policy. - -Define 3 or 4 domain configurations. I use the following config using a ramdisk -only and about 8MBytes of memory for each DomU (test purposes): - -#-------configuration xmsec1------------------------- -kernel = "/boot/vmlinuz-2.6.11-xenU" -ramdisk="/boot/U1_ramdisk.img" -#security reference identifier -ssidref= 0x00010001 -memory = 10 -name = "xmsec1" -cpu = -1 # leave to Xen to pick -# Number of network interfaces. Default is 1. -nics=1 -dhcp="dhcp" -#----------------------------------------------------- - -xmsec2 and xmsec3 look the same except for the name and the ssidref line. Use -your domain config file and add "ssidref = 0x00010001" to the first (xmsec1), -"ssidref= 0x00020002" to the second (call it xmsec2), and "ssidref=0x00030003" -to the third (we will call this one xmsec3). - -First start xmsec1: xm create -c xmsec1 (succeeds) - -Then -[root@laptop policy]# xm list -Name Id Mem(MB) CPU State Time(s) Console -Domain-0 0 620 0 r---- 42.3 s:00/p:00 -xmnosec 1 9 0 -b--- 0.3 9601 s:00/p:05 -xmsec1 2 9 0 -b--- 0.2 9602 s:01/p:01 - -Shows a new domain xmsec1 running with primary (here: chinese wall) ssidref 1 -and secondary (here: simple type enforcement) ssidref 1. The ssidrefs are -independent and can differ for a domain. - -[root@laptop policy]# ./policy_tool getpolicy - -Policy dump: -============ -Magic = 1debc. -PolVer = aaaa0000. -Len = 112. -Primary = CHINESE WALL policy (c=1, off=14). -Secondary = SIMPLE TYPE ENFORCEMENT policy (c=2, off=d8). - - -Chinese Wall policy: -==================== -Max Types = a. -Max Ssidrefs = 5. -Max ConfSets = 2. -Ssidrefs Off = 10. -Conflicts Off = 74. -Runing T. Off = 9c. -C. Agg. Off = b0. - -SSID To CHWALL-Type matrix: - - ssidref 0: 01 00 00 00 00 00 00 00 00 00 - ssidref 1: 00 01 00 00 00 00 00 00 00 00 - ssidref 2: 00 00 01 00 00 00 00 00 00 00 - ssidref 3: 00 00 00 01 00 00 00 00 00 00 - ssidref 4: 00 00 00 00 01 00 00 00 00 00 - -Confict Sets: - - c-set 0: 00 00 01 01 00 00 00 00 00 00 - c-set 1: 01 00 00 00 00 01 01 00 00 00 <<< t1 is not part of any c-set - -Running -Types: 01 01 00 00 00 00 00 00 00 00 <<< xmsec1 has ssidref 1->type1 - ^^ <<< ref-count at position 1 incr -Conflict -Aggregate Set: 00 00 00 00 00 01 01 00 00 00 <<< domain 1 was allowed to - <<< start since type 1 was not - <<< in conflict with running - <<< types - -Simple Type Enforcement policy: -=============================== -Max Types = 5. -Max Ssidrefs = 5. -Ssidrefs Off = 8. - -SSID To STE-Type matrix: - - ssidref 0: 01 01 01 01 01 <<< the ste policy does not maintain; we - ssidref 1: 00 01 00 00 00 <-- <<< see that domain xmsec1 has ste - ssidref 2: 00 00 01 00 00 <<< ssidref1->type1 and has this type in - ssidref 3: 00 00 00 01 00 <<< common with dom0 - ssidref 4: 00 00 00 00 01 - - -Policy dump End. - -Look at sHype output in xen dmesg: - -[root@laptop xen]# xm dmesg -. -. -[somewhere near the very end] -(XEN) chwall_init_domain_ssid: determined chwall_ssidref to 1. -(XEN) ste_init_domain_ssid. -(XEN) ste_init_domain_ssid: determined ste_ssidref to 1. -(XEN) acm_init_domain_ssid: Instantiated individual ssid for domain 0x01. -(XEN) chwall_post_domain_create. -(XEN) ste_pre_eventchannel_interdomain. -(XEN) ste_pre_eventchannel_interdomain: (evtchn 0 --> 1) common type #01. -(XEN) shype_authorize_domops. -(XEN) ste_pre_eventchannel_interdomain. -(XEN) ste_pre_eventchannel_interdomain: (evtchn 0 --> 1) common type #01. -(XEN) ste_pre_eventchannel_interdomain. -(XEN) ste_pre_eventchannel_interdomain: (evtchn 0 --> 1) common type #01. - - -You can see that the chinese wall policy does not complain and that the ste -policy makes three access control decisions for three event-channels setup -between domain 0 and the new domain 1. Each time, the two domains share the -type1 and setting up the eventchannel is permitted. - - -Starting up a second domain xmsec2: - -[root@laptop xen]# xm create -c xmsec2 -Using config file "xmsec2". -Started domain xmsec2, console on port 9602 -************ REMOTE CONSOLE: CTRL-] TO QUIT ******** -Linux version 2.6.11-xenU (root@xxxxxxxxxxxxxxx) (gcc version 3.4.2 20041017 -(Red Hat 3.4.2-6.fc3)) #1 Wed Mar 30 13:14:31 EST 2005 -. -. -. -[root@laptop policy]# xm list -Name Id Mem(MB) CPU State Time(s) Console -Domain-0 0 620 0 r---- 71.7 s:00/p:00 -xmsec1 1 9 0 -b--- 0.3 9601 s:01/p:01 -xmsec2 2 7 0 -b--- 0.3 9602 s:02/p:02 << our domain runs both policies with ssidref 2 - - -[root@laptop policy]# ./policy_tool getpolicy - -Policy dump: -============ -Magic = 1debc. -PolVer = aaaa0000. -Len = 112. -Primary = CHINESE WALL policy (c=1, off=14). -Secondary = SIMPLE TYPE ENFORCEMENT policy (c=2, off=d8). - - -Chinese Wall policy: -==================== -Max Types = a. -Max Ssidrefs = 5. -Max ConfSets = 2. -Ssidrefs Off = 10. -Conflicts Off = 74. -Runing T. Off = 9c. -C. Agg. Off = b0. - -SSID To CHWALL-Type matrix: - - ssidref 0: 01 00 00 00 00 00 00 00 00 00 - ssidref 1: 00 01 00 00 00 00 00 00 00 00 - ssidref 2: 00 00 01 00 00 00 00 00 00 00 <<< our domain has type 2 set - ssidref 3: 00 00 00 01 00 00 00 00 00 00 - ssidref 4: 00 00 00 00 01 00 00 00 00 00 - -Confict Sets: - - c-set 0: 00 00 01 01 00 00 00 00 00 00 <<< t2 is in c-set0 with type 3 - c-set 1: 01 00 00 00 00 01 01 00 00 00 - -Running -Types: 01 01 01 00 00 00 00 00 00 00 <<< t2 is running since the - ^^ <<< current aggregate conflict - <<< set (see above) does not - <<< include type 2 -Conflict -Aggregate Set: 00 00 00 01 00 01 01 00 00 00 <<< type 3 is added to the - <<< conflict aggregate - - -Simple Type Enforcement policy: -=============================== -Max Types = 5. -Max Ssidrefs = 5. -Ssidrefs Off = 8. - -SSID To STE-Type matrix: - - ssidref 0: 01 01 01 01 01 - ssidref 1: 00 01 00 00 00 - ssidref 2: 00 00 01 00 00 - ssidref 3: 00 00 00 01 00 - ssidref 4: 00 00 00 00 01 - - -Policy dump End. - - -The sHype xen dmesg output looks similar to the one above when starting the -first domain. - -Now we start xmsec3 and it has ssidref3. Thus, it tries to run as type3 which -conflicts with running type2 (from xmsec2). As expected, creating this domain -fails for security policy enforcement reasons. - -[root@laptop xen]# xm create -c xmsec3 -Using config file "xmsec3". -Error: Error creating domain: (22, 'Invalid argument') -[root@laptop xen]# - -[root@laptop xen]# xm dmesg -. -. -[somewhere near the very end] -(XEN) chwall_pre_domain_create. -(XEN) chwall_pre_domain_create: CHINESE WALL CONFLICT in type 03. - -xmsec3 ssidref3 points to type3, which is in the current conflict aggregate -set. This domain cannot start until domain xmsec2 is destroyed, at which time -the aggregate conflict set is reduced and type3 is excluded from it. Then, -xmsec3 can start. Of course, afterwards, xmsec2 cannot be restarted. Try it. - -3. Policy tool -************** -toos/policy/policy_tool.c - -a) ./policy_tool getpolicy - prints the currently enforced policy - (see for example section 1.) - -b) ./policy_tool setpolicy - sets a predefined and hardcoded security - policy (the one described in section 2.) - -c) ./policy_tool dumpstats - prints some status information about the caching - of access control decisions (number of cache hits - and number of policy evaluations for grant_table - and event channels). - -d) ./policy_tool loadpolicy <binary_policy_file> - sets the policy defined in the <binary_policy_file> - please use the policy_processor that is posted to this - mailing list to create such a binary policy from an XML - policy description - -4. Policy interface: -******************** -The Policy interface is working in "network-byte-order" (big endian). The reason for this -is that policy files/management should be portable and independent of the platforms. - -Our policy interface enables managers to create a single binary policy file in a trusted -environment and distributed it to multiple systems for enforcement. - -5. Booting with a binary policy: -******************************** -The grub configuration file can be adapted to boot the hypervisor with an -already active policy. To do this, a binary policy file - this can be -the same file as used by the policy_tool - should be placed into the boot -partition. The following entry from the grub configuration file shows how -a binary policy can be added to the system during boot time. Note that the -binary policy must be of the same type that the hypervisor was compiled -for. The policy module line should also only be added as the last module -line if XEN was compiled with the access control module (ACM). - -title XEN0 3.0 Devel - kernel /xen.gz dom0_mem=400000 - module /vmlinuz-2.6.12-xen0 root=/dev/hda2 ro console=tty0 - module /initrd-2.6.12-xen0.img - module /xen_sample_policy.bin - - -====================end-of file======================================= diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/timers/Makefile --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/timers/Makefile Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,17 +0,0 @@ -# -# Makefile for x86 timers -# - -XENARCH := $(subst ",,$(CONFIG_XENARCH)) - -obj-y := timer_tsc.o -c-obj-y := - -c-link := - -$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): - @ln -fsn $(srctree)/arch/i386/kernel/timers/$(notdir $@) $@ - -obj-y += $(c-obj-y) - -clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link)) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/timers/timer_tsc.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,379 +0,0 @@ -/* - * This code largely moved from arch/i386/kernel/time.c. - * See comments there for proper credits. - */ - -#include <linux/spinlock.h> -#include <linux/init.h> -#include <linux/timex.h> -#include <linux/errno.h> -#include <linux/cpufreq.h> -#include <linux/string.h> -#include <linux/jiffies.h> - -#include <asm/timer.h> -#include <asm/io.h> -/* processor.h for distable_tsc flag */ -#include <asm/processor.h> - -#include "io_ports.h" -#include "mach_timer.h" - -#include <asm/hpet.h> - -#ifdef CONFIG_HPET_TIMER -static unsigned long hpet_usec_quotient; -static unsigned long hpet_last; -static struct timer_opts timer_tsc; -#endif - -static inline void cpufreq_delayed_get(void); - -int tsc_disable __initdata = 0; - -extern spinlock_t i8253_lock; - -static int use_tsc; - -static unsigned long long monotonic_base; -static u32 monotonic_offset; -static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; - -/* convert from cycles(64bits) => nanoseconds (64bits) - * basic equation: - * ns = cycles / (freq / ns_per_sec) - * ns = cycles * (ns_per_sec / freq) - * ns = cycles * (10^9 / (cpu_mhz * 10^6)) - * ns = cycles * (10^3 / cpu_mhz) - * - * Then we use scaling math (suggested by george@xxxxxxxxxx) to get: - * ns = cycles * (10^3 * SC / cpu_mhz) / SC - * ns = cycles * cyc2ns_scale / SC - * - * And since SC is a constant power of two, we can convert the div - * into a shift. - * -johnstul@xxxxxxxxxx "math is hard, lets go shopping!" - */ -static unsigned long cyc2ns_scale; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ - -static inline void set_cyc2ns_scale(unsigned long cpu_mhz) -{ - cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} - -/* Cached *multiplier* to convert TSC counts to microseconds. - * (see the equation below). - * Equal to 2^32 * (1 / (clocks per usec) ). - * Initialized in time_init. - */ -static unsigned long fast_gettimeoffset_quotient; - -extern u32 shadow_tsc_stamp; -extern u64 shadow_system_time; - -static unsigned long get_offset_tsc(void) -{ - register unsigned long eax, edx; - - /* Read the Time Stamp Counter */ - - rdtsc(eax,edx); - - /* .. relative to previous jiffy (32 bits is enough) */ - eax -= shadow_tsc_stamp; - - /* - * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient - * = (tsc_low delta) * (usecs_per_clock) - * = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy) - * - * Using a mull instead of a divl saves up to 31 clock cycles - * in the critical path. - */ - - __asm__("mull %2" - :"=a" (eax), "=d" (edx) - :"rm" (fast_gettimeoffset_quotient), - "0" (eax)); - - /* our adjusted time offset in microseconds */ - return edx; -} - -static unsigned long long monotonic_clock_tsc(void) -{ - unsigned long long last_offset, this_offset, base; - unsigned seq; - - /* atomically read monotonic base & last_offset */ - do { - seq = read_seqbegin(&monotonic_lock); - last_offset = monotonic_offset; - base = monotonic_base; - } while (read_seqretry(&monotonic_lock, seq)); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return base + cycles_2_ns(this_offset - last_offset); -} - -/* - * Scheduler clock - returns current time in nanosec units. - */ -unsigned long long sched_clock(void) -{ - unsigned long long this_offset; - - /* - * In the NUMA case we dont use the TSC as they are not - * synchronized across all CPUs. - */ -#ifndef CONFIG_NUMA - if (!use_tsc) -#endif - /* no locking but a rare wrong value is not a big deal */ - return jiffies_64 * (1000000000 / HZ); - - /* Read the Time Stamp Counter */ - rdtscll(this_offset); - - /* return the value in ns */ - return cycles_2_ns(this_offset); -} - - -static void mark_offset_tsc(void) -{ - - /* update the monotonic base value */ - write_seqlock(&monotonic_lock); - monotonic_base = shadow_system_time; - monotonic_offset = shadow_tsc_stamp; - write_sequnlock(&monotonic_lock); -} - -static void delay_tsc(unsigned long loops) -{ - unsigned long bclock, now; - - rdtscl(bclock); - do - { - rep_nop(); - rdtscl(now); - } while ((now-bclock) < loops); -} - -#ifdef CONFIG_HPET_TIMER -static void mark_offset_tsc_hpet(void) -{ - unsigned long long this_offset, last_offset; - unsigned long offset, temp, hpet_current; - - write_seqlock(&monotonic_lock); - last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - /* - * It is important that these two operations happen almost at - * the same time. We do the RDTSC stuff first, since it's - * faster. To avoid any inconsistencies, we need interrupts - * disabled locally. - */ - /* - * Interrupts are just disabled locally since the timer irq - * has the SA_INTERRUPT flag set. -arca - */ - /* read Pentium cycle counter */ - - hpet_current = hpet_readl(HPET_COUNTER); - rdtsc(last_tsc_low, last_tsc_high); - - /* lost tick compensation */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) { - int lost_ticks = (offset - hpet_last) / hpet_tick; - jiffies_64 += lost_ticks; - } - hpet_last = hpet_current; - - /* update the monotonic base value */ - this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low; - monotonic_base += cycles_2_ns(this_offset - last_offset); - write_sequnlock(&monotonic_lock); - - /* calculate delay_at_last_interrupt */ - /* - * Time offset = (hpet delta) * ( usecs per HPET clock ) - * = (hpet delta) * ( usecs per tick / HPET clocks per tick) - * = (hpet delta) * ( hpet_usec_quotient ) / (2^32) - * Where, - * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick - */ - delay_at_last_interrupt = hpet_current - offset; - ASM_MUL64_REG(temp, delay_at_last_interrupt, - hpet_usec_quotient, delay_at_last_interrupt); -} -#endif - - -#ifdef CONFIG_CPU_FREQ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static inline void cpufreq_delayed_get(void) -{ - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n"); - schedule_work(&cpufreq_delayed_get_work); - } -} - -/* If the CPU frequency is scaled, TSC-based delays will need a different - * loops_per_jiffy value to function properly. - */ - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -#ifndef CONFIG_SMP -static unsigned long fast_gettimeoffset_ref = 0; -static unsigned long cpu_khz_ref = 0; -#endif - -static int -time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - - if (val != CPUFREQ_RESUMECHANGE) - write_seqlock_irq(&xtime_lock); - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; -#ifndef CONFIG_SMP - fast_gettimeoffset_ref = fast_gettimeoffset_quotient; - cpu_khz_ref = cpu_khz; -#endif - } - - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); -#ifndef CONFIG_SMP - if (cpu_khz) - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (use_tsc) { - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { - fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq); - set_cyc2ns_scale(cpu_khz/1000); - } - } -#endif - } - - if (val != CPUFREQ_RESUMECHANGE) - write_sequnlock_irq(&xtime_lock); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - - -static int __init cpufreq_tsc(void) -{ - int ret; - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - ret = cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - if (!ret) - cpufreq_init = 1; - return ret; -} -core_initcall(cpufreq_tsc); - -#else /* CONFIG_CPU_FREQ */ -static inline void cpufreq_delayed_get(void) { return; } -#endif - - -static int init_tsc(char* override) -{ - u64 __cpu_khz; - - __cpu_khz = HYPERVISOR_shared_info->cpu_freq; - do_div(__cpu_khz, 1000); - cpu_khz = (u32)__cpu_khz; - printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", - cpu_khz / 1000, cpu_khz % 1000); - - /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz = - (2^32 * 1 / (clocks/us)) */ - { - unsigned long eax=0, edx=1000; - __asm__("divl %2" - :"=a" (fast_gettimeoffset_quotient), "=d" (edx) - :"r" (cpu_khz), - "0" (eax), "1" (edx)); - } - - set_cyc2ns_scale(cpu_khz/1000); - - use_tsc = 1; - - return 0; -} - -static int __init tsc_setup(char *str) -{ - printk(KERN_WARNING "notsc: cannot disable TSC in Xen/Linux.\n"); - return 1; -} -__setup("notsc", tsc_setup); - - - -/************************************************************/ - -/* tsc timer_opts struct */ -struct timer_opts timer_tsc = { - .name = "tsc", - .mark_offset = mark_offset_tsc, - .get_offset = get_offset_tsc, - .monotonic_clock = monotonic_clock_tsc, - .delay = delay_tsc, -}; - -struct init_timer_opts timer_tsc_init = { - .init = init_tsc, - .opts = &timer_tsc, -}; diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/asm-offsets.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,70 +0,0 @@ -/* - * Generate definitions needed by assembly language modules. - * This code generates raw asm output which is post-processed to extract - * and format the required data. - */ - -#include <linux/sched.h> -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/hardirq.h> -#include <linux/suspend.h> -#include <asm/pda.h> -#include <asm/processor.h> -#include <asm/segment.h> -#include <asm/thread_info.h> -#include <asm/ia32.h> - -#define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) - -#define BLANK() asm volatile("\n->" : : ) - -int main(void) -{ -#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) - ENTRY(state); - ENTRY(flags); - ENTRY(thread); - ENTRY(pid); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(threadinfo_ ## entry, offsetof(struct thread_info, entry)) - ENTRY(flags); - ENTRY(addr_limit); - ENTRY(preempt_count); - BLANK(); -#undef ENTRY -#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) - ENTRY(kernelstack); - ENTRY(oldrsp); - ENTRY(pcurrent); - ENTRY(irqrsp); - ENTRY(irqcount); - ENTRY(cpunumber); - ENTRY(irqstackptr); - ENTRY(kernel_mode); - BLANK(); -#undef ENTRY -#ifdef CONFIG_IA32_EMULATION -#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); - BLANK(); -#undef ENTRY - DEFINE(IA32_RT_SIGFRAME_sigcontext, - offsetof (struct rt_sigframe32, uc.uc_mcontext)); - BLANK(); -#endif - DEFINE(pbe_address, offsetof(struct pbe, address)); - DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); - DEFINE(pbe_next, offsetof(struct pbe, next)); - return 0; -} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/init_task.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/init_task.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,49 +0,0 @@ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/init.h> -#include <linux/init_task.h> -#include <linux/fs.h> -#include <linux/mqueue.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/desc.h> - -static struct fs_struct init_fs = INIT_FS; -static struct files_struct init_files = INIT_FILES; -static struct signal_struct init_signals = INIT_SIGNALS(init_signals); -static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -struct mm_struct init_mm = INIT_MM(init_mm); - -EXPORT_SYMBOL(init_mm); - -/* - * Initial task structure. - * - * We need to make sure that this is 8192-byte aligned due to the - * way process stacks are handled. This is done by having a special - * "init_task" linker map entry.. - */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; - -/* - * Initial task structure. - * - * All other task structs will be allocated on slabs in fork.c - */ -struct task_struct init_task = INIT_TASK(init_task); - -EXPORT_SYMBOL(init_task); -/* - * per-CPU TSS segments. Threads are completely 'soft' on Linux, - * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned - * section. Since TSS's are completely CPU-local, we want them - * on exact cacheline boundaries, to eliminate cacheline ping-pong. - */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_maxaligned_in_smp; - -#define ALIGN_TO_4K __attribute__((section(".data.init_task"))) diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-dma.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/pci-dma.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,336 +0,0 @@ -/* - * Dynamic DMA mapping support. - */ - -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm-xen/balloon.h> - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -int dma_map_sg(struct device *hwdev, struct scatterlist *sg, - int nents, int direction) -{ - int i; - - BUG_ON(direction == DMA_NONE); - for (i = 0; i < nents; i++ ) { - struct scatterlist *s = &sg[i]; - BUG_ON(!s->page); - s->dma_address = virt_to_bus(page_address(s->page) +s->offset); - s->dma_length = s->length; - } - return nents; -} - -EXPORT_SYMBOL(dma_map_sg); - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, int dir) -{ - int i; - for (i = 0; i < nents; i++) { - struct scatterlist *s = &sg[i]; - BUG_ON(s->page == NULL); - BUG_ON(s->dma_address == 0); - dma_unmap_single(dev, s->dma_address, s->dma_length, dir); - } -} - -EXPORT_SYMBOL(dma_unmap_sg); - -struct dma_coherent_mem { - void *virt_base; - u32 device_base; - int size; - int flags; - unsigned long *bitmap; -}; - -void *dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, unsigned gfp) -{ - void *ret; - unsigned int order = get_order(size); - unsigned long vstart; - - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - ret = mem->virt_base + (page << PAGE_SHIFT); - memset(ret, 0, size); - return ret; - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - return NULL; - } - - if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) - gfp |= GFP_DMA; - - vstart = __get_free_pages(gfp, order); - ret = (void *)vstart; - if (ret == NULL) - return ret; - - xen_contig_memory(vstart, order); - - memset(ret, 0, size); - *dma_handle = virt_to_bus(ret); - - return ret; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - } else - free_pages((unsigned long)vaddr, order); -} -EXPORT_SYMBOL(dma_free_coherent); - -#if 0 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base; - int pages = size >> PAGE_SHIFT; - int bitmap_size = (pages + 31)/32; - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem)); - dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - memset(dev->dma_mem->bitmap, 0, bitmap_size); - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem->bitmap); - out: - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if(!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; - int pos, err; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); -#endif - -static LIST_HEAD(dma_map_head); -static DEFINE_SPINLOCK(dma_map_lock); -struct dma_map_entry { - struct list_head list; - dma_addr_t dma; - char *bounce, *host; - size_t size; -}; -#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d))) - -dma_addr_t -dma_map_single(struct device *dev, void *ptr, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_entry *ent; - void *bnc; - dma_addr_t dma; - unsigned long flags; - - if (direction == DMA_NONE) - out_of_line_bug(); - - /* - * Even if size is sub-page, the buffer may still straddle a page - * boundary. Take into account buffer start offset. All other calls are - * conservative and always search the dma_map list if it's non-empty. - */ - if (((((unsigned long)ptr) & ~PAGE_MASK) + size) <= PAGE_SIZE) { - dma = virt_to_bus(ptr); - } else { - BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL); - BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL); - if (direction != DMA_FROM_DEVICE) - memcpy(bnc, ptr, size); - ent->dma = dma; - ent->bounce = bnc; - ent->host = ptr; - ent->size = size; - spin_lock_irqsave(&dma_map_lock, flags); - list_add(&ent->list, &dma_map_head); - spin_unlock_irqrestore(&dma_map_lock, flags); - } - - if ((dma+size) & ~*dev->dma_mask) - out_of_line_bug(); - return dma; -} -EXPORT_SYMBOL(dma_map_single); - -void -dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_entry *ent; - unsigned long flags; - - if (direction == DMA_NONE) - out_of_line_bug(); - - /* Fast-path check: are there any multi-page DMA mappings? */ - if (!list_empty(&dma_map_head)) { - spin_lock_irqsave(&dma_map_lock, flags); - list_for_each_entry ( ent, &dma_map_head, list ) { - if (DMA_MAP_MATCHES(ent, dma_addr)) { - list_del(&ent->list); - break; - } - } - spin_unlock_irqrestore(&dma_map_lock, flags); - if (&ent->list != &dma_map_head) { - BUG_ON(dma_addr != ent->dma); - BUG_ON(size != ent->size); - if (direction != DMA_TO_DEVICE) - memcpy(ent->host, ent->bounce, size); - dma_free_coherent(dev, size, ent->bounce, ent->dma); - kfree(ent); - } - } -} -EXPORT_SYMBOL(dma_unmap_single); - -void -dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_entry *ent; - unsigned long flags, off; - - /* Fast-path check: are there any multi-page DMA mappings? */ - if (!list_empty(&dma_map_head)) { - spin_lock_irqsave(&dma_map_lock, flags); - list_for_each_entry ( ent, &dma_map_head, list ) - if (DMA_MAP_MATCHES(ent, dma_handle)) - break; - spin_unlock_irqrestore(&dma_map_lock, flags); - if (&ent->list != &dma_map_head) { - off = dma_handle - ent->dma; - BUG_ON((off + size) > ent->size); - /*if (direction != DMA_TO_DEVICE)*/ - memcpy(ent->host+off, ent->bounce+off, size); - } - } -} -EXPORT_SYMBOL(dma_sync_single_for_cpu); - -void -dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_entry *ent; - unsigned long flags, off; - - /* Fast-path check: are there any multi-page DMA mappings? */ - if (!list_empty(&dma_map_head)) { - spin_lock_irqsave(&dma_map_lock, flags); - list_for_each_entry ( ent, &dma_map_head, list ) - if (DMA_MAP_MATCHES(ent, dma_handle)) - break; - spin_unlock_irqrestore(&dma_map_lock, flags); - if (&ent->list != &dma_map_head) { - off = dma_handle - ent->dma; - BUG_ON((off + size) > ent->size); - /*if (direction != DMA_FROM_DEVICE)*/ - memcpy(ent->bounce+off, ent->host+off, size); - } - } - - flush_write_buffers(); -} -EXPORT_SYMBOL(dma_sync_single_for_device); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/arch/xen/x86_64/mm/ioremap.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/ioremap.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,466 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> - -/* - * Reuse arch/xen/i396/mm/ioremap.c. Need to merge later - */ -#ifndef CONFIG_XEN_PHYSDEV_ACCESS - -void * __ioremap(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - return NULL; -} - -void *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return NULL; -} - -void iounmap(volatile void __iomem *addr) -{ -} - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - return NULL; -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ -} - -#else - -#if defined(__i386__) -/* - * Does @address reside within a non-highmem page that is local to this virtual - * machine (i.e., not an I/O page, nor a memory page belonging to another VM). - * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand - * why this works. - */ -static inline int is_local_lowmem(unsigned long address) -{ - extern unsigned long max_low_pfn; - unsigned long mfn = address >> PAGE_SHIFT; - unsigned long pfn = mfn_to_pfn(mfn); - return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn)); -} -#elif defined(__x86_64__) -/* - * - */ -static inline int is_local_lowmem(unsigned long address) -{ - return 0; -} -#endif - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - domid_t domid = DOMID_IO; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0x0 && last_addr < 0x100000) - return isa_bus_to_virt(phys_addr); -#endif - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (is_local_lowmem(phys_addr)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = bus_to_virt(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - - domid = DOMID_LOCAL; - } - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr, - size, __pgprot(_PAGE_PRESENT | _PAGE_RW | - _PAGE_DIRTY | _PAGE_ACCESSED -#if defined(__x86_64__) - | _PAGE_USER -#endif - | flags), domid)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} - - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (is_local_lowmem(last_addr)) { - struct page *ppage = virt_to_page(bus_to_virt(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} - -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p; - if ((void __force *) addr <= high_memory) - return; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; -#endif - p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr)); - if (!p) { - printk("__iounmap: bad address %p\n", addr); - return; - } - - if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) { - /* p->size includes the guard page, but cpa doesn't like that */ - change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)), - (p->size - PAGE_SIZE) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - kfree(p); -} - -#if defined(__i386__) -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= 0x0 && last_addr < 0x100000) - return isa_bus_to_virt(phys_addr); -#endif - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; -#ifdef CONFIG_XEN_PRIVILEGED_GUEST - if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN)) - return; -#endif - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} -#endif /* defined(__i386__) */ - -#endif /* CONFIG_XEN_PHYSDEV_ACCESS */ - -/* These hacky macros avoid phys->machine translations. */ -#define __direct_pte(x) ((pte_t) { (x) } ) -#define __direct_mk_pte(page_nr,pgprot) \ - __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) -#define direct_mk_pte_phys(physpage, pgprot) \ - __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot) - -static inline void direct_remap_area_pte(pte_t *pte, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - if (address >= end) - BUG(); - - do { - (*v)->ptr = virt_to_machine(pte); - (*v)++; - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -static inline int direct_remap_area_pmd(struct mm_struct *mm, - pmd_t *pmd, - unsigned long address, - unsigned long size, - mmu_update_t **v) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - pte_t *pte = (mm == &init_mm) ? - pte_alloc_kernel(mm, pmd, address) : - pte_alloc_map(mm, pmd, address); - if (!pte) - return -ENOMEM; - direct_remap_area_pte(pte, address, end - address, v); - pte_unmap(pte); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); - return 0; -} - -int __direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long size, - mmu_update_t *v) -{ - pgd_t * dir; - unsigned long end = address + size; - int error; - -#if defined(__i386__) - dir = pgd_offset(mm, address); -#elif defined (__x86_64) - dir = (mm == &init_mm) ? - pgd_offset_k(address): - pgd_offset(mm, address); -#endif - if (address >= end) - BUG(); - spin_lock(&mm->page_table_lock); - do { - pud_t *pud; - pmd_t *pmd; - - error = -ENOMEM; - pud = pud_alloc(mm, dir, address); - if (!pud) - break; - pmd = pmd_alloc(mm, pud, address); - if (!pmd) - break; - error = 0; - direct_remap_area_pmd(mm, pmd, address, end - address, &v); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - - } while (address && (address < end)); - spin_unlock(&mm->page_table_lock); - return error; -} - - -int direct_remap_area_pages(struct mm_struct *mm, - unsigned long address, - unsigned long machine_addr, - unsigned long size, - pgprot_t prot, - domid_t domid) -{ - int i; - unsigned long start_address; -#define MAX_DIRECTMAP_MMU_QUEUE 130 - mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u; - - start_address = address; - - flush_cache_all(); - - for (i = 0; i < size; i += PAGE_SIZE) { - if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) { - /* Fill in the PTE pointers. */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - - if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0) - return -EFAULT; - v = u; - start_address = address; - } - - /* - * Fill in the machine address: PTE ptr is done later by - * __direct_remap_area_pages(). - */ - v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot); - - machine_addr += PAGE_SIZE; - address += PAGE_SIZE; - v++; - } - - if (v != u) { - /* get the ptep's filled in */ - __direct_remap_area_pages(mm, - start_address, - address-start_address, - u); - if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)) - return -EFAULT; - } - - flush_tlb_all(); - - return 0; -} - -EXPORT_SYMBOL(direct_remap_area_pages); diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/blkback/control.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/control.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,61 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/blkif/backend/control.c - * - * Routines for interfacing with the control plane. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -static void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - DPRINTK("Received blkif backend message, subtype=%d\n", msg->subtype); - - switch ( msg->subtype ) - { - case CMSG_BLKIF_BE_CREATE: - blkif_create((blkif_be_create_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DESTROY: - blkif_destroy((blkif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_CONNECT: - blkif_connect((blkif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_DISCONNECT: - if ( !blkif_disconnect((blkif_be_disconnect_t *)&msg->msg[0],msg->id) ) - return; /* Sending the response is deferred until later. */ - break; - case CMSG_BLKIF_BE_VBD_CREATE: - vbd_create((blkif_be_vbd_create_t *)&msg->msg[0]); - break; - case CMSG_BLKIF_BE_VBD_DESTROY: - vbd_destroy((blkif_be_vbd_destroy_t *)&msg->msg[0]); - break; - default: - DPRINTK("Parse error while reading message subtype %d, len %d\n", - msg->subtype, msg->length); - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -void blkif_ctrlif_init(void) -{ - ctrl_msg_t cmsg; - blkif_be_driver_status_t st; - - (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_BLKIF_BE; - cmsg.subtype = CMSG_BLKIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(blkif_be_driver_status_t); - st.status = BLKIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/drivers/xen/netback/control.c --- a/linux-2.6-xen-sparse/drivers/xen/netback/control.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,58 +0,0 @@ -/****************************************************************************** - * arch/xen/drivers/netif/backend/control.c - * - * Routines for interfacing with the control plane. - * - * Copyright (c) 2004, Keir Fraser - */ - -#include "common.h" - -static void netif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id) -{ - DPRINTK("Received netif backend message, subtype=%d\n", msg->subtype); - - switch ( msg->subtype ) - { - case CMSG_NETIF_BE_CREATE: - netif_create((netif_be_create_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_DESTROY: - netif_destroy((netif_be_destroy_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_CREDITLIMIT: - netif_creditlimit((netif_be_creditlimit_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_CONNECT: - netif_connect((netif_be_connect_t *)&msg->msg[0]); - break; - case CMSG_NETIF_BE_DISCONNECT: - if ( !netif_disconnect((netif_be_disconnect_t *)&msg->msg[0],msg->id) ) - return; /* Sending the response is deferred until later. */ - break; - default: - DPRINTK("Parse error while reading message subtype %d, len %d\n", - msg->subtype, msg->length); - msg->length = 0; - break; - } - - ctrl_if_send_response(msg); -} - -void netif_ctrlif_init(void) -{ - ctrl_msg_t cmsg; - netif_be_driver_status_t st; - - (void)ctrl_if_register_receiver(CMSG_NETIF_BE, netif_ctrlif_rx, - CALLBACK_IN_BLOCKING_CONTEXT); - - /* Send a driver-UP notification to the domain controller. */ - cmsg.type = CMSG_NETIF_BE; - cmsg.subtype = CMSG_NETIF_BE_DRIVER_STATUS; - cmsg.length = sizeof(netif_be_driver_status_t); - st.status = NETIF_DRIVER_STATUS_UP; - memcpy(cmsg.msg, &st, sizeof(st)); - ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE); -} diff -r 5f1ed597f107 -r 8799d14bef77 linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pda.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pda.h Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,85 +0,0 @@ -#ifndef X86_64_PDA_H -#define X86_64_PDA_H - -#ifndef __ASSEMBLY__ -#include <linux/stddef.h> -#include <linux/types.h> -#include <linux/cache.h> - -/* Per processor datastructure. %gs points to it while the kernel runs */ -struct x8664_pda { - struct task_struct *pcurrent; /* Current process */ - unsigned long data_offset; /* Per cpu data offset from linker address */ - struct x8664_pda *me; /* Pointer to itself */ - unsigned long kernelstack; /* top of kernel stack for current */ - unsigned long oldrsp; /* user rsp for system call */ - unsigned long irqrsp; /* Old rsp for interrupts. */ - int irqcount; /* Irq nesting counter. Starts with -1 */ - int cpunumber; /* Logical CPU number */ - char *irqstackptr; /* top of irqstack */ - unsigned int __softirq_pending; - unsigned int __nmi_count; /* number of NMI on this CPUs */ - unsigned long idle_timestamp; - struct mm_struct *active_mm; - int mmu_state; - unsigned apic_timer_irqs; - int kernel_mode; /* kernel or user mode */ -} ____cacheline_aligned; - - -#define IRQSTACK_ORDER 2 -#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) - -extern struct x8664_pda cpu_pda[]; - -/* - * There is no fast way to get the base address of the PDA, all the accesses - * have to mention %fs/%gs. So it needs to be done this Torvaldian way. - */ -#define sizeof_field(type,field) (sizeof(((type *)0)->field)) -#define typeof_field(type,field) typeof(((type *)0)->field) - -extern void __bad_pda_field(void); - -#define pda_offset(field) offsetof(struct x8664_pda, field) - -#define pda_to_op(op,field,val) do { \ - switch (sizeof_field(struct x8664_pda, field)) { \ -case 2: \ -asm volatile(op "w %0,%%gs:%P1"::"r" (val),"i"(pda_offset(field)):"memory"); break; \ -case 4: \ -asm volatile(op "l %0,%%gs:%P1"::"r" (val),"i"(pda_offset(field)):"memory"); break; \ -case 8: \ -asm volatile(op "q %0,%%gs:%P1"::"r" (val),"i"(pda_offset(field)):"memory"); break; \ - default: __bad_pda_field(); \ - } \ - } while (0) - -/* - * AK: PDA read accesses should be neither volatile nor have an memory clobber. - * Unfortunately removing them causes all hell to break lose currently. - */ -#define pda_from_op(op,field) ({ \ - typedef typeof_field(struct x8664_pda, field) T__; T__ ret__; \ - switch (sizeof_field(struct x8664_pda, field)) { \ -case 2: \ -asm volatile(op "w %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ -case 4: \ -asm volatile(op "l %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ -case 8: \ -asm volatile(op "q %%gs:%P1,%0":"=r" (ret__):"i"(pda_offset(field)):"memory"); break;\ - default: __bad_pda_field(); \ - } \ - ret__; }) - - -#define read_pda(field) pda_from_op("mov",field) -#define write_pda(field,val) pda_to_op("mov",field,val) -#define add_pda(field,val) pda_to_op("add",field,val) -#define sub_pda(field,val) pda_to_op("sub",field,val) - -#endif - -#define PDA_STACKOFFSET (5*8) - -#endif diff -r 5f1ed597f107 -r 8799d14bef77 patches/linux-2.6.12/x86_64-linux.patch --- a/patches/linux-2.6.12/x86_64-linux.patch Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,68 +0,0 @@ -diff -urN linux-2.6.10-orig/include/asm-x86_64/hw_irq.h linux-2.6.10/include/asm-x86_64/hw_irq.h ---- linux-2.6.10-orig/include/asm-x86_64/hw_irq.h 2005-01-06 00:34:38.000000000 -0500 -+++ linux-2.6.10/include/asm-x86_64/hw_irq.h 2005-02-25 17:45:37.181518088 -0500 -@@ -48,6 +48,7 @@ - * - * Vectors 0xf0-0xf9 are free (reserved for future Linux use). - */ -+#ifndef CONFIG_XEN - #define SPURIOUS_APIC_VECTOR 0xff - #define ERROR_APIC_VECTOR 0xfe - #define INVALIDATE_TLB_VECTOR 0xfd -@@ -57,7 +58,7 @@ - #define KDB_VECTOR 0xf9 - - #define THERMAL_APIC_VECTOR 0xf0 -- -+#endif - - /* - * Local APIC timer IRQ vector is on a different priority level, -diff -urN linux-2.6.10-orig/include/asm-x86_64/irq.h linux-2.6.10/include/asm-x86_64/irq.h ---- linux-2.6.10-orig/include/asm-x86_64/irq.h 2005-01-06 00:34:38.000000000 -0500 -+++ linux-2.6.10/include/asm-x86_64/irq.h 2005-02-25 17:45:37.181518088 -0500 -@@ -10,6 +10,9 @@ - * <tomsoft@xxxxxxxxxxxxxxxxxxxxxxxxx> - */ - -+#ifdef CONFIG_XEN -+#include "irq_vectors.h" -+#endif - #define TIMER_IRQ 0 - - /* -@@ -22,6 +25,7 @@ - * the usable vector space is 0x20-0xff (224 vectors) - */ - -+#ifndef CONFIG_XEN - /* - * The maximum number of vectors supported by x86_64 processors - * is limited to 256. For processors other than x86_64, NR_VECTORS -@@ -38,6 +42,7 @@ - #define NR_IRQS 224 - #define NR_IRQ_VECTORS 1024 - #endif -+#endif - - static __inline__ int irq_canonicalize(int irq) - { -diff -urN linux-2.6.10-orig/include/asm-x86_64/posix_types.h linux-2.6.10/include/asm-x86_64/posix_types.h ---- linux-2.6.10-orig/include/asm-x86_64/posix_types.h 2004-10-18 17:55:29.000000000 -0400 -+++ linux-2.6.10/include/asm-x86_64/posix_types.h 2005-02-25 17:45:37.183517784 -0500 -@@ -6,7 +6,7 @@ - * be a little careful about namespace pollution etc. Also, we cannot - * assume GCC is being used. - */ -- -+#ifndef __ASSEMBLY__ - typedef unsigned long __kernel_ino_t; - typedef unsigned int __kernel_mode_t; - typedef unsigned long __kernel_nlink_t; -@@ -115,5 +115,5 @@ - } - - #endif /* defined(__KERNEL__) */ -- -+#endif - #endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/Makefile --- a/tools/consoled/Makefile Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,48 +0,0 @@ -# Makefile for consoled -# based on xcs Makefile -# Anthony Liguori 2005 - -XEN_ROOT=../.. -include $(XEN_ROOT)/tools/Rules.mk - -CONSOLED_INSTALL_DIR = /usr/sbin -XC_CONSOLE_INSTALL_DIR = /usr/libexec/xen - -INSTALL = install -INSTALL_PROG = $(INSTALL) -m0755 -INSTALL_DIR = $(INSTALL) -d -m0755 - -CC = gcc -CFLAGS = -Wall -Werror -g3 - -CFLAGS += -I $(XEN_XCS) -CFLAGS += -I $(XEN_LIBXC) -CFLAGS += -I $(XEN_XENSTORE) - -SRCS := -SRCS += main.c utils.c io.c - -HDRS = $(wildcard *.h) -OBJS = $(patsubst %.c,%.o,$(SRCS)) -BIN = consoled xc_console - -all: $(BIN) - -clean: - $(RM) *.a *.so *.o *.rpm $(BIN) - -consoled: $(OBJS) - $(CC) $(CFLAGS) $^ -o $@ -L$(XEN_LIBXC) -L$(XEN_XENSTORE) \ - -lxc -lxenstore - -xc_console: xc_console.o - $(CC) $(CFLAGS) $^ -o $@ -L$(XEN_LIBXC) -L$(XEN_XENSTORE) \ - -lxc -lxenstore - -$(OBJS): $(HDRS) - -install: $(BIN) - $(INSTALL_DIR) -p $(DESTDIR)/$(CONSOLED_INSTALL_DIR) - $(INSTALL_PROG) consoled $(DESTDIR)/$(CONSOLED_INSTALL_DIR) - $(INSTALL_DIR) -p $(DESTDIR)/$(XC_CONSOLE_INSTALL_DIR) - $(INSTALL_PROG) xc_console $(DESTDIR)/$(XC_CONSOLE_INSTALL_DIR) diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/io.c --- a/tools/consoled/io.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,328 +0,0 @@ -/*\ - * Copyright (C) International Business Machines Corp., 2005 - * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> - * - * Xen Console Daemon - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; under version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -\*/ - -#define _GNU_SOURCE - -#include "utils.h" -#include "io.h" - -#include "xc.h" -#include "xs.h" -#include "xen/io/domain_controller.h" -#include "xcs_proto.h" - -#include <malloc.h> -#include <stdlib.h> -#include <errno.h> -#include <string.h> -#include <sys/select.h> -#include <fcntl.h> -#include <unistd.h> -#include <termios.h> - -#define MAX(a, b) (((a) > (b)) ? (a) : (b)) -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - -struct buffer -{ - char *data; - size_t size; - size_t capacity; - size_t max_capacity; -}; - -void buffer_append(struct buffer *buffer, const void *data, size_t size) -{ - if ((buffer->capacity - buffer->size) < size) { - buffer->capacity += (size + 1024); - buffer->data = realloc(buffer->data, buffer->capacity); - if (buffer->data == NULL) { - dolog(LOG_ERR, "Memory allocation failed"); - exit(ENOMEM); - } - } - - memcpy(buffer->data + buffer->size, data, size); - buffer->size += size; - - if (buffer->max_capacity && - buffer->size > buffer->max_capacity) { - memmove(buffer->data + (buffer->size - buffer->max_capacity), - buffer->data, buffer->max_capacity); - buffer->data = realloc(buffer->data, buffer->max_capacity); - buffer->capacity = buffer->max_capacity; - } -} - -bool buffer_empty(struct buffer *buffer) -{ - return buffer->size == 0; -} - -void buffer_advance(struct buffer *buffer, size_t size) -{ - size = MIN(size, buffer->size); - memmove(buffer->data, buffer + size, buffer->size - size); - buffer->size -= size; -} - -struct domain -{ - int domid; - int tty_fd; - struct buffer buffer; - struct domain *next; -}; - -static struct domain *dom_head; - -bool domain_is_valid(int domid) -{ - bool ret; - xc_dominfo_t info; - - ret = (xc_domain_getinfo(xc, domid, 1, &info) == 1 && - info.domid == domid); - - return ret; -} - -int domain_create_tty(int domid) -{ - char path[1024]; - int master; - - if ((master = getpt()) == -1 || - grantpt(master) == -1 || unlockpt(master) == -1) { - dolog(LOG_ERR, "Failed to create tty for domain-%d", domid); - master = -1; - } else { - const char *slave = ptsname(master); - struct termios term; - - if (tcgetattr(master, &term) != -1) { - cfmakeraw(&term); - tcsetattr(master, TCSAFLUSH, &term); - } - - xs_mkdir(xs, "/console"); - snprintf(path, sizeof(path), "/console/%d", domid); - xs_mkdir(xs, path); - strcat(path, "/tty"); - - xs_write(xs, path, slave, strlen(slave), O_CREAT); - } - - return master; -} - -struct domain *create_domain(int domid) -{ - struct domain *dom; - char *data; - unsigned int len; - char path[1024]; - - dom = (struct domain *)malloc(sizeof(struct domain)); - if (dom == NULL) { - dolog(LOG_ERR, "Out of memory %s:%s():L%d", - __FILE__, __FUNCTION__, __LINE__); - exit(ENOMEM); - } - - dom->domid = domid; - dom->tty_fd = domain_create_tty(domid); - dom->buffer.data = 0; - dom->buffer.size = 0; - dom->buffer.capacity = 0; - dom->buffer.max_capacity = 0; - - snprintf(path, sizeof(path), "/console/%d/limit", domid); - data = xs_read(xs, path, &len); - if (data) { - dom->buffer.max_capacity = strtoul(data, 0, 0); - free(data); - } - - dolog(LOG_DEBUG, "New domain %d", domid); - - return dom; -} - -struct domain *lookup_domain(int domid) -{ - struct domain **pp; - - for (pp = &dom_head; *pp; pp = &(*pp)->next) { - struct domain *dom = *pp; - - if (dom->domid == domid) { - return dom; - } else if (dom->domid > domid) { - *pp = create_domain(domid); - (*pp)->next = dom; - return *pp; - } - } - - *pp = create_domain(domid); - return *pp; -} - -void remove_domain(struct domain *dom) -{ - struct domain **pp; - - dolog(LOG_DEBUG, "Removing domain-%d", dom->domid); - - for (pp = &dom_head; *pp; pp = &(*pp)->next) { - struct domain *d = *pp; - - if (dom->domid == d->domid) { - *pp = d->next; - free(d); - break; - } - } -} - -void handle_tty_read(struct domain *dom) -{ - ssize_t len; - xcs_msg_t msg; - - msg.type = XCS_REQUEST; - msg.u.control.remote_dom = dom->domid; - msg.u.control.msg.type = CMSG_CONSOLE; - msg.u.control.msg.subtype = CMSG_CONSOLE_DATA; - msg.u.control.msg.id = 1; - - len = read(dom->tty_fd, msg.u.control.msg.msg, 60); - if (len < 1) { - close(dom->tty_fd); - - if (domain_is_valid(dom->domid)) { - dom->tty_fd = domain_create_tty(dom->domid); - } else { - remove_domain(dom); - } - } else if (domain_is_valid(dom->domid)) { - msg.u.control.msg.length = len; - - if (!write_sync(xcs_data_fd, &msg, sizeof(msg))) { - dolog(LOG_ERR, "Write to xcs failed: %m"); - } - } else { - close(dom->tty_fd); - remove_domain(dom); - } -} - -void handle_tty_write(struct domain *dom) -{ - ssize_t len; - - len = write(dom->tty_fd, dom->buffer.data, dom->buffer.size); - if (len < 1) { - close(dom->tty_fd); - - if (domain_is_valid(dom->domid)) { - dom->tty_fd = domain_create_tty(dom->domid); - } else { - remove_domain(dom); - } - } else { - buffer_advance(&dom->buffer, len); - } -} - -void handle_xcs_msg(int fd) -{ - xcs_msg_t msg; - - if (!read_sync(fd, &msg, sizeof(msg))) { - dolog(LOG_ERR, "read from xcs failed! %m"); - } else if (msg.type == XCS_REQUEST) { - struct domain *dom; - - dom = lookup_domain(msg.u.control.remote_dom); - buffer_append(&dom->buffer, - msg.u.control.msg.msg, - msg.u.control.msg.length); - } -} - -static void enum_domains(void) -{ - int domid = 0; - xc_dominfo_t dominfo; - - while (xc_domain_getinfo(xc, domid, 1, &dominfo) == 1) { - lookup_domain(dominfo.domid); - domid = dominfo.domid + 1; - } -} - -void handle_io(void) -{ - fd_set readfds, writefds; - int ret; - int max_fd = -1; - - do { - struct domain *d; - struct timeval tv = { 1, 0 }; - - FD_ZERO(&readfds); - FD_ZERO(&writefds); - - FD_SET(xcs_data_fd, &readfds); - max_fd = MAX(xcs_data_fd, max_fd); - - for (d = dom_head; d; d = d->next) { - if (d->tty_fd != -1) { - FD_SET(d->tty_fd, &readfds); - } - - if (d->tty_fd != -1 && !buffer_empty(&d->buffer)) { - FD_SET(d->tty_fd, &writefds); - } - - max_fd = MAX(d->tty_fd, max_fd); - } - - ret = select(max_fd + 1, &readfds, &writefds, 0, &tv); - enum_domains(); - - if (FD_ISSET(xcs_data_fd, &readfds)) { - handle_xcs_msg(xcs_data_fd); - } - - for (d = dom_head; d; d = d->next) { - if (FD_ISSET(d->tty_fd, &readfds)) { - handle_tty_read(d); - } - - if (FD_ISSET(d->tty_fd, &writefds)) { - handle_tty_write(d); - } - } - } while (ret > -1); -} diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/io.h --- a/tools/consoled/io.h Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,26 +0,0 @@ -/*\ - * Copyright (C) International Business Machines Corp., 2005 - * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> - * - * Xen Console Daemon - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; under version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -\*/ - -#ifndef CONSOLED_IO_H -#define CONSOLED_IO_H - -void handle_io(void); - -#endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/main.c --- a/tools/consoled/main.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,93 +0,0 @@ -/*\ - * Copyright (C) International Business Machines Corp., 2005 - * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> - * - * Xen Console Daemon - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; under version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -\*/ - -#include <getopt.h> -#include <stdlib.h> -#include <stdio.h> -#include <errno.h> -#include <unistd.h> -#include <sys/types.h> - -#include "xc.h" -#include "xen/io/domain_controller.h" -#include "xcs_proto.h" - -#include "utils.h" -#include "io.h" - -int main(int argc, char **argv) -{ - const char *sopts = "hVvi"; - struct option lopts[] = { - { "help", 0, 0, 'h' }, - { "version", 0, 0, 'V' }, - { "verbose", 0, 0, 'v' }, - { "interactive", 0, 0, 'i' }, - { 0 }, - }; - bool is_interactive = false; - int ch; - int syslog_option = LOG_CONS; - int syslog_mask = LOG_WARNING; - int opt_ind = 0; - - while ((ch = getopt_long(argc, argv, sopts, lopts, &opt_ind)) != -1) { - switch (ch) { - case 'h': - //usage(argv[0]); - exit(0); - case 'V': - //version(argv[0]); - exit(0); - case 'v': - syslog_option |= LOG_PERROR; - syslog_mask = LOG_DEBUG; - break; - case 'i': - is_interactive = true; - break; - case '?': - fprintf(stderr, - "Try `%s --help' for more information\n", - argv[0]); - exit(EINVAL); - } - } - - if (geteuid() != 0) { - fprintf(stderr, "%s requires root to run.\n", argv[0]); - exit(EPERM); - } - - openlog("consoled", syslog_option, LOG_DAEMON); - setlogmask(syslog_mask); - - if (!is_interactive) { - daemonize("/var/run/consoled.pid"); - } - - xen_setup(); - - handle_io(); - - closelog(); - - return 0; -} diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/utils.c --- a/tools/consoled/utils.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,251 +0,0 @@ -/*\ - * Copyright (C) International Business Machines Corp., 2005 - * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> - * - * Xen Console Daemon - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; under version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -\*/ - -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/wait.h> -#include <unistd.h> -#include <stdlib.h> -#include <fcntl.h> -#include <err.h> -#include <errno.h> -#include <stdio.h> -#include <getopt.h> -#include <stdbool.h> -#include <sys/socket.h> -#include <sys/un.h> -#include <string.h> - -#include "xc.h" -#include "xen/io/domain_controller.h" -#include "xcs_proto.h" - -#include "utils.h" - -struct xs_handle *xs; -int xc; - -int xcs_ctrl_fd = -1; -int xcs_data_fd = -1; - -bool _read_write_sync(int fd, void *data, size_t size, bool do_read) -{ - size_t offset = 0; - ssize_t len; - - while (offset < size) { - if (do_read) { - len = read(fd, data + offset, size - offset); - } else { - len = write(fd, data + offset, size - offset); - } - - if (len < 1) { - if (len == -1 && (errno == EAGAIN || errno == EINTR)) { - return false; - } - } else { - offset += len; - } - } - - return true; -} - -static int open_domain_socket(const char *path) -{ - struct sockaddr_un addr; - int sock; - size_t addr_len; - - if ((sock = socket(PF_UNIX, SOCK_STREAM, 0)) == -1) { - goto out; - } - - addr.sun_family = AF_UNIX; - strcpy(addr.sun_path, path); - addr_len = sizeof(addr.sun_family) + strlen(XCS_SUN_PATH) + 1; - - if (connect(sock, (struct sockaddr *)&addr, addr_len) == -1) { - goto out_close_sock; - } - - return sock; - - out_close_sock: - close(sock); - out: - return -1; -} - -static void child_exit(int sig) -{ - while (waitpid(-1, NULL, WNOHANG) > 0); -} - -void daemonize(const char *pidfile) -{ - pid_t pid; - int fd; - int len; - int i; - char buf[100]; - - if (getppid() == 1) { - return; - } - - if ((pid = fork()) > 0) { - exit(0); - } else if (pid == -1) { - err(errno, "fork() failed"); - } - - setsid(); - - /* redirect fd 0,1,2 to /dev/null */ - if ((fd = open("/dev/null",O_RDWR)) == -1) { - exit(1); - } - - for (i = 0; i <= 2; i++) { - close(i); - dup2(fd, i); - } - - close(fd); - - umask(027); - chdir("/"); - - fd = open(pidfile, O_RDWR | O_CREAT); - if (fd == -1) { - exit(1); - } - - if (lockf(fd, F_TLOCK, 0) == -1) { - exit(1); - } - - len = sprintf(buf, "%d\n", getpid()); - write(fd, buf, len); - - signal(SIGCHLD, child_exit); - signal(SIGTSTP, SIG_IGN); - signal(SIGTTOU, SIG_IGN); - signal(SIGTTIN, SIG_IGN); -} - -/* synchronized send/recv strictly for setting up xcs */ -/* always use asychronize callbacks any other time */ -static bool xcs_send_recv(int fd, xcs_msg_t *msg) -{ - bool ret = false; - - if (!write_sync(fd, msg, sizeof(*msg))) { - dolog(LOG_ERR, "Write failed at %s:%s():L%d? Possible bug.", - __FILE__, __FUNCTION__, __LINE__); - goto out; - } - - if (!read_sync(fd, msg, sizeof(*msg))) { - dolog(LOG_ERR, "Read failed at %s:%s():L%d? Possible bug.", - __FILE__, __FUNCTION__, __LINE__); - goto out; - } - - ret = true; - - out: - return ret; -} - -bool xen_setup(void) -{ - int sock; - xcs_msg_t msg; - - xs = xs_daemon_open(); - if (xs == NULL) { - dolog(LOG_ERR, - "Failed to contact xenstore (%m). Is it running?"); - goto out; - } - - xc = xc_interface_open(); - if (xc == -1) { - dolog(LOG_ERR, "Failed to contact hypervisor (%m)"); - goto out; - } - - sock = open_domain_socket(XCS_SUN_PATH); - if (sock == -1) { - dolog(LOG_ERR, "Failed to contact xcs (%m). Is it running?"); - goto out_close_store; - } - - xcs_ctrl_fd = sock; - - sock = open_domain_socket(XCS_SUN_PATH); - if (sock == -1) { - dolog(LOG_ERR, "Failed to contact xcs (%m). Is it running?"); - goto out_close_ctrl; - } - - xcs_data_fd = sock; - - memset(&msg, 0, sizeof(msg)); - msg.type = XCS_CONNECT_CTRL; - if (!xcs_send_recv(xcs_ctrl_fd, &msg) || msg.result != XCS_RSLT_OK) { - dolog(LOG_ERR, "xcs control connect failed. Possible bug."); - goto out_close_data; - } - - msg.type = XCS_CONNECT_DATA; - if (!xcs_send_recv(xcs_data_fd, &msg) || msg.result != XCS_RSLT_OK) { - dolog(LOG_ERR, "xcs data connect failed. Possible bug."); - goto out_close_data; - } - - /* Since the vast majority of control messages are console messages - it's just easier to ignore other messages that try to bind to - a specific type. */ - msg.type = XCS_MSG_BIND; - msg.u.bind.port = PORT_WILDCARD; - msg.u.bind.type = TYPE_WILDCARD; - if (!xcs_send_recv(xcs_ctrl_fd, &msg) || msg.result != XCS_RSLT_OK) { - dolog(LOG_ERR, "xcs vind failed. Possible bug."); - goto out_close_data; - } - - return true; - - out_close_data: - close(xcs_ctrl_fd); - xcs_data_fd = -1; - out_close_ctrl: - close(xcs_ctrl_fd); - xcs_ctrl_fd = -1; - out_close_store: - xs_daemon_close(xs); - out: - return false; -} - diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/utils.h --- a/tools/consoled/utils.h Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,47 +0,0 @@ -/*\ - * Copyright (C) International Business Machines Corp., 2005 - * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> - * - * Xen Console Daemon - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; under version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -\*/ - -#ifndef CONSOLED_UTILS_H -#define CONSOLED_UTILS_H - -#include <stdbool.h> -#include <syslog.h> -#include <stdio.h> - -#include "xs.h" - -void daemonize(const char *pidfile); -bool xen_setup(void); -#define read_sync(fd, buffer, size) _read_write_sync(fd, buffer, size, true) -#define write_sync(fd, buffer, size) _read_write_sync(fd, buffer, size, false) -bool _read_write_sync(int fd, void *data, size_t size, bool do_read); - -extern int xcs_ctrl_fd; -extern int xcs_data_fd; -extern struct xs_handle *xs; -extern int xc; - -#if 1 -#define dolog(val, fmt, ...) syslog(val, fmt, ## __VA_ARGS__) -#else -#define dolog(val, fmt, ...) fprintf(stderr, fmt "\n", ## __VA_ARGS__) -#endif - -#endif diff -r 5f1ed597f107 -r 8799d14bef77 tools/consoled/xc_console.c --- a/tools/consoled/xc_console.c Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,236 +0,0 @@ -/*\ - * Copyright (C) International Business Machines Corp., 2005 - * Author(s): Anthony Liguori <aliguori@xxxxxxxxxx> - * - * Xen Console Daemon - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; under version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -\*/ - -#include <sys/types.h> -#include <sys/socket.h> -#include <sys/un.h> -#include <stdio.h> -#include <unistd.h> -#include <errno.h> -#include <stdlib.h> -#include <time.h> -#include <fcntl.h> -#include <sys/wait.h> -#include <termios.h> -#include <signal.h> -#include <getopt.h> -#include <sys/select.h> -#include <err.h> -#include <errno.h> -#include <pty.h> - -#include "xc.h" -#include "xs.h" - -#define ESCAPE_CHARACTER 0x1d - -static volatile sig_atomic_t received_signal = 0; - -static void sighandler(int signum) -{ - received_signal = 1; -} - -static bool write_sync(int fd, const void *data, size_t size) -{ - size_t offset = 0; - ssize_t len; - - while (offset < size) { - len = write(fd, data + offset, size - offset); - if (len < 1) { - return false; - } - offset += len; - } - - return true; -} - -static void usage(const char *program) { - printf("Usage: %s [OPTION] DOMID\n" - "Attaches to a virtual domain console\n" - "\n" - " -h, --help display this help and exit\n" - , program); -} - -/* don't worry too much if setting terminal attributes fail */ -static void init_term(int fd, struct termios *old) -{ - struct termios new_term; - - if (tcgetattr(fd, old) == -1) { - perror("tcgetattr() failed"); - return; - } - - new_term = *old; - cfmakeraw(&new_term); - - if (tcsetattr(fd, TCSAFLUSH, &new_term) == -1) { - perror("tcsetattr() failed"); - } -} - -static void restore_term(int fd, struct termios *old) -{ - if (tcsetattr(fd, TCSAFLUSH, old) == -1) { - perror("tcsetattr() failed"); - } -} - -static int console_loop(int xc_handle, domid_t domid, int fd) -{ - int ret; - - do { - fd_set fds; - - FD_ZERO(&fds); - FD_SET(STDIN_FILENO, &fds); - FD_SET(fd, &fds); - - ret = select(fd + 1, &fds, NULL, NULL, NULL); - if (ret == -1) { - if (errno == EINTR || errno == EAGAIN) { - continue; - } - perror("select() failed"); - return -1; - } - - if (FD_ISSET(STDIN_FILENO, &fds)) { - ssize_t len; - char msg[60]; - - len = read(STDIN_FILENO, msg, sizeof(msg)); - if (len == 1 && msg[0] == ESCAPE_CHARACTER) { - return 0; - } - - if (len == 0 && len == -1) { - if (len == -1 && - (errno == EINTR || errno == EAGAIN)) { - continue; - } - perror("select() failed"); - return -1; - } - - if (!write_sync(fd, msg, len)) { - perror("write() failed"); - return -1; - } - } - - if (FD_ISSET(fd, &fds)) { - ssize_t len; - char msg[512]; - - len = read(fd, msg, sizeof(msg)); - if (len == 0 || len == -1) { - if (len == -1 && - (errno == EINTR || errno == EAGAIN)) { - continue; - } - perror("select() failed"); - return -1; - } - - if (!write_sync(STDOUT_FILENO, msg, len)) { - perror("write() failed"); - return -1; - } - } - } while (received_signal == 0); - - return 0; -} - -int main(int argc, char **argv) -{ - struct termios attr; - int domid; - int xc_handle; - char *sopt = "hf:pc"; - int ch; - int opt_ind=0; - struct option lopt[] = { - { "help", 0, 0, 'h' }, - { "file", 1, 0, 'f' }, - { "pty", 0, 0, 'p' }, - { "ctty", 0, 0, 'c' }, - { 0 }, - - }; - char *str_pty; - char path[1024]; - int spty; - unsigned int len = 0; - struct xs_handle *xs; - - while((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) { - switch(ch) { - case 'h': - usage(argv[0]); - exit(0); - break; - } - } - - if ((argc - optind) != 1) { - fprintf(stderr, "Invalid number of arguments\n"); - fprintf(stderr, "Try `%s --help' for more information.\n", - argv[0]); - exit(EINVAL); - } - - domid = atoi(argv[optind]); - - xs = xs_daemon_open(); - if (xs == NULL) { - err(errno, "Could not contact XenStore"); - } - - xc_handle = xc_interface_open(); - if (xc_handle == -1) { - err(errno, "xc_interface_open()"); - } - - signal(SIGTERM, sighandler); - - snprintf(path, sizeof(path), "/console/%d/tty", domid); - str_pty = xs_read(xs, path, &len); - if (str_pty == NULL) { - err(errno, "Could not read tty from store"); - } - spty = open(str_pty, O_RDWR | O_NOCTTY); - if (spty == -1) { - err(errno, "Could not open tty `%s'", str_pty); - } - free(str_pty); - - init_term(STDIN_FILENO, &attr); - console_loop(xc_handle, domid, spty); - restore_term(STDIN_FILENO, &attr); - - return 0; - } diff -r 5f1ed597f107 -r 8799d14bef77 tools/examples/network --- a/tools/examples/network Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,246 +0,0 @@ -#!/bin/sh -#============================================================================ -# Default Xen network start/stop script. -# Xend calls a network script when it starts. -# The script name to use is defined in /etc/xen/xend-config.sxp -# in the network-script field. -# -# This script creates a bridge (default xen-br0), adds a device -# (default eth0) to it, copies the IP addresses from the device -# to the bridge and adjusts the routes accordingly. -# -# If all goes well, this should ensure that networking stays up. -# However, some configurations are upset by this, especially -# NFS roots. If the bridged setup does not meet your needs, -# configure a different script, for example using routing instead. -# -# Usage: -# -# network (start|stop|status) {VAR=VAL}* -# -# Vars: -# -# bridge The bridge to use (default xen-br0). -# netdev The interface to add to the bridge (default eth0). -# antispoof Whether to use iptables to prevent spoofing (default yes). -# -# start: -# Creates the bridge and enslaves netdev to it. -# Copies the IP addresses from netdev to the bridge. -# Deletes the routes to netdev and adds them on bridge. -# -# stop: -# Removes netdev from the bridge. -# Deletes the routes to bridge and adds them to netdev. -# -# status: -# Print ifconfig for netdev and bridge. -# Print routes. -# -#============================================================================ - -# Exit if anything goes wrong. -set -e - -# First arg is the operation. -OP=$1 -shift - -# Pull variables in args in to environment. -for arg ; do export "${arg}" ; done - -bridge=${bridge:-xen-br0} -netdev=${netdev:-eth0} -antispoof=${antispoof:-yes} - -echo "*network $OP bridge=$bridge netdev=$netdev antispoof=$antispoof" >&2 - -# Usage: transfer_addrs src dst -# Copy all IP addresses (including aliases) from device $src to device $dst. -transfer_addrs () { - local src=$1 - local dst=$2 - # Don't bother if $dst already has IP addresses. - if ip addr show dev ${dst} | egrep -q '^ *inet ' ; then - return - fi - # Address lines start with 'inet' and have the device in them. - # Replace 'inet' with 'ip addr add' and change the device name $src - # to 'dev $src'. - ip addr show dev ${src} | egrep '^ *inet ' | sed -e " -s/inet/ip addr add/ -s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+/[0-9]\+\)@\1@ -s/${src}/dev ${dst}/ -" | sh -e - # Remove automatic routes on destionation device - ip route list | sed -ne " -/dev ${dst}\( \|$\)/ { - s/^/ip route del / - p -}" | sh -e -} - -# Usage: del_addrs src -del_addrs () { - local src=$1 - ip addr show dev ${src} | egrep '^ *inet ' | sed -e " -s/inet/ip addr del/ -s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+\)/[0-9]\+@\1@ -s/${src}/dev ${src}/ -" | sh -e -} - -# Usage: transfer_routes src dst -# Get all IP routes to device $src, delete them, and -# add the same routes to device $dst. -# The original routes have to be deleted, otherwise adding them -# for $dst fails (duplicate routes). -transfer_routes () { - local src=$1 - local dst=$2 - # List all routes and grep the ones with $src in. - # Stick 'ip route del' on the front to delete. - # Change $src to $dst and use 'ip route add' to add. - ip route list | sed -ne " -/dev ${src}\( \|$\)/ { - h - s/^/ip route del / - P - g - s/${src}/${dst}/ - s/^/ip route add / - P - d -}" | sh -e -} - -# Usage: create_bridge bridge -create_bridge () { - local bridge=$1 - - # Don't create the bridge if it already exists. - if ! brctl show | grep -q ${bridge} ; then - brctl addbr ${bridge} - brctl stp ${bridge} off - brctl setfd ${bridge} 0 - fi - ifconfig ${bridge} up -} - -# Usage: add_to_bridge bridge dev -add_to_bridge () { - local bridge=$1 - local dev=$2 - # Don't add $dev to $bridge if it's already on a bridge. - if ! brctl show | grep -q ${dev} ; then - brctl addif ${bridge} ${dev} - fi -} - -# Usage: antispoofing dev bridge -# Set the default forwarding policy for $dev to drop. -# Allow forwarding to the bridge. -antispoofing () { - local dev=$1 - local bridge=$2 - - iptables -P FORWARD DROP - iptables -A FORWARD -m physdev --physdev-in ${dev} -j ACCEPT -} - -# Usage: show_status dev bridge -# Print ifconfig and routes. -show_status () { - local dev=$1 - local bridge=$2 - - echo '============================================================' - ifconfig ${dev} - ifconfig ${bridge} - echo ' ' - ip route list - echo ' ' - route -n - echo '============================================================' -} - -op_start () { - if [ "${bridge}" == "null" ] ; then - return - fi - - create_bridge ${bridge} - - if ifconfig 2>/dev/null | grep -q veth0 ; then - return - fi - - if ifconfig veth0 2>/dev/null | grep -q veth0 ; then - # Propagate MAC address and ARP responsibilities to virtual interface. - mac=`ifconfig ${netdev} | grep HWadd | sed -e 's/.*\(..:..:..:..:..:..\).*/\1/'` - ifconfig veth0 down - ifconfig veth0 hw ether ${mac} - ifconfig veth0 arp up - transfer_addrs ${netdev} veth0 - transfer_routes ${netdev} veth0 - del_addrs ${netdev} - ifconfig ${netdev} -arp down - ifconfig ${netdev} hw ether fe:ff:ff:ff:ff:ff up - # Bring up second half of virtual device and attach it to the bridge. - ifconfig vif0.0 up - add_to_bridge ${bridge} vif0.0 - else - transfer_addrs ${netdev} ${bridge} - transfer_routes ${netdev} ${bridge} - fi - - # Attach the real interface to the bridge. - add_to_bridge ${bridge} ${netdev} - - if [ ${antispoof} == 'yes' ] ; then - antispoofing ${netdev} ${bridge} - fi -} - -op_stop () { - if [ "${bridge}" == "null" ] ; then - return - fi - - brctl delif ${bridge} ${netdev} - - if ifconfig veth0 2>/dev/null | grep -q veth0 ; then - brctl delif ${bridge} vif0.0 - ifconfig vif0.0 down - mac=`ifconfig veth0 | grep HWadd | sed -e 's/.*\(..:..:..:..:..:..\).*/\1/'` - ifconfig ${netdev} down - ifconfig ${netdev} hw ether ${mac} - ifconfig ${netdev} arp up - transfer_addrs veth0 ${netdev} - transfer_routes veth0 ${netdev} - del_addrs veth0 - ifconfig veth0 -arp down - ifconfig veth0 hw ether 00:00:00:00:00:00 - else - transfer_routes ${bridge} ${netdev} - fi -} - -case ${OP} in - start) - op_start - ;; - - stop) - op_stop - ;; - - status) - show_status ${netdev} ${bridge} - ;; - - *) - echo 'Unknown command: ' ${OP} >&2 - echo 'Valid commands are: start, stop, status' >&2 - exit 1 -esac diff -r 5f1ed597f107 -r 8799d14bef77 tools/ioemu/target-i386-dm/qemu-vgaram-bin.gz --- a/tools/ioemu/target-i386-dm/qemu-vgaram-bin.gz Wed Aug 24 02:43:18 2005 +++ /dev/null Thu Aug 25 22:53:20 2005 @@ -1,7 +0,0 @@ -?ËmB qemu-vgaram-bin íÝÍk\×ÇñCòÂÁMw¢ I -^´?XÊ?#\c?Ô¡ \ No newline at end of file -M7okɲe?L,a9?J?oò'B×Ýô(¨1?.L?Å?´x?lJ1/?ÜçÑ}Ï?3wf?h?s7ßü¸3wFsæ?{î¹ç¾?}öpJ=ÉÉ?dAr\2'yI²&Y?\?lH??¼#ù?äE{Ϫ½Ï_×÷¿/¹bÑϾ,¹*yÊ^ßÌÞ?/yáðuy|#½vø¯òY7äõòüïé?Ã?Èë7öò?¾çP -?~ö}Éç6ÕÜ?|?=.ÿ-«?Õõ3gϵ?^¶×Ûê¬ÿ÷=©³]©³]Y/Ítéð5ymA??ÎÛûß·:=oõ{Åæ_µïsÁ?ë÷xMò¶=Õæ?µÇÃõð°dArM²kÓ?U÷?YÉIËWYf³×j?ïmÁ×yòGÕÅìÈ÷?Ͷß%kC¾î7³vµjmCç?Xï÷GDëýfÞÊD,».ó?ä%ËJ\ݾ??ItzIru/7Óºô;'¤ÿ?ÛË'é?Ì[?×?í}öw¶{·lõ¿bóuÞE?w̶پ.¾Ëc?#??I??lInÙ -lo^çy¼íåÛ@YÍr=aýÎU˺µ³9ËY¼ÖlzqozZêðYyí´Ì??z??ùóò·óÒ?×åñ]ißwd=}çÔ?ÿCÉr6Ís15ÛÀ??íÀuàuüyê÷?'íù?ú?J>²åº#¹+¹??¾ÿ?=¿c¯Tþ?$3©iÿº?²òO¥þv1cïü[Ý·½?íWG?Wòýï?¶½®[?ítVÚÔµ?1?äñ¦D§«ò¼'óõñ¹¬¯ýùTÛ?¦Üöò¤âqD>´væù?ä??D?c?ÉbjÆ={¾c?uÞCöüZ`´e·Èàë/IÖ$Ë?K??lþ+?×%oJ?´ùOÛTß¿.yÿÁk³é¨lëoH_{AoJV%ò|Uæ¯ÉãËòøDö9Çlª?·ùG³éÉ?äªä?Äÿö{¿æ9ÉÜÞãÙô¼?3'e<+??¶þF§úü?=~Ú>G?ã?¢Ì?evLtÿ3~4ÝÌZ{Ûµir=ó©Ùvm:?ò½¾~?²6ð?=ßÈÖý?lýÿAò²ä÷?·mÞo%/JÞ°vsFúûW$oJ~gí^_˦OYð¿]²2ÎÛkú«öÞµ÷ýBò|ö=µ|ÝNþ?}O}ÿ/MM_wߦGÃêz¸þ?ã? \ No newline at end of file -IOú?%ɦ,ÿ?×?ÖÙe[.?êöÿ?=~Âê@?eÓê`Ù?¿þæZê{Õ:þ:XýyßöCÚ߯e;#ÛøS{}àIé7gåõfûoÚ?NgÓÙCM¿ªmì¹Û´7?÷?ýs6ÿ?¯÷?#ß×ƵµÉõ§ûºÞÞúß?iO²$iÚ_Oê¯Ðþ¼?õ}ð§?m{>cù8õ÷½?ãÐdåkYÞïz_¸cß#ª\wøxcSòE<þÚ *{>Ëbà2ÇÇ/kRßW÷ÆóÒ?äØ˾Ï?ä?DÇïí?áþ^ç?ýý±,û¯ßï¶ÅÛB?zñcßû©\Üþ^ß?õû?u«ß?mXÝèû.Úß½?ÍÓ1àÖ@?5ÛØë2îyW¦OÚövîÁºjú?ò3uÞ¥CýmýÝCý¾÷²=î}¯uâÉÇbµÖ?fx¼Ý]¢û=??ýßv?;þÖx;÷s~~èÏÏÁ=¨_íw·Òøã?¨ò}ÿ7ªîõµÈö???cu=?%úØW?ùQ+W÷?·RÿøÇóX`ù»VÏ?ï*/ÿB¶®½ü?"?åïë?o?y¢Û_¾¬5?¿,ïz±þµ>"Ï{ÚúùZ}ÿ¤ò£÷½m}_>.?©T×G<?NÛò?õUvÞþ½Ì¼-Ôh?ùwøO=ÚN±Û¿_wÎÏýî¦á>(ª|íïóó |