[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] Merge.
# HG changeset patch # User adsharma@xxxxxxxxxxxxxxxxxxxx # Node ID dfaf788ab18cdd92f626380ddd97a64fa92abbcd # Parent de3576a1c62cea31cad7333af2426eaf65884926 # Parent 3bbc9384be3f408a96baf6db4666bc21cebfb955 Merge. diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/include/lib.h --- a/extras/mini-os/include/lib.h Thu Aug 25 20:52:38 2005 +++ b/extras/mini-os/include/lib.h Fri Aug 26 20:47:16 2005 @@ -79,36 +79,4 @@ char *strstr(const char *s1, const char *s2); -/* dlmalloc functions */ -struct mallinfo { - int arena; /* non-mmapped space allocated from system */ - int ordblks; /* number of free chunks */ - int smblks; /* number of fastbin blocks */ - int hblks; /* number of mmapped regions */ - int hblkhd; /* space in mmapped regions */ - int usmblks; /* maximum total allocated space */ - int fsmblks; /* space available in freed fastbin blocks */ - int uordblks; /* total allocated space */ - int fordblks; /* total free space */ - int keepcost; /* top-most, releasable (via malloc_trim) space */ -}; - -void *malloc(size_t n); -void *calloc(size_t n_elements, size_t element_size); -void free(void* p); -void *realloc(void* p, size_t n); -void *memalign(size_t alignment, size_t n); -void *valloc(size_t n); -struct mallinfo mallinfo(void); -int mallopt(int parameter_number, int parameter_value); - -void **independent_calloc(size_t n_elements, size_t size, void* chunks[]); -void **independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]); -void *pvalloc(size_t n); -void cfree(void* p); -int malloc_trim(size_t pad); -size_t malloc_usable_size(void* p); -void malloc_stats(void); - - #endif /* _LIB_H_ */ diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/include/mm.h --- a/extras/mini-os/include/mm.h Thu Aug 25 20:52:38 2005 +++ b/extras/mini-os/include/mm.h Fri Aug 26 20:47:16 2005 @@ -126,6 +126,18 @@ void init_mm(void); unsigned long alloc_pages(int order); -int is_mfn_mapped(unsigned long mfn); +#define alloc_page() alloc_pages(0); +void free_pages(void *pointer, int order); +//int is_mfn_mapped(unsigned long mfn); + +static __inline__ int get_order(unsigned long size) +{ + int order; + size = (size-1) >> PAGE_SHIFT; + for ( order = 0; size; order++ ) + size >>= 1; + return order; +} + #endif /* _MM_H_ */ diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/include/types.h --- a/extras/mini-os/include/types.h Thu Aug 25 20:52:38 2005 +++ b/extras/mini-os/include/types.h Fri Aug 26 20:47:16 2005 @@ -49,4 +49,6 @@ typedef unsigned long u_quad_t; typedef unsigned long uintptr_t; #endif + +#define UINT_MAX (~0U) #endif /* _TYPES_H_ */ diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/mm.c --- a/extras/mini-os/mm.c Thu Aug 25 20:52:38 2005 +++ b/extras/mini-os/mm.c Fri Aug 26 20:47:16 2005 @@ -1,6 +1,7 @@ -/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- +/* **************************************************************************** * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge + * (C) 2005 - Grzegorz Milos - Intel Research Cambridge **************************************************************************** * * File: mm.c @@ -13,8 +14,6 @@ * Description: memory management related functions * contains buddy page allocator from Xen. * - **************************************************************************** - * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $ **************************************************************************** * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to @@ -40,7 +39,7 @@ #include <mm.h> #include <types.h> #include <lib.h> - +#include <xmalloc.h> #ifdef MM_DEBUG #define DEBUG(_f, _a...) \ @@ -505,6 +504,6 @@ (u_long)to_virt(PFN_PHYS(max_pfn)), PFN_PHYS(max_pfn)); init_page_allocator(PFN_PHYS(start_pfn), PFN_PHYS(max_pfn)); #endif - + printk("MM: done\n"); } diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c --- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/smpboot.c Fri Aug 26 20:47:16 2005 @@ -853,7 +853,7 @@ atomic_set(&init_deasserted, 0); #if 1 - cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL); + cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL|__GFP_ZERO); BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE); cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; printk("GDT: copying %d bytes from %lx to %lx\n", diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 20:47:16 2005 @@ -105,13 +105,18 @@ struct mm_struct * old_mm; int retval = 0; + memset(&mm->context, 0, sizeof(mm->context)); init_MUTEX(&mm->context.sem); - mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); retval = copy_ldt(&mm->context, &old_mm->context); up(&old_mm->context.sem); + } + if (retval == 0) { + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); } return retval; } @@ -133,6 +138,11 @@ else kfree(mm->context.ldt); mm->context.size = 0; + } + if (!mm->context.pinned) { + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); } } diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/smpboot.c Fri Aug 26 20:47:16 2005 @@ -739,7 +739,7 @@ atomic_set(&init_deasserted, 0); #ifdef CONFIG_XEN - cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL); + cpu_gdt_descr[cpu].address = __get_free_page(GFP_KERNEL|__GFP_ZERO); BUG_ON(cpu_gdt_descr[0].size > PAGE_SIZE); cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size; memcpy((void *)cpu_gdt_descr[cpu].address, diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Fri Aug 26 20:47:16 2005 @@ -712,6 +712,7 @@ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); memset(empty_zero_page, 0, sizeof(empty_zero_page)); + init_mm.context.pinned = 1; #ifdef CONFIG_XEN_PHYSDEV_ACCESS { diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c Fri Aug 26 20:47:16 2005 @@ -12,19 +12,145 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/tlbflush.h> +#include <asm/io.h> + +#ifdef CONFIG_XEN #include <asm/pgalloc.h> -#include <asm/io.h> +#include <asm/mmu_context.h> + +LIST_HEAD(mm_unpinned); +DEFINE_SPINLOCK(mm_unpinned_lock); + +static inline void mm_walk_set_prot(void *pt, pgprot_t flags) +{ + struct page *page = virt_to_page(pt); + unsigned long pfn = page_to_pfn(page); + + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0)); +} + +static void mm_walk(struct mm_struct *mm, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g,u,m; + + pgd = mm->pgd; + for (g = 0; g <= USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + mm_walk_set_prot(pud,flags); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + mm_walk_set_prot(pmd,flags); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + pte = pte_offset_kernel(pmd,0); + mm_walk_set_prot(pte,flags); + } + } + } +} + +void mm_pin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + mm_walk(mm, PAGE_KERNEL_RO); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH)); + xen_pgd_pin(__pa(mm->pgd)); /* kernel */ + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */ + mm->context.pinned = 1; + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_unpin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + xen_pgd_unpin(__pa(mm->pgd)); + xen_pgd_unpin(__pa(__user_pgd(mm->pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0)); + mm_walk(mm, PAGE_KERNEL); + xen_tlb_flush(); + mm->context.pinned = 0; + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_pin_all(void) +{ + while (!list_empty(&mm_unpinned)) + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, + context.unpinned)); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if ( tsk->active_mm == mm ) + { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) + mm_unpin(mm); +} void pte_free(struct page *pte) { - pte_t *ptep; - - ptep = pfn_to_kaddr(page_to_pfn(pte)); - - xen_pte_unpin(__pa(ptep)); - make_page_writable(ptep); - __free_page(pte); -} + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); + + if (!pte_write(*virt_to_ptep(va))) + BUG_ON(HYPERVISOR_update_va_mapping( + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)); + __free_page(pte); +} +#endif /* CONFIG_XEN */ static inline pte_t *lookup_address(unsigned long address) { @@ -78,7 +204,7 @@ } else asm volatile("wbinvd":::"memory"); if (address) - __flush_tlb_one((unsigned long) address); + __flush_tlb_one(address); else __flush_tlb_all(); } @@ -166,14 +292,17 @@ BUG(); /* on x86-64 the direct mapping set at boot is not using 4k pages */ -// BUG_ON(PageReserved(kpte_page)); /* * ..., but the XEN guest kernels (currently) do: * If the pte was reserved, it means it was created at boot * time (not via split_large_page) and in turn we must not * replace it with a large page. */ - if (!PageReserved(kpte_page)) { +#ifndef CONFIG_XEN + BUG_ON(PageReserved(kpte_page)); +#else + if (!PageReserved(kpte_page)) +#endif switch (page_count(kpte_page)) { case 1: save_page(address, kpte_page); @@ -182,7 +311,6 @@ case 0: BUG(); /* memleak and failed 2M page regeneration */ } - } return 0; } diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/drivers/xen/blkback/interface.c --- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Fri Aug 26 20:47:16 2005 @@ -124,6 +124,7 @@ if (blkif->blk_ring.sring) { unmap_frontend_page(blkif); vfree(blkif->blk_ring.sring); + blkif->blk_ring.sring = NULL; } kmem_cache_free(blkif_cachep, blkif); diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c --- a/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/blkfront/blkfront.c Fri Aug 26 20:47:16 2005 @@ -1258,6 +1258,7 @@ err = talk_to_backend(dev, info); if (err) { kfree(info); + dev->data = NULL; return err; } diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c --- a/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/netfront/netfront.c Fri Aug 26 20:47:16 2005 @@ -939,18 +939,12 @@ static int destroy_netdev(struct net_device *netdev) { - struct net_private *np = NULL; #ifdef CONFIG_PROC_FS xennet_proc_delif(netdev); #endif unregister_netdev(netdev); - - np = netdev_priv(netdev); - list_del(&np->list); - - kfree(netdev); return 0; } @@ -1244,11 +1238,16 @@ } info = netdev_priv(netdev); + dev->data = info; + err = talk_to_backend(dev, info); if (err) { destroy_netdev(netdev); + kfree(netdev); + dev->data = NULL; return err; } + /* Call once in case entries already there. */ watch_for_status(&info->watch, info->watch.node); diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c --- a/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/drivers/xen/privcmd/privcmd.c Fri Aug 26 20:47:16 2005 @@ -63,16 +63,19 @@ "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx" : "=a" (ret) : "0" (&hypercall) : "memory" ); #elif defined (__x86_64__) - __asm__ __volatile__ ( - "movq %5,%%r10; movq %6,%%r8;" TRAP_INSTR - : "=a" (ret) - : "a" ((unsigned long)hypercall.op), - "D" ((unsigned long)hypercall.arg[0]), - "S" ((unsigned long)hypercall.arg[1]), - "d" ((unsigned long)hypercall.arg[2]), - "g" ((unsigned long)hypercall.arg[3]), - "g" ((unsigned long)hypercall.arg[4]) - : "r11","rcx","r8","r10","memory"); + { + long ign1, ign2, ign3; + __asm__ __volatile__ ( + "movq %5,%%r10; movq %6,%%r8;" TRAP_INSTR + : "=a" (ret), "=D" (ign1), "=S" (ign2), "=d" (ign3) + : "0" ((unsigned long)hypercall.op), + "1" ((unsigned long)hypercall.arg[0]), + "2" ((unsigned long)hypercall.arg[1]), + "3" ((unsigned long)hypercall.arg[2]), + "g" ((unsigned long)hypercall.arg[3]), + "g" ((unsigned long)hypercall.arg[4]) + : "r11","rcx","r8","r10","memory"); + } #endif } break; diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/hypercall.h Fri Aug 26 20:47:16 2005 @@ -29,551 +29,362 @@ #ifndef __HYPERCALL_H__ #define __HYPERCALL_H__ + #include <asm-xen/xen-public/xen.h> -/* - * Assembler stubs for hyper-calls. - */ +#define _hypercall0(type, name) \ +({ \ + long __res; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res) \ + : "0" (__HYPERVISOR_##name) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + long __res, __ign1; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=b" (__ign1) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + long __res, __ign1, __ign2; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + long __res, __ign1, __ign2, __ign3; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)), "3" ((long)(a3)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + long __res, __ign1, __ign2, __ign3, __ign4; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)), "3" ((long)(a3)), \ + "4" ((long)(a4)) \ + : "memory" ); \ + (type)__res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ + "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)), "3" ((long)(a3)), \ + "4" ((long)(a4)), "5" ((long)(a5)) \ + : "memory" ); \ + (type)__res; \ +}) static inline int HYPERVISOR_set_trap_table( - trap_info_t *table) -{ - int ret; - unsigned long ignore; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ignore) - : "0" (__HYPERVISOR_set_trap_table), "1" (table) - : "memory" ); - - return ret; + trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); } static inline int HYPERVISOR_mmu_update( - mmu_update_t *req, int count, int *success_count, domid_t domid) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_mmu_update), "1" (req), "2" (count), - "3" (success_count), "4" (domid) - : "memory" ); - - return ret; + mmu_update_t *req, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); } static inline int HYPERVISOR_mmuext_op( - struct mmuext_op *op, int count, int *success_count, domid_t domid) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_mmuext_op), "1" (op), "2" (count), - "3" (success_count), "4" (domid) - : "memory" ); - - return ret; + struct mmuext_op *op, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); } static inline int HYPERVISOR_set_gdt( - unsigned long *frame_list, int entries) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_set_gdt), "1" (frame_list), "2" (entries) - : "memory" ); - - - return ret; + unsigned long *frame_list, int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); } static inline int HYPERVISOR_stack_switch( - unsigned long ss, unsigned long esp) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_stack_switch), "1" (ss), "2" (esp) - : "memory" ); - - return ret; + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); } static inline int HYPERVISOR_set_callbacks( - unsigned long event_selector, unsigned long event_address, - unsigned long failsafe_selector, unsigned long failsafe_address) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_set_callbacks), "1" (event_selector), - "2" (event_address), "3" (failsafe_selector), "4" (failsafe_address) - : "memory" ); - - return ret; + unsigned long event_selector, unsigned long event_address, + unsigned long failsafe_selector, unsigned long failsafe_address) +{ + return _hypercall4(int, set_callbacks, + event_selector, event_address, + failsafe_selector, failsafe_address); } static inline int HYPERVISOR_fpu_taskswitch( - int set) -{ - int ret; - unsigned long ign; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_fpu_taskswitch), "1" (set) - : "memory" ); - - return ret; + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); } static inline int HYPERVISOR_yield( - void) -{ - int ret; - unsigned long ign; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_yield) - : "memory", "ecx" ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_yield, 0); } static inline int HYPERVISOR_block( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_block) - : "memory", "ecx" ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_block, 0); } static inline int HYPERVISOR_shutdown( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift)) - : "memory", "ecx" ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_shutdown | + (SHUTDOWN_poweroff << SCHEDOP_reasonshift), 0); } static inline int HYPERVISOR_reboot( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift)) - : "memory", "ecx" ); - - return ret; -} - -static inline int -HYPERVISOR_suspend( - unsigned long srec) -{ - int ret; - unsigned long ign1, ign2; - - /* NB. On suspend, control software expects a suspend record in %esi. */ - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=S" (ign2) - : "0" (__HYPERVISOR_sched_op), - "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), - "S" (srec) : "memory", "ecx"); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_shutdown | + (SHUTDOWN_reboot << SCHEDOP_reasonshift), 0); } static inline int HYPERVISOR_crash( - void) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_shutdown | (SHUTDOWN_crash << SCHEDOP_reasonshift)) - : "memory", "ecx" ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_shutdown | + (SHUTDOWN_crash << SCHEDOP_reasonshift), 0); } static inline long HYPERVISOR_set_timer_op( - u64 timeout) -{ - int ret; - unsigned long timeout_hi = (unsigned long)(timeout>>32); - unsigned long timeout_lo = (unsigned long)timeout; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_set_timer_op), "b" (timeout_lo), "c" (timeout_hi) - : "memory"); - - return ret; + u64 timeout) +{ + unsigned long timeout_hi = (unsigned long)(timeout>>32); + unsigned long timeout_lo = (unsigned long)timeout; + return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); } static inline int HYPERVISOR_dom0_op( - dom0_op_t *dom0_op) -{ - int ret; - unsigned long ign1; - - dom0_op->interface_version = DOM0_INTERFACE_VERSION; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_dom0_op), "1" (dom0_op) - : "memory"); - - return ret; + dom0_op_t *dom0_op) +{ + dom0_op->interface_version = DOM0_INTERFACE_VERSION; + return _hypercall1(int, dom0_op, dom0_op); } static inline int HYPERVISOR_set_debugreg( - int reg, unsigned long value) -{ - int ret; - unsigned long ign1, ign2; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_set_debugreg), "1" (reg), "2" (value) - : "memory" ); - - return ret; + int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); } static inline unsigned long HYPERVISOR_get_debugreg( - int reg) -{ - unsigned long ret; - unsigned long ign; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_get_debugreg), "1" (reg) - : "memory" ); - - return ret; + int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); } static inline int HYPERVISOR_update_descriptor( - u64 ma, u64 desc) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_update_descriptor), - "1" ((unsigned long)ma), "2" ((unsigned long)(ma>>32)), - "3" ((unsigned long)desc), "4" ((unsigned long)(desc>>32)) - : "memory" ); - - return ret; + u64 ma, u64 desc) +{ + return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); } static inline int HYPERVISOR_dom_mem_op( - unsigned int op, unsigned long *extent_list, - unsigned long nr_extents, unsigned int extent_order) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4, ign5; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4), - "=D" (ign5) - : "0" (__HYPERVISOR_dom_mem_op), "1" (op), "2" (extent_list), - "3" (nr_extents), "4" (extent_order), "5" (DOMID_SELF) - : "memory" ); - - return ret; + unsigned int op, unsigned long *extent_list, + unsigned long nr_extents, unsigned int extent_order) +{ + return _hypercall5(int, dom_mem_op, op, extent_list, + nr_extents, extent_order, DOMID_SELF); } static inline int HYPERVISOR_multicall( - void *call_list, int nr_calls) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_multicall), "1" (call_list), "2" (nr_calls) - : "memory" ); - - return ret; + void *call_list, int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); } static inline int HYPERVISOR_update_va_mapping( - unsigned long va, pte_t new_val, unsigned long flags) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4) - : "0" (__HYPERVISOR_update_va_mapping), - "1" (va), "2" ((new_val).pte_low), + unsigned long va, pte_t new_val, unsigned long flags) +{ + unsigned long pte_hi = 0; #ifdef CONFIG_X86_PAE - "3" ((new_val).pte_high), -#else - "3" (0), + pte_hi = new_val.pte_high; #endif - "4" (flags) - : "memory" ); - - return ret; + return _hypercall4(int, update_va_mapping, va, + new_val.pte_low, pte_hi, flags); } static inline int HYPERVISOR_event_channel_op( - void *op) -{ - int ret; - unsigned long ignore; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ignore) - : "0" (__HYPERVISOR_event_channel_op), "1" (op) - : "memory" ); - - return ret; + void *op) +{ + return _hypercall1(int, event_channel_op, op); } static inline int HYPERVISOR_xen_version( - int cmd) -{ - int ret; - unsigned long ignore; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ignore) - : "0" (__HYPERVISOR_xen_version), "1" (cmd) - : "memory" ); - - return ret; + int cmd) +{ + return _hypercall1(int, xen_version, cmd); } static inline int HYPERVISOR_console_io( - int cmd, int count, char *str) -{ - int ret; - unsigned long ign1, ign2, ign3; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) - : "0" (__HYPERVISOR_console_io), "1" (cmd), "2" (count), "3" (str) - : "memory" ); - - return ret; + int cmd, int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); } static inline int HYPERVISOR_physdev_op( - void *physdev_op) -{ - int ret; - unsigned long ign; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign) - : "0" (__HYPERVISOR_physdev_op), "1" (physdev_op) - : "memory" ); - - return ret; + void *physdev_op) +{ + return _hypercall1(int, physdev_op, physdev_op); } static inline int HYPERVISOR_grant_table_op( - unsigned int cmd, void *uop, unsigned int count) -{ - int ret; - unsigned long ign1, ign2, ign3; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3) - : "0" (__HYPERVISOR_grant_table_op), "1" (cmd), "2" (uop), "3" (count) - : "memory" ); - - return ret; + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); } static inline int HYPERVISOR_update_va_mapping_otherdomain( - unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) -{ - int ret; - unsigned long ign1, ign2, ign3, ign4, ign5; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), - "=S" (ign4), "=D" (ign5) - : "0" (__HYPERVISOR_update_va_mapping_otherdomain), - "1" (va), "2" ((new_val).pte_low), + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + unsigned long pte_hi = 0; #ifdef CONFIG_X86_PAE - "3" ((new_val).pte_high), -#else - "3" (0), + pte_hi = new_val.pte_high; #endif - "4" (flags), "5" (domid) : - "memory" ); - - return ret; + return _hypercall5(int, update_va_mapping_otherdomain, va, + new_val.pte_low, pte_hi, flags, domid); } static inline int HYPERVISOR_vm_assist( - unsigned int cmd, unsigned int type) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_vm_assist), "1" (cmd), "2" (type) - : "memory" ); - - return ret; + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); } static inline int HYPERVISOR_boot_vcpu( - unsigned long vcpu, vcpu_guest_context_t *ctxt) -{ - int ret; - unsigned long ign1, ign2; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_boot_vcpu), "1" (vcpu), "2" (ctxt) - : "memory"); - - return ret; + unsigned long vcpu, vcpu_guest_context_t *ctxt) +{ + return _hypercall2(int, boot_vcpu, vcpu, ctxt); +} + +static inline int +HYPERVISOR_vcpu_up( + int vcpu) +{ + return _hypercall2(int, sched_op, SCHEDOP_vcpu_up | + (vcpu << SCHEDOP_vcpushift), 0); +} + +static inline int +HYPERVISOR_vcpu_pickle( + int vcpu, vcpu_guest_context_t *ctxt) +{ + return _hypercall2(int, sched_op, SCHEDOP_vcpu_pickle | + (vcpu << SCHEDOP_vcpushift), ctxt); +} + +static inline int +HYPERVISOR_suspend( + unsigned long srec) +{ + int ret; + unsigned long ign1, ign2; + + /* On suspend, control software expects a suspend record in %esi. */ + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1), "=S" (ign2) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_shutdown | (SHUTDOWN_suspend << + SCHEDOP_reasonshift)), + "2" (srec) : "memory", "ecx"); + + return ret; } static inline int HYPERVISOR_vcpu_down( - int vcpu) -{ - int ret; - unsigned long ign1; - /* Yes, I really do want to clobber edx here: when we resume a - vcpu after unpickling a multi-processor domain, it returns - here, but clobbers all of the call clobbered registers. */ - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_vcpu_down | (vcpu << SCHEDOP_vcpushift)) - : "memory", "ecx", "edx" ); - - return ret; -} - -static inline int -HYPERVISOR_vcpu_up( - int vcpu) -{ - int ret; - unsigned long ign1; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_vcpu_up | (vcpu << SCHEDOP_vcpushift)) - : "memory", "ecx" ); - - return ret; -} - -static inline int -HYPERVISOR_vcpu_pickle( - int vcpu, vcpu_guest_context_t *ctxt) -{ - int ret; - unsigned long ign1, ign2; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret), "=b" (ign1), "=c" (ign2) - : "0" (__HYPERVISOR_sched_op), - "1" (SCHEDOP_vcpu_pickle | (vcpu << SCHEDOP_vcpushift)), - "2" (ctxt) - : "memory" ); - - return ret; + int vcpu) +{ + int ret; + unsigned long ign1; + /* Yes, I really do want to clobber edx here: when we resume a + vcpu after unpickling a multi-processor domain, it returns + here, but clobbers all of the call clobbered registers. */ + __asm__ __volatile__ ( + TRAP_INSTR + : "=a" (ret), "=b" (ign1) + : "0" (__HYPERVISOR_sched_op), + "1" (SCHEDOP_vcpu_down | (vcpu << SCHEDOP_vcpushift)) + : "memory", "ecx", "edx" ); + return ret; } #endif /* __HYPERCALL_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/hypercall.h Fri Aug 26 20:47:16 2005 @@ -4,6 +4,10 @@ * Linux-specific hypervisor handling. * * Copyright (c) 2002-2004, K A Fraser + * + * 64-bit updates: + * Benjamin Liu <benjamin.liu@xxxxxxxxx> + * Jun Nakajima <jun.nakajima@xxxxxxxxx> * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: @@ -26,497 +30,331 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ -/* - * Benjamin Liu <benjamin.liu@xxxxxxxxx> - * Jun Nakajima <jun.nakajima@xxxxxxxxx> - * Ported to x86-64. - * - */ #ifndef __HYPERCALL_H__ #define __HYPERCALL_H__ + #include <asm-xen/xen-public/xen.h> #define __syscall_clobber "r11","rcx","memory" -/* - * Assembler stubs for hyper-calls. - */ +#define _hypercall0(type, name) \ +({ \ + long __res; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res) \ + : "0" (__HYPERVISOR_##name) \ + : __syscall_clobber ); \ + (type)__res; \ +}) + +#define _hypercall1(type, name, a1) \ +({ \ + long __res, __ign1; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=D" (__ign1) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)) \ + : __syscall_clobber ); \ + (type)__res; \ +}) + +#define _hypercall2(type, name, a1, a2) \ +({ \ + long __res, __ign1, __ign2; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)) \ + : __syscall_clobber ); \ + (type)__res; \ +}) + +#define _hypercall3(type, name, a1, a2, a3) \ +({ \ + long __res, __ign1, __ign2, __ign3; \ + asm volatile ( \ + TRAP_INSTR \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)), "3" ((long)(a3)) \ + : __syscall_clobber ); \ + (type)__res; \ +}) + +#define _hypercall4(type, name, a1, a2, a3, a4) \ +({ \ + long __res, __ign1, __ign2, __ign3; \ + asm volatile ( \ + "movq %8,%%r10; " TRAP_INSTR \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)), "3" ((long)(a3)), \ + "g" ((long)(a4)) \ + : __syscall_clobber, "r10" ); \ + (type)__res; \ +}) + +#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ +({ \ + long __res, __ign1, __ign2, __ign3; \ + asm volatile ( \ + "movq %8,%%r10; movq %9,%%r8; " TRAP_INSTR \ + : "=a" (__res), "=D" (__ign1), "=S" (__ign2), \ + "=d" (__ign3) \ + : "0" (__HYPERVISOR_##name), "1" ((long)(a1)), \ + "2" ((long)(a2)), "3" ((long)(a3)), \ + "g" ((long)(a4)), "g" ((long)(a5)) \ + : __syscall_clobber, "r10", "r8" ); \ + (type)__res; \ +}) + static inline int HYPERVISOR_set_trap_table( - trap_info_t *table) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_trap_table), "D" (table) - : __syscall_clobber ); - - return ret; + trap_info_t *table) +{ + return _hypercall1(int, set_trap_table, table); } static inline int HYPERVISOR_mmu_update( - mmu_update_t *req, int count, int *success_count, domid_t domid) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5, %%r10;" TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_mmu_update), "D" (req), "S" ((long)count), - "d" (success_count), "g" ((unsigned long)domid) - : __syscall_clobber, "r10" ); - - return ret; + mmu_update_t *req, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmu_update, req, count, success_count, domid); } static inline int HYPERVISOR_mmuext_op( - struct mmuext_op *op, int count, int *success_count, domid_t domid) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5, %%r10;" TRAP_INSTR - : "=a" (ret) - : "0" (__HYPERVISOR_mmuext_op), "D" (op), "S" ((long)count), - "d" (success_count), "g" ((unsigned long)domid) - : __syscall_clobber, "r10" ); - - return ret; + struct mmuext_op *op, int count, int *success_count, domid_t domid) +{ + return _hypercall4(int, mmuext_op, op, count, success_count, domid); } static inline int HYPERVISOR_set_gdt( - unsigned long *frame_list, int entries) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_gdt), "D" (frame_list), "S" ((long)entries) - : __syscall_clobber ); - - - return ret; -} + unsigned long *frame_list, int entries) +{ + return _hypercall2(int, set_gdt, frame_list, entries); +} + static inline int HYPERVISOR_stack_switch( - unsigned long ss, unsigned long esp) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_stack_switch), "D" (ss), "S" (esp) - : __syscall_clobber ); - - return ret; + unsigned long ss, unsigned long esp) +{ + return _hypercall2(int, stack_switch, ss, esp); } static inline int HYPERVISOR_set_callbacks( - unsigned long event_address, unsigned long failsafe_address, - unsigned long syscall_address) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_callbacks), "D" (event_address), - "S" (failsafe_address), "d" (syscall_address) - : __syscall_clobber ); - - return ret; + unsigned long event_address, unsigned long failsafe_address, + unsigned long syscall_address) +{ + return _hypercall3(int, set_callbacks, + event_address, failsafe_address, syscall_address); } static inline int HYPERVISOR_fpu_taskswitch( - int set) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) : "0" ((unsigned long)__HYPERVISOR_fpu_taskswitch), - "D" ((unsigned long) set) : __syscall_clobber ); - - return ret; + int set) +{ + return _hypercall1(int, fpu_taskswitch, set); } static inline int HYPERVISOR_yield( - void) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), "D" ((unsigned long)SCHEDOP_yield) - : __syscall_clobber ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_yield, 0); } static inline int HYPERVISOR_block( - void) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), "D" ((unsigned long)SCHEDOP_block) - : __syscall_clobber ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_block, 0); } static inline int HYPERVISOR_shutdown( - void) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)(SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift))) - : __syscall_clobber ); - - return ret; + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_shutdown | + (SHUTDOWN_poweroff << SCHEDOP_reasonshift), 0); } static inline int HYPERVISOR_reboot( - void) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)(SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift))) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_suspend( - unsigned long srec) -{ - int ret; - - /* NB. On suspend, control software expects a suspend record in %esi. */ - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)(SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift))), - "S" (srec) - : __syscall_clobber ); - - return ret; -} - -/* - * We can have the timeout value in a single argument for the hypercall, but - * that will break the common code. - */ + void) +{ + return _hypercall2(int, sched_op, SCHEDOP_shutdown | + (SHUTDOWN_reboot << SCHEDOP_reasonshift), 0); +} + static inline long HYPERVISOR_set_timer_op( - u64 timeout) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_timer_op), - "D" (timeout) - : __syscall_clobber ); - - return ret; + u64 timeout) +{ + return _hypercall1(long, set_timer_op, timeout); } static inline int HYPERVISOR_dom0_op( - dom0_op_t *dom0_op) -{ - int ret; - - dom0_op->interface_version = DOM0_INTERFACE_VERSION; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_dom0_op), "D" (dom0_op) - : __syscall_clobber ); - - return ret; + dom0_op_t *dom0_op) +{ + dom0_op->interface_version = DOM0_INTERFACE_VERSION; + return _hypercall1(int, dom0_op, dom0_op); } static inline int HYPERVISOR_set_debugreg( - int reg, unsigned long value) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_debugreg), "D" ((unsigned long)reg), "S" (value) - : __syscall_clobber ); - - return ret; + int reg, unsigned long value) +{ + return _hypercall2(int, set_debugreg, reg, value); } static inline unsigned long HYPERVISOR_get_debugreg( - int reg) -{ - unsigned long ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_get_debugreg), "D" ((unsigned long)reg) - : __syscall_clobber ); - - return ret; + int reg) +{ + return _hypercall1(unsigned long, get_debugreg, reg); } static inline int HYPERVISOR_update_descriptor( - unsigned long ma, unsigned long word) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_update_descriptor), "D" (ma), - "S" (word) - : __syscall_clobber ); - - return ret; + unsigned long ma, unsigned long word) +{ + return _hypercall2(int, update_descriptor, ma, word); } static inline int HYPERVISOR_dom_mem_op( - unsigned int op, unsigned long *extent_list, - unsigned long nr_extents, unsigned int extent_order) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5,%%r10; movq %6,%%r8;" TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_dom_mem_op), "D" ((unsigned long)op), "S" (extent_list), - "d" (nr_extents), "g" ((unsigned long) extent_order), "g" ((unsigned long) DOMID_SELF) - : __syscall_clobber,"r8","r10"); - - return ret; + unsigned int op, unsigned long *extent_list, + unsigned long nr_extents, unsigned int extent_order) +{ + return _hypercall5(int, dom_mem_op, op, extent_list, + nr_extents, extent_order, DOMID_SELF); } static inline int HYPERVISOR_multicall( - void *call_list, int nr_calls) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_multicall), "D" (call_list), "S" ((unsigned long)nr_calls) - : __syscall_clobber); - - return ret; + void *call_list, int nr_calls) +{ + return _hypercall2(int, multicall, call_list, nr_calls); } static inline int HYPERVISOR_update_va_mapping( - unsigned long page_nr, pte_t new_val, unsigned long flags) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_update_va_mapping), - "D" (page_nr), "S" (new_val.pte), "d" (flags) - : __syscall_clobber); - - return ret; + unsigned long va, pte_t new_val, unsigned long flags) +{ + return _hypercall3(int, update_va_mapping, va, new_val.pte, flags); } static inline int HYPERVISOR_event_channel_op( - void *op) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_event_channel_op), "D" (op) - : __syscall_clobber); - - return ret; + void *op) +{ + return _hypercall1(int, event_channel_op, op); } static inline int HYPERVISOR_xen_version( - int cmd) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_xen_version), "D" ((unsigned long)cmd) - : __syscall_clobber); - - return ret; + int cmd) +{ + return _hypercall1(int, xen_version, cmd); } static inline int HYPERVISOR_console_io( - int cmd, int count, char *str) -{ - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_console_io), "D" ((unsigned long)cmd), "S" ((unsigned long)count), "d" (str) - : __syscall_clobber); - - return ret; + int cmd, int count, char *str) +{ + return _hypercall3(int, console_io, cmd, count, str); } static inline int HYPERVISOR_physdev_op( - void *physdev_op) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_physdev_op), "D" (physdev_op) - : __syscall_clobber); - - return ret; + void *physdev_op) +{ + return _hypercall1(int, physdev_op, physdev_op); } static inline int HYPERVISOR_grant_table_op( - unsigned int cmd, void *uop, unsigned int count) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_grant_table_op), "D" ((unsigned long)cmd), "S" ((unsigned long)uop), "d" (count) - : __syscall_clobber); - - return ret; + unsigned int cmd, void *uop, unsigned int count) +{ + return _hypercall3(int, grant_table_op, cmd, uop, count); } static inline int HYPERVISOR_update_va_mapping_otherdomain( - unsigned long page_nr, pte_t new_val, unsigned long flags, domid_t domid) -{ - int ret; - - __asm__ __volatile__ ( - "movq %5, %%r10;" TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_update_va_mapping_otherdomain), - "D" (page_nr), "S" (new_val.pte), "d" (flags), "g" ((unsigned long)domid) - : __syscall_clobber,"r10"); - - return ret; + unsigned long va, pte_t new_val, unsigned long flags, domid_t domid) +{ + return _hypercall4(int, update_va_mapping_otherdomain, va, + new_val.pte, flags, domid); } static inline int HYPERVISOR_vm_assist( - unsigned int cmd, unsigned int type) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_vm_assist), "D" ((unsigned long)cmd), "S" ((unsigned long)type) - : __syscall_clobber); - - return ret; + unsigned int cmd, unsigned int type) +{ + return _hypercall2(int, vm_assist, cmd, type); +} + +static inline int +HYPERVISOR_boot_vcpu( + unsigned long vcpu, vcpu_guest_context_t *ctxt) +{ + return _hypercall2(int, boot_vcpu, vcpu, ctxt); +} + +static inline int +HYPERVISOR_vcpu_up( + int vcpu) +{ + return _hypercall2(int, sched_op, SCHEDOP_vcpu_up | + (vcpu << SCHEDOP_vcpushift), 0); +} + +static inline int +HYPERVISOR_vcpu_pickle( + int vcpu, vcpu_guest_context_t *ctxt) +{ + return _hypercall2(int, sched_op, SCHEDOP_vcpu_pickle | + (vcpu << SCHEDOP_vcpushift), ctxt); } static inline int HYPERVISOR_switch_to_user(void) { - int ret; - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) : "0" ((unsigned long)__HYPERVISOR_switch_to_user) : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_boot_vcpu( - unsigned long vcpu, vcpu_guest_context_t *ctxt) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" (__HYPERVISOR_boot_vcpu), "D" (vcpu), "S" (ctxt) - : __syscall_clobber); - - return ret; + return _hypercall0(int, switch_to_user); } static inline int HYPERVISOR_set_segment_base( - int reg, unsigned long value) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_set_segment_base), "D" ((unsigned long)reg), "S" (value) - : __syscall_clobber ); - - return ret; -} - -static inline int -HYPERVISOR_vcpu_pickle( - int vcpu, vcpu_guest_context_t *ctxt) -{ - int ret; - - __asm__ __volatile__ ( - TRAP_INSTR - : "=a" (ret) - : "0" ((unsigned long)__HYPERVISOR_sched_op), - "D" ((unsigned long)SCHEDOP_vcpu_pickle | (vcpu << SCHEDOP_vcpushift)), - "S" ((unsigned long)ctxt) - : __syscall_clobber ); - - return ret; + int reg, unsigned long value) +{ + return _hypercall2(int, set_segment_base, reg, value); +} + +static inline int +HYPERVISOR_suspend( + unsigned long srec) +{ + return _hypercall2(int, sched_op, SCHEDOP_shutdown | + (SHUTDOWN_suspend << SCHEDOP_reasonshift), srec); } #endif /* __HYPERCALL_H__ */ + +/* + * Local variables: + * c-file-style: "linux" + * indent-tabs-mode: t + * c-indent-level: 8 + * c-basic-offset: 8 + * tab-width: 8 + * End: + */ diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Fri Aug 26 20:47:16 2005 @@ -58,6 +58,9 @@ } } +extern void mm_pin(struct mm_struct *mm); +extern void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -66,6 +69,9 @@ struct mmuext_op _op[3], *op = _op; if (likely(prev != next)) { + if (!next->context.pinned) + mm_pin(next); + /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); #if 0 /* XEN: no lazy tlb */ diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 20:47:16 2005 @@ -21,12 +21,27 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + } else { + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); + } } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + } else { + *(pud) = __pud(_PAGE_TABLE | __pa(pmd)); + } } /* @@ -35,53 +50,54 @@ */ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); -} - -extern __inline__ pmd_t *get_pmd(void) -{ - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - return pmd; + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); + } else { + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); + *(__user_pgd(pgd)) = *(pgd); + } } extern __inline__ void pmd_free(pmd_t *pmd) { - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - xen_pmd_unpin(__pa(pmd)); - make_page_writable(pmd); + pte_t *ptep = virt_to_ptep(pmd); + + if (!pte_write(*ptep)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } free_page((unsigned long)pmd); } static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); return pmd; } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pud) - return NULL; - make_page_readonly(pud); - xen_pud_pin(__pa(pud)); return pud; } static inline void pud_free(pud_t *pud) { - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - xen_pud_unpin(__pa(pud)); - make_page_writable(pud); + pte_t *ptep = virt_to_ptep(pud); + + if (!pte_write(*ptep)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } free_page((unsigned long)pud); } @@ -107,10 +123,6 @@ (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ - make_pages_readonly(pgd, 2); - - xen_pgd_pin(__pa(pgd)); /* kernel */ - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ /* * Set level3_user_pgt for vsyscall area */ @@ -121,31 +133,45 @@ static inline void pgd_free(pgd_t *pgd) { - BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - xen_pgd_unpin(__pa(pgd)); - xen_pgd_unpin(__pa(__user_pgd(pgd))); - make_pages_writable(pgd, 2); + pte_t *ptep = virt_to_ptep(pgd); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(pgd)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } + + ptep = virt_to_ptep(__user_pgd(pgd)); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(__user_pgd(pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(pgd), + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, + PAGE_KERNEL), + 0)); + } + free_pages((unsigned long)pgd, 1); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); + if (pte) + make_page_readonly(pte); + return pte; } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - return virt_to_page((unsigned long)pte); + struct page *pte; + + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + return pte; } /* Should really implement gc for free page table pages. This could be diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Fri Aug 26 20:47:16 2005 @@ -18,7 +18,7 @@ #define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) xen_invlpg(addr) +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) /* diff -r de3576a1c62c -r dfaf788ab18c tools/python/xen/util/Brctl.py --- a/tools/python/xen/util/Brctl.py Thu Aug 25 20:52:38 2005 +++ b/tools/python/xen/util/Brctl.py Fri Aug 26 20:47:16 2005 @@ -76,6 +76,7 @@ def bridge_del(bridge): """Delete a bridge. """ + cmd(CMD_IFCONFIG, '%s down' % bridge) cmd(CMD_BRCTL, 'delbr %s' % bridge) def routes(): diff -r de3576a1c62c -r dfaf788ab18c tools/python/xen/xend/XendVnet.py --- a/tools/python/xen/xend/XendVnet.py Thu Aug 25 20:52:38 2005 +++ b/tools/python/xen/xend/XendVnet.py Fri Aug 26 20:47:16 2005 @@ -22,7 +22,7 @@ from xen.xend import sxp from xen.xend.XendError import XendError from xen.xend.XendLogging import log -from xen.xend.xenstore import XenNode, DBMap +from xen.xend.xenstore import XenNode, DBMap, DBVar def vnet_cmd(cmd): out = None @@ -38,17 +38,40 @@ class XendVnetInfo: vifctl_ops = {'up': 'vif.add', 'down': 'vif.del'} + + __exports__ = [ + DBVar('id', ty='str'), + DBVar('dbid', ty='str'), + DBVar('config', ty='sxpr'), + ] - def __init__(self, config): - self.config = config - self.id = sxp.child_value(config, 'id') - self.id = str(self.id) + def __init__(self, db, config=None): + if config: + self.id = sxp.child_value(config, 'id') + self.id = str(self.id) + self.dbid = self.id.replace(':', '-') + self.db = db.addChild(self.dbid) + self.config = config + else: + self.db = db + self.importFromDB() + config = self.config + self.bridge = sxp.child_value(config, 'bridge') if not self.bridge: self.bridge = "vnet%s" % self.id self.vnetif = sxp.child_value(config, 'vnetif') if not self.vnetif: - self.vnetif = "vnetif%s" % self.id + self.vnetif = "vnif%s" % self.id + + def saveToDB(self, save=False, sync=False): + self.db.saveDB(save=save, sync=sync) + + def exportToDB(self, save=False, sync=False): + self.db.exportToDB(self, fields=self.__exports__, save=save, sync=sync) + + def importFromDB(self): + self.db.importFromDB(self, fields=self.__exports__) def sxpr(self): return self.config @@ -64,7 +87,9 @@ log.info("Deleting vnet %s", self.id) Brctl.vif_bridge_rem({'bridge': self.bridge, 'vif': self.vnetif}) Brctl.bridge_del(self.bridge) - return vnet_cmd(['vnet.del', self.id]) + val = vnet_cmd(['vnet.del', self.id]) + self.db.delete() + return val def vifctl(self, op, vif, vmac): try: @@ -82,16 +107,18 @@ def __init__(self): # Table of vnet info indexed by vnet id. self.vnet = {} - self.dbmap = DBMap(db=XenNode(self.dbpath)) - self.dbmap.readDB() - for vnetdb in self.dbmap.values(): - config = vnetdb.config - info = XendVnetInfo(config) - self.vnet[info.id] = info + self.db = DBMap(db=XenNode(self.dbpath)) + self.db.readDB() + for vnetdb in self.db.values(): try: + info = XendVnetInfo(vnetdb) + self.vnet[info.id] = info info.configure() except XendError, ex: log.warning("Failed to configure vnet %s: %s", str(info.id), str(ex)) + except Exception, ex: + log.exception("Vnet error") + vnetdb.delete() def vnet_of_bridge(self, bridge): """Get the vnet for a bridge (if any). @@ -128,9 +155,9 @@ @param config: config """ - info = XendVnetInfo(config) + info = XendVnetInfo(self.db, config=config) self.vnet[info.id] = info - self.dbmap["%s/config" % info.id] = info.sxpr() + info.saveToDB() info.configure() def vnet_delete(self, id): @@ -141,7 +168,6 @@ info = self.vnet_get(id) if info: del self.vnet[id] - self.dbmap.delete(id) info.delete() def instance(): diff -r de3576a1c62c -r dfaf788ab18c tools/python/xen/xend/server/SrvVnetDir.py --- a/tools/python/xen/xend/server/SrvVnetDir.py Thu Aug 25 20:52:38 2005 +++ b/tools/python/xen/xend/server/SrvVnetDir.py Fri Aug 26 20:47:16 2005 @@ -19,6 +19,7 @@ from xen.xend.Args import FormFn from xen.xend import PrettyPrint from xen.xend import XendVnet +from xen.xend.XendError import XendError from xen.web.SrvDir import SrvDir diff -r de3576a1c62c -r dfaf788ab18c tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py Thu Aug 25 20:52:38 2005 +++ b/tools/python/xen/xm/main.py Fri Aug 26 20:47:16 2005 @@ -104,6 +104,11 @@ network-limit <DomId> <Vif> <Credit> <Period> Limit the transmission rate of a virtual network interface network-list <DomId> List virtual network interfaces for a domain + + Vnet commands: + vnet-list [-l|--long] list vnets + vnet-create <config> create a vnet from a config file + vnet-delete <vnetid> delete a vnet For a short list of subcommands run 'xm help' For more help on xm see the xm(1) man page @@ -546,6 +551,47 @@ from xen.xend.XendClient import server server.xend_domain_device_destroy(dom, 'vbd', dev) + +def xm_vnet_list(args): + from xen.xend.XendClient import server + try: + (options, params) = getopt(args, 'l', ['long']) + except GetoptError, opterr: + err(opterr) + sys.exit(1) + + use_long = 0 + for (k, v) in options: + if k in ['-l', '--long']: + use_long = 1 + + if params: + use_long = 1 + vnets = params + else: + vnets = server.xend_vnets() + + for vnet in vnets: + try: + if use_long: + info = server.xend_vnet(vnet) + PrettyPrint.prettyprint(info) + else: + print vnet + except Exception, ex: + print vnet, ex + +def xm_vnet_create(args): + arg_check(args, 1, "vnet-create") + conf = args[0] + from xen.xend.XendClient import server + server.xend_vnet_create(conf) + +def xm_vnet_delete(args): + arg_check(args, 1, "vnet-delete") + vnet = args[0] + from xen.xend.XendClient import server + server.xend_vnet_delete(vnet) commands = { # console commands @@ -592,7 +638,11 @@ "block-refresh": xm_block_refresh, # network "network-limit": xm_network_limit, - "network-list": xm_network_list + "network-list": xm_network_list, + # vnet + "vnet-list": xm_vnet_list, + "vnet-create": xm_vnet_create, + "vnet-delete": xm_vnet_delete, } aliases = { diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/00INSTALL --- a/tools/vnet/00INSTALL Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/00INSTALL Fri Aug 26 20:47:16 2005 @@ -1,14 +1,34 @@ -To compile and install run "make install"; if it fails or you need to reinstall -run "make clean" first or the build will fail, at least that is what I have -found under 2.6.10. +make + - compile in local dirs. The module is in vnet-module/vnet_module.ko. -Other important items: +make dist + - compile and install into $(XEN_ROOT)/dist/install, + - where XEN_ROOT is the root of the xen tree. + +make install + - compile and install into system. + +The xen0 kernel must have been compiled before building the vnet module. +The vnet module installs to + /lib/modules/<kernel version>-xen0/kernel/xen/vnet_module.ko + +The vnet module should be loaded before starting xend, or +xend will fail to create any persistent vnets it has in its configuration. +The script network-vnet is a modified version of the xen network script +that loads the module if it's not already loaded. + +The module uses kernel crypto functions, and these need to be +enabled in the xen0 kernel config. They should be on by default - +if they're not you will get compile or insmod errors (see below). + +Kernel config options: + 1) You will need to have your xen0 kernel compiled with HMAC_SUPPORT 2.6.x = (MAIN MENU: Cryptographic Options -> HMAC Support) BEFORE running "make install". -2) You will want at least some of the other alogorithms listed under +2) You will want at least some of the other algorithms listed under "Cryptographic Options" for the kernel compiled as modules. 3) You will want the networking IPsec/VLAN options compiled in as modules @@ -23,9 +43,5 @@ 802.1Q VLAN Support -4) The module (vnet_module) will not properly load from the command line - with a "modprobe vnet_module". Use network-vnet to properly configure - your system and load the module for you. - Please refer to the additional documentation found in tools/vnet/doc for proper syntax and config file parameters. diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/Makefile --- a/tools/vnet/Makefile Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/Makefile Fri Aug 26 20:47:16 2005 @@ -1,18 +1,21 @@ +# -*- mode: Makefile; -*- -export LINUX_SERIES ?=2.6 +ifndef VNET_ROOT +export VNET_ROOT = $(shell pwd) +include $(VNET_ROOT)/Make.env +endif -# Root path to install in. -# Set to '/' to install relative to filesystem root. -export prefix?=$(shell cd ../../dist/install && pwd) +.PHONY: all compile install dist clean pristine +.PHONY: gc-all gc-install gc-clean -.PHONY: all compile -.PHONY: gc-install gc-clean gc-prstine -.PHONY: libxutil vnetd vnet-module install dist clean pristine +SUBDIRS:= +SUBDIRS+= examples +SUBDIRS+= gc +SUBDIRS+= libxutil +SUBDIRS+= vnetd +SUBDIRS+= vnet-module all: compile - -compile: libxutil vnetd vnet-module -#compile: vnet-module gc.tar.gz: wget http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/$@ @@ -21,36 +24,39 @@ tar xfz gc.tar.gz ln -sf gc?.? gc -gc-install: gc - (cd gc && make test && ./configure --prefix=`pwd`/install) +$(GC_LIB_A): gc + (cd gc && ./configure --prefix=$(GC_DIR) ) make -C gc - make -C gc install + DESTDIR="" make -C gc install + +gc-all: $(GC_LIB_A) + +gc-install: gc-clean: - -$(MAKE) -C gc clean + -@$(RM) -r gc?.? gc -gc-pristine: - -rm -rf gc?.? gc +submak = $(MAKE) -C $(patsubst %-$(1),%,$(@)) $(1) +subtgt = $(patsubst %,%-$(1),$(SUBDIRS)) -libxutil: - $(MAKE) -C libxutil +%-all: + $(call submak,all) -vnetd: gc-install - $(MAKE) -C vnetd +%-clean: + -$(call submak,clean) -vnet-module: - $(MAKE) -C vnet-module +%-install: + $(call submak,install) -install: compile - $(MAKE) -C libxutil install - $(MAKE) -C vnetd install - $(MAKE) -C vnet-module install - $(MAKE) -C examples install +compile: $(call subtgt,all) -clean: - -$(MAKE) -C libxutil clean - -$(MAKE) -C vnetd clean - -$(MAKE) -C vnet-module clean - -rm -rf gc?.? gc +install: DESTDIR= +install: dist -pristine: clean gc-pristine +dist: compile $(call subtgt,install) + +clean: $(call subtgt,clean) + -@$(RM) -r build + +pristine: clean + -@$(RM) gc.tar.gz diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/doc/vnet-module.txt --- a/tools/vnet/doc/vnet-module.txt Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/doc/vnet-module.txt Fri Aug 26 20:47:16 2005 @@ -1,20 +1,33 @@ Vnet Module Command Interface Mike Wray <mike.wray@xxxxxx> -2004/09/17 +2005/08/25 When insmod the vnet-module creates /proc/vnet/policy which can be used to control the module by writing commands into it. The return code from the command should be returned by close. +Xend uses these commands to implement its vnet interface. The commands are: -(vnet.add (id <id>) [(security { none | auth | conf } )] ) +(vnet.add (id <id>) [(vnetif <ifname>)] [(security { none | auth | conf } )] ) Create the vnet with id <id> and the given security level (default none). +Vnet ids are 128-bit and can be specified as 8 fields of 1 to 4 hex digits +separated by colons. A vnet id with no colons is treated as one with the first +7 fields zero. Examples: + +1500 - equivalent to 0:0:0:0:0:0:0:1500 +aaff:0:0:0:0:0:77:88 + Security levels: - none: no security - auth: message authentication (IPSEC hmac) - conf: message confidentiality (IPSEC hmac and encryption) + +The <ifname> is the name of the network device created for the vnet. +If not given it defaults to vnif<N>, where <N> is the hex for the +8-th field in the id. Note that network device names can have a +maximum of 14 characters. (vnet.del (id <id>)) @@ -31,11 +44,17 @@ Remove the vif with MAC address <macaddr> from the vnet with id <vnetid>. The vnet module will stop responding to VARP for the vif. +(vif.print) + +Print the known vnets, vifs and varp cache on the console. + Examples: To create vnet 10 with no security: echo '(vnet.add (id 10))' > /proc/vnet/policy + +This creates a device vnif0010. To create vnet 11 with message authentication: diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/doc/vnet-xend.txt --- a/tools/vnet/doc/vnet-xend.txt Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/doc/vnet-xend.txt Fri Aug 26 20:47:16 2005 @@ -3,11 +3,13 @@ Mike Wray <mike.wray@xxxxxx> +2005/08/25 + 0) Introduction --------------- Vnets provide virtual private LANs for virtual machines. -This is done using bridging and tunneling. A virtual interface +This is done using bridging and multipoint tunneling. A virtual interface on a vnet can only see other interfaces on the same vnet - it cannot see the real network, and the real network cannot see it either. @@ -32,13 +34,16 @@ Restart xend. +Alternatively insert the vnet module using vnet-insert, +preferably before xend starts. + 2) Creating vnets ----------------- Xend already implements commands to add/remove vnets and bridge to them. To add a vnet use -xm call vnet_add <vnet config file> +xm vnet-create <vnet config file> For example, if vnet97.sxp contains: @@ -46,7 +51,7 @@ do -xm call vnet_add vnet97.sxp +xm vnet-create vnet97.sxp This will define a vnet with id 97 and no security. The bridge for the vnet is called vnet97 and the virtual interface for it is vnetif97. @@ -64,31 +69,35 @@ Once configured, vnets are persistent in the xend database. To remove a vnet use -xm call vnet_delete <vnet id> +xm vnet-delete <vnet id> To list vnets use -xm call vnets +xm vnet-list To get information on a vnet id use -xm call vnet <vnet id> +xm vnet-list <vnet id> 3) Troubleshooting ------------------ The vnet module should appear in 'lsmod'. -If a vnet has been configured it should appear in the output of 'xm call vnets'. +If a vnet has been configured it should appear in the output of 'xm vnet-list'. Its bridge and interface should appear in 'ifconfig'. It should also show in 'brctl show', with its attached interfaces. -You can 'see into' a vnet from dom0 if you put an IP address on the bridge. +You can 'see into' a vnet from dom0 if you put an IP address on the bridge +and configure its MAC address as a vif. For example, if you have vnet97 with a vm with ip addr 10.0.0.12 on it, -then +and <mac> is the MAC address of vnet97 (use ifconfig), then +echo '(vif.add (vnet 97) (vmac <mac>))' >/proc/vnet/policy ifconfig vnet97 10.0.0.20 up should let you ping 10.0.0.12 via the vnet97 bridge. +This works even if the vm with vif 10.0.0.12 is on another +machine (it only works locally if you don't use vif.add). 4) Examples ----------- @@ -104,11 +113,11 @@ (linux (kernel /boot/vmlinuz-2.6-xenU) (ip 10.0.0.12:1.2.3.4::::eth0:off) - (root /dev/hda1) + (root /dev/sda1) (args 'rw fastboot 4') ) ) - (device (vbd (uname phy:hda2) (dev hda1) (mode w))) + (device (vbd (uname phy:hda2) (dev sda1) (mode w))) (device (vif (mac aa:00:00:11:00:12) (bridge vnet97))) ) @@ -123,11 +132,11 @@ (linux (kernel /boot/vmlinuz-2.6-xenU) (ip 10.0.0.11:1.2.3.4::::eth0:off) - (root /dev/hda1) + (root /dev/sda1) (args 'rw fastboot 4') ) ) - (device (vbd (uname phy:hda3) (dev hda1) (mode w))) + (device (vbd (uname phy:hda3) (dev sda1) (mode w))) (device (vif (mac aa:00:00:11:00:11) (bridge vnet97))) ) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/examples/Makefile --- a/tools/vnet/examples/Makefile Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/examples/Makefile Fri Aug 26 20:47:16 2005 @@ -2,11 +2,14 @@ #============================================================================ XEN_SCRIPT_DIR:=/etc/xen/scripts + +.PHONY: all install clean all: install: install -m 0755 -d $(DESTDIR)$(XEN_SCRIPT_DIR) install -m 0554 network-vnet $(DESTDIR)$(XEN_SCRIPT_DIR) + install -m 0554 vnet-insert $(DESTDIR)$(XEN_SCRIPT_DIR) clean: \ No newline at end of file diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/examples/network-vnet --- a/tools/vnet/examples/network-vnet Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/examples/network-vnet Fri Aug 26 20:47:16 2005 @@ -1,218 +1,10 @@ #!/bin/sh -#============================================================================ -# Default Xen network start/stop script. -# Xend calls a network script when it starts. -# The script name to use is defined in /etc/xen/xend-config.sxp -# in the network-script field. -# -# This script creates a bridge (default xen-br0), adds a device -# (default eth0) to it, copies the IP addresses from the device -# to the bridge and adjusts the routes accordingly. -# -# If all goes well, this should ensure that networking stays up. -# However, some configurations are upset by this, especially -# NFS roots. If the bridged setup does not meet your needs, -# configure a different script, for example using routing instead. -# -# Usage: -# -# network (start|stop|status) {VAR=VAL}* -# -# Vars: -# -# bridge The bridge to use (default xen-br0). -# netdev The interface to add to the bridge (default eth0). -# antispoof Whether to use iptables to prevent spoofing (default yes). -# -# start: -# Creates the bridge and enslaves netdev to it. -# Copies the IP addresses from netdev to the bridge. -# Deletes the routes to netdev and adds them on bridge. -# -# stop: -# Removes netdev from the bridge. -# Deletes the routes to bridge and adds them to netdev. -# -# status: -# Print ifconfig for netdev and bridge. -# Print routes. -# -#============================================================================ +scriptdir=/etc/xen/scripts/ -# Exit if anything goes wrong. -set -e +case ${1} in + start) + ${scriptdir}/vnet-insert + ;; +esac -# First arg is the operation. -OP=$1 -shift - -# Pull variables in args in to environment. -for arg ; do export "${arg}" ; done - -bridge=${bridge:-xen-br0} -netdev=${netdev:-eth0} -antispoof=${antispoof:-yes} - -echo "network $OP bridge=$bridge netdev=$netdev antispoof=$antispoof" - -# Usage: transfer_addrs src dst -# Copy all IP addresses (including aliases) from device $src to device $dst. -transfer_addrs () { - local src=$1 - local dst=$2 - # Don't bother if $dst already has IP addresses. - if ip addr show dev ${dst} | egrep -q '^ *inet' ; then - return - fi - # Address lines start with 'inet' and have the device in them. - # Replace 'inet' with 'ip addr add' and change the device name $src - # to 'dev $src'. Remove netmask as we'll add routes later. - ip addr show dev ${src} | egrep '^ *inet' | sed -e " -s/inet/ip addr add/ -s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+\)/[0-9]\+@\1@ -s/${src}/dev ${dst}/ -" | sh -e -} - -# Usage: transfer_routes src dst -# Get all IP routes to device $src, delete them, and -# add the same routes to device $dst. -# The original routes have to be deleted, otherwise adding them -# for $dst fails (duplicate routes). -transfer_routes () { - local src=$1 - local dst=$2 - # List all routes and grep the ones with $src in. - # Stick 'ip route del' on the front to delete. - # Change $src to $dst and use 'ip route add' to add. - ip route list | grep ${src} | sed -e " -h -s/^/ip route del / -P -g -s/${src}/${dst}/ -s/^/ip route add / -P -d -" | sh -e -} - -# Usage: create_bridge dev bridge -# Create bridge $bridge and add device $dev to it. -create_bridge () { - local dev=$1 - local bridge=$2 - - # Don't create the bridge if it already exists. - if ! brctl show | grep -q ${bridge} ; then - brctl addbr ${bridge} - brctl stp ${bridge} off - brctl setfd ${bridge} 0 - fi - ifconfig ${bridge} up -} - -# Usage: antispoofing dev bridge -# Set the default forwarding policy for $dev to drop. -# Allow forwarding to the bridge. -antispoofing () { - local dev=$1 - local bridge=$2 - - iptables -P FORWARD DROP - iptables -A FORWARD -m physdev --physdev-in ${dev} -j ACCEPT -} - -# Usage: show_status dev bridge -# Print ifconfig and routes. -show_status () { - local dev=$1 - local bridge=$2 - - echo '============================================================' - ifconfig ${dev} - ifconfig ${bridge} - echo ' ' - ip route list - echo ' ' - route -n - echo '============================================================' -} - -# Insert the vnet module if it can be found and -# it's not already there. -vnet_insert () { - local module="vnet_module" - local mod_dir=/lib/modules/$(uname -r)/kernel - local mod_path="${mod_dir}/${module}" - local mod_obj="" - - for ext in ".o" ".ko" ; do - f=${mod_path}${ext} - if [ -f ${f} ] ; then - mod_obj=$f - break - fi - done - if [ "${mod_obj}" == "" ] ; then - return - fi - if lsmod | grep -q ${module} ; then - echo "VNET: ${module} loaded" - else - echo "VNET: Loading ${module}..." - insmod ${mod_obj} - fi -} - -op_start () { - if [ "${bridge}" == "null" ] ; then - return - fi - # Create the bridge and give it the interface IP addresses. - # Move the interface routes onto the bridge. - create_bridge ${netdev} ${bridge} - transfer_addrs ${netdev} ${bridge} - transfer_routes ${netdev} ${bridge} - # Don't add $dev to $bridge if it's already on a bridge. - if ! brctl show | grep -q ${netdev} ; then - brctl addif ${bridge} ${netdev} - fi - - if [ ${antispoof} == 'yes' ] ; then - antispoofing ${netdev} ${bridge} - fi - - vnet_insert -} - -op_stop () { - if [ "${bridge}" == "null" ] ; then - return - fi - # Remove the interface from the bridge. - # Move the routes back to the interface. - brctl delif ${bridge} ${netdev} - transfer_routes ${bridge} ${netdev} - - # It's not our place to be enabling forwarding... -} - -case ${OP} in - start) - op_start - ;; - - stop) - op_stop - ;; - - status) - show_status ${netdev} ${bridge} - ;; - - *) - echo 'Unknown command: ' ${OP} - echo 'Valid commands are: start, stop, status' - exit 1 -esac +${scriptdir}/network-bridge "$@" diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/examples/vnet97.sxp --- a/tools/vnet/examples/vnet97.sxp Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/examples/vnet97.sxp Fri Aug 26 20:47:16 2005 @@ -1,3 +1,2 @@ # Vnet configuration for a vnet with id 97 and no security. -# Configure using 'xm call vnet_add vnet97.sxp'. (vnet (id 97) (bridge vnet97) (vnetif vnetif97) (security none)) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/examples/vnet98.sxp --- a/tools/vnet/examples/vnet98.sxp Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/examples/vnet98.sxp Fri Aug 26 20:47:16 2005 @@ -1,3 +1,2 @@ # Vnet configuration for a vnet with id 98 and message authentication. -# Configure using 'xm call vnet_add vnet98.sxp'. (vnet (id 98) (bridge vnet98) (vnetif vnetif98) (security auth)) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/examples/vnet99.sxp --- a/tools/vnet/examples/vnet99.sxp Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/examples/vnet99.sxp Fri Aug 26 20:47:16 2005 @@ -1,3 +1,2 @@ # Vnet configuration for a vnet with id 99 and message confidentiality. -# Configure using 'xm call vnet_add vnet99.sxp'. (vnet (id 99) (bridge vnet99) (vnetif vnetif99) (security conf)) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/Makefile --- a/tools/vnet/libxutil/Makefile Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/Makefile Fri Aug 26 20:47:16 2005 @@ -1,5 +1,8 @@ +ifndef VNET_ROOT +export VNET_ROOT = $(shell cd .. && pwd) +include $(VNET_ROOT)/Make.env +endif -XEN_ROOT = ../../.. INSTALL = install INSTALL_DATA = $(INSTALL) -m0644 INSTALL_PROG = $(INSTALL) -m0755 @@ -15,6 +18,7 @@ LIB_SRCS += hash_table.c LIB_SRCS += iostream.c LIB_SRCS += lexis.c +LIB_SRCS += mem_stream.c LIB_SRCS += string_stream.c LIB_SRCS += sxpr.c LIB_SRCS += sxpr_parser.c @@ -26,6 +30,7 @@ PIC_OBJS := $(LIB_SRCS:.c=.opic) CFLAGS += -Wall -Werror -O3 -fno-strict-aliasing +CFLAGS += -g # Get gcc to generate the dependencies for us. CFLAGS += -Wp,-MD,.$(@F).d @@ -39,6 +44,7 @@ LIB += libxutil.a all: build + build: check-for-zlib $(MAKE) $(LIB) @@ -70,8 +76,8 @@ ln -sf libxutil.so.$(MAJOR) $(DESTDIR)/usr/$(LIBDIR)/libxutil.so clean: - $(RM) *.a *.so* *.o *.opic *.rpm - $(RM) *~ - $(RM) $(DEPS) + -@$(RM) *.a *.so* *.o *.opic *.rpm + -@$(RM) *~ + -@$(RM) $(DEPS) -include $(DEPS) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/debug.h --- a/tools/vnet/libxutil/debug.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/debug.h Fri Aug 26 20:47:16 2005 @@ -49,9 +49,9 @@ #ifdef DEBUG #define dprintf(fmt, args...) fprintf(stdout, "%d [DBG] " MODULE_NAME ">%s" fmt, getpid(), __FUNCTION__, ##args) -#define wprintf(fmt, args...) fprintf(stderr, "%d [WRN] " MODULE_NAME ">%s" fmt, getpid(),__FUNCTION__, ##args) -#define iprintf(fmt, args...) fprintf(stderr, "%d [INF] " MODULE_NAME ">%s" fmt, getpid(),__FUNCTION__, ##args) -#define eprintf(fmt, args...) fprintf(stderr, "%d [ERR] " MODULE_NAME ">%s" fmt, getpid(),__FUNCTION__, ##args) +#define wprintf(fmt, args...) fprintf(stderr, "%d [WRN] " MODULE_NAME ">%s" fmt, getpid(), __FUNCTION__, ##args) +#define iprintf(fmt, args...) fprintf(stderr, "%d [INF] " MODULE_NAME ">%s" fmt, getpid(), __FUNCTION__, ##args) +#define eprintf(fmt, args...) fprintf(stderr, "%d [ERR] " MODULE_NAME ">%s" fmt, getpid(), __FUNCTION__, ##args) #else diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/sxpr.c --- a/tools/vnet/libxutil/sxpr.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/sxpr.c Fri Aug 26 20:47:16 2005 @@ -405,7 +405,6 @@ #endif /* USE_GC */ /** Create a new atom with the given name. - * Makes an integer sxpr if the name can be parsed as an int. * * @param name the name * @return new atom @@ -414,7 +413,8 @@ Sxpr n, obj = ONOMEM; long v; - if(convert_atol(name, &v) == 0){ + // Don't always want to do this. + if(0 && convert_atol(name, &v) == 0){ obj = OINT(v); } else { n = string_new(name); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/sxpr.h --- a/tools/vnet/libxutil/sxpr.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/sxpr.h Fri Aug 26 20:47:16 2005 @@ -228,7 +228,9 @@ * * @param val pointer */ -#define PTR(val) OBJP(T_UINT, (void*)(val)) +static inline Sxpr PTR(void *val){ + return OBJP(T_UINT, (void*)(val)); +} /** Allocate some memory and return an sxpr containing it. * Returns ONOMEM if allocation failed. @@ -237,7 +239,9 @@ * @param ty typecode * @return sxpr */ -#define halloc(_n, _ty) OBJP(_ty, allocate(_n)) +static inline Sxpr halloc(int n, int ty){ + return OBJP(ty, allocate(n)); +} /** Allocate an sxpr containing a pointer to the given type. * diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/sxpr_parser.c --- a/tools/vnet/libxutil/sxpr_parser.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/sxpr_parser.c Fri Aug 26 20:47:16 2005 @@ -472,7 +472,14 @@ } int Parser_atom(Parser *p){ - Sxpr obj = atom_new(peek_token(p)); + Sxpr obj; + long v; + if(Parser_flags(p, PARSE_INT) && + convert_atol(peek_token(p), &v) == 0){ + obj = OINT(v); + } else { + obj = atom_new(peek_token(p)); + } return Parser_set_value(p, obj); } diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/sxpr_parser.h --- a/tools/vnet/libxutil/sxpr_parser.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/sxpr_parser.h Fri Aug 26 20:47:16 2005 @@ -89,15 +89,17 @@ /** Parser flags. */ -//enum { -//}; +enum { + /** Convert integer atoms to ints. */ + PARSE_INT=1, +}; /** Raise some parser flags. * * @param in parser * @param flags flags mask */ -inline static void Parser_flags_raise(Parser *in, int flags){ +static inline void Parser_flags_raise(Parser *in, int flags){ in->flags |= flags; } @@ -106,7 +108,7 @@ * @param in parser * @param flags flags mask */ -inline static void Parser_flags_lower(Parser *in, int flags){ +static inline void Parser_flags_lower(Parser *in, int flags){ in->flags &= ~flags; } @@ -114,8 +116,12 @@ * * @param in parser */ -inline static void Parser_flags_clear(Parser *in){ +static inline void Parser_flags_clear(Parser *in){ in->flags = 0; +} + +static inline int Parser_flags(Parser *in, int flags){ + return in->flags & flags; } extern void Parser_free(Parser *z); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/sys_string.c --- a/tools/vnet/libxutil/sys_string.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/sys_string.c Fri Aug 26 20:47:16 2005 @@ -27,6 +27,31 @@ #include "allocate.h" #include "sys_string.h" + +#ifdef __KERNEL__ + +#define deferr(_err) case _err: return #_err + +extern char *strerror(int err) +{ + switch(err){ + deferr(EPERM); + deferr(ENOENT); + deferr(ESRCH); + deferr(EINTR); + deferr(EIO); + deferr(EINVAL); + deferr(ENOMEM); + deferr(EACCES); + deferr(EFAULT); + deferr(EBUSY); + + default: + return "ERROR"; + } +} + +#endif /** Set the base to use for converting a string to a number. Base is * hex if starts with 0x, otherwise decimal. diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/libxutil/sys_string.h --- a/tools/vnet/libxutil/sys_string.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/libxutil/sys_string.h Fri Aug 26 20:47:16 2005 @@ -31,6 +31,8 @@ #include <linux/types.h> #include <stdarg.h> #include "allocate.h" + +extern char *strerror(int err); #if 0 static inline int tolower(int c){ diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/00README --- a/tools/vnet/vnet-module/00README Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/00README Fri Aug 26 20:47:16 2005 @@ -6,11 +6,11 @@ The makefiles use the following variables, which can be set in your env or on the make command line: -LINUX_SERIES: linux release to compile for, 2.4 (default), or 2.6. -XEN_ROOT: root of the xen tree containing kernel source. +LINUX_SERIES: linux release to compile for: 2.4, or 2.6 (default). +XEN_ROOT: root of the xen tree containing kernel source. KERNEL_VERSION: kernel version, default got from XEN_ROOT. -KERNEL_MINOR: kernel minor version, default -xen0. -KERNEL_SRC: path to kernel source, default linux-<VERSION> under XEN_ROOT. +KERNEL_MINOR: kernel minor version, default -xen0. +KERNEL_SRC: path to kernel source, default linux-<VERSION> under XEN_ROOT. *) For 2.4 kernel diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/Makefile --- a/tools/vnet/vnet-module/Makefile Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/Makefile Fri Aug 26 20:47:16 2005 @@ -18,9 +18,13 @@ # 59 Temple Place, suite 330, Boston, MA 02111-1307 USA #============================================================================ +ifndef VNET_ROOT +export VNET_ROOT = $(shell cd .. && pwd) +include $(VNET_ROOT)/Make.env +endif + #============================================================================ ifeq ($(src),) -LINUX_SERIES ?=2.6 include Makefile-$(LINUX_SERIES) @@ -45,7 +49,7 @@ # Setup explicit rules for them using the kbuild C compile rule. # File names in the lib dir. -remote_srcs = $(foreach file,$(VNET_LIB_SRC),$(LIB_DIR)/$(file)) +remote_srcs = $(foreach file,$(VNET_LIB_SRC),$(LIBXUTIL_DIR)/$(file)) # Equivalent file names here. local_srcs = $(foreach file,$(VNET_LIB_SRC),$(src)/$(file)) @@ -54,12 +58,12 @@ local_objs = $(local_srcs:.c=.o) # Make the local objects depend on compiling the remote sources. -$(local_objs): $(src)/%.o: $(LIB_DIR)/%.c +$(local_objs): $(src)/%.o: $(LIBXUTIL_DIR)/%.c $(call if_changed_rule,cc_o_c) #---------------------------------------------------------------------------- -vpath %.h $(LIB_DIR) -EXTRA_CFLAGS += -I $(LIB_DIR) +vpath %.h $(LIBXUTIL_DIR) +EXTRA_CFLAGS += -I $(LIBXUTIL_DIR) EXTRA_CFLAGS += -I $(src) endif diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/Makefile-2.4 --- a/tools/vnet/vnet-module/Makefile-2.4 Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/Makefile-2.4 Fri Aug 26 20:47:16 2005 @@ -21,7 +21,7 @@ #============================================================================ # Vnet module makefile for 2.4 series kernels. -LINUX_SERIES ?=2.4 +LINUX_SERIES =2.4 include Makefile.ver KERNEL_MODULE := vnet_module.o @@ -37,9 +37,9 @@ vpath %.h $(KERNEL_SRC)/include INCLUDES+= -I $(KERNEL_SRC)/include -vpath %.h $(LIB_DIR) -vpath %.c $(LIB_DIR) -INCLUDES += -I $(LIB_DIR) +vpath %.h $(LIBXUTIL_DIR) +vpath %.c $(LIBXUTIL_DIR) +INCLUDES += -I $(LIBXUTIL_DIR) INCLUDES+= -I . @@ -61,6 +61,7 @@ CFLAGS += -Wno-unused-function CFLAGS += -Wno-unused-parameter +CFLAGS += -g CFLAGS += -O2 CFLAGS += -fno-strict-aliasing CFLAGS += -fno-common @@ -90,8 +91,8 @@ .PHONY: clean clean: - @rm -f *.a *.o *.ko *~ - @rm -f $(VNET_DEP) .*.cmd *.mod.? - @rm -rf .tmp_versions + -@$(RM) *.a *.o *.ko *~ + -@$(RM) $(VNET_DEP) .*.cmd *.mod.? + -@$(RM) -r .tmp_versions -include $(VNET_DEP) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/Makefile-2.6 --- a/tools/vnet/vnet-module/Makefile-2.6 Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/Makefile-2.6 Fri Aug 26 20:47:16 2005 @@ -21,7 +21,7 @@ #============================================================================ # Vnet module makefile for 2.6 series kernels. -LINUX_SERIES ?=2.6 +LINUX_SERIES =2.6 include Makefile.ver KERNEL_MODULE = vnet_module.ko @@ -38,13 +38,14 @@ .PHONY: install install-module modules_install install install-module modules_install: module - install -m 0755 -d $(DESTDIR)$(KERNEL_MODULE_DIR)/xen - install -m 0554 $(KERNEL_MODULE) $(DESTDIR)$(KERNEL_MODULE_DIR)/xen + install -m 0755 -d $(DESTDIR)$(KERNEL_MODULE_DIR) + install -m 0554 $(KERNEL_MODULE) $(DESTDIR)$(KERNEL_MODULE_DIR) .PHONY: clean clean: - @$(MAKE) -C $(KERNEL_SRC) M=$(PWD) clean - @rm -f *.a *.o *.ko *~ .*.d .*.cmd *.mod.? + -@$(MAKE) -C $(KERNEL_SRC) M=$(PWD) clean + -@$(RM) *.a *.o *.ko *~ .*.d .*.cmd *.mod.? + -@$(RM) -r .tmp_versions TAGS: etags *.c *.h diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/Makefile.ver --- a/tools/vnet/vnet-module/Makefile.ver Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/Makefile.ver Fri Aug 26 20:47:16 2005 @@ -18,22 +18,11 @@ # 59 Temple Place, suite 330, Boston, MA 02111-1307 USA #============================================================================ -#---------------------------------------------------------------------------- -# Xeno/xen. - -# Root of xen tree. -XEN_ROOT ?=../../.. - -# Path to relativize the install. Set to / -# to install relative to filesystem root. -prefix ?=$(XEN_ROOT)/install/ -#---------------------------------------------------------------------------- - LINUX_SERIES ?=2.6 KERNEL_MINOR ?=-xen0 -LINUX_VERSION ?= $(shell ( /bin/ls -ld $(XEN_ROOT)/linux-$(LINUX_SERIES).*-xen-sparse ) 2>/dev/null | \ - sed -e 's!^.*linux-\(.\+\)-xen-sparse!\1!' ) +LINUX_VERSION ?= $(shell ( /bin/ls -ld $(XEN_ROOT)/linux-$(LINUX_SERIES).*-xen0 ) 2>/dev/null | \ + sed -e 's!^.*linux-\(.\+\)-xen0!\1!' ) ifeq ($(LINUX_VERSION),) $(error Kernel source for linux $(LINUX_SERIES) not found) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/Makefile.vnet --- a/tools/vnet/vnet-module/Makefile.vnet Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/Makefile.vnet Fri Aug 26 20:47:16 2005 @@ -23,8 +23,6 @@ else SRC_DIR=$(src)/ endif - -LIB_DIR := $(SRC_DIR)../libxutil VNET_SRC := VNET_SRC += esp.c diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/etherip.c --- a/tools/vnet/vnet-module/etherip.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/etherip.c Fri Aug 26 20:47:16 2005 @@ -42,6 +42,7 @@ #include <vnet.h> #include <varp.h> #include <if_varp.h> +#include <varp.h> #include <skb_util.h> #define MODULE_NAME "VNET" @@ -53,22 +54,18 @@ * The etherip protocol is used to transport Ethernet frames in IP packets. */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -#define MAC_ETH(_skb) ((struct ethhdr *)(_skb)->mac.raw) +/** Get the vnet label from an etherip header. + * + * @param hdr header + * @@param vnet (in net order) + */ +void etheriphdr_get_vnet(struct etheriphdr *hdr, VnetId *vnet){ +#ifdef CONFIG_ETHERIP_EXT + *vnet = *(VnetId*)hdr->vnet; #else -#define MAC_ETH(_skb) ((_skb)->mac.ethernet) -#endif - -/** Get the vnet label from an etherip header. - * - * @param hdr header - * @return vnet (in host order) - */ -int etheriphdr_get_vnet(struct etheriphdr *hdr){ -#ifdef CONFIG_ETHERIP_EXT - return ntohl(hdr->vnet); -#else - return hdr->reserved; + *vnet = (VnetId){}; + vnet->u.vnet16[7] = (unsigned short)hdr->reserved; + #endif } @@ -76,15 +73,15 @@ * Also sets the etherip version. * * @param hdr header - * @param vnet vnet label (in host order) - */ -void etheriphdr_set_vnet(struct etheriphdr *hdr, int vnet){ + * @param vnet vnet label (in net order) + */ +void etheriphdr_set_vnet(struct etheriphdr *hdr, VnetId *vnet){ #ifdef CONFIG_ETHERIP_EXT - hdr->version = 4; - hdr->vnet = htonl(vnet); + hdr->version = ETHERIP_VERSION; + *(VnetId*)hdr->vnet = *vnet; #else - hdr->version = 3; - hdr->reserved = vnet & 0x0fff; + hdr->version = ETHERIP_VERSION; + hdr->reserved = (vnet->u.vnet16[7] & 0x0fff); #endif } @@ -119,12 +116,12 @@ const int ip_n = sizeof(struct iphdr); const int eth_n = ETH_HLEN; int head_n = 0; - int vnet = tunnel->key.vnet; + VnetId *vnet = &tunnel->key.vnet; struct etheriphdr *etheriph; struct ethhdr *ethh; u32 saddr = 0; - dprintf("> skb=%p vnet=%d\n", skb, vnet); + //dprintf("> skb=%p vnet=%d\n", skb, vnet); head_n = etherip_n + ip_n + eth_n; err = skb_make_room(&skb, skb, head_n, 0); if(err) goto exit; @@ -133,7 +130,7 @@ //if(err) goto exit; // The original ethernet header. - ethh = MAC_ETH(skb); + ethh = eth_hdr(skb); //print_skb_data(__FUNCTION__, 0, skb, skb->mac.raw, skb->len); // Null the pointer as we are pushing a new IP header. skb->mac.raw = NULL; @@ -155,7 +152,7 @@ skb->nh.iph->ttl = 64; // Linux default time-to-live. skb->nh.iph->protocol = IPPROTO_ETHERIP; // IP protocol number. skb->nh.iph->saddr = saddr; // Source address. - skb->nh.iph->daddr = tunnel->key.addr; // Destination address. + skb->nh.iph->daddr = tunnel->key.addr.u.ip4.s_addr; // Destination address. skb->nh.iph->check = 0; // Ethernet header will be filled-in by device. @@ -213,15 +210,18 @@ struct etheriphdr *etheriph; struct ethhdr *ethhdr; Vnet *vinfo = NULL; - u32 vnet; - - ethhdr = MAC_ETH(skb); - if(MULTICAST(skb->nh.iph->daddr) && - (skb->nh.iph->daddr != varp_mcast_addr)){ + VnetId vnet = {}; + u32 saddr, daddr; + char vnetbuf[VNET_ID_BUF]; + + saddr = skb->nh.iph->saddr; + daddr = skb->nh.iph->daddr; + ethhdr = eth_hdr(skb); + if(MULTICAST(daddr) && (daddr != varp_mcast_addr)){ // Ignore multicast packets not addressed to us. - dprintf("> dst=%u.%u.%u.%u varp_mcast_addr=%u.%u.%u.%u\n", - NIPQUAD(skb->nh.iph->daddr), - NIPQUAD(varp_mcast_addr)); + dprintf("> Ignoring mcast skb: src=%u.%u.%u.%u dst=%u.%u.%u.%u" + " varp_mcast_addr=%u.%u.%u.%u\n", + NIPQUAD(saddr), NIPQUAD(daddr), NIPQUAD(varp_mcast_addr)); goto exit; } ip_n = (skb->nh.iph->ihl << 2); @@ -229,7 +229,8 @@ // skb->data points at ethernet header. //dprintf("> len=%d\n", skb->len); if (!pskb_may_pull(skb, eth_n + ip_n)){ - wprintf("> Malformed skb\n"); + wprintf("> Malformed skb (eth+ip) src=%u.%u.%u.%u\n", + NIPQUAD(saddr)); err = -EINVAL; goto exit; } @@ -237,18 +238,30 @@ } // Assume skb->data points at etherip header. etheriph = (void*)skb->data; - if(!pskb_may_pull(skb, etherip_n)){ - wprintf("> Malformed skb\n"); + if(etheriph->version != ETHERIP_VERSION){ + wprintf("> Bad etherip version=%d src=%u.%u.%u.%u\n", + etheriph->version, + NIPQUAD(saddr)); err = -EINVAL; goto exit; } - vnet = etheriphdr_get_vnet(etheriph); - dprintf("> Rcvd skb=%p vnet=%d\n", skb, vnet); + if(!pskb_may_pull(skb, etherip_n)){ + wprintf("> Malformed skb (etherip) src=%u.%u.%u.%u\n", + NIPQUAD(saddr)); + err = -EINVAL; + goto exit; + } + etheriphdr_get_vnet(etheriph, &vnet); + dprintf("> Rcvd skb vnet=%s src=%u.%u.%u.%u\n", + VnetId_ntoa(&vnet, vnetbuf), + NIPQUAD(saddr)); // If vnet is secure, context must include IPSEC ESP. - err = vnet_check_context(vnet, SKB_CONTEXT(skb), &vinfo); + err = vnet_check_context(&vnet, SKB_CONTEXT(skb), &vinfo); Vnet_decref(vinfo); if(err){ - wprintf("> Failed security check\n"); + wprintf("> Failed security check vnet=%s src=%u.%u.%u.%u\n", + VnetId_ntoa(&vnet, vnetbuf), + NIPQUAD(saddr)); goto exit; } mine = 1; @@ -258,19 +271,29 @@ // Know source ip, vnet, vmac, so could update varp cache. // But if traffic comes to us over a vnetd tunnel this points the coa // at the vnetd rather than the endpoint. So don't do it. - //varp_update(htonl(vnet), MAC_ETH(skb)->h_source, skb->nh.iph->saddr); + //varp_update(vnet, eth_hdr(skb)->h_source, skb->nh.iph->saddr); // Assuming a standard Ethernet frame. + // Should check for protocol? Support ETH_P_8021Q too. skb->nh.raw = skb_pull(skb, ETH_HLEN); + + dprintf("> Unpacked vnet=%s srcmac=" MACFMT " dstmac=" MACFMT "\n", + VnetId_ntoa(&vnet, vnetbuf), + MAC6TUPLE(eth_hdr(skb)->h_source), + MAC6TUPLE(eth_hdr(skb)->h_dest)); #ifdef CONFIG_NETFILTER #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) // This stops our new pkt header being clobbered by a subsequent - // call to nf_bridge_maybe_copy_header. Just replicate the - // corresponding nf_bridge_save_header. + // call to nf_bridge_maybe_copy_header. + // Code from nf_bridge_save_header() modidifed to use h_proto + // instead of skb->protocol. if(skb->nf_bridge){ + // Hmm. Standard ethernet header is ETH_HLEN (14), + // VLAN header (802.1q) is VLAN_ETH_HLEN (18). + // Where does 16 come from? int header_size = 16; - if(MAC_ETH(skb)->h_proto == __constant_htons(ETH_P_8021Q)) { + if(eth_hdr(skb)->h_proto == __constant_htons(ETH_P_8021Q)) { header_size = 18; } memcpy(skb->nf_bridge->data, skb->data - header_size, header_size); @@ -279,7 +302,7 @@ #endif if(1){ - struct ethhdr *eth = MAC_ETH(skb); + struct ethhdr *eth = eth_hdr(skb); // Devices use eth_type_trans() to set skb->pkt_type and skb->protocol. // Set them from contained ethhdr, or leave as received? // 'Ware use of hard_header_len in eth_type_trans(). @@ -310,6 +333,7 @@ } dst_release(skb->dst); skb->dst = NULL; + #ifdef CONFIG_NETFILTER nf_conntrack_put(skb->nfct); skb->nfct = NULL; @@ -321,7 +345,7 @@ //print_skb_data(__FUNCTION__, 0, skb, skb->mac.raw, skb->len + ETH_HLEN); - err = vnet_skb_recv(skb, vnet, (Vmac*)MAC_ETH(skb)->h_dest); + err = vnet_skb_recv(skb, &vnet, (Vmac*)eth_hdr(skb)->h_dest); exit: if(mine) err = 1; dprintf("< skb=%p err=%d\n", skb, err); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/if_etherip.h --- a/tools/vnet/vnet-module/if_etherip.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/if_etherip.h Fri Aug 26 20:47:16 2005 @@ -18,15 +18,30 @@ */ #ifndef _VNET_IF_ETHERIP_H_ #define _VNET_IF_ETHERIP_H_ -/*----------------------------------------------------------------------------*/ + +#define CONFIG_ETHERIP_EXT + #ifdef CONFIG_ETHERIP_EXT + +#define ETHERIP_VERSION 4 + struct etheriphdr { - __u8 version; - __u32 vnet; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 reserved:12, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u16 version:4, + reserved:12; +#else +#error "Please fix <asm/byteorder.h>" +#endif + __u8 vnet[16]; } __attribute__ ((packed)); -/*----------------------------------------------------------------------------*/ #else + +#define ETHERIP_VERSION 3 + struct etheriphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) @@ -42,10 +57,9 @@ }; #endif + #ifndef IPPROTO_ETHERIP #define IPPROTO_ETHERIP 97 #endif -/*----------------------------------------------------------------------------*/ - #endif /* ! _VNET_IF_ETHERIP_H_ */ diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/if_varp.h --- a/tools/vnet/vnet-module/if_varp.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/if_varp.h Fri Aug 26 20:47:16 2005 @@ -20,6 +20,14 @@ #ifndef _VNET_IF_VARP_H #define _VNET_IF_VARP_H +/* Need struct in_addr, struct in6_addr. */ +#ifdef __KERNEL__ +#include <linux/in.h> +#include <linux/in6.h> +#else +#include <netinet/in.h> +#endif + typedef struct Vmac { unsigned char mac[ETH_ALEN]; } Vmac; @@ -30,17 +38,35 @@ VARP_OP_ANNOUNCE = 2, }; +typedef struct VnetId { + union { + uint8_t vnet8[16]; + uint16_t vnet16[8]; + uint32_t vnet32[4]; + } u; +} __attribute__((packed)) VnetId; + +typedef struct VarpAddr { + uint8_t family; // AF_INET or AF_INET6. + union { + uint8_t raw[16]; + struct in_addr ip4; + struct in6_addr ip6; + } u; +} __attribute__((packed)) VarpAddr; + typedef struct VnetMsgHdr { uint16_t id; uint16_t opcode; } __attribute__((packed)) VnetMsgHdr; typedef struct VarpHdr { - VnetMsgHdr vnetmsghdr; - uint32_t vnet; - Vmac vmac; - uint32_t addr; + VnetMsgHdr hdr; + VnetId vnet; + Vmac vmac; + VarpAddr addr; } __attribute__((packed)) VarpHdr; + /** Default address for varp/vnet broadcasts: 224.10.0.1 */ #define VARP_MCAST_ADDR 0xe00a0001 @@ -48,6 +74,4 @@ /** UDP port to use for varp protocol. */ #define VARP_PORT 1798 - - -#endif /* ! _VNET_IF_VARP_H */ +#endif /* ! _VNET_IF_VARP_H */ diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/skb_util.h --- a/tools/vnet/vnet-module/skb_util.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/skb_util.h Fri Aug 26 20:47:16 2005 @@ -19,7 +19,9 @@ #ifndef _VNET_SKB_UTIL_H_ #define _VNET_SKB_UTIL_H_ -struct sk_buff; +#include <net/route.h> +#include <linux/skbuff.h> + struct scatterlist; extern int skb_make_room(struct sk_buff **pskb, struct sk_buff *skb, int head_n, int tail_n); @@ -40,4 +42,53 @@ extern void print_skb_data(char *msg, int count, struct sk_buff *skb, u8 *data, int len); +/* The mac.ethernet field went away in 2.6 in favour of eth_hdr(). + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#else +static inline struct ethhdr *eth_hdr(const struct sk_buff *skb) +{ + return (struct ethhdr *)skb->mac.raw; +} #endif + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + +static inline int skb_route(struct sk_buff *skb, struct rtable **prt){ + int err = 0; + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = skb->nh.iph->daddr, + .saddr = skb->nh.iph->saddr, + .tos = skb->nh.iph->tos, + } + } + }; + + if(skb->dev){ + fl.oif = skb->dev->ifindex; + } + err = ip_route_output_key(prt, &fl); + return err; +} + +#else + +static inline int skb_route(struct sk_buff *skb, struct rtable **prt){ + int err = 0; + struct rt_key key = { }; + key.dst = skb->nh.iph->daddr; + key.src = skb->nh.iph->saddr; + key.tos = skb->nh.iph->tos; + if(skb->dev){ + key.oif = skb->dev->ifindex; + } + err = ip_route_output_key(prt, &key); + return err; +} + +#endif + +#endif diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/tunnel.c --- a/tools/vnet/vnet-module/tunnel.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/tunnel.c Fri Aug 26 20:47:16 2005 @@ -36,7 +36,7 @@ #include "hash_table.h" #define MODULE_NAME "VNET" -//#define DEBUG 1 +#define DEBUG 1 #undef DEBUG #include "debug.h" @@ -56,11 +56,9 @@ } } -int Tunnel_create(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **val){ +int Tunnel_create(TunnelType *type, VnetId *vnet, VarpAddr *addr, Tunnel *base, Tunnel **val){ int err = 0; Tunnel *tunnel = NULL; - dprintf("> type=%s vnet=%d addr=" IPFMT " base=%s\n", - type->name, vnet, NIPQUAD(addr), (base ? base->type->name : "ip")); if(!type || !type->open || !type->send || !type->close){ err = -EINVAL; goto exit; @@ -71,8 +69,8 @@ goto exit; } atomic_set(&tunnel->refcount, 1); - tunnel->key.vnet = vnet; - tunnel->key.addr = addr; + tunnel->key.vnet = *vnet; + tunnel->key.addr = *addr; tunnel->type = type; tunnel->data = NULL; tunnel->send_stats = (TunnelStats){}; @@ -89,7 +87,7 @@ return err; } -int Tunnel_open(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **tunnel){ +int Tunnel_open(TunnelType *type, VnetId *vnet, VarpAddr *addr, Tunnel *base, Tunnel **tunnel){ int err = 0; dprintf(">\n"); @@ -123,15 +121,16 @@ static inline Hashcode tunnel_table_key_hash_fn(void *k){ TunnelKey *key = k; Hashcode h = 0; - h = hash_2ul(key->vnet, key->addr); + h = VnetId_hash(h, &key->vnet); + h = VarpAddr_hash(h, &key->addr); return h; } static int tunnel_table_key_equal_fn(void *k1, void *k2){ TunnelKey *key1 = k1; TunnelKey *key2 = k2; - return (key1->vnet == key2->vnet) - && (key1->addr == key2->addr); + return VnetId_eq(&key1->vnet, &key2->vnet) && + VarpAddr_eq(&key1->addr, &key2->addr); } static void tunnel_table_entry_free_fn(HashTable *table, HTEntry *entry){ @@ -165,9 +164,9 @@ * @param addr destination address * @return tunnel state or NULL */ -Tunnel * Tunnel_lookup(u32 vnet, u32 addr){ +Tunnel * Tunnel_lookup(VnetId *vnet, VarpAddr *addr){ Tunnel *tunnel = NULL; - TunnelKey key = {.vnet = vnet, .addr = addr }; + TunnelKey key = {.vnet = *vnet, .addr = *addr }; dprintf(">\n"); tunnel = HashTable_get(tunnel_table, &key); Tunnel_incref(tunnel); @@ -199,23 +198,16 @@ */ int Tunnel_send(Tunnel *tunnel, struct sk_buff *skb){ int err = 0; - int len; dprintf("> tunnel=%p skb=%p\n", tunnel, skb); - len = skb->len; if(tunnel){ + int len = skb->len; dprintf("> type=%s type->send...\n", tunnel->type->name); + // Must not refer to skb after sending - might have been freed. err = tunnel->type->send(tunnel, skb); - // Must not refer to skb after sending - might have been freed. TunnelStats_update(&tunnel->send_stats, len, err); } else { - struct net_device *dev = NULL; - err = vnet_get_device(DEVICE, &dev); - if(err) goto exit; - skb->dev = dev; err = skb_xmit(skb); - dev_put(dev); - } - exit: + } dprintf("< err=%d\n", err); return err; } @@ -225,4 +217,8 @@ } void __exit tunnel_module_exit(void){ -} + if(tunnel_table){ + HashTable_free(tunnel_table); + tunnel_table = NULL; + } +} diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/tunnel.h --- a/tools/vnet/vnet-module/tunnel.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/tunnel.h Fri Aug 26 20:47:16 2005 @@ -22,6 +22,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <asm/atomic.h> +#include <if_varp.h> struct sk_buff; struct Tunnel; @@ -41,8 +42,8 @@ } TunnelStats; typedef struct TunnelKey { - u32 vnet; - u32 addr; + VnetId vnet; + VarpAddr addr; } TunnelKey; typedef struct Tunnel { @@ -87,13 +88,15 @@ } extern int Tunnel_init(void); -extern Tunnel * Tunnel_lookup(u32 vnet, u32 addr); +extern Tunnel * Tunnel_lookup(struct VnetId *vnet, struct VarpAddr *addr); extern int Tunnel_add(Tunnel *tunnel); extern int Tunnel_del(Tunnel *tunnel); extern int Tunnel_send(Tunnel *tunnel, struct sk_buff *skb); -extern int Tunnel_create(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **tunnelp); -extern int Tunnel_open(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **tunnelp); +extern int Tunnel_create(TunnelType *type, struct VnetId *vnet, struct VarpAddr *addr, + Tunnel *base, Tunnel **tunnelp); +extern int Tunnel_open(TunnelType *type, struct VnetId *vnet, struct VarpAddr *addr, + Tunnel *base, Tunnel **tunnelp); extern int tunnel_module_init(void); extern void tunnel_module_exit(void); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/varp.c --- a/tools/vnet/vnet-module/varp.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/varp.c Fri Aug 26 20:47:16 2005 @@ -40,26 +40,20 @@ #include <tunnel.h> #include <vnet.h> #include <vif.h> +#include <if_varp.h> #include <varp.h> -#include <if_varp.h> +#include <vnet.h> #include "allocate.h" #include "hash_table.h" #include "sys_net.h" #include "sys_string.h" +#include "skb_util.h" #define MODULE_NAME "VARP" -//#define DEBUG 1 +#define DEBUG 1 #undef DEBUG #include "debug.h" - - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) -// The 'ethernet' field in the skb->mac union went away. -#define MAC_ETH(_skb) ((struct ethhdr *)(_skb)->mac.raw) -#else -#define MAC_ETH(_skb) ((_skb)->mac.ethernet) -#endif /** @file VARP: Virtual ARP. * @@ -121,8 +115,8 @@ /** Key for varp entries. */ typedef struct VarpKey { - /** Vnet id (host order). */ - u32 vnet; + /** Vnet id (network order). */ + VnetId vnet; /** Virtual MAC address. */ Vmac vmac; } VarpKey; @@ -132,7 +126,7 @@ /** Key for the entry. */ VarpKey key; /** Care-of address for the key. */ - u32 addr; + VarpAddr addr; /** Last-updated timestamp. */ unsigned long timestamp; /** State. */ @@ -152,8 +146,6 @@ struct sk_buff_head queue; /** Maximum size of the queue. */ int queue_max; - - int locks; } VarpEntry; /** The varp cache. Varp entries indexed by VarpKey. */ @@ -181,14 +173,10 @@ /** Multicast address (network order). */ u32 varp_mcast_addr = 0; -/** Unicast address (network order). */ -u32 varp_ucast_addr = 0; - /** UDP port (network order). */ u16 varp_port = 0; -/** Network device to use. */ -char *varp_device = DEVICE; +char *varp_device = "xen-br0"; #define VarpTable_read_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0) #define VarpTable_read_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0) @@ -199,7 +187,10 @@ #define VarpEntry_unlock(ventry, flags) write_unlock_irqrestore(&(ventry)->lock, (flags)) void VarpTable_sweep(VarpTable *z, int all); +void VarpTable_flush(VarpTable *z); void VarpTable_print(VarpTable *z); + +#include "./varp_util.c" /** Print the varp cache (if debug on). */ @@ -209,14 +200,53 @@ #endif } +/** Flush the varp cache. + */ +void varp_flush(void){ + VarpTable_flush(varp_table); +} + +static int device_ucast_addr(const char *device, uint32_t *addr) +{ + int err; + struct net_device *dev = NULL; + + err = vnet_get_device(device, &dev); + if(err) goto exit; + err = vnet_get_device_address(dev, addr); + exit: + if(err){ + *addr = 0; + } + return err; +} + +/** Get the unicast address of the varp device. + */ +int varp_ucast_addr(uint32_t *addr) +{ + int err = -ENODEV; + const char *devices[] = { varp_device, "eth0", "eth1", "eth2", NULL }; + const char **p; + for(p = devices; err && *p; p++){ + err = device_ucast_addr(*p, addr); + } + return err; +} + /** Print varp info and the varp cache. */ void varp_print(void){ + uint32_t addr = 0; + varp_ucast_addr(&addr); + printk(KERN_INFO "=== VARP ===============================================================\n"); printk(KERN_INFO "varp_device %s\n", varp_device); printk(KERN_INFO "varp_mcast_addr " IPFMT "\n", NIPQUAD(varp_mcast_addr)); - printk(KERN_INFO "varp_ucast_addr " IPFMT "\n", NIPQUAD(varp_ucast_addr)); + printk(KERN_INFO "varp_ucast_addr " IPFMT "\n", NIPQUAD(addr)); printk(KERN_INFO "varp_port %d\n", ntohs(varp_port)); + vnet_print(); + vif_print(); VarpTable_print(varp_table); printk(KERN_INFO "========================================================================\n"); } @@ -246,18 +276,43 @@ int err = 0; struct in_device *in_dev; - //printk("%s>\n", __FUNCTION__); in_dev = in_dev_get(dev); if(!in_dev){ - err = -EIO; + err = -ENODEV; goto exit; } *addr = in_dev->ifa_list->ifa_address; in_dev_put(in_dev); exit: - //printk("%s< err=%d\n", __FUNCTION__, err); - return err; -} + return err; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + +static inline int addr_route(u32 daddr, struct rtable **prt){ + int err = 0; + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = daddr, + } + } + }; + + err = ip_route_output_key(prt, &fl); + return err; +} + +#else + +static inline int addr_route(u32 daddr, struct rtable **prt){ + int err = 0; + struct rt_key key = { .dst = daddr }; + err = ip_route_output_key(prt, &key); + return err; +} + +#endif #ifndef LL_RESERVED_SPACE #define HH_DATA_MOD 16 @@ -270,12 +325,12 @@ * @param opcode varp opcode (host order) * @param dev device (may be null) * @param skb skb being replied to (may be null) - * @param vnet vnet id (in host order) + * @param vnet vnet id (in network order) * @param vmac vmac (in network order) * @return 0 on success, error code otherwise */ int varp_send(u16 opcode, struct net_device *dev, struct sk_buff *skbin, - u32 vnet, Vmac *vmac){ + VnetId *vnet, Vmac *vmac){ int err = 0; int link_n = 0; int ip_n = sizeof(struct iphdr); @@ -285,45 +340,53 @@ struct in_device *in_dev = NULL; VarpHdr *varph = NULL; u8 macbuf[6] = {}; - u8 *smac, *dmac; + u8 *smac, *dmac = macbuf; u32 saddr, daddr; u16 sport, dport; - - dmac = macbuf; - dprintf("> opcode=%d vnet=%d vmac=" MACFMT "\n", - opcode, ntohl(vnet), MAC6TUPLE(vmac->mac)); - if(!dev){ - //todo: should use routing for daddr to get device. - err = vnet_get_device(varp_device, &dev); - if(err) goto exit; - } - link_n = LL_RESERVED_SPACE(dev); - in_dev = in_dev_get(dev); - if(!in_dev) goto exit; - - smac = dev->dev_addr; - saddr = in_dev->ifa_list->ifa_address; - +#if defined(DEBUG) + char vnetbuf[VNET_ID_BUF]; +#endif + + dprintf("> opcode=%d vnet= %s vmac=" MACFMT "\n", + opcode, VnetId_ntoa(vnet, vnetbuf), MAC6TUPLE(vmac->mac)); + + dport = varp_port; if(skbin){ - dmac = MAC_ETH(skbin)->h_source; + daddr = skbin->nh.iph->saddr; + dmac = eth_hdr(skbin)->h_source; sport = skbin->h.uh->dest; - daddr = skbin->nh.iph->saddr; - //dport = skbin->h.uh->source; - dport = varp_port; } else { - if(!in_dev) goto exit; if(MULTICAST(varp_mcast_addr)){ daddr = varp_mcast_addr; ip_eth_mc_map(daddr, dmac); } else { - daddr = in_dev->ifa_list->ifa_broadcast; - dmac = dev->broadcast; + daddr = INADDR_BROADCAST; } sport = varp_port; - dport = varp_port; + } + + if(!dev){ + struct rtable *rt = NULL; + err = addr_route(daddr, &rt); + if(err) goto exit; + dev = rt->u.dst.dev; + } + + in_dev = in_dev_get(dev); + if(!in_dev){ + err = -ENODEV; + goto exit; + } + link_n = LL_RESERVED_SPACE(dev); + saddr = in_dev->ifa_list->ifa_address; + smac = dev->dev_addr; + if(daddr == INADDR_BROADCAST){ + daddr = in_dev->ifa_list->ifa_broadcast; + dmac = dev->broadcast; } in_dev_put(in_dev); + dprintf("> dev=%s\n", dev->name); dprintf("> smac=" MACFMT " dmac=" MACFMT "\n", MAC6TUPLE(smac), MAC6TUPLE(dmac)); dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n", NIPQUAD(saddr), NIPQUAD(daddr)); dprintf("> sport=%u dport=%u\n", ntohs(sport), ntohs(dport)); @@ -368,11 +431,12 @@ // Varp header. varph = (void*)skb_put(skbout, varp_n); *varph = (VarpHdr){}; - varph->vnetmsghdr.id = htons(VARP_ID); - varph->vnetmsghdr.opcode = htons(opcode); - varph->vnet = htonl(vnet); + varph->hdr.id = htons(VARP_ID); + varph->hdr.opcode = htons(opcode); + varph->vnet = *vnet; varph->vmac = *vmac; - varph->addr = saddr; + varph->addr.family = AF_INET; + varph->addr.u.ip4.s_addr = saddr; err = skb_xmit(skbout); @@ -385,16 +449,13 @@ /** Send a varp request for the vnet and destination mac of a packet. * * @param skb packet - * @param vnet vnet (in host order) + * @param vnet vnet (in network order) * @return 0 on success, error code otherwise */ -int varp_solicit(struct sk_buff *skb, int vnet){ - int err = 0; - dprintf("> skb=%p\n", skb); - varp_dprint(); +int varp_solicit(struct sk_buff *skb, VnetId *vnet){ + int err = 0; err = varp_send(VARP_OP_REQUEST, NULL, NULL, - vnet, (Vmac*)MAC_ETH(skb)->h_dest); - dprintf("< err=%d\n", err); + vnet, (Vmac*)eth_hdr(skb)->h_dest); return err; } @@ -430,22 +491,26 @@ */ void VarpEntry_print(VarpEntry *ventry){ if(ventry){ - char *c, *d; + char *state, *flags; + char vnetbuf[VNET_ID_BUF]; + char addrbuf[VARP_ADDR_BUF]; + switch(ventry->state){ - case VARP_STATE_INCOMPLETE: c = "INC"; break; - case VARP_STATE_REACHABLE: c = "RCH"; break; - case VARP_STATE_FAILED: c = "FLD"; break; - default: c = "UNK"; break; + case VARP_STATE_INCOMPLETE: state = "INC"; break; + case VARP_STATE_REACHABLE: state = "RCH"; break; + case VARP_STATE_FAILED: state = "FLD"; break; + default: state = "UNK"; break; } - d = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : " "); - - printk(KERN_INFO "VENTRY(%p ref=%1d %s %s vnet=%d vmac=" MACFMT " addr=" IPFMT " q=%d t=%lu)\n", + flags = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : " "); + + printk(KERN_INFO "VENTRY(%p ref=%1d %s %s vnet=%s vmac=" MACFMT + " addr=%s q=%3d t=%lu)\n", ventry, atomic_read(&ventry->refcount), - c, d, - ventry->key.vnet, + state, flags, + VnetId_ntoa(&ventry->key.vnet, vnetbuf), MAC6TUPLE(ventry->key.vmac.mac), - NIPQUAD(ventry->addr), + VarpAddr_ntoa(&ventry->addr, addrbuf), skb_queue_len(&ventry->queue), ventry->timestamp); } else { @@ -469,7 +534,6 @@ void VarpEntry_incref(VarpEntry *z){ if(!z) return; atomic_inc(&z->refcount); - //dprintf("> "); VarpEntry_print(z); } /** Decrement reference count, freeing if zero. @@ -478,9 +542,7 @@ */ void VarpEntry_decref(VarpEntry *z){ if(!z) return; - //dprintf("> "); VarpEntry_print(z); if(atomic_dec_and_test(&z->refcount)){ - //dprintf("> freeing %p...\n", z); VarpEntry_free(z); } } @@ -499,7 +561,7 @@ /** Schedule the varp entry timer. * Must increment the reference count before doing - * this the first time, so the ventry won' be freed + * this the first time, so the ventry won't be freed * before the timer goes off. * * @param ventry varp entry @@ -538,7 +600,7 @@ atomic_inc(&ventry->probes); VarpEntry_unlock(ventry, flags); locked = 0; - varp_solicit(skb, ventry->key.vnet); + varp_solicit(skb, &ventry->key.vnet); } else { dprintf("> empty queue.\n"); } @@ -568,7 +630,7 @@ * @param vmac virtual MAC address (copied) * @return ventry or null */ -VarpEntry * VarpEntry_new(u32 vnet, Vmac *vmac){ +VarpEntry * VarpEntry_new(VnetId *vnet, Vmac *vmac){ VarpEntry *z = ALLOCATE(VarpEntry); if(z){ unsigned long now = jiffies; @@ -584,7 +646,7 @@ z->timestamp = now; z->error = varp_error_fn; - z->key.vnet = vnet; + z->key.vnet = *vnet; z->key.vmac = *vmac; } return z; @@ -598,15 +660,9 @@ */ Hashcode varp_key_hash_fn(void *k){ VarpKey *key = k; - Hashcode h; - h = hash_2ul(key->vnet, - (key->vmac.mac[0] << 24) | - (key->vmac.mac[1] << 16) | - (key->vmac.mac[2] << 8) | - (key->vmac.mac[3] )); - h = hash_hul(h, - (key->vmac.mac[4] << 8) | - (key->vmac.mac[5] )); + Hashcode h = 0; + h = VnetId_hash(h, &key->vnet); + h = Vmac_hash(h, &key->vmac); return h; } @@ -620,8 +676,8 @@ int varp_key_equal_fn(void *k1, void *k2){ VarpKey *key1 = k1; VarpKey *key2 = k2; - return (key1->vnet == key2->vnet) - && (memcmp(key1->vmac.mac, key2->vmac.mac, ETH_ALEN) == 0); + return (VnetId_eq(&key1->vnet, &key2->vnet) && + Vmac_eq(&key1->vmac, &key2->vmac)); } /** Free an entry in the varp cache. @@ -670,12 +726,10 @@ */ static void varp_table_timer_fn(unsigned long arg){ VarpTable *z = (VarpTable *)arg; - //dprintf("> z=%p\n", z); if(z){ VarpTable_sweep(z, 0); VarpTable_schedule(z); } - //dprintf("<\n"); } /** Print a varp table. @@ -687,7 +741,6 @@ VarpEntry *ventry; unsigned long flags, vflags; - //dprintf(">\n"); VarpTable_read_lock(z, flags); HashTable_for_each(entry, varp_table->table){ ventry = entry->value; @@ -696,7 +749,6 @@ VarpEntry_unlock(ventry, vflags); } VarpTable_read_unlock(z, flags); - //dprintf("<\n"); } /** Create a varp table. @@ -735,7 +787,7 @@ * @param vmac virtual MAC address (copied) * @return new entry or null */ -VarpEntry * VarpTable_add(VarpTable *z, u32 vnet, Vmac *vmac){ +VarpEntry * VarpTable_add(VarpTable *z, VnetId *vnet, Vmac *vmac){ int err = -ENOMEM; VarpEntry *ventry; HTEntry *entry; @@ -743,7 +795,6 @@ ventry = VarpEntry_new(vnet, vmac); if(!ventry) goto exit; - //dprintf("> "); VarpEntry_print(ventry); VarpTable_write_lock(z, flags); entry = HashTable_add(z->table, ventry, ventry); VarpTable_write_unlock(z, flags); @@ -775,19 +826,20 @@ * @param vmac virtual MAC addres * @return entry found or null */ -VarpEntry * VarpTable_lookup(VarpTable *z, u32 vnet, Vmac *vmac){ +VarpEntry * VarpTable_lookup(VarpTable *z, VnetId *vnet, Vmac *vmac){ unsigned long flags; - VarpKey key = { .vnet = vnet, .vmac = *vmac }; + VarpKey key = { .vnet = *vnet, .vmac = *vmac }; VarpEntry *ventry; VarpTable_read_lock(z, flags); ventry = HashTable_get(z->table, &key); + if(ventry) VarpEntry_incref(ventry); VarpTable_read_unlock(z, flags); - if(ventry) VarpEntry_incref(ventry); return ventry; } /** Handle output for a reachable ventry. * Send the skb using the tunnel to the care-of address. + * Assumes the ventry lock is held. * * @param ventry varp entry * @param skb skb to send @@ -796,12 +848,12 @@ int VarpEntry_send(VarpEntry *ventry, struct sk_buff *skb){ int err = 0; unsigned long flags = 0; - u32 addr; + VarpAddr addr; dprintf("> skb=%p\n", skb); addr = ventry->addr; VarpEntry_unlock(ventry, flags); - err = vnet_tunnel_send(ventry->key.vnet, addr, skb); + err = vnet_tunnel_send(&ventry->key.vnet, &addr, skb); VarpEntry_lock(ventry, flags); dprintf("< err=%d\n", err); return err; @@ -811,6 +863,7 @@ * If the entry is still incomplete, queue the skb, otherwise * send it. If the queue is full, dequeue and free an old skb to * make room for the new one. + * Assumes the ventry lock is held. * * @param ventry varp entry * @param skb skb to send @@ -820,7 +873,7 @@ int err = 0; unsigned long flags = 0; - dprintf("> skb=%p\n", skb); //VarpEntry_print(ventry); + dprintf("> skb=%p\n", skb); ventry->state = VARP_STATE_INCOMPLETE; atomic_set(&ventry->probes, 1); if(!VarpEntry_get_flags(ventry, VARP_FLAG_PROBING)){ @@ -829,7 +882,7 @@ VarpEntry_schedule(ventry); } VarpEntry_unlock(ventry, flags); - varp_solicit(skb, ventry->key.vnet); + varp_solicit(skb, &ventry->key.vnet); VarpEntry_lock(ventry, flags); if(ventry->state == VARP_STATE_INCOMPLETE){ @@ -837,7 +890,7 @@ struct sk_buff *oldskb; oldskb = ventry->queue.next; __skb_unlink(oldskb, &ventry->queue); - dprintf("> purging skb=%p\n", oldskb); + dprintf("> dropping skb=%p\n", oldskb); kfree_skb(oldskb); } __skb_queue_tail(&ventry->queue, skb); @@ -893,33 +946,39 @@ * @param state state * @return 0 on success, error code otherwise */ -int VarpEntry_update(VarpEntry *ventry, u32 addr, int state){ +int VarpEntry_update(VarpEntry *ventry, VarpAddr *addr, int state){ int err = 0; unsigned long now = jiffies; unsigned long flags; dprintf("> addr=" IPFMT " state=%d\n", NIPQUAD(addr), state); - //VarpEntry_print(ventry); VarpEntry_lock(ventry, flags); if(VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT)) goto exit; - ventry->addr = addr; + ventry->addr = *addr; ventry->timestamp = now; ventry->state = state; VarpEntry_process_queue(ventry); exit: - //dprintf("> "); VarpEntry_print(ventry); VarpEntry_unlock(ventry, flags); dprintf("< err=%d\n", err); return err; } -int VarpTable_update(VarpTable *z, int vnet, Vmac *vmac, u32 addr, +int VarpTable_update(VarpTable *z, VnetId *vnet, Vmac *vmac, VarpAddr *addr, int state, int force){ int err = 0; VarpEntry *ventry; +#ifdef DEBUG + char vnetbuf[VNET_ID_BUF]; + char addrbuf[VARP_ADDR_BUF]; +#endif - dprintf("> vnet=%d mac=" MACFMT " addr=" IPFMT " state=%d force=%d\n", - vnet, MAC6TUPLE(vmac->mac), NIPQUAD(addr), state, force); + dprintf("> vnet=%s mac=" MACFMT " addr=%s state=%d force=%d\n", + VnetId_ntoa(vnet, vnetbuf), + MAC6TUPLE(vmac->mac), + VarpAddr_ntoa(addr, addrbuf), + state, + force); ventry = VarpTable_lookup(z, vnet, vmac); if(force && !ventry){ dprintf("> No entry, adding\n"); @@ -945,10 +1004,10 @@ * @return 0 on success, -ENOENT if no entry found */ int VarpTable_update_entry(VarpTable *z, VarpHdr *varph, int state){ - return VarpTable_update(z, ntohl(varph->vnet), &varph->vmac, varph->addr, state, 0); -} - -int varp_update(int vnet, unsigned char *vmac, u32 addr){ + return VarpTable_update(z, &varph->vnet, &varph->vmac, &varph->addr, state, 0); +} + +int varp_update(VnetId *vnet, unsigned char *vmac, VarpAddr *addr){ if(!varp_table){ return -ENOSYS; } @@ -971,7 +1030,6 @@ unsigned long old = now - VARP_ENTRY_TTL; unsigned long flags, vflags; - //dprintf(">\n"); VarpTable_read_lock(z, flags); HashTable_for_each(entry, varp_table->table){ ventry = entry->value; @@ -984,7 +1042,36 @@ VarpEntry_unlock(ventry, vflags); } VarpTable_read_unlock(z, flags); - //dprintf("<\n"); +} + +/** Flush the varp table. + * Remove old unreachable varp entries with empty queues. + * Permanent entries are not removed. + * + * @param z table + */ +void VarpTable_flush(VarpTable *z){ + HashTable_for_decl(entry); + VarpEntry *ventry; + unsigned long now = jiffies; + unsigned long old = now - VARP_ENTRY_TTL; + unsigned long flags, vflags; + int flush; + + VarpTable_write_lock(z, flags); + HashTable_for_each(entry, varp_table->table){ + ventry = entry->value; + VarpEntry_lock(ventry, vflags); + flush = (!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) && + (ventry->timestamp < old) && + (ventry->state != VARP_STATE_REACHABLE) && + (skb_queue_len(&ventry->queue) == 0)); + VarpEntry_unlock(ventry, vflags); + if(flush){ + VarpTable_remove(z, ventry); + } + } + VarpTable_write_unlock(z, flags); } /** Handle a varp request. Look for a vif with the requested @@ -997,14 +1084,13 @@ */ int varp_handle_request(struct sk_buff *skb, VarpHdr *varph){ int err = -ENOENT; - u32 vnet; + VnetId *vnet; Vmac *vmac; Vif *vif = NULL; dprintf(">\n"); - vnet = ntohl(varph->vnet); + vnet = &varph->vnet; vmac = &varph->vmac; - dprintf("> vnet=%d vmac=" MACFMT "\n", vnet, MAC6TUPLE(vmac->mac)); if(vif_lookup(vnet, vmac, &vif)) goto exit; varp_send(VARP_OP_ANNOUNCE, skb->dev, skb, vnet, vmac); vif_decref(vif); @@ -1026,7 +1112,7 @@ err = -ENOSYS; goto exit; } - err = varp_send(VARP_OP_ANNOUNCE, dev, NULL, vif->vnet, &vif->vmac); + err = varp_send(VARP_OP_ANNOUNCE, dev, NULL, &vif->vnet, &vif->vmac); exit: dprintf("< err=%d\n", err); return err; @@ -1067,7 +1153,7 @@ (skb->nh.iph->daddr != varp_mcast_addr)){ // Ignore multicast packets not addressed to us. err = 0; - dprintf("> daddr=" IPFMT " mcaddr=" IPFMT "\n", + dprintf("> Ignoring daddr=" IPFMT " mcaddr=" IPFMT "\n", NIPQUAD(skb->nh.iph->daddr), NIPQUAD(varp_mcast_addr)); goto exit; } @@ -1076,23 +1162,29 @@ goto exit; } mine = 1; - if(varph->vnetmsghdr.id != htons(VARP_ID)){ + if(varph->hdr.id != htons(VARP_ID)){ // It's not varp at all - ignore it. - wprintf("> Unknown id: %d \n", ntohs(varph->vnetmsghdr.id)); + wprintf("> Invalid varp id: %d, expected %d \n", + ntohs(varph->hdr.id), + VARP_ID); goto exit; } - if(1){ +#ifdef DEBUG + { + char vnetbuf[VNET_ID_BUF]; + char addrbuf[VARP_ADDR_BUF]; dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n", NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr)); dprintf("> sport=%u dport=%u\n", ntohs(skb->h.uh->source), ntohs(skb->h.uh->dest)); - dprintf("> opcode=%d vnet=%u vmac=" MACFMT " addr=" IPFMT "\n", - ntohs(varph->vnetmsghdr.opcode), - ntohl(varph->vnet), + dprintf("> opcode=%d vnet=%s vmac=" MACFMT " addr=%s\n", + ntohs(varph->hdr.opcode), + VnetId_ntoa(&varph->vnet, vnetbuf), MAC6TUPLE(varph->vmac.mac), - NIPQUAD(varph->addr)); + VarpAddr_ntoa(&varph->addr, addrbuf)); varp_dprint(); } - switch(ntohs(varph->vnetmsghdr.opcode)){ +#endif + switch(ntohs(varph->hdr.opcode)){ case VARP_OP_REQUEST: err = varp_handle_request(skb, varph); break; @@ -1100,8 +1192,8 @@ err = varp_handle_announce(skb, varph); break; default: - wprintf("> Unknown opcode: %d \n", ntohs(varph->vnetmsghdr.opcode)); - break; + wprintf("> Unknown opcode: %d \n", ntohs(varph->hdr.opcode)); + break; } exit: if(mine) err = 1; @@ -1112,30 +1204,32 @@ /** Send an outgoing packet on the appropriate vnet tunnel. * * @param skb outgoing message - * @param vnet vnet (host order) + * @param vnet vnet (network order) * @return 0 on success, error code otherwise */ -int varp_output(struct sk_buff *skb, u32 vnet){ +int varp_output(struct sk_buff *skb, VnetId *vnet){ int err = 0; unsigned char *mac = NULL; Vmac *vmac = NULL; VarpEntry *ventry = NULL; - dprintf("> skb=%p vnet=%u\n", skb, vnet); + dprintf(">\n"); if(!varp_table){ err = -ENOSYS; goto exit; } - dprintf("> skb.mac=%p\n", skb->mac.raw); if(!skb->mac.raw){ wprintf("> No ethhdr in skb!\n"); err = -EINVAL; goto exit; } - mac = MAC_ETH(skb)->h_dest; + mac = eth_hdr(skb)->h_dest; vmac = (Vmac*)mac; if(mac_is_multicast(mac)){ - err = vnet_tunnel_send(vnet, varp_mcast_addr, skb); + VarpAddr addr = {}; + addr.family = AF_INET; + addr.u.ip4.s_addr = varp_mcast_addr; + err = vnet_tunnel_send(vnet, &addr, skb); } else { ventry = VarpTable_lookup(varp_table, vnet, vmac); if(!ventry){ @@ -1165,7 +1259,7 @@ int err = 0; varp_close(); varp_mcast_addr = addr; - err = varp_open(varp_mcast_addr, varp_ucast_addr, varp_port); + err = varp_open(varp_mcast_addr, varp_port); return err; } @@ -1191,7 +1285,6 @@ */ int varp_init(void){ int err = 0; - struct net_device *dev = NULL; dprintf(">\n"); varp_table = VarpTable_new(); @@ -1200,18 +1293,10 @@ goto exit; } varp_init_mcast_addr(varp_mcaddr); - err = vnet_get_device(varp_device, &dev); - dprintf("> vnet_get_device(%s)=%d\n", varp_device, err); - if(err) goto exit; - err = vnet_get_device_address(dev, &varp_ucast_addr); - dprintf("> vnet_get_device_address()=%d\n", err); - if(err) goto exit; varp_port = htons(VARP_PORT); - err = varp_open(varp_mcast_addr, varp_ucast_addr, varp_port); - dprintf("> varp_open()=%d\n", err); + err = varp_open(varp_mcast_addr, varp_port); exit: - if(dev) dev_put(dev); dprintf("< err=%d\n", err); return err; } diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/varp.h --- a/tools/vnet/vnet-module/varp.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/varp.h Fri Aug 26 20:47:16 2005 @@ -19,6 +19,10 @@ #ifndef _VNET_VARP_H #define _VNET_VARP_H +#include "hash_table.h" +#include "if_varp.h" +#include "varp_util.h" + #define CONFIG_VARP_GRATUITOUS 1 @@ -26,29 +30,26 @@ struct sk_buff; struct Vif; -#define DEVICE "xen-br0" - extern int vnet_get_device(const char *name, struct net_device **dev); extern int vnet_get_device_address(struct net_device *dev, u32 *addr); extern int varp_handle_message(struct sk_buff *skb); -extern int varp_output(struct sk_buff *skb, u32 vnet); -extern int varp_update(int vnet, unsigned char *vmac, u32 addr); +extern int varp_output(struct sk_buff *skb, struct VnetId *vnet); +extern int varp_update(struct VnetId *vnet, unsigned char *vmac, struct VarpAddr *addr); extern int varp_init(void); extern void varp_exit(void); -extern int varp_open(u32 mcaddr, u32 addr, u16 port); +extern int varp_open(u32 mcaddr, u16 port); extern void varp_close(void); extern int varp_set_mcast_addr(u32 addr); extern void varp_print(void); +extern void varp_flush(void); extern int varp_announce_vif(struct net_device *dev, struct Vif *vif); -//extern int varp_announce_vifs(struct net_device *dev, struct task_struct *domain); extern u32 varp_mcast_addr; - /* MAC broadcast addr is ff-ff-ff-ff-ff-ff (all 1's). * MAC multicast addr has low bit 1, i.e. 01-00-00-00-00-00. diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/varp_socket.c --- a/tools/vnet/vnet-module/varp_socket.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/varp_socket.c Fri Aug 26 20:47:16 2005 @@ -177,7 +177,7 @@ /*============================================================================*/ /** Socket flags. */ -enum { +enum VsockFlag { VSOCK_REUSE = 1, VSOCK_BIND = 2, VSOCK_CONNECT = 4, @@ -256,28 +256,13 @@ */ int setsock_multicast(int sock, uint32_t saddr){ int err = 0; - struct net_device *dev = NULL; - u32 addr = 0; struct ip_mreqn mreq = {}; int mloop = 0; - err = vnet_get_device(DEVICE, &dev); - if(err){ - eprintf("> error getting device: %d %d\n", err, errno); - goto exit; - } - err = vnet_get_device_address(dev, &addr); - if(err){ - eprintf("> error getting device address: %d %d\n", err, errno); - goto exit; - } // See 'man 7 ip' for these options. mreq.imr_multiaddr.s_addr = saddr; // IP multicast address. - //mreq.imr_address.s_addr = addr; // Interface IP address. mreq.imr_address.s_addr = INADDR_ANY; // Interface IP address. mreq.imr_ifindex = 0; // Interface index (0 means any). - dprintf("> saddr=%u.%u.%u.%u addr=%u.%u.%u.%u ifindex=%d\n", - NIPQUAD(saddr), NIPQUAD(addr), mreq.imr_ifindex); err = setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &mloop, sizeof(mloop)); if(err < 0){ eprintf("> setsockopt IP_MULTICAST_LOOP: %d %d\n", err, errno); @@ -305,7 +290,7 @@ } /** Create a socket. - * The flags can include VSOCK_REUSE, VSOCK_BROADCAST, VSOCK_CONNECT. + * The flags can include values from enum VsockFlag. * * @param socktype socket type * @param saddr address @@ -368,19 +353,15 @@ /** Open the varp multicast socket. * * @param mcaddr multicast address - * @param saddr address * @param port port * @param val return parameter for the socket * @return 0 on success, error code otherwise */ -int varp_mcast_open(uint32_t mcaddr, uint32_t saddr, uint16_t port, int *val){ +int varp_mcast_open(uint32_t mcaddr, uint16_t port, int *val){ int err = 0; int flags = VSOCK_REUSE; int multicast = MULTICAST(mcaddr); int sock = 0; - struct sockaddr_in addr_in; - struct sockaddr *addr = (struct sockaddr *)&addr_in; - int addr_n = sizeof(addr_in); dprintf(">\n"); flags |= VSOCK_MULTICAST; @@ -392,23 +373,6 @@ err = setsock_multicast_ttl(sock, 1); if(err < 0) goto exit; } - if(0){ - addr_in.sin_family = AF_INET; - addr_in.sin_addr.s_addr = saddr; - addr_in.sin_port = port; - err = bind(sock, addr, addr_n); - if(err < 0){ - eprintf("> bind: %d %d\n", err, errno); - goto exit; - } - } - if(0){ - struct sockaddr_in self = {}; - int self_n; - getsockname(sock, (struct sockaddr *)&self, &self_n); - dprintf("> sockname sock=%d addr=%u.%u.%u.%u port=%d\n", - sock, NIPQUAD(saddr), ntohs(port)); - } exit: if(err){ shutdown(sock, 2); @@ -427,7 +391,7 @@ */ int varp_ucast_open(uint32_t addr, u16 port, int *val){ int err = 0; - int flags = VSOCK_BIND | VSOCK_REUSE; + int flags = (VSOCK_BIND | VSOCK_REUSE); dprintf(">\n"); err = create_socket(SOCK_DGRAM, addr, port, flags, val); dprintf("< err=%d val=%d\n", err, *val); @@ -536,7 +500,6 @@ err = sock_add_wait_queue(varp_mcast_sock, &mcast_wait); err = sock_add_wait_queue(varp_ucast_sock, &ucast_wait); for(n = 1; atomic_read(&varp_run) == 1; n++){ - //dprintf("> n=%d\n", n); count = 0; count += handle_sock_skb(varp_mcast_sock); count += handle_sock_skb(varp_ucast_sock); @@ -609,20 +572,18 @@ /** Open the varp sockets and start the thread handling them. * * @param mcaddr multicast address - * @param addr unicast address * @param port port * @return 0 on success, error code otherwise */ -int varp_open(u32 mcaddr, u32 addr, u16 port){ +int varp_open(u32 mcaddr, u16 port){ int err = 0; mm_segment_t oldfs; //MOD_INC_USE_COUNT; - dprintf("> mcaddr=%u.%u.%u.%u addr=%u.%u.%u.%u port=%u\n", - NIPQUAD(mcaddr), NIPQUAD(addr), ntohs(port)); - //MOD_INC_USE_COUNT; + dprintf("> mcaddr=%u.%u.%u.%u port=%u\n", + NIPQUAD(mcaddr), ntohs(port)); oldfs = change_fs(KERNEL_DS); - err = varp_mcast_open(mcaddr, addr, port, &varp_mcast_sock); + err = varp_mcast_open(mcaddr, port, &varp_mcast_sock); if(err < 0 ) goto exit; err = varp_ucast_open(INADDR_ANY, port, &varp_ucast_sock); if(err < 0 ) goto exit; diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vif.c --- a/tools/vnet/vnet-module/vif.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vif.c Fri Aug 26 20:47:16 2005 @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/version.h> #include <linux/net.h> #include <linux/in.h> @@ -33,11 +34,14 @@ #include <net/protocol.h> #include <net/route.h> #include <linux/skbuff.h> +#include <linux/spinlock.h> #include <etherip.h> #include <if_varp.h> #include <vnet_dev.h> #include <vif.h> +#include <varp.h> + #include "allocate.h" #include "hash_table.h" #include "sys_net.h" @@ -50,6 +54,27 @@ /** Table of vifs indexed by VifKey. */ HashTable *vif_table = NULL; +rwlock_t vif_table_lock = RW_LOCK_UNLOCKED; + +#define vif_read_lock(flags) read_lock_irqsave(&vif_table_lock, (flags)) +#define vif_read_unlock(flags) read_unlock_irqrestore(&vif_table_lock, (flags)) +#define vif_write_lock(flags) write_lock_irqsave(&vif_table_lock, (flags)) +#define vif_write_unlock(flags) write_unlock_irqrestore(&vif_table_lock, (flags)) + +void vif_print(void){ + HashTable_for_decl(entry); + Vif *vif; + unsigned long flags; + char vnetbuf[VNET_ID_BUF]; + + vif_read_lock(flags); + HashTable_for_each(entry, vif_table){ + vif = entry->value; + printk(KERN_INFO "VIF(vnet=%s vmac=" MACFMT ")\n", + VnetId_ntoa(&vif->vnet, vnetbuf), MAC6TUPLE(vif->vmac.mac)); + } + vif_read_unlock(flags); +} void vif_decref(Vif *vif){ if(!vif) return; @@ -71,18 +96,11 @@ */ Hashcode vif_key_hash_fn(void *k){ VifKey *key = k; - Hashcode h; - h = hash_2ul(key->vnet, - (key->vmac.mac[0] << 24) | - (key->vmac.mac[1] << 16) | - (key->vmac.mac[2] << 8) | - (key->vmac.mac[3] )); - h = hash_hul(h, - (key->vmac.mac[4] << 8) | - (key->vmac.mac[5] )); + Hashcode h = 0; + h = VnetId_hash(h, &key->vnet); + h = Vmac_hash(h, &key->vmac); return h; } - /** Test equality for keys in the vif table. * Compares vnet and mac. @@ -94,7 +112,8 @@ int vif_key_equal_fn(void *k1, void *k2){ VifKey *key1 = k1; VifKey *key2 = k2; - return (key1->vnet == key2->vnet) && (memcmp(key1->vmac.mac, key2->vmac.mac, ETH_ALEN) == 0); + return (VnetId_eq(&key1->vnet , &key2->vnet) && + Vmac_eq(&key1->vmac, &key2->vmac)); } /** Free an entry in the vif table. @@ -118,13 +137,13 @@ * @param mac MAC address * @return 0 on success, -ENOENT otherwise */ -int vif_lookup(int vnet, Vmac *vmac, Vif **vif){ - int err = 0; - VifKey key = {}; +int vif_lookup(VnetId *vnet, Vmac *vmac, Vif **vif){ + int err = 0; + VifKey key = { .vnet = *vnet, .vmac = *vmac }; HTEntry *entry = NULL; + unsigned long flags; - key.vnet = vnet; - key.vmac = *vmac; + vif_read_lock(flags); entry = HashTable_get_entry(vif_table, &key); if(entry){ *vif = entry->value; @@ -133,7 +152,7 @@ *vif = NULL; err = -ENOENT; } - //dprintf("< err=%d addr=" IPFMT "\n", err, NIPQUAD(*coaddr)); + vif_read_unlock(flags); return err; } @@ -143,10 +162,12 @@ * @param mac MAC address * @return 0 on success, negative error code otherwise */ -int vif_add(int vnet, Vmac *vmac, Vif **val){ +int vif_add(VnetId *vnet, Vmac *vmac, Vif **val){ int err = 0; Vif *vif = NULL; HTEntry *entry; + unsigned long flags; + dprintf("> vnet=%d\n", vnet); vif = ALLOCATE(Vif); if(!vif){ @@ -154,9 +175,11 @@ goto exit; } atomic_set(&vif->refcount, 1); - vif->vnet = vnet; + vif->vnet = *vnet; vif->vmac = *vmac; + vif_write_lock(flags); entry = HashTable_add(vif_table, vif, vif); + vif_write_unlock(flags); if(!entry){ err = -ENOMEM; deallocate(vif); @@ -177,22 +200,14 @@ * @param coaddr return parameter for care-of address * @return number of entries deleted, or negative error code */ -int vif_remove(int vnet, Vmac *vmac){ - int err = 0; - VifKey key = { .vnet = vnet, .vmac = *vmac }; - //dprintf("> vnet=%d addr=%u.%u.%u.%u\n", vnet, NIPQUAD(coaddr)); +int vif_remove(VnetId *vnet, Vmac *vmac){ + int err = 0; + VifKey key = { .vnet = *vnet, .vmac = *vmac }; + unsigned long flags; + + vif_write_lock(flags); err = HashTable_remove(vif_table, &key); - //dprintf("< err=%d\n", err); - return err; -} - -int vif_find(int vnet, Vmac *vmac, int create, Vif **vif){ - int err = 0; - - err = vif_lookup(vnet, vmac, vif); - if(err && create){ - err = vif_add(vnet, vmac, vif); - } + vif_write_unlock(flags); return err; } @@ -200,15 +215,15 @@ HashTable_clear(vif_table); } -int vif_create(int vnet, Vmac *vmac, Vif **vif){ +int vif_create(VnetId *vnet, Vmac *vmac, Vif **vif){ int err = 0; dprintf(">\n"); - if(!vif_lookup(vnet, vmac, vif)){ + if(vif_lookup(vnet, vmac, vif) == 0){ + vif_decref(*vif); err = -EEXIST; goto exit; } - dprintf("> vif_add...\n"); err = vif_add(vnet, vmac, vif); exit: if(err){ @@ -218,25 +233,6 @@ return err; } -/** Create a vif. - * - * @param vnet vnet id - * @param mac mac address (as a string) - * @return 0 on success, error code otherwise - */ -int mkvif(int vnet, char *mac){ - int err = 0; - Vmac vmac = {}; - Vif *vif = NULL; - dprintf("> vnet=%d mac=%s\n", vnet, mac); - err = mac_aton(mac, vmac.mac); - if(err) goto exit; - err = vif_create(vnet, &vmac, &vif); - exit: - dprintf("< err=%d\n", err); - return err; -} - /** Initialize the vif table. * * @return 0 on success, error code otherwise @@ -250,12 +246,9 @@ goto exit; } vif_table->entry_free_fn = vif_entry_free_fn; - vif_table->key_hash_fn = vif_key_hash_fn; - vif_table->key_equal_fn = vif_key_equal_fn; - - // Some vifs for testing. - //mkvif(1, "aa:00:00:00:20:11"); - //mkvif(2, "aa:00:00:00:20:12"); + vif_table->key_hash_fn = vif_key_hash_fn; + vif_table->key_equal_fn = vif_key_equal_fn; + exit: if(err < 0) wprintf("< err=%d\n", err); dprintf("< err=%d\n", err); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vif.h --- a/tools/vnet/vnet-module/vif.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vif.h Fri Aug 26 20:47:16 2005 @@ -24,12 +24,12 @@ /** Key for entries in the vif table. */ typedef struct VifKey { - int vnet; + VnetId vnet; Vmac vmac; } VifKey; typedef struct Vif { - int vnet; + VnetId vnet; Vmac vmac; struct net_device *dev; atomic_t refcount; @@ -38,15 +38,17 @@ struct HashTable; extern struct HashTable *vif_table; +extern void vif_print(void); + extern void vif_decref(Vif *vif); extern void vif_incref(Vif *vif); -extern int vif_create(int vnet, Vmac *vmac, Vif **vif); +extern int vif_create(struct VnetId *vnet, Vmac *vmac, Vif **vif); -extern int vif_add(int vnet, Vmac *vmac, Vif **vif); -extern int vif_lookup(int vnet, Vmac *vmac, Vif **vif); -extern int vif_remove(int vnet, Vmac *vmac); -extern int vif_find(int vnet, Vmac *vmac, int create, Vif **vif); +extern int vif_create(VnetId *vnet, Vmac *vmac, Vif **vif); +extern int vif_add(struct VnetId *vnet, Vmac *vmac, Vif **vif); +extern int vif_lookup(struct VnetId *vnet, Vmac *vmac, Vif **vif); +extern int vif_remove(struct VnetId *vnet, Vmac *vmac); extern void vif_purge(void); extern int vif_init(void); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vnet.c --- a/tools/vnet/vnet-module/vnet.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vnet.c Fri Aug 26 20:47:16 2005 @@ -47,6 +47,7 @@ #include <random.h> #include <tunnel.h> +#include <skb_util.h> #include <vnet_dev.h> #include <vnet.h> #include <vif.h> @@ -70,7 +71,7 @@ /** Key for entries in the vnet address table. */ typedef struct VnetAddrKey { /** Vnet id. */ - int vnet; + VnetId vnet; /** MAC address. */ unsigned char mac[ETH_ALEN]; } VnetAddrKey; @@ -88,7 +89,6 @@ void Vnet_decref(Vnet *info){ if(!info) return; if(atomic_dec_and_test(&info->refcount)){ - dprintf("> free vnet=%u\n", info->vnet); vnet_dev_remove(info); deallocate(info); } @@ -101,6 +101,28 @@ void Vnet_incref(Vnet *info){ if(!info) return; atomic_inc(&info->refcount); +} + +void Vnet_print(Vnet *info) +{ + char vnetbuf[VNET_ID_BUF]; + + printk(KERN_INFO "VNET(vnet=%s device=%s security=%c%c)\n", + VnetId_ntoa(&info->vnet, vnetbuf), + info->device, + ((info->security & SA_AUTH) ? 'a' : '-'), + ((info->security & SA_CONF) ? 'c' : '-')); +} + +void vnet_print(void) +{ + HashTable_for_decl(entry); + Vnet *info; + + HashTable_for_each(entry, vnet_table){ + info = entry->value; + Vnet_print(info); + } } /** Allocate a vnet, setting reference count to 1. @@ -129,7 +151,7 @@ HTEntry *entry = NULL; // Vnet_del(info->vnet); //todo: Delete existing vnet info? Vnet_incref(info); - entry = HashTable_add(vnet_table, HKEY(info->vnet), info); + entry = HashTable_add(vnet_table, &info->vnet, info); if(!entry){ err = -ENOMEM; Vnet_decref(info); @@ -142,8 +164,8 @@ * @param vnet id of vnet to remove * @return number of vnets removed */ -int Vnet_del(vnetid_t vnet){ - return HashTable_remove(vnet_table, HKEY(vnet)); +int Vnet_del(VnetId *vnet){ + return HashTable_remove(vnet_table, vnet); } /** Lookup a vnet by id. @@ -153,17 +175,14 @@ * @param info return parameter for vnet * @return 0 on sucess, -ENOENT if no vnet found */ -int Vnet_lookup(vnetid_t vnet, Vnet **info){ - int err = 0; - dprintf("> vnet=%u info=%p\n", vnet, info); - dprintf("> vnet_table=%p\n",vnet_table); - *info = HashTable_get(vnet_table, HKEY(vnet)); +int Vnet_lookup(VnetId *vnet, Vnet **info){ + int err = 0; + *info = HashTable_get(vnet_table, vnet); if(*info){ Vnet_incref(*info); } else { err = -ENOENT; } - dprintf("< err=%d\n", err); return err; } @@ -191,22 +210,33 @@ */ static int vnet_setup(void){ int err = 0; - int i, n = 5; //20; + int i, n = 3; int security = vnet_security_default; + uint32_t vnetid; Vnet *vnet; - dprintf(">\n"); for(i=0; i<n; i++){ err = Vnet_alloc(&vnet); if(err) break; - vnet->vnet = VNET_VIF + i; - vnet->security = (vnet->vnet > 10 ? security : 0); - //err = Vnet_add(vnet); + vnetid = VNET_VIF + i; + vnet->vnet = toVnetId(vnetid); + sprintf(vnet->device, "vnif%04x", vnetid); + vnet->security = (vnetid > 10 ? security : 0); err = Vnet_create(vnet); if(err) break; } - dprintf("< err=%d\n", err); - return err; + return err; +} + +int vnet_key_equal_fn(void *k1, void *k2){ + VnetId *key1 = k1; + VnetId *key2 = k2; + return VnetId_eq(key1, key2); +} + +Hashcode vnet_key_hash_fn(void *k){ + VnetId *key = k; + return VnetId_hash(0, key); } /** Initialize the vnet table and the physical vnet. @@ -216,18 +246,18 @@ int vnet_init(void){ int err = 0; - dprintf(">\n"); vnet_table = HashTable_new(0); - dprintf("> vnet_table=%p\n", vnet_table); if(!vnet_table){ err = -ENOMEM; goto exit; } + vnet_table->key_equal_fn = vnet_key_equal_fn; + vnet_table->key_hash_fn = vnet_key_hash_fn; vnet_table->entry_free_fn = vnet_entry_free_fn; err = Vnet_alloc(&vnet_physical); if(err) goto exit; - vnet_physical->vnet = VNET_PHYS; + vnet_physical->vnet = toVnetId(VNET_PHYS); vnet_physical->security = 0; err = Vnet_add(vnet_physical); if(err) goto exit; @@ -237,7 +267,6 @@ if(err) goto exit; err = vif_init(); exit: - if(err < 0) wprintf("< err=%d\n", err); return err; } @@ -248,50 +277,28 @@ vnet_table = NULL; } -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) - -static inline int skb_route(struct sk_buff *skb, struct rtable **prt){ - int err = 0; - struct flowi fl = { - .oif = skb->dev->ifindex, - .nl_u = { - .ip4_u = { - .daddr = skb->nh.iph->daddr, - .saddr = skb->nh.iph->saddr, - .tos = skb->nh.iph->tos, - } - } - }; - - err = ip_route_output_key(prt, &fl); - return err; -} - -#else - -static inline int skb_route(struct sk_buff *skb, struct rtable **prt){ - int err = 0; - struct rt_key key = { }; - key.dst = skb->nh.iph->daddr; - key.src = skb->nh.iph->saddr; - key.tos = skb->nh.iph->tos; - key.oif = skb->dev->ifindex; - err = ip_route_output_key(prt, &key); - return err; -} - -#endif - inline int skb_xmit(struct sk_buff *skb){ int err = 0; struct rtable *rt = NULL; - dprintf("> skb=%p dev=%s\n", skb, skb->dev->name); - + dprintf(">\n"); skb->protocol = htons(ETH_P_IP); err = skb_route(skb, &rt); - if(err) goto exit; + if(err){ + wprintf("> skb_route=%d\n", err); + wprintf("> dev=%s idx=%d src=%u.%u.%u.%u dst=%u.%u.%u.%u tos=%d\n", + (skb->dev ? skb->dev->name : "???"), + (skb->dev ? skb->dev->ifindex : -1), + NIPQUAD(skb->nh.iph->saddr), + NIPQUAD(skb->nh.iph->daddr), + skb->nh.iph->tos); + + goto exit; + } skb->dst = &rt->u.dst; + if(!skb->dev){ + skb->dev = rt->u.dst.dev; + } ip_select_ident(skb->nh.iph, &rt->u.dst, NULL); @@ -317,39 +324,27 @@ * * @todo fixme */ -int vnet_skb_send(struct sk_buff *skb, u32 vnet){ - int err = 0; - Vif *vif = NULL; - - dprintf("> skb=%p vnet=%u\n", skb, vnet); - if(vnet == VNET_PHYS || !vnet){ - // For completeness, send direct to the network. - if(skb->dev){ - err = skb_xmit(skb); - } else { - // Can't assume eth0 - might be nbe-br or other. Need to route. - struct net_device *dev = NULL; - err = vnet_get_device(DEVICE, &dev); - if(err) goto exit; - skb->dev = dev; - err = skb_xmit(skb); - dev_put(dev); - } +int vnet_skb_send(struct sk_buff *skb, VnetId *vnet){ + int err = 0; + VnetId vnet_phys = toVnetId(VNET_PHYS); + + dprintf(">\n"); + skb->dev = NULL; + if(!vnet || VnetId_eq(vnet, &vnet_phys)){ + // No vnet or physical vnet, send direct to the network. + skb_xmit(skb); } else { - dprintf("> varp_output\n"); err = varp_output(skb, vnet); } - //dprintf("< err=%d\n", err); - exit: - if(vif) vif_decref(vif); dprintf("< err=%d\n", err); return err; } /** Receive an skb for a vnet. + * We make the skb come out of the vif for the vnet, and + * let ethernet bridging forward it to related interfaces. * If the dest is broadcast, goes to all vifs on the vnet. - * If the dest is unicast, goes to addressed vif on vnet. - * For each vif we set the packet dev and receive the packet. + * If the dest is unicast, goes to the addressed vif on the vnet. * * The packet must have skb->mac.raw set and skb->data must point * after the device (ethernet) header. @@ -359,139 +354,19 @@ * @param vmac packet vmac * @return 0 on success, error code otherwise */ -#if 1 -int vnet_skb_recv(struct sk_buff *skb, u32 vnet, Vmac *vmac){ - // Receive the skb for a vnet. - // We make the skb come out of the vif for the vnet, and - // let ethernet bridging forward it to related interfaces. +int vnet_skb_recv(struct sk_buff *skb, VnetId *vnet, Vmac *vmac){ int err = 0; Vnet *info = NULL; - dprintf("> vnet=%u mac=%s\n", vnet, mac_ntoa(vmac->mac)); err = Vnet_lookup(vnet, &info); if(err) goto exit; skb->dev = info->dev; - dprintf("> netif_rx dev=%s\n", skb->dev->name); netif_rx(skb); exit: if(info) Vnet_decref(info); if(err){ - kfree_skb(skb); - } - dprintf("< err=%d\n", err); - return err; -} - -#else -int vnet_skb_recv(struct sk_buff *skb, u32 vnet, Vmac *vmac){ - int err = 0; - Vif *vif = NULL; - - dprintf("> vnet=%u mac=%s\n", vnet, mac_ntoa(vmac->mac)); - if(mac_is_multicast(vmac->mac)){ - HashTable_for_decl(entry); - int count = 0; - struct sk_buff *new_skb; - - HashTable_for_each(entry, vif_table){ - vif = entry->value; - if(vif->vnet != vnet) continue; - count++; - new_skb = skb_copy(skb, GFP_ATOMIC); - if(!new_skb) break; - new_skb->dev = vif->dev; - dprintf("> %d] netif_rx dev=%s\n", count, new_skb->dev->name); - netif_rx(new_skb); - } kfree_skb(skb); - } else { - err = vif_lookup(vnet, vmac, &vif); - if(err){ - kfree_skb(skb); - goto exit; - } - skb->dev = vif->dev; - dprintf("> netif_rx dev=%s\n", skb->dev->name); - netif_rx(skb); - } - exit: - dprintf("< err=%d\n", err); - return err; -} -#endif - -/** Check validity of an incoming IP frame. - * - * @param skb frame - * @return 0 if ok, error code otherwise - * - * @todo fixme Can prob skip most of this because linux will have done it. - * @todo Only need the vnet skb context check. - */ -int check_ip_frame(struct sk_buff *skb){ - int err = -EINVAL; - struct iphdr* iph; - struct net_device *dev; - __u32 len; - __u16 check; - -#if 0 - if(skb->context){ - // Todo: After ESP want to skip most checks (including checksum), - // Todo: but in general may not want to skip all checks on detunnel. - //dprintf("> Skip check, has context\n"); - err = 0; - goto exit; - } -#endif - // Check we have enough for an ip header - the skb passed should - // have data pointing at the eth header and skb->len should include - // that. skb->nh should already have been set. Let the indvidual - // protocol handlers worry about the exact ip header len - // (i.e. whether any ip options are set). - dev = skb->dev; - - if(skb->len < ETH_HLEN + sizeof(struct iphdr)){ - wprintf("> packet too short for ip header\n"); - goto exit; - } - - iph = skb->nh.iph; - /* - * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. - * - * Is the datagram acceptable? - * - * 1. Length at least the size of an ip header - * 2. Version of 4 - * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] - * 4. Doesn't have a bogus length - */ - if (iph->ihl < 5 || iph->version != 4){ - wprintf("> len and version check failed\n"); - goto exit; - } - if(skb->len < ETH_HLEN + (iph->ihl << 2)){ - wprintf("> packet too short for given ihl\n"); - goto exit; - } - - check = iph->check; - //iph->check = 0; - //iph->check = compute_cksum((__u16 *)iph, (iph->ihl << 1)); - if(iph->check != check){ - wprintf("> invalid checksum\n"); - goto exit; - } - - len = ntohs(iph->tot_len); - if (skb->len < len + ETH_HLEN || len < (iph->ihl << 2)){ - wprintf("> packet too short for tot_len\n"); - goto exit; - } - skb->h.raw = skb->nh.raw + (iph->ihl << 2); - err = 0; - exit: + } return err; } @@ -539,14 +414,13 @@ * * @todo Need to check that the sa provides the correct security level. */ -int vnet_check_context(int vnet, SkbContext *context, Vnet **val){ +int vnet_check_context(VnetId *vnet, SkbContext *context, Vnet **val){ int err = 0; Vnet *info = NULL; SAState *sa = NULL; err = Vnet_lookup(vnet, &info); if(err){ - wprintf("> No vnet %d\n", vnet); goto exit; } if(!info->security) goto exit; @@ -556,7 +430,8 @@ goto exit; } if(context->protocol != IPPROTO_ESP){ - wprintf("> Invalid protocol: wanted %d, got %d\n", IPPROTO_ESP, context->protocol); + wprintf("> Invalid protocol: wanted %d, got %d\n", + IPPROTO_ESP, context->protocol); goto exit; } sa = context->data; @@ -586,13 +461,11 @@ */ static void sa_tunnel_close(Tunnel *tunnel){ SAState *sa; - dprintf(">\n"); if(!tunnel) return; sa = tunnel->data; if(!sa) return; SAState_decref(sa); tunnel->data = NULL; - dprintf("<\n"); } /** Packet send function for SA tunnels. @@ -604,7 +477,6 @@ static int sa_tunnel_send(Tunnel *tunnel, struct sk_buff *skb){ int err = -EINVAL; SAState *sa; - //dprintf("> tunnel=%p\n", tunnel); if(!tunnel){ wprintf("> Null tunnel!\n"); goto exit; @@ -616,7 +488,6 @@ } err = SAState_send(sa, skb, tunnel->base); exit: - //dprintf("< err=%d\n", err); return err; } @@ -638,7 +509,7 @@ * @param tunnel return parameter * @return 0 on success, error code otherwise */ -int vnet_tunnel_open(u32 vnet, u32 addr, Tunnel **tunnel){ +int vnet_tunnel_open(VnetId *vnet, VarpAddr *addr, Tunnel **tunnel){ extern TunnelType *etherip_tunnel_type; int err = 0; Vnet *info = NULL; @@ -646,20 +517,17 @@ Tunnel *sa_tunnel = NULL; Tunnel *etherip_tunnel = NULL; - dprintf("> vnet=%u addr=" IPFMT "\n", vnet, NIPQUAD(addr)); err = Vnet_lookup(vnet, &info); - dprintf("> Vnet_lookup=%d\n", err); if(err) goto exit; if(info->security){ SAState *sa = NULL; - dprintf("> security=%d\n", info->security); + //FIXME: Assuming IPv4 for now. + u32 ipaddr = addr->u.ip4.s_addr; err = Tunnel_create(sa_tunnel_type, vnet, addr, base_tunnel, &sa_tunnel); if(err) goto exit; - dprintf("> sa_tunnel=%p\n", sa_tunnel); - err = sa_create(info->security, 0, IPPROTO_ESP, addr, &sa); + err = sa_create(info->security, 0, IPPROTO_ESP, ipaddr, &sa); if(err) goto exit; sa_tunnel->data = sa; - dprintf("> sa=%p\n", sa); base_tunnel = sa_tunnel; } err = Tunnel_create(etherip_tunnel_type, vnet, addr, base_tunnel, ðerip_tunnel); @@ -673,7 +541,6 @@ } else { *tunnel = etherip_tunnel; } - dprintf("< err=%d\n", err); return err; } @@ -685,14 +552,12 @@ * @param tunnel return parameter * @return 0 on success, error code otherwise */ -int vnet_tunnel_lookup(u32 vnet, u32 addr, Tunnel **tunnel){ - int err = 0; - dprintf("> vnet=%d addr=" IPFMT "\n", vnet, NIPQUAD(addr)); +int vnet_tunnel_lookup(VnetId *vnet, VarpAddr *addr, Tunnel **tunnel){ + int err = 0; *tunnel = Tunnel_lookup(vnet, addr); if(!*tunnel){ err = vnet_tunnel_open(vnet, addr, tunnel); } - dprintf("< err=%d\n", err); return err; } @@ -703,16 +568,14 @@ * @param skb packet * @return 0 on success, error code otherwise */ -int vnet_tunnel_send(vnetid_t vnet, vnetaddr_t addr, struct sk_buff *skb){ +int vnet_tunnel_send(VnetId *vnet, VarpAddr *addr, struct sk_buff *skb){ int err = 0; Tunnel *tunnel = NULL; - dprintf("> vnet=%u addr=" IPFMT "\n", vnet, NIPQUAD(addr)); err = vnet_tunnel_lookup(vnet, addr, &tunnel); if(err) goto exit; err = Tunnel_send(tunnel, skb); Tunnel_decref(tunnel); exit: - dprintf("< err=%d\n", err); return err; } @@ -722,7 +585,7 @@ vnet_exit(); esp_module_exit(); etherip_module_exit(); - tunnel_module_init(); + tunnel_module_exit(); random_module_exit(); } @@ -753,12 +616,13 @@ sa_algorithm_probe_all(); err = sa_table_init(); if(err) wprintf("> sa_table_init err=%d\n", err); + if(err) goto exit; ProcFS_init(); exit: if(err < 0){ vnet_module_exit(); - } - if(err < 0) wprintf("< err=%d\n", err); + wprintf("< err=%d\n", err); + } return err; } diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vnet.h --- a/tools/vnet/vnet-module/vnet.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vnet.h Fri Aug 26 20:47:16 2005 @@ -29,17 +29,15 @@ struct Vif; struct net_device; -typedef uint32_t vnetid_t; -typedef uint32_t vnetaddr_t; - /** Vnet property record. */ typedef struct Vnet { /** Reference count. */ atomic_t refcount; /** Vnet id. */ - vnetid_t vnet; + struct VnetId vnet; /** Security flag. If true the vnet requires ESP. */ int security; + char device[IFNAMSIZ]; struct net_device *dev; struct net_device *bridge; @@ -51,30 +49,28 @@ int recursion; } Vnet; -extern int Vnet_lookup(vnetid_t id, Vnet **vnet); -extern int Vnet_add(Vnet *vnet); -extern int Vnet_del(vnetid_t vnet); -extern void Vnet_incref(Vnet *); -extern void Vnet_decref(Vnet *); -extern int Vnet_alloc(Vnet **vnet); +extern void vnet_print(void); +extern void Vnet_print(Vnet *info); + +extern int Vnet_lookup(struct VnetId *vnet, struct Vnet **info); +extern int Vnet_add(struct Vnet *info); +extern int Vnet_del(struct VnetId *vnet); +extern void Vnet_incref(struct Vnet *info); +extern void Vnet_decref(struct Vnet *info); +extern int Vnet_alloc(struct Vnet **info); extern Vnet *vnet_physical; extern int skb_xmit(struct sk_buff *skb); -extern int vnet_skb_send(struct sk_buff *skb, u32 vnet); -extern int vnet_skb_recv(struct sk_buff *skb, u32 vnet, struct Vmac *vmac); +extern int vnet_skb_send(struct sk_buff *skb, struct VnetId *vnet); +extern int vnet_skb_recv(struct sk_buff *skb, struct VnetId *vnet, struct Vmac *vmac); -extern int vnet_check_context(int vnet, SkbContext *context, Vnet **vinfo); +extern int vnet_check_context(struct VnetId *vnet, SkbContext *context, Vnet **vinfo); -extern int vnet_tunnel_open(vnetid_t vnet, vnetaddr_t addr, Tunnel **tunnel); -extern int vnet_tunnel_lookup(vnetid_t vnet, vnetaddr_t addr, Tunnel **tunnel); -extern int vnet_tunnel_send(vnetid_t vnet, vnetaddr_t addr, struct sk_buff *skb); +extern int vnet_tunnel_open(struct VnetId *vnet, struct VarpAddr *addr, Tunnel **tunnel); +extern int vnet_tunnel_lookup(struct VnetId *vnet, struct VarpAddr *addr, Tunnel **tunnel); +extern int vnet_tunnel_send(struct VnetId *vnet, struct VarpAddr *addr, struct sk_buff *skb); extern int vnet_init(void); - -enum { - HANDLE_OK = 1, - HANDLE_NO = 0, -}; extern int vnet_sa_security(u32 spi, int protocol, u32 addr); struct SAState; diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vnet_dev.c --- a/tools/vnet/vnet-module/vnet_dev.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vnet_dev.c Fri Aug 26 20:47:16 2005 @@ -48,15 +48,9 @@ #undef DEBUG #include "debug.h" -#define VNETIF_FMT "vnetif%u" -#define VNETBR_FMT "vnet%u" - #ifndef CONFIG_BRIDGE #error Must configure ethernet bridging in Network Options #endif - -#include <linux/../../net/bridge/br_private.h> -#define dev_bridge(_dev) ((struct net_bridge *)(_dev)->priv) static void vnet_dev_destructor(struct net_device *dev){ dprintf(">\n"); @@ -113,135 +107,16 @@ Vnet *vnet = (void*)dev->priv; dprintf(">\n"); - dprintf("> vnet=%d\n", vnet->vnet); - snprintf(dev->name, IFNAMSIZ - 1, VNETIF_FMT, vnet->vnet); - if(__dev_get_by_name(dev->name)){ + if(__dev_get_by_name(vnet->device)){ err = -ENOMEM; - } + wprintf("> vnet device name in use: %s\n", vnet->device); + } + strcpy(dev->name, vnet->device); dprintf("< err=%d\n", err); return err; } -//============================================================================ -#ifdef CONFIG_VNET_BRIDGE - -#define BRIDGE DEVICE - -void vnet_bridge_fini(Vnet *vnet){ - if(!vnet) return; - if(vnet->bridge){ - br_del_bridge(vnet->bridge->name); - vnet->bridge = NULL; - } -} - -/** Create the bridge for a vnet, and add the - * vnet interface to it. - * - * @param vnet vnet - * @return 0 on success, error code otherwise - */ -int vnet_bridge_init(Vnet *vnet){ - int err = 0; - char bridge[IFNAMSIZ] = {}; - struct net_bridge *br; - vnet->bridge = NULL; - snprintf(bridge, IFNAMSIZ - 1, VNETBR_FMT, vnet->vnet); - rtnl_lock(); - err = br_add_bridge(bridge); - rtnl_unlock(); - if(err){ - dprintf("> Error creating vnet bridge %s: err=%d\n", bridge, err); - goto exit; - } - vnet->bridge = __dev_get_by_name(bridge); - if(!vnet->bridge){ - wprintf("> Vnet bridge %s is null!\n", bridge); - err = -EINVAL; - goto exit; - } - br = dev_bridge(vnet->bridge); - br->stp_enabled = 0; - br->bridge_hello_time = 0; - br->hello_time = 0; - br->bridge_forward_delay = 0; - br->forward_delay = 0; - rtnl_lock(); - err = br_add_if(br, vnet->dev); - rtnl_unlock(); - if(err){ - dprintf("> Error adding vif %s to vnet bridge %s: err=%d\n", - vnet->dev->name, bridge, err); - goto exit; - } - rtnl_lock(); - dev_open(vnet->dev); - dev_open(vnet->bridge); - rtnl_unlock(); - exit: - if(err){ - if(vnet->bridge){ - rtnl_lock(); - br_del_bridge(bridge); - rtnl_unlock(); - vnet->bridge = NULL; - } - } - return err; -} - - -/** Add an interface to the bridge for a vnet. - * - * @param vnet vnet - * @param dev interface - * @return 0 on success, error code otherwise - */ -int vnet_add_if(Vnet *vnet, struct net_device *dev){ - int err = 0; - struct net_device *brdev; - - dprintf(">\n"); - if(!vnet->bridge){ - err = -EINVAL; - goto exit; - } - // Delete the interface from the default bridge. - // todo: Really want to delete it from any bridge it's in. - if(!vnet_get_device(BRIDGE, &brdev)){ - rtnl_lock(); - br_del_if(dev_bridge(brdev), dev); - rtnl_unlock(); - } - dprintf("> br_add_if %s %s\n", vnet->bridge->name, dev->name); - rtnl_lock(); - dev_open(dev); - dev_open(vnet->bridge); - err = br_add_if(dev_bridge(vnet->bridge), dev); - rtnl_unlock(); - exit: - dprintf("< err=%d\n", err); - return err; -} - -int vnet_del_if(Vnet *vnet, struct net_device *dev){ - int err = 0; - - dprintf(">\n"); - if(!vnet->bridge){ - err = -EINVAL; - goto exit; - } - rtnl_lock(); - br_del_if(dev_bridge(vnet->bridge), dev); - rtnl_unlock(); - exit: - dprintf("< err=%d\n", err); - return err; -} - - -/** Create the bridge and virtual interface for a vnet. +/** Create the virtual interface for a vnet. * * @param info vnet * @return 0 on success, error code otherwise @@ -249,25 +124,13 @@ int Vnet_create(Vnet *info){ int err = 0; - dprintf("> %u\n", info->vnet); err = vnet_dev_add(info); if(err) goto exit; - dprintf("> vnet_bridge_init\n"); - err = vnet_bridge_init(info); - if(err) goto exit; - dprintf("> Vnet_add...\n"); err = Vnet_add(info); exit: - if(err){ - dprintf("> vnet_bridge_fini...\n"); - vnet_bridge_fini(info); - } - dprintf("< err=%d\n", err); return err; } - - /** Remove the net device for a vnet. * Clears the dev field of the vnet. * Safe to call if the vnet or its dev are null. @@ -276,75 +139,13 @@ */ void vnet_dev_remove(Vnet *vnet){ if(!vnet) return; - dprintf("> vnet=%u\n", vnet->vnet); - if(vnet->bridge){ - dprintf("> br_del_bridge(%s)\n", vnet->bridge->name); - rtnl_lock(); - br_del_bridge(vnet->bridge->name); - rtnl_unlock(); - vnet->bridge = NULL; - } if(vnet->dev){ //dev_put(vnet->dev); dprintf("> unregister_netdev(%s)\n", vnet->dev->name); unregister_netdev(vnet->dev); vnet->dev = NULL; } - dprintf("<\n"); -} - -//============================================================================ -#else -//============================================================================ - -/** Create the virtual interface for a vnet. - * - * @param info vnet - * @return 0 on success, error code otherwise - */ -int Vnet_create(Vnet *info){ - int err = 0; - - dprintf("> %u\n", info->vnet); - err = vnet_dev_add(info); - if(err) goto exit; - dprintf("> Vnet_add...\n"); - err = Vnet_add(info); - exit: - dprintf("< err=%d\n", err); - return err; -} - -int vnet_add_if(Vnet *vnet, struct net_device *dev){ - int err = -ENOSYS; - return err; -} - - -int vnet_del_if(Vnet *vnet, struct net_device *dev){ - int err = 0; - return err; -} - -/** Remove the net device for a vnet. - * Clears the dev field of the vnet. - * Safe to call if the vnet or its dev are null. - * - * @param vnet vnet - */ -void vnet_dev_remove(Vnet *vnet){ - if(!vnet) return; - dprintf("> vnet=%u\n", vnet->vnet); - if(vnet->dev){ - //dev_put(vnet->dev); - dprintf("> unregister_netdev(%s)\n", vnet->dev->name); - unregister_netdev(vnet->dev); - vnet->dev = NULL; - } - dprintf("<\n"); -} -#endif -//============================================================================ +} static int vnet_dev_open(struct net_device *dev){ int err = 0; @@ -365,6 +166,7 @@ static int vnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev){ int err = 0; Vnet *vnet = dev->priv; + int len = 0; dprintf("> skb=%p\n", skb); if(vnet->recursion++) { @@ -385,12 +187,14 @@ skb->mac.raw = skb->data; } //dev->trans_start = jiffies; - err = vnet_skb_send(skb, vnet->vnet); + len = skb->len; + // Must not use skb pointer after vnet_skb_send(). + err = vnet_skb_send(skb, &vnet->vnet); if(err < 0){ vnet->stats.tx_errors++; } else { vnet->stats.tx_packets++; - vnet->stats.tx_bytes += skb->len; + vnet->stats.tx_bytes += len; } exit: vnet->recursion--; @@ -416,43 +220,48 @@ struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len){ int err = 0; - dprintf("> skb=%p ethhdr=%p dev=%s len=%u\n", - skb, skb->mac.raw, dev->name, len); - if(saddr){ - dprintf("> saddr=" MACFMT "\n", MAC6TUPLE((unsigned char*)saddr)); - } else { - dprintf("> saddr=NULL\n"); - } - if(daddr){ - dprintf("> daddr=" MACFMT "\n", MAC6TUPLE((unsigned char*)daddr)); - } else { - dprintf("> daddr=NULL\n"); - } + err = eth_hard_header(skb, dev, type, daddr, saddr, len); - dprintf("> eth_hard_header=%d\n", err); + if(err) goto exit; skb->mac.raw = skb->data; - dprintf("> src=" MACFMT " dst=" MACFMT "\n", - MAC6TUPLE(skb->mac.ethernet->h_source), - MAC6TUPLE(skb->mac.ethernet->h_dest)); - dprintf("< err=%d\n", err); + exit: + return err; +} + +void vnet_default_mac(unsigned char *mac) +{ + static unsigned val = 1; + mac[0] = 0xAA; + mac[1] = 0xFF; + mac[2] = (unsigned char)((val >> 24) & 0xff); + mac[3] = (unsigned char)((val >> 16) & 0xff); + mac[4] = (unsigned char)((val >> 8) & 0xff); + mac[5] = (unsigned char)((val ) & 0xff); + val++; +} + +int vnet_device_mac(const char *device, unsigned char *mac){ + int err; + struct net_device *dev; + + err = vnet_get_device(device, &dev); + if(err) goto exit; + memcpy(mac, dev->dev_addr, ETH_ALEN); + dev_put(dev); + exit: return err; } void vnet_dev_mac(unsigned char *mac){ - static unsigned val = 1; - struct net_device *dev; - - if(vnet_get_device(DEVICE, &dev)){ - mac[0] = 0xAA; - mac[1] = 0xFF; - mac[2] = (unsigned char)((val >> 24) & 0xff); - mac[3] = (unsigned char)((val >> 16) & 0xff); - mac[4] = (unsigned char)((val >> 8) & 0xff); - mac[5] = (unsigned char)((val ) & 0xff); - val++; - } else { - memcpy(mac, dev->dev_addr, ETH_ALEN); - dev_put(dev); + const char *devices[] = { "eth0", "eth1", "eth2", NULL }; + const char **pdev; + int err = -ENODEV; + + for(pdev = devices; err && *pdev; pdev++){ + err = vnet_device_mac(*pdev, mac); + } + if(err){ + vnet_default_mac(mac); } } @@ -463,7 +272,9 @@ dprintf(">\n"); ether_setup(dev); - if(!eth_hard_header) eth_hard_header = dev->hard_header; + if(!eth_hard_header){ + eth_hard_header = dev->hard_header; + } dev->hard_header = vnet_dev_hard_header; dev->open = vnet_dev_open; @@ -507,7 +318,10 @@ if(vnet->dev) goto exit; vnet->header_n = sizeof(struct iphdr) + sizeof(struct etheriphdr); dev = kmalloc(sizeof(struct net_device), GFP_ATOMIC); - if(!dev){ err = -ENOMEM; goto exit; } + if(!dev){ + err = -ENOMEM; + goto exit; + } *dev = (struct net_device){}; dev->priv = vnet; vnet->dev = dev; @@ -515,9 +329,10 @@ err = vnet_dev_set_name(dev); if(err) goto exit; vnet_dev_init(dev); - dprintf("> name=%s, register_netdev...\n", dev->name); err = register_netdev(dev); - dprintf("> register_netdev=%d\n", err); + if(err){ + wprintf("> register_netdev(%s) = %d\n", dev->name, err); + } if(err) goto exit; rtnl_lock(); dev_open(dev); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vnet_dev.h --- a/tools/vnet/vnet-module/vnet_dev.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vnet_dev.h Fri Aug 26 20:47:16 2005 @@ -20,12 +20,9 @@ #define _VNET_VNET_DEV_H_ struct Vnet; -struct net_device; extern int vnet_dev_add(struct Vnet *vnet); extern void vnet_dev_remove(struct Vnet *vnet); extern int Vnet_create(struct Vnet *info); -extern int vnet_add_if(struct Vnet *vnet, struct net_device *dev); -extern int vnet_del_if(struct Vnet *vnet, struct net_device *dev); #endif diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnet-module/vnet_ioctl.c --- a/tools/vnet/vnet-module/vnet_ioctl.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnet-module/vnet_ioctl.c Fri Aug 26 20:47:16 2005 @@ -59,7 +59,7 @@ on the kernel interface being available to us (it's not exported @!$"%!). Create a vnet N: -- create the vnet device vnetifN: using commands to /proc, kernel api +- create the vnet device vnifN: using commands to /proc, kernel api - create the vnet bridge vnetN: using brctl in user-space - for best results something should keep track of the mapping vnet id <-> bridge name @@ -312,7 +312,6 @@ err = Parser_input(parser, NULL, 0); if(err) goto exit; obj = parser->val; - objprint(iostdout, obj, 0); IOStream_print(iostdout, "\n"); for(l = obj; CONSP(l); l = CDR(l)){ err = eval(CAR(l)); if(err) break; @@ -451,6 +450,7 @@ return err; } +#if 0 static int intof(Sxpr exp, int *v){ int err = 0; char *s; @@ -473,6 +473,24 @@ err = intof(val, v); return err; } +#endif + +static int vnetof(Sxpr exp, VnetId *v){ + int err = 0; + char *s; + err = stringof(exp, &s); + if(err) goto exit; + err = VnetId_aton(s, v); + exit: + return err; +} + +static int child_vnet(Sxpr exp, Sxpr key, VnetId *v){ + int err = 0; + Sxpr val = sxpr_child_value(exp, key, ONONE); + err = vnetof(val, v); + return err; +} static int macof(Sxpr exp, unsigned char *v){ int err = 0; @@ -515,20 +533,27 @@ * It is an error if a vnet with the same id exists. * * @param vnet vnet id + * @param device vnet device name * @param security security level * @return 0 on success, error code otherwise */ -static int ctrl_vnet_add(int vnet, int security){ +static int ctrl_vnet_add(VnetId *vnet, char *device, int security){ int err = 0; Vnet *vnetinfo = NULL; + + if(strlen(device) >= IFNAMSIZ){ + err = -EINVAL; + goto exit; + } if(Vnet_lookup(vnet, &vnetinfo) == 0){ err = -EEXIST; goto exit; } err = Vnet_alloc(&vnetinfo); if(err) goto exit; - vnetinfo->vnet = vnet; + vnetinfo->vnet = *vnet; vnetinfo->security = security; + strcpy(vnetinfo->device, device); err = Vnet_create(vnetinfo); exit: if(vnetinfo) Vnet_decref(vnetinfo); @@ -540,9 +565,15 @@ * @param vnet vnet id * @return 0 on success, error code otherwise */ -static int ctrl_vnet_del(int vnet){ +static int ctrl_vnet_del(VnetId *vnet){ int err = -ENOSYS; // Can't delete if there are any vifs on the vnet. + + // Need to flush vif entries for the deleted vnet. + // Need to flush varp entries for the deleted vnet. + // Note that (un)register_netdev() hold rtnl_lock() around + // (un)register_netdevice(). + //Vnet_del(vnet); return err; } @@ -553,7 +584,7 @@ * @param vmac mac address * @return 0 on success, error code otherwise */ -static int ctrl_vif_add(int vnet, Vmac *vmac){ +static int ctrl_vif_add(VnetId *vnet, Vmac *vmac){ int err = 0; Vnet *vnetinfo = NULL; Vif *vif = NULL; @@ -561,7 +592,7 @@ dprintf(">\n"); err = Vnet_lookup(vnet, &vnetinfo); if(err) goto exit; - err = vif_add(vnet, vmac, &vif); + err = vif_create(vnet, vmac, &vif); exit: if(vnetinfo) Vnet_decref(vnetinfo); if(vif) vif_decref(vif); @@ -569,46 +600,13 @@ return err; } -/** Add net device 'vifname' to the bridge for 'vnet' and - * create an entry for a vif with the given vnet and vmac. - * This is used when device 'vifname' is a virtual device - * connected to a vif in a vm. - * - * @param vifname name of device to bridge +/** Delete a vif. + * * @param vnet vnet id * @param vmac mac address * @return 0 on success, error code otherwise */ -static int ctrl_vif_conn(char *vifname, int vnet, Vmac *vmac){ - int err = 0; - Vnet *vnetinfo = NULL; - struct net_device *vifdev = NULL; - Vif *vif = NULL; - - dprintf("> %s\n", vifname); - err = Vnet_lookup(vnet, &vnetinfo); - if(err) goto exit; - err = vif_add(vnet, vmac, &vif); - if(err) goto exit; - err = vnet_get_device(vifname, &vifdev); - if(err) goto exit; - vif->dev = vifdev; - err = vnet_add_if(vnetinfo, vifdev); - exit: - if(vnetinfo) Vnet_decref(vnetinfo); - if(vif) vif_decref(vif); - if(vifdev) dev_put(vifdev); - dprintf("< err=%d\n", err); - return err; -} - -/** Delete a vif. - * - * @param vnet vnet id - * @param vmac mac address - * @return 0 on success, error code otherwise - */ -static int ctrl_vif_del(int vnet, Vmac *vmac){ +static int ctrl_vif_del(VnetId *vnet, Vmac *vmac){ int err = 0; Vnet *vnetinfo = NULL; Vif *vif = NULL; @@ -618,10 +616,6 @@ if(err) goto exit; err = vif_lookup(vnet, vmac, &vif); if(err) goto exit; - if(vif->dev){ - vnet_del_if(vnetinfo, vif->dev); - vif->dev = NULL; - } vif_remove(vnet, vmac); exit: if(vnetinfo) Vnet_decref(vnetinfo); @@ -652,21 +646,37 @@ return err; } -/** (vnet.add (id <id>) [(security { none | auth | conf } )] ) +/** (varp.flush) + */ +static int eval_varp_flush(Sxpr exp){ + int err = 0; + varp_flush(); + return err; +} + +/** (vnet.add (id <id>) + * [(vnetif <name>)] + * [(security { none | auth | conf } )] + * ) */ static int eval_vnet_add(Sxpr exp){ int err = 0; Sxpr oid = intern("id"); Sxpr osecurity = intern("security"); + Sxpr ovnetif = intern("vnetif"); Sxpr csecurity; - int id; - char *security; + VnetId vnet = {}; + char *device = NULL; + char dev[IFNAMSIZ] = {}; + char *security = NULL; int sec; - err = child_int(exp, oid, &id); - if(err) goto exit; - if(id < VNET_VIF){ - err = -EINVAL; - goto exit; + + err = child_vnet(exp, oid, &vnet); + if(err) goto exit; + child_string(exp, ovnetif, &device); + if(!device){ + snprintf(dev, IFNAMSIZ-1, "vnif%04x", ntohs(vnet.u.vnet16[7])); + device = dev; } csecurity = sxpr_child_value(exp, osecurity, intern("none")); err = stringof(csecurity, &security); @@ -681,8 +691,7 @@ err = -EINVAL; goto exit; } - dprintf("> vnet id=%d\n", id); - err = ctrl_vnet_add(id, sec); + err = ctrl_vnet_add(&vnet, device, sec); exit: dprintf("< err=%d\n", err); return err; @@ -698,11 +707,11 @@ static int eval_vnet_del(Sxpr exp){ int err = 0; Sxpr oid = intern("id"); - int id; - - err = child_int(exp, oid, &id); - if(err) goto exit; - err = ctrl_vnet_del(id); + VnetId vnet = {}; + + err = child_vnet(exp, oid, &vnet); + if(err) goto exit; + err = ctrl_vnet_del(&vnet); exit: return err; } @@ -713,55 +722,32 @@ int err = 0; Sxpr ovnet = intern("vnet"); Sxpr ovmac = intern("vmac"); - int vnet; + VnetId vnet = {}; Vmac vmac = {}; - err = child_int(exp, ovnet, &vnet); + err = child_vnet(exp, ovnet, &vnet); if(err) goto exit; err = child_mac(exp, ovmac, vmac.mac); if(err) goto exit; - err = ctrl_vif_add(vnet, &vmac); - exit: - return err; -} - -/** (vif.conn (vif <name>) (vnet <id>) (vmac <mac>)) - */ -static int eval_vif_conn(Sxpr exp){ - int err = 0; - Sxpr ovif = intern("vif"); + err = ctrl_vif_add(&vnet, &vmac); + exit: + return err; +} + +/** (vif.del (vnet <vnet>) (vmac <macaddr>)) + */ +static int eval_vif_del(Sxpr exp){ + int err = 0; Sxpr ovnet = intern("vnet"); Sxpr ovmac = intern("vmac"); - char *vif = NULL; - int vnet = 0; + VnetId vnet = {}; Vmac vmac = {}; - err = child_string(exp, ovif, &vif); - if(err) goto exit; - err = child_int(exp, ovnet, &vnet); + err = child_vnet(exp, ovnet, &vnet); if(err) goto exit; err = child_mac(exp, ovmac, vmac.mac); - dprintf("> connect vif=%s vnet=%d\n", vif, vnet); - err = ctrl_vif_conn(vif, vnet, &vmac); - exit: - dprintf("< err=%d\n", err); - return err; -} - -/** (vif.del (vnet <vnet>) (vmac <macaddr>)) - */ -static int eval_vif_del(Sxpr exp){ - int err = 0; - Sxpr ovnet = intern("vnet"); - Sxpr ovmac = intern("vmac"); - int vnet; - Vmac vmac = {}; - - err = child_int(exp, ovnet, &vnet); - if(err) goto exit; - err = child_mac(exp, ovmac, vmac.mac); - if(err) goto exit; - err = ctrl_vif_del(vnet, &vmac); + if(err) goto exit; + err = ctrl_vif_del(&vnet, &vmac); exit: return err; } @@ -776,23 +762,23 @@ SxprEval defs[] = { { intern("varp.print"), eval_varp_print }, { intern("varp.mcaddr"), eval_varp_mcaddr }, + { intern("varp.flush"), eval_varp_flush }, { intern("vif.add"), eval_vif_add }, - { intern("vif.conn"), eval_vif_conn }, { intern("vif.del"), eval_vif_del }, { intern("vnet.add"), eval_vnet_add }, { intern("vnet.del"), eval_vnet_del }, { ONONE, NULL } }; SxprEval *def; - dprintf(">\n"); - err = -EINVAL; + iprintf("> "); objprint(iostdout, exp, 0); IOStream_print(iostdout, "\n"); + err = -ENOSYS; for(def = defs; !NONEP(def->elt); def++){ if(sxpr_elementp(exp, def->elt)){ err = def->fn(exp); break; } } - dprintf("< err=%d\n", err); + iprintf("< err=%d\n", err); return err; } diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnetd/Makefile --- a/tools/vnet/vnetd/Makefile Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnetd/Makefile Fri Aug 26 20:47:16 2005 @@ -16,32 +16,29 @@ # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #---------------------------------------------------------------------------- +VNET_ROOT = $(shell cd .. && pwd) +include $(VNET_ROOT)/Make.env + all: vnetd #---------------------------------------------------------------------------- -XEN_ROOT = ../../.. include $(XEN_ROOT)/tools/Rules.mk VNETD_INSTALL_DIR = /usr/sbin -LIB_DIR = ../libxutil -VNET_DIR = ../vnet-module - -INCLUDES += -I$(LIB_DIR) -INCLUDES += -I$(VNET_DIR) +INCLUDES += -I$(LIBXUTIL_DIR) +INCLUDES += -I$(VNET_MODULE_DIR) #---------------------------------------------------------------------------- # GC. -GC_DIR:=../gc/install -GC_INCLUDE:= $(GC_DIR)/include -GC_LIB_DIR:=$(GC_DIR)/lib INCLUDES += -I$(GC_INCLUDE) #LIBS += -L$(GC_LIB_DIR) CPPFLAGS += -D USE_GC #---------------------------------------------------------------------------- +CFLAGS += -g CFLAGS += -Wall CFLAGS += $(INCLUDES) $(LIBS) @@ -51,7 +48,7 @@ CFLAGS += -Wp,-MD,.$(@F).d PROG_DEP = .*.d -vpath %.c $(LIB_DIR) +vpath %.c $(LIBXUTIL_DIR) IPATHS:=$(INCLUDES:-I=) vpath %.h $(IPATHS) @@ -83,9 +80,9 @@ VNETD_OBJ := $(VNETD_SRC:.c=.o) -#VNETD_LIBS:= $(GC_LIB_DIR)/libgc.so.1.0.2 +#VNETD_LIBS:= $(GC_LIB_SO) #VNETD_LIBS:= -lgc -VNETD_LIBS:= $(GC_LIB_DIR)/libgc.a +VNETD_LIBS:= $(GC_LIB_A) vnetd: $(VNETD_OBJ) $(CC) $(CFLAGS) -o $@ $^ $(VNETD_LIBS) -ldl -lpthread @@ -95,8 +92,8 @@ install -m 0755 vnetd $(DESTDIR)$(VNETD_INSTALL_DIR) clean: - -rm -f *.a *.o *~ - -rm -f vnetd - -rm -f $(PROG_DEP) + -@$(RM) *.a *.o *~ + -@$(RM) vnetd + -@$(RM) $(PROG_DEP) -include $(PROG_DEP) diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnetd/vcache.c --- a/tools/vnet/vnetd/vcache.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnetd/vcache.c Fri Aug 26 20:47:16 2005 @@ -44,6 +44,8 @@ #undef DEBUG #include "debug.h" +#include "varp_util.c" + static VarpCache *vcache = NULL; void IPMessageQueue_init(IPMessageQueue *queue, int maxlen){ @@ -97,16 +99,20 @@ * @param vmac vmac (in network order) * @return 0 on success, error code otherwise */ -int varp_send(Conn *conn, uint16_t opcode, uint32_t vnet, Vmac *vmac, uint32_t addr){ +int varp_send(Conn *conn, uint16_t opcode, VnetId *vnet, Vmac *vmac, VarpAddr *addr){ int err = 0; int varp_n = sizeof(VarpHdr); VarpHdr varph = {}; - - varph.vnetmsghdr.id = htons(VARP_ID); - varph.vnetmsghdr.opcode = htons(opcode); - varph.vnet = vnet; - varph.vmac = *vmac; - varph.addr = addr; +#ifdef DEBUG + char vnetbuf[VNET_ID_BUF]; + char addrbuf[VARP_ADDR_BUF]; +#endif + + varph.hdr.id = htons(VARP_ID); + varph.hdr.opcode = htons(opcode); + varph.vnet = *vnet; + varph.vmac = *vmac; + varph.addr = *addr; if(0){ struct sockaddr_in self; @@ -117,8 +123,10 @@ } dprintf("> addr=%s opcode=%d\n", inet_ntoa(conn->addr.sin_addr), opcode); - dprintf("> vnet=%d vmac=" MACFMT " addr=" IPFMT "\n", - ntohl(vnet), MAC6TUPLE(vmac->mac), NIPQUAD(addr)); + dprintf("> vnet=%s vmac=" MACFMT " addr=%s\n", + VnetId_ntoa(vnet, vnetbuf), + MAC6TUPLE(vmac->mac), + VarpAddr_ntoa(addr, addrbuf)); err = marshal_bytes(conn->out, &varph, varp_n); marshal_flush(conn->out); dprintf("< err=%d\n", err); @@ -157,21 +165,24 @@ */ void VCEntry_print(VCEntry *ventry){ if(ventry){ - char *c, *d; + char *state, *flags; + char vnetbuf[VNET_ID_BUF]; + char addrbuf[VARP_ADDR_BUF]; + switch(ventry->state){ - case VCACHE_STATE_INCOMPLETE: c = "INC"; break; - case VCACHE_STATE_REACHABLE: c = "RCH"; break; - case VCACHE_STATE_FAILED: c = "FLD"; break; - default: c = "UNK"; break; - } - d = (VCEntry_get_flags(ventry, VCACHE_FLAG_PROBING) ? "P" : " "); - - printf("VENTRY(%p %s %s vnet=%d vmac=" MACFMT " addr=" IPFMT " time=%g)\n", + case VCACHE_STATE_INCOMPLETE: state = "INC"; break; + case VCACHE_STATE_REACHABLE: state = "RCH"; break; + case VCACHE_STATE_FAILED: state = "FLD"; break; + default: state = "UNK"; break; + } + flags = (VCEntry_get_flags(ventry, VCACHE_FLAG_PROBING) ? "P" : " "); + + printf("VENTRY(%p %s %s vnet=%s vmac=" MACFMT " addr=%s time=%g)\n", ventry, - c, d, - ntohl(ventry->key.vnet), + state, flags, + VnetId_ntoa(&ventry->key.vnet, vnetbuf), MAC6TUPLE(ventry->key.vmac.mac), - NIPQUAD(ventry->addr), + VarpAddr_ntoa(&ventry->addr, addrbuf), ventry->timestamp); } else { printf("VENTRY: Null!\n"); @@ -239,11 +250,11 @@ * @param vmac virtual MAC address (copied) * @return ventry or null */ -VCEntry * VCEntry_new(uint32_t vnet, Vmac *vmac){ +VCEntry * VCEntry_new(VnetId *vnet, Vmac *vmac){ VCEntry *z = ALLOCATE(VCEntry); z->state = VCACHE_STATE_INCOMPLETE; z->timestamp = time_now(); - z->key.vnet = vnet; + z->key.vnet = *vnet; z->key.vmac = *vmac; return z; } @@ -256,15 +267,9 @@ */ Hashcode vcache_key_hash_fn(void *k){ VCKey *key = k; - Hashcode h; - h = hash_2ul(key->vnet, - (key->vmac.mac[0] << 24) | - (key->vmac.mac[1] << 16) | - (key->vmac.mac[2] << 8) | - (key->vmac.mac[3] )); - h = hash_hul(h, - (key->vmac.mac[4] << 8) | - (key->vmac.mac[5] )); + Hashcode h = 0; + h = VnetId_hash(h, &key->vnet); + h = Vmac_hash(h, &key->vmac); return h; } @@ -278,8 +283,8 @@ int vcache_key_equal_fn(void *k1, void *k2){ VCKey *key1 = k1; VCKey *key2 = k2; - return (key1->vnet == key2->vnet) - && (memcmp(key1->vmac.mac, key2->vmac.mac, ETH_ALEN) == 0); + return (VnetId_eq(&key1->vnet , &key2->vnet) && + Vmac_eq(&key1->vmac, &key2->vmac)); } void VarpCache_schedule(VarpCache *z); @@ -351,7 +356,7 @@ * @param vmac virtual MAC address (copied) * @return new entry or null */ -VCEntry * VarpCache_add(VarpCache *z, uint32_t vnet, Vmac *vmac){ +VCEntry * VarpCache_add(VarpCache *z, VnetId *vnet, Vmac *vmac){ VCEntry *ventry; HTEntry *entry; @@ -378,8 +383,8 @@ * @param vmac virtual MAC addres * @return entry found or null */ -VCEntry * VarpCache_lookup(VarpCache *z, uint32_t vnet, Vmac *vmac){ - VCKey key = { .vnet = vnet, .vmac = *vmac }; +VCEntry * VarpCache_lookup(VarpCache *z, VnetId *vnet, Vmac *vmac){ + VCKey key = { .vnet = *vnet, .vmac = *vmac }; VCEntry *ventry; ventry = HashTable_get(z->table, &key); return ventry; @@ -389,13 +394,15 @@ dprintf(">\n"); if(VCEntry_get_flags(ventry, VCACHE_FLAG_LOCAL_PROBE)){ dprintf("> local probe\n"); - varp_send(vnetd->bcast_conn, VARP_OP_REQUEST, ventry->key.vnet, &ventry->key.vmac, ventry->addr); + varp_send(vnetd->bcast_conn, VARP_OP_REQUEST, + &ventry->key.vnet, &ventry->key.vmac, &ventry->addr); } if(VCEntry_get_flags(ventry, VCACHE_FLAG_REMOTE_PROBE)){ ConnList *l; dprintf("> remote probe\n"); for(l = vnetd->connections; l; l = l->next){ - varp_send(l->conn, VARP_OP_REQUEST, ventry->key.vnet, &ventry->key.vmac, ventry->addr); + varp_send(l->conn, VARP_OP_REQUEST, + &ventry->key.vnet, &ventry->key.vmac, &ventry->addr); } } @@ -440,7 +447,8 @@ IPMessage *msg; while((msg = IPMessageQueue_pop(&ventry->queue))){ dprintf("> announce\n"); - varp_send(msg->conn, VARP_OP_ANNOUNCE, ventry->key.vnet, &ventry->key.vmac, ventry->addr); + varp_send(msg->conn, VARP_OP_ANNOUNCE, + &ventry->key.vnet, &ventry->key.vmac, &ventry->addr); } } exit: @@ -459,7 +467,7 @@ VCEntry *ventry; dprintf(">\n"); - ventry = VarpCache_lookup(z, varph->vnet, &varph->vmac); + ventry = VarpCache_lookup(z, &varph->vnet, &varph->vmac); if(ventry){ err = VCEntry_update(ventry, msg, varph, state); } else { @@ -503,14 +511,14 @@ * @param local whether it's local or not */ void vcache_forward_varp(VarpHdr *varph, int local){ - uint16_t opcode = ntohs(varph->vnetmsghdr.opcode); + uint16_t opcode = ntohs(varph->hdr.opcode); if(local){ ConnList *l; for(l = vnetd->connections; l; l = l->next){ - varp_send(l->conn, opcode, varph->vnet, &varph->vmac, varph->addr); + varp_send(l->conn, opcode, &varph->vnet, &varph->vmac, &varph->addr); } } else { - varp_send(vnetd->bcast_conn, opcode, varph->vnet, &varph->vmac, varph->addr); + varp_send(vnetd->bcast_conn, opcode, &varph->vnet, &varph->vmac, &varph->addr); } } @@ -531,13 +539,13 @@ #else int vcache_handle_request(IPMessage *msg, VarpHdr *varph, int local){ int err = -ENOENT; - uint32_t vnet; + VnetId *vnet; Vmac *vmac; VCEntry *ventry = NULL; int reply = 0; dprintf(">\n"); - vnet = htonl(varph->vnet); + vnet = &varph->vnet; vmac = &varph->vmac; ventry = VarpCache_lookup(vcache, vnet, vmac); if(!ventry){ @@ -605,13 +613,18 @@ VarpHdr *varph = &vmsg->varp.varph; dprintf(">\n"); - if(1){ +#ifdef DEBUG + { + char vnetbuf[VNET_ID_BUF]; dprintf("> src=%s:%d\n", inet_ntoa(msg->saddr.sin_addr), ntohs(msg->saddr.sin_port)); dprintf("> dst=%s:%d\n", inet_ntoa(msg->daddr.sin_addr), ntohs(msg->daddr.sin_port)); - dprintf("> opcode=%d vnet=%u vmac=" MACFMT "\n", - ntohs(varph->opcode), ntohl(varph->vnet), MAC6TUPLE(varph->vmac.mac)); - } - switch(ntohs(varph->vnetmsghdr.opcode)){ + dprintf("> opcode=%d vnet=%s vmac=" MACFMT "\n", + ntohs(varph->opcode), + VnetId_ntoa(&varph->vnet, vnetbuf), + MAC6TUPLE(varph->vmac.mac)); + } +#endif + switch(ntohs(varph->hdr.opcode)){ case VARP_OP_REQUEST: err = vcache_handle_request(msg, varph, local); break; diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnetd/vcache.h --- a/tools/vnet/vnetd/vcache.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnetd/vcache.h Fri Aug 26 20:47:16 2005 @@ -93,7 +93,7 @@ /** Key for varp cache entries. */ typedef struct VCKey { /** Vnet id (network order). */ - uint32_t vnet; + VnetId vnet; /** Virtual MAC address. */ Vmac vmac; } VCKey; @@ -103,7 +103,7 @@ VCKey key; /** Care-of address for the key. */ - uint32_t addr; + VarpAddr addr; /** Alias coa if we are a gateway. */ //uint32_t gateway; @@ -111,7 +111,7 @@ //uint32_t encaps; /** Where this entry came from. */ - uint32_t source; + VarpAddr source; /** Last-updated timestamp. */ double timestamp; diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnetd/vnetd.c --- a/tools/vnet/vnetd/vnetd.c Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnetd/vnetd.c Fri Aug 26 20:47:16 2005 @@ -112,7 +112,6 @@ #include <sys/wait.h> #include <sys/select.h> -//#include </usr/include/linux/ip.h> // For struct iphdr; #include <linux/ip.h> // For struct iphdr; #include <linux/if_ether.h> @@ -492,22 +491,16 @@ dprintf("> addr=%s protocol=%d n=%d\n", inet_ntoa(conn->addr.sin_addr), protocol, data_n); string_stream_init(io, &sdata, buf, sizeof(buf)); - dprintf("> 10\n"); err = marshal_uint16(io, VNET_FWD_ID); if(err < 0) goto exit; - dprintf("> 20\n"); err = marshal_uint16(io, 0); if(err < 0) goto exit; - dprintf("> 30\n"); err = marshal_uint16(io, protocol); if(err < 0) goto exit; - dprintf("> 40\n"); err = marshal_uint16(io, data_n); if(err < 0) goto exit; - dprintf("> 50\n"); err = marshal_bytes(io, data, data_n); if(err < 0) goto exit; - dprintf("> 60 bytes=%d\n", IOStream_get_written(io)); err = IOStream_write(conn->out, buf, IOStream_get_written(io)); IOStream_flush(conn->out); exit: @@ -978,7 +971,7 @@ int err = 0; uint32_t addr = INADDR_ANY; uint16_t port = vnetd->port; - int flags = VSOCK_BIND | VSOCK_REUSE; + int flags = (VSOCK_BIND | VSOCK_REUSE); err = create_socket(SOCK_DGRAM, addr, port, flags, val); return err; } @@ -1162,7 +1155,7 @@ err = vnetd_broadcast_conn(vnetd, &vnetd->bcast_conn); if(err < 0) goto exit; { - int flags = VSOCK_BROADCAST | VSOCK_MULTICAST; + int flags = (VSOCK_BROADCAST | VSOCK_MULTICAST); uint32_t mcaddr = vnetd->mcast_addr.sin_addr.s_addr; err = vnetd_raw_socket(IPPROTO_ETHERIP, flags, mcaddr, &vnetd->etherip_sock); diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/vnetd/vnetd.h --- a/tools/vnet/vnetd/vnetd.h Thu Aug 25 20:52:38 2005 +++ b/tools/vnet/vnetd/vnetd.h Fri Aug 26 20:47:16 2005 @@ -20,6 +20,7 @@ #include <asm/types.h> #include <linux/if_ether.h> #include "if_varp.h" +#include "varp_util.h" #include "connection.h" #include "sxpr.h" diff -r de3576a1c62c -r dfaf788ab18c tools/xenstat/libxenstat/src/xen-interface.c --- a/tools/xenstat/libxenstat/src/xen-interface.c Thu Aug 25 20:52:38 2005 +++ b/tools/xenstat/libxenstat/src/xen-interface.c Fri Aug 26 20:47:16 2005 @@ -59,14 +59,15 @@ } /* Make simple xen version hypervisor calls */ -static int xi_make_xen_version_hypercall(xi_handle *handle, long *vnum, xen_extraversion_t *ver) +static int xi_make_xen_version_hypercall(xi_handle *handle, long *vnum, + xen_extraversion_t *ver) { privcmd_hypercall_t privcmd; multicall_entry_t multicall[2]; int ret = 0; /* set up for doing hypercall */ - privcmd.op = __HYPERVISOR_multicall; + privcmd.op = __HYPERVISOR_multicall; privcmd.arg[0] = (unsigned long)multicall; privcmd.arg[1] = 2; @@ -75,7 +76,7 @@ multicall[0].args[0] = (unsigned long)XENVER_version; /* second to get xen version flag */ - multicall[1].op = __HYPERVISOR_xen_version; + multicall[1].op = __HYPERVISOR_xen_version; multicall[1].args[0] = (unsigned long)XENVER_extraversion; multicall[1].args[1] = (unsigned long)ver; @@ -104,7 +105,8 @@ } /* Make Xen Dom0 op hypervisor call */ -static int xi_make_dom0_op(xi_handle *handle, dom0_op_t *dom_op, int dom_opcode) +static int xi_make_dom0_op(xi_handle *handle, dom0_op_t *dom_op, + int dom_opcode) { privcmd_hypercall_t privcmd; int ret = 0; @@ -191,11 +193,10 @@ } /* gets xen version information from hypervisor */ -int xi_get_xen_version(xi_handle *handle, long *vnum, xen_extraversion_t *ver) -{ - - /* gets the XENVER_version and XENVER_extraversion */ - if (xi_make_xen_version_hypercall( handle, vnum, ver) < 0) {; +int xi_get_xen_version(xi_handle *handle, long *vnum, xen_extraversion_t *ver) +{ + /* gets the XENVER_version and XENVER_extraversion */ + if (xi_make_xen_version_hypercall( handle, vnum, ver) < 0) { perror("XEN VERSION Hypercall failed"); return -1; } diff -r de3576a1c62c -r dfaf788ab18c tools/xenstat/libxenstat/src/xenstat.c --- a/tools/xenstat/libxenstat/src/xenstat.c Thu Aug 25 20:52:38 2005 +++ b/tools/xenstat/libxenstat/src/xenstat.c Fri Aug 26 20:47:16 2005 @@ -27,23 +27,24 @@ /* * Types */ +#define SHORT_ASC_LEN 5 /* length of 65535 */ +#define VERSION_SIZE (2 * SHORT_ASC_LEN + 1 + sizeof(xen_extraversion_t) + 1) + struct xenstat_handle { xi_handle *xihandle; int page_size; FILE *procnetdev; + char xen_version[VERSION_SIZE]; /* xen version running on this node */ }; -#define SHORT_ASC_LEN 5 /* length of 65535 */ -#define VERSION_SIZE (2 * SHORT_ASC_LEN + 1 + sizeof(xen_extraversion_t) + 1) - struct xenstat_node { + xenstat_handle *handle; unsigned int flags; unsigned long long cpu_hz; unsigned int num_cpus; unsigned long long tot_mem; unsigned long long free_mem; unsigned int num_domains; - char xen_version[VERSION_SIZE]; /* xen version running on this node */ xenstat_domain *domains; /* Array of length num_domains */ }; @@ -83,8 +84,7 @@ */ /* Called to collect the information for the node and all the domains on * it. When called, the domain information has already been collected. */ -typedef int (*xenstat_collect_func)(xenstat_handle * handle, - xenstat_node * node); +typedef int (*xenstat_collect_func)(xenstat_node * node); /* Called to free the information collected by the collect function. The free * function will only be called on a xenstat_node if that node includes * information collected by the corresponding collector. */ @@ -101,20 +101,23 @@ xenstat_uninit_func uninit; } xenstat_collector; -static int xenstat_collect_vcpus(xenstat_handle * handle, - xenstat_node * node); -static int xenstat_collect_networks(xenstat_handle * handle, - xenstat_node * node); +static int xenstat_collect_vcpus(xenstat_node * node); +static int xenstat_collect_networks(xenstat_node * node); +static int xenstat_collect_xen_version(xenstat_node * node); static void xenstat_free_vcpus(xenstat_node * node); static void xenstat_free_networks(xenstat_node * node); +static void xenstat_free_xen_version(xenstat_node * node); static void xenstat_uninit_vcpus(xenstat_handle * handle); static void xenstat_uninit_networks(xenstat_handle * handle); +static void xenstat_uninit_xen_version(xenstat_handle * handle); static xenstat_collector collectors[] = { { XENSTAT_VCPU, xenstat_collect_vcpus, xenstat_free_vcpus, xenstat_uninit_vcpus }, { XENSTAT_NETWORK, xenstat_collect_networks, - xenstat_free_networks, xenstat_uninit_networks } + xenstat_free_networks, xenstat_uninit_networks }, + { XENSTAT_XEN_VERSION, xenstat_collect_xen_version, + xenstat_free_xen_version, xenstat_uninit_xen_version } }; #define NUM_COLLECTORS (sizeof(collectors)/sizeof(xenstat_collector)) @@ -169,8 +172,6 @@ #define DOMAIN_CHUNK_SIZE 256 xenstat_node *node; dom0_physinfo_t physinfo; - xen_extraversion_t version; - long vnum = 0; dom0_getdomaininfo_t domaininfo[DOMAIN_CHUNK_SIZE]; unsigned int num_domains, new_domains; unsigned int i; @@ -180,19 +181,14 @@ if (node == NULL) return NULL; + /* Store the handle in the node for later access */ + node->handle = handle; + /* Get information about the physical system */ if (xi_get_physinfo(handle->xihandle, &physinfo) < 0) { free(node); return NULL; } - - /* Get the xen version number and xen version tag */ - if (xi_get_xen_version(handle->xihandle, &vnum, &version) < 0) { - free(node); - return NULL; - } - snprintf(node->xen_version, VERSION_SIZE, - "%ld.%ld%s\n", ((vnum >> 16) & 0xFFFF), vnum & 0xFFFF, (char *)version); node->cpu_hz = ((unsigned long long)physinfo.cpu_khz) * 1000ULL; node->num_cpus = @@ -259,7 +255,7 @@ for (i = 0; i < NUM_COLLECTORS; i++) { if ((flags & collectors[i].flag) == collectors[i].flag) { node->flags |= collectors[i].flag; - if(collectors[i].collect(handle, node) == 0) { + if(collectors[i].collect(node) == 0) { xenstat_free_node(node); return NULL; } @@ -306,9 +302,9 @@ return NULL; } -const char *xenstat_node_xen_ver(xenstat_node * node) -{ - return node->xen_version; +const char *xenstat_node_xen_version(xenstat_node * node) +{ + return node->handle->xen_version; } unsigned long long xenstat_node_tot_mem(xenstat_node * node) @@ -434,7 +430,7 @@ * VCPU functions */ /* Collect information about VCPUs */ -static int xenstat_collect_vcpus(xenstat_handle * handle, xenstat_node * node) +static int xenstat_collect_vcpus(xenstat_node * node) { unsigned int i, vcpu; /* Fill in VCPU information */ @@ -447,10 +443,9 @@ for (vcpu = 0; vcpu < node->domains[i].num_vcpus; vcpu++) { /* FIXME: need to be using a more efficient mechanism*/ long long vcpu_time; - vcpu_time = - xi_get_vcpu_usage(handle->xihandle, - node->domains[i].id, - vcpu); + vcpu_time = xi_get_vcpu_usage(node->handle->xihandle, + node->domains[i].id, + vcpu); if (vcpu_time < 0) return 0; node->domains[i].vcpus[vcpu].ns = vcpu_time; @@ -490,40 +485,40 @@ "bytes packets errs drop fifo colls carrier compressed\n"; /* Collect information about networks */ -static int xenstat_collect_networks(xenstat_handle * handle, - xenstat_node * node) +static int xenstat_collect_networks(xenstat_node * node) { /* Open and validate /proc/net/dev if we haven't already */ - if (handle->procnetdev == NULL) { + if (node->handle->procnetdev == NULL) { char header[sizeof(PROCNETDEV_HEADER)]; - handle->procnetdev = fopen("/proc/net/dev", "r"); - if (handle->procnetdev == NULL) { + node->handle->procnetdev = fopen("/proc/net/dev", "r"); + if (node->handle->procnetdev == NULL) { perror("Error opening /proc/net/dev"); - return 1; + return 0; } /* Validate the format of /proc/net/dev */ if (fread(header, sizeof(PROCNETDEV_HEADER) - 1, 1, - handle->procnetdev) != 1) { + node->handle->procnetdev) != 1) { perror("Error reading /proc/net/dev header"); - return 1; + return 0; } header[sizeof(PROCNETDEV_HEADER) - 1] = '\0'; if (strcmp(header, PROCNETDEV_HEADER) != 0) { fprintf(stderr, "Unexpected /proc/net/dev format\n"); - return 1; + return 0; } } /* Fill in networks */ /* FIXME: optimize this */ - fseek(handle->procnetdev, sizeof(PROCNETDEV_HEADER) - 1, SEEK_SET); + fseek(node->handle->procnetdev, sizeof(PROCNETDEV_HEADER) - 1, + SEEK_SET); while (1) { xenstat_domain *domain; xenstat_network net; unsigned int domid; - int ret = fscanf(handle->procnetdev, + int ret = fscanf(node->handle->procnetdev, "vif%u.%u:%llu%llu%llu%llu%*u%*u%*u%*u" "%llu%llu%llu%llu%*u%*u%*u%*u\n", &domid, &net.id, @@ -536,7 +531,7 @@ if (ret != 10) { unsigned int c; do { - c = fgetc(handle->procnetdev); + c = fgetc(node->handle->procnetdev); } while (c != '\n' && c != EOF); if (c == EOF) break; @@ -563,7 +558,7 @@ sizeof(xenstat_network)); } if (domain->networks == NULL) - return 1; + return 0; domain->networks[domain->num_networks - 1] = net; } @@ -638,3 +633,37 @@ { return network->tdrop; } + +/* + * Xen version functions + */ + +/* Collect Xen version information */ +static int xenstat_collect_xen_version(xenstat_node * node) +{ + long vnum = 0; + xen_extraversion_t version; + + /* Collect Xen version information if not already collected */ + if (node->handle->xen_version[0] == '\0') { + /* Get the Xen version number and extraversion string */ + if (xi_get_xen_version(node->handle->xihandle, + &vnum, &version) < 0) + return 0; + /* Format the version information as a string and store it */ + snprintf(node->handle->xen_version, VERSION_SIZE, "%ld.%ld%s", + ((vnum >> 16) & 0xFFFF), vnum & 0xFFFF, version); + } + + return 1; +} + +/* Free Xen version information in node - nothing to do */ +static void xenstat_free_xen_version(xenstat_node * node) +{ +} + +/* Free Xen version information in handle - nothing to do */ +static void xenstat_uninit_xen_version(xenstat_handle * handle) +{ +} diff -r de3576a1c62c -r dfaf788ab18c tools/xenstat/libxenstat/src/xenstat.h --- a/tools/xenstat/libxenstat/src/xenstat.h Thu Aug 25 20:52:38 2005 +++ b/tools/xenstat/libxenstat/src/xenstat.h Fri Aug 26 20:47:16 2005 @@ -31,10 +31,13 @@ /* Release the handle to libxc, free resources, etc. */ void xenstat_uninit(xenstat_handle * handle); -/* Get all available information about a node */ +/* Flags for types of information to collect in xenstat_get_node */ #define XENSTAT_VCPU 0x1 #define XENSTAT_NETWORK 0x2 -#define XENSTAT_ALL (XENSTAT_VCPU|XENSTAT_NETWORK) +#define XENSTAT_XEN_VERSION 0x4 +#define XENSTAT_ALL (XENSTAT_VCPU|XENSTAT_NETWORK|XENSTAT_XEN_VERSION) + +/* Get all available information about a node */ xenstat_node *xenstat_get_node(xenstat_handle * handle, unsigned int flags); /* Free the information */ @@ -51,8 +54,9 @@ /* Get the domain with the given index; used to loop over all domains. */ xenstat_domain *xenstat_node_domain_by_index(xenstat_node * node, unsigned index); + /* Get xen version of the node */ -const char *xenstat_node_xen_ver(xenstat_node * node); +const char *xenstat_node_xen_version(xenstat_node * node); /* Get amount of total memory on a node */ unsigned long long xenstat_node_tot_mem(xenstat_node * node); diff -r de3576a1c62c -r dfaf788ab18c tools/xenstat/xentop/xentop.c --- a/tools/xenstat/xentop/xentop.c Thu Aug 25 20:52:38 2005 +++ b/tools/xenstat/xentop/xentop.c Fri Aug 26 20:47:16 2005 @@ -519,8 +519,8 @@ print("%4u", xenstat_domain_num_networks(domain)); } -/* Compares number of total network tx bytes of two domains, returning -1,0,1 for - * <,=,> */ +/* Compares number of total network tx bytes of two domains, returning -1,0,1 + * for <,=,> */ static int compare_net_tx(xenstat_domain *domain1, xenstat_domain *domain2) { return -compare(tot_net_bytes(domain1, FALSE), @@ -533,8 +533,8 @@ print("%8llu", tot_net_bytes(domain, FALSE)/1024); } -/* Compares number of total network rx bytes of two domains, returning -1,0,1 for - * <,=,> */ +/* Compares number of total network rx bytes of two domains, returning -1,0,1 + * for <,=,> */ static int compare_net_rx(xenstat_domain *domain1, xenstat_domain *domain2) { return -compare(tot_net_bytes(domain1, TRUE), @@ -555,7 +555,7 @@ int i = 0; xenstat_network *network; unsigned num_networks = 0; - unsigned long long total = 0; + unsigned long long total = 0; /* How many networks? */ num_networks = xenstat_domain_num_networks(domain); @@ -564,12 +564,13 @@ for (i=0; i < num_networks; i++) { /* Next get the network information */ network = xenstat_domain_network(domain,i); - if (rx_flag) + if (rx_flag) total += xenstat_network_rbytes(network); - else + else total += xenstat_network_tbytes(network); } - return (total); + + return total; } /* Compares security id (ssid) of two domains, returning -1,0,1 for <,=,> */ @@ -592,6 +593,7 @@ #define TIME_STR_LEN 9 const char *TIME_STR_FORMAT = "%H:%M:%S"; char time_str[TIME_STR_LEN]; + const char *ver_str; unsigned run = 0, block = 0, pause = 0, crash = 0, dying = 0, shutdown = 0; unsigned i, num_domains = 0; @@ -602,7 +604,8 @@ strftime(time_str, TIME_STR_LEN, TIME_STR_FORMAT, localtime(&curtime.tv_sec)); num_domains = xenstat_node_num_domains(cur_node); - print("xentop - %s\n", time_str); + ver_str = xenstat_node_xen_version(cur_node); + print("xentop - %s Xen %s\n", time_str, ver_str); /* Tabulate what states domains are in for summary */ for (i=0; i < num_domains; i++) { diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/Makefile --- a/xen/arch/ia64/Makefile Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/Makefile Fri Aug 26 20:47:16 2005 @@ -14,8 +14,11 @@ irq_ia64.o irq_lsapic.o vhpt.o xenasm.o hyperprivop.o dom_fw.o \ grant_table.o sn_console.o +# TMP holder to contain *.0 moved out of CONFIG_VTI +OBJS += vmx_init.o + ifeq ($(CONFIG_VTI),y) -OBJS += vmx_init.o vmx_virt.o vmx_vcpu.o vmx_process.o vmx_vsa.o vmx_ivt.o \ +OBJS += vmx_virt.o vmx_vcpu.o vmx_process.o vmx_vsa.o vmx_ivt.o\ vmx_phy_mode.o vmx_utility.o vmx_interrupt.o vmx_entry.o vmmu.o \ vtlb.o mmio.o vlsapic.o vmx_hypercall.o mm.o vmx_support.o pal_emul.o endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/asm-offsets.c --- a/xen/arch/ia64/asm-offsets.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/asm-offsets.c Fri Aug 26 20:47:16 2005 @@ -296,4 +296,11 @@ //DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); //DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); //DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); + DEFINE(IA64_KR_CURRENT_OFFSET, offsetof (cpu_kr_ia64_t, _kr[IA64_KR_CURRENT])); + DEFINE(IA64_KR_PT_BASE_OFFSET, offsetof (cpu_kr_ia64_t, _kr[IA64_KR_PT_BASE])); + DEFINE(IA64_KR_IO_BASE_OFFSET, offsetof (cpu_kr_ia64_t, _kr[IA64_KR_IO_BASE])); + DEFINE(IA64_KR_PERCPU_DATA_OFFSET, offsetof (cpu_kr_ia64_t, _kr[IA64_KR_PER_CPU_DATA])); + DEFINE(IA64_KR_IO_BASE_OFFSET, offsetof (cpu_kr_ia64_t, _kr[IA64_KR_IO_BASE])); + DEFINE(IA64_KR_CURRENT_STACK_OFFSET, offsetof (cpu_kr_ia64_t, _kr[IA64_KR_CURRENT_STACK])); + } diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/domain.c --- a/xen/arch/ia64/domain.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/domain.c Fri Aug 26 20:47:16 2005 @@ -38,25 +38,17 @@ #include <asm/vcpu.h> /* for function declarations */ #include <public/arch-ia64.h> -#ifdef CONFIG_VTI #include <asm/vmx.h> #include <asm/vmx_vcpu.h> #include <asm/vmx_vpd.h> #include <asm/pal.h> #include <public/io/ioreq.h> -#endif // CONFIG_VTI #define CONFIG_DOMAIN0_CONTIGUOUS unsigned long dom0_start = -1L; -#ifdef CONFIG_VTI unsigned long dom0_size = 512*1024*1024; //FIXME: Should be configurable //FIXME: alignment should be 256MB, lest Linux use a 256MB page size unsigned long dom0_align = 256*1024*1024; -#else // CONFIG_VTI -unsigned long dom0_size = 512*1024*1024; //FIXME: Should be configurable -//FIXME: alignment should be 256MB, lest Linux use a 256MB page size -unsigned long dom0_align = 64*1024*1024; -#endif // CONFIG_VTI #ifdef DOMU_BUILD_STAGING unsigned long domU_staging_size = 32*1024*1024; //FIXME: Should be configurable unsigned long domU_staging_start; @@ -187,60 +179,6 @@ memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96); } -#ifdef CONFIG_VTI -void arch_do_createdomain(struct vcpu *v) -{ - struct domain *d = v->domain; - struct thread_info *ti = alloc_thread_info(v); - - /* Clear thread_info to clear some important fields, like preempt_count */ - memset(ti, 0, sizeof(struct thread_info)); - init_switch_stack(v); - - /* Shared info area is required to be allocated at domain - * creation, since control panel will write some I/O info - * between front end and back end to that area. However for - * vmx domain, our design is to let domain itself to allcoate - * shared info area, to keep machine page contiguous. So this - * page will be released later when domainN issues request - * after up. - */ - d->shared_info = (void *)alloc_xenheap_page(); - /* Now assume all vcpu info and event indicators can be - * held in one shared page. Definitely later we need to - * consider more about it - */ - - memset(d->shared_info, 0, PAGE_SIZE); - d->shared_info->vcpu_data[v->vcpu_id].arch.privregs = - alloc_xenheap_pages(get_order(sizeof(mapped_regs_t))); - printf("arch_vcpu_info=%p\n", d->shared_info->vcpu_data[0].arch.privregs); - memset(d->shared_info->vcpu_data[v->vcpu_id].arch.privregs, 0, PAGE_SIZE); - v->vcpu_info = &d->shared_info->vcpu_data[v->vcpu_id]; - /* Mask all events, and specific port will be unmasked - * when customer subscribes to it. - */ - if(v == d->vcpu[0]) { - memset(&d->shared_info->evtchn_mask[0], 0xff, - sizeof(d->shared_info->evtchn_mask)); - } - - /* Allocate per-domain vTLB and vhpt */ - v->arch.vtlb = init_domain_tlb(v); - - /* Physical->machine page table will be allocated when - * final setup, since we have no the maximum pfn number in - * this stage - */ - - /* FIXME: This is identity mapped address for xenheap. - * Do we need it at all? - */ - d->xen_vastart = XEN_START_ADDR; - d->xen_vaend = XEN_END_ADDR; - d->arch.breakimm = 0x1000; -} -#else // CONFIG_VTI void arch_do_createdomain(struct vcpu *v) { struct domain *d = v->domain; @@ -263,11 +201,26 @@ v->vcpu_info = &(d->shared_info->vcpu_data[0]); d->max_pages = (128UL*1024*1024)/PAGE_SIZE; // 128MB default // FIXME - if ((d->arch.metaphysical_rr0 = allocate_metaphysical_rr0()) == -1UL) + +#ifdef CONFIG_VTI + /* Per-domain vTLB and vhpt implementation. Now vmx domain will stick + * to this solution. Maybe it can be deferred until we know created + * one as vmx domain */ + v->arch.vtlb = init_domain_tlb(v); +#endif + + /* We may also need emulation rid for region4, though it's unlikely + * to see guest issue uncacheable access in metaphysical mode. But + * keep such info here may be more sane. + */ + if (((d->arch.metaphysical_rr0 = allocate_metaphysical_rr()) == -1UL) + || ((d->arch.metaphysical_rr4 = allocate_metaphysical_rr()) == -1UL)) BUG(); VCPU(v, metaphysical_mode) = 1; v->arch.metaphysical_rr0 = d->arch.metaphysical_rr0; + v->arch.metaphysical_rr4 = d->arch.metaphysical_rr4; v->arch.metaphysical_saved_rr0 = d->arch.metaphysical_rr0; + v->arch.metaphysical_saved_rr4 = d->arch.metaphysical_rr4; #define DOMAIN_RID_BITS_DEFAULT 18 if (!allocate_rid_range(d,DOMAIN_RID_BITS_DEFAULT)) // FIXME BUG(); @@ -292,7 +245,6 @@ return -ENOMEM; } } -#endif // CONFIG_VTI void arch_getdomaininfo_ctxt(struct vcpu *v, struct vcpu_guest_context *c) { @@ -312,16 +264,28 @@ c->shared = v->domain->shared_info->arch; } -#ifndef CONFIG_VTI int arch_set_info_guest(struct vcpu *v, struct vcpu_guest_context *c) { struct pt_regs *regs = (struct pt_regs *) ((unsigned long) v + IA64_STK_OFFSET) - 1; + struct domain *d = v->domain; + int i, rc, ret; + unsigned long progress = 0; printf("arch_set_info_guest\n"); + if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) + return 0; + + if (c->flags & VGCF_VMX_GUEST) { + if (!vmx_enabled) { + printk("No VMX hardware feature for vmx domain.\n"); + return -EINVAL; + } + + vmx_setup_platform(v, c); + } + *regs = c->regs; - regs->cr_ipsr = IA64_PSR_IT|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_IC|IA64_PSR_I|IA64_PSR_DFH|IA64_PSR_BN|IA64_PSR_SP|IA64_PSR_DI; - regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; - regs->ar_rsc |= (2 << 2); /* force PL2/3 */ + new_thread(v, regs->cr_iip, 0, 0); v->vcpu_info->arch.evtchn_vector = c->vcpu.evtchn_vector; if ( c->vcpu.privregs && copy_from_user(v->vcpu_info->arch.privregs, @@ -330,100 +294,13 @@ return -EFAULT; } - init_all_rr(v); - - // this should be in userspace - regs->r28 = dom_fw_setup(v->domain,"nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1",256L); //FIXME v->arch.domain_itm_last = -1L; - VCPU(v, banknum) = 1; - VCPU(v, metaphysical_mode) = 1; - - v->domain->shared_info->arch = c->shared; + d->shared_info->arch = c->shared; + + /* Don't redo final setup */ + set_bit(_VCPUF_initialised, &v->vcpu_flags); return 0; } -#else // CONFIG_VTI -int arch_set_info_guest( - struct vcpu *v, struct vcpu_guest_context *c) -{ - struct domain *d = v->domain; - int i, rc, ret; - unsigned long progress = 0; - shared_iopage_t *sp; - - if ( test_bit(_VCPUF_initialised, &v->vcpu_flags) ) - return 0; - - /* Lazy FP not implemented yet */ - clear_bit(_VCPUF_fpu_initialised, &v->vcpu_flags); - if ( c->flags & VGCF_FPU_VALID ) - set_bit(_VCPUF_fpu_initialised, &v->vcpu_flags); - - /* Sync d/i cache conservatively, after domain N is loaded */ - ret = ia64_pal_cache_flush(3, 0, &progress, NULL); - if (ret != PAL_STATUS_SUCCESS) - panic("PAL CACHE FLUSH failed for dom[%d].\n", - v->domain->domain_id); - DPRINTK("Sync i/d cache for dom%d image SUCC\n", - v->domain->domain_id); - - /* Physical mode emulation initialization, including - * emulation ID allcation and related memory request - */ - physical_mode_init(v); - - /* FIXME: only support PMT table continuously by far */ - d->arch.pmt = __va(c->pt_base); - d->arch.max_pfn = c->pt_max_pfn; - d->arch.vmx_platform.shared_page_va = __va(c->share_io_pg); - sp = get_sp(d); - memset((char *)sp,0,PAGE_SIZE); - /* FIXME: temp due to old CP */ - sp->sp_global.eport = 2; -#ifdef V_IOSAPIC_READY - sp->vcpu_number = 1; -#endif - /* TEMP */ - d->arch.vmx_platform.pib_base = 0xfee00000UL; - - - if (c->flags & VGCF_VMX_GUEST) { - if (!vmx_enabled) - panic("No VMX hardware feature for vmx domain.\n"); - - vmx_final_setup_domain(d); - - /* One more step to enable interrupt assist */ - set_bit(ARCH_VMX_INTR_ASSIST, &v->arch.arch_vmx.flags); - } - - vlsapic_reset(v); - vtm_init(v); - - /* Only open one port for I/O and interrupt emulation */ - if (v == d->vcpu[0]) { - memset(&d->shared_info->evtchn_mask[0], 0xff, - sizeof(d->shared_info->evtchn_mask)); - clear_bit(iopacket_port(d), &d->shared_info->evtchn_mask[0]); - } - /* Setup domain context. Actually IA-64 is a bit different with - * x86, with almost all system resources better managed by HV - * directly. CP only needs to provide start IP of guest, which - * ideally is the load address of guest Firmware. - */ - new_thread(v, c->guest_iip, 0, 0); - - - d->xen_vastart = XEN_START_ADDR; - d->xen_vaend = XEN_END_ADDR; - d->arch.breakimm = 0x1000 + d->domain_id; - v->arch._thread.on_ustack = 0; - - /* Don't redo final setup */ - set_bit(_VCPUF_initialised, &v->vcpu_flags); - - return 0; -} -#endif // CONFIG_VTI void arch_do_boot_vcpu(struct vcpu *v) { @@ -443,7 +320,8 @@ printf("domain_relinquish_resources: not implemented\n"); } -#ifdef CONFIG_VTI +// heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread() +// and linux/arch/ia64/kernel/process.c:kernel_thread() void new_thread(struct vcpu *v, unsigned long start_pc, unsigned long start_stack, @@ -453,7 +331,6 @@ struct pt_regs *regs; struct ia64_boot_param *bp; extern char saved_command_line[]; - //char *dom0_cmdline = "BOOT_IMAGE=scsi0:\EFI\redhat\xenlinux nomca root=/dev/sdb1 ro"; #ifdef CONFIG_DOMAIN0_CONTIGUOUS @@ -471,61 +348,31 @@ regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2 } regs->cr_iip = start_pc; - regs->cr_ifs = 0; /* why? - matthewc */ + regs->cr_ifs = 1UL << 63; /* or clear? */ regs->ar_fpsr = FPSR_DEFAULT; + if (VMX_DOMAIN(v)) { +#ifdef CONFIG_VTI vmx_init_all_rr(v); - } else - init_all_rr(v); - - if (VMX_DOMAIN(v)) { - if (d == dom0) { + if (d == dom0) VMX_VPD(v,vgr[12]) = dom_fw_setup(d,saved_command_line,256L); - printk("new_thread, done with dom_fw_setup\n"); - } /* Virtual processor context setup */ VMX_VPD(v, vpsr) = IA64_PSR_BN; VPD_CR(v, dcr) = 0; +#endif } else { - regs->r28 = dom_fw_setup(d,saved_command_line,256L); + init_all_rr(v); + if (d == dom0) + regs->r28 = dom_fw_setup(d,saved_command_line,256L); + else { + regs->ar_rsc |= (2 << 2); /* force PL2/3 */ + regs->r28 = dom_fw_setup(d,"nomca nosmp xencons=tty0 console=tty0 root=/dev/hda1",256L); //FIXME + } VCPU(v, banknum) = 1; VCPU(v, metaphysical_mode) = 1; d->shared_info->arch.flags = (d == dom0) ? (SIF_INITDOMAIN|SIF_PRIVILEGED|SIF_BLK_BE_DOMAIN|SIF_NET_BE_DOMAIN|SIF_USB_BE_DOMAIN) : 0; } } -#else // CONFIG_VTI - -// heavily leveraged from linux/arch/ia64/kernel/process.c:copy_thread() -// and linux/arch/ia64/kernel/process.c:kernel_thread() -void new_thread(struct vcpu *v, - unsigned long start_pc, - unsigned long start_stack, - unsigned long start_info) -{ - struct domain *d = v->domain; - struct pt_regs *regs; - struct ia64_boot_param *bp; - extern char saved_command_line[]; - -#ifdef CONFIG_DOMAIN0_CONTIGUOUS - if (d == dom0) start_pc += dom0_start; -#endif - - regs = (struct pt_regs *) ((unsigned long) v + IA64_STK_OFFSET) - 1; - regs->cr_ipsr = ia64_getreg(_IA64_REG_PSR) - | IA64_PSR_BITS_TO_SET | IA64_PSR_BN - & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_RI | IA64_PSR_IS); - regs->cr_ipsr |= 2UL << IA64_PSR_CPL0_BIT; // domain runs at PL2 - regs->cr_iip = start_pc; - regs->cr_ifs = 1UL << 63; - regs->ar_fpsr = FPSR_DEFAULT; - init_all_rr(v); - regs->r28 = dom_fw_setup(d,saved_command_line,256L); //FIXME - VCPU(v, banknum) = 1; - VCPU(v, metaphysical_mode) = 1; - d->shared_info->arch.flags = (d == dom0) ? (SIF_INITDOMAIN|SIF_PRIVILEGED|SIF_BLK_BE_DOMAIN|SIF_NET_BE_DOMAIN|SIF_USB_BE_DOMAIN) : 0; -} -#endif // CONFIG_VTI static struct page * map_new_domain0_page(unsigned long mpaddr) { @@ -903,44 +750,6 @@ } #endif -#ifdef CONFIG_VTI -/* Up to whether domain is vmx one, different context may be setup - * here. - */ -void -post_arch_do_create_domain(struct vcpu *v, int vmx_domain) -{ - struct domain *d = v->domain; - - if (!vmx_domain) { - d->shared_info = (void*)alloc_xenheap_page(); - if (!d->shared_info) - panic("Allocate share info for non-vmx domain failed.\n"); - d->shared_info_va = 0xfffd000000000000; - - printk("Build shared info for non-vmx domain\n"); - build_shared_info(d); - /* Setup start info area */ - } -} - -/* For VMX domain, this is invoked when kernel model in domain - * request actively - */ -void build_shared_info(struct domain *d) -{ - int i; - - /* Set up shared-info area. */ - update_dom_time(d); - - /* Mask all upcalls... */ - for ( i = 0; i < MAX_VIRT_CPUS; i++ ) - d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; - - /* ... */ -} - /* * Domain 0 has direct access to all devices absolutely. However * the major point of this stub here, is to allow alloc_dom_mem @@ -959,182 +768,12 @@ unsigned long initrd_start, unsigned long initrd_len, char *cmdline) { - char *dst; - int i, rc; - unsigned long pfn, mfn; - unsigned long nr_pt_pages; - unsigned long count; - unsigned long alloc_start, alloc_end; - struct pfn_info *page = NULL; - start_info_t *si; - struct vcpu *v = d->vcpu[0]; - struct domain_setup_info dsi; - unsigned long p_start; - unsigned long pkern_start; - unsigned long pkern_entry; - unsigned long pkern_end; - unsigned long ret; - unsigned long progress = 0; - -//printf("construct_dom0: starting\n"); - /* Sanity! */ -#ifndef CLONE_DOMAIN0 - if ( d != dom0 ) - BUG(); - if ( test_bit(_DOMF_constructed, &d->domain_flags) ) - BUG(); -#endif - - printk("##Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d); - memset(&dsi, 0, sizeof(struct domain_setup_info)); - - printk("*** LOADING DOMAIN 0 ***\n"); - - alloc_start = dom0_start; - alloc_end = dom0_start + dom0_size; - d->tot_pages = d->max_pages = (alloc_end - alloc_start)/PAGE_SIZE; - image_start = __va(ia64_boot_param->initrd_start); - image_len = ia64_boot_param->initrd_size; - - dsi.image_addr = (unsigned long)image_start; - dsi.image_len = image_len; - rc = parseelfimage(&dsi); - if ( rc != 0 ) - return rc; - - /* Temp workaround */ - if (running_on_sim) - dsi.xen_section_string = (char *)1; - - if ((!vmx_enabled) && !dsi.xen_section_string) { - printk("Lack of hardware support for unmodified vmx dom0\n"); - panic(""); - } - - if (vmx_enabled && !dsi.xen_section_string) { - printk("Dom0 is vmx domain!\n"); - vmx_dom0 = 1; - } - - p_start = dsi.v_start; - pkern_start = dsi.v_kernstart; - pkern_end = dsi.v_kernend; - pkern_entry = dsi.v_kernentry; - - printk("p_start=%lx, pkern_start=%lx, pkern_end=%lx, pkern_entry=%lx\n", - p_start,pkern_start,pkern_end,pkern_entry); - - if ( (p_start & (PAGE_SIZE-1)) != 0 ) - { - printk("Initial guest OS must load to a page boundary.\n"); - return -EINVAL; - } - - printk("METAPHYSICAL MEMORY ARRANGEMENT:\n" - " Kernel image: %lx->%lx\n" - " Entry address: %lx\n" - " Init. ramdisk: (NOT IMPLEMENTED YET)\n", - pkern_start, pkern_end, pkern_entry); - - if ( (pkern_end - pkern_start) > (d->max_pages * PAGE_SIZE) ) - { - printk("Initial guest OS requires too much space\n" - "(%luMB is greater than %luMB limit)\n", - (pkern_end-pkern_start)>>20, (d->max_pages<<PAGE_SHIFT)>>20); - return -ENOMEM; - } - - // Other sanity check about Dom0 image - - /* Construct a frame-allocation list for the initial domain, since these - * pages are allocated by boot allocator and pfns are not set properly - */ - for ( mfn = (alloc_start>>PAGE_SHIFT); - mfn < (alloc_end>>PAGE_SHIFT); - mfn++ ) - { - page = &frame_table[mfn]; - page_set_owner(page, d); - page->u.inuse.type_info = 0; - page->count_info = PGC_allocated | 1; - list_add_tail(&page->list, &d->page_list); - - /* Construct 1:1 mapping */ - machine_to_phys_mapping[mfn] = mfn; - } - - post_arch_do_create_domain(v, vmx_dom0); - - /* Load Dom0 image to its own memory */ - loaddomainelfimage(d,image_start); - - /* Copy the initial ramdisk. */ - - /* Sync d/i cache conservatively */ - ret = ia64_pal_cache_flush(4, 0, &progress, NULL); - if (ret != PAL_STATUS_SUCCESS) - panic("PAL CACHE FLUSH failed for dom0.\n"); - printk("Sync i/d cache for dom0 image SUCC\n"); - - /* Physical mode emulation initialization, including - * emulation ID allcation and related memory request - */ - physical_mode_init(v); - /* Dom0's pfn is equal to mfn, so there's no need to allocate pmt - * for dom0 - */ - d->arch.pmt = NULL; - - /* Give up the VGA console if DOM0 is configured to grab it. */ - if (cmdline != NULL) - console_endboot(strstr(cmdline, "tty0") != NULL); - - /* VMX specific construction for Dom0, if hardware supports VMX - * and Dom0 is unmodified image - */ - printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d); - if (vmx_dom0) - vmx_final_setup_domain(dom0); - - /* vpd is ready now */ - vlsapic_reset(v); - vtm_init(v); - - set_bit(_DOMF_constructed, &d->domain_flags); - new_thread(v, pkern_entry, 0, 0); - - physdev_init_dom0(d); - // FIXME: Hack for keyboard input -#ifdef CLONE_DOMAIN0 -if (d == dom0) -#endif - serial_input_init(); - if (d == dom0) { - VCPU(v, delivery_mask[0]) = -1L; - VCPU(v, delivery_mask[1]) = -1L; - VCPU(v, delivery_mask[2]) = -1L; - VCPU(v, delivery_mask[3]) = -1L; - } - else __set_bit(0x30,VCPU(v, delivery_mask)); - - return 0; -} - - -#else //CONFIG_VTI - -int construct_dom0(struct domain *d, - unsigned long image_start, unsigned long image_len, - unsigned long initrd_start, unsigned long initrd_len, - char *cmdline) -{ char *dst; int i, rc; unsigned long pfn, mfn; unsigned long nr_pt_pages; unsigned long count; - //l2_pgentry_t *l2tab, *l2start; - //l1_pgentry_t *l1tab = NULL, *l1start = NULL; + unsigned long alloc_start, alloc_end; struct pfn_info *page = NULL; start_info_t *si; struct vcpu *v = d->vcpu[0]; @@ -1144,6 +783,7 @@ unsigned long pkern_start; unsigned long pkern_entry; unsigned long pkern_end; + unsigned long ret, progress = 0; //printf("construct_dom0: starting\n"); /* Sanity! */ @@ -1158,7 +798,9 @@ printk("*** LOADING DOMAIN 0 ***\n"); - d->max_pages = dom0_size/PAGE_SIZE; + alloc_start = dom0_start; + alloc_end = dom0_start + dom0_size; + d->tot_pages = d->max_pages = dom0_size/PAGE_SIZE; image_start = __va(ia64_boot_param->initrd_start); image_len = ia64_boot_param->initrd_size; //printk("image_start=%lx, image_len=%lx\n",image_start,image_len); @@ -1171,6 +813,23 @@ if ( rc != 0 ) return rc; +#ifdef CONFIG_VTI + /* Temp workaround */ + if (running_on_sim) + dsi.xen_section_string = (char *)1; + + /* Check whether dom0 is vti domain */ + if ((!vmx_enabled) && !dsi.xen_section_string) { + printk("Lack of hardware support for unmodified vmx dom0\n"); + panic(""); + } + + if (vmx_enabled && !dsi.xen_section_string) { + printk("Dom0 is vmx domain!\n"); + vmx_dom0 = 1; + } +#endif + p_start = dsi.v_start; pkern_start = dsi.v_kernstart; pkern_end = dsi.v_kernend; @@ -1214,13 +873,42 @@ for ( i = 0; i < MAX_VIRT_CPUS; i++ ) d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; +#ifdef CONFIG_VTI + /* Construct a frame-allocation list for the initial domain, since these + * pages are allocated by boot allocator and pfns are not set properly + */ + for ( mfn = (alloc_start>>PAGE_SHIFT); + mfn < (alloc_end>>PAGE_SHIFT); + mfn++ ) + { + page = &frame_table[mfn]; + page_set_owner(page, d); + page->u.inuse.type_info = 0; + page->count_info = PGC_allocated | 1; + list_add_tail(&page->list, &d->page_list); + + /* Construct 1:1 mapping */ + machine_to_phys_mapping[mfn] = mfn; + } + + /* Dom0's pfn is equal to mfn, so there's no need to allocate pmt + * for dom0 + */ + d->arch.pmt = NULL; +#endif + /* Copy the OS image. */ - //(void)loadelfimage(image_start); loaddomainelfimage(d,image_start); /* Copy the initial ramdisk. */ //if ( initrd_len != 0 ) // memcpy((void *)vinitrd_start, initrd_start, initrd_len); + + /* Sync d/i cache conservatively */ + ret = ia64_pal_cache_flush(4, 0, &progress, NULL); + if (ret != PAL_STATUS_SUCCESS) + panic("PAL CACHE FLUSH failed for dom0.\n"); + printk("Sync i/d cache for dom0 image SUCC\n"); #if 0 /* Set up start info area. */ @@ -1257,14 +945,21 @@ #endif /* Give up the VGA console if DOM0 is configured to grab it. */ -#ifdef IA64 if (cmdline != NULL) -#endif - console_endboot(strstr(cmdline, "tty0") != NULL); + console_endboot(strstr(cmdline, "tty0") != NULL); + + /* VMX specific construction for Dom0, if hardware supports VMX + * and Dom0 is unmodified image + */ + printk("Dom0: 0x%lx, domain: 0x%lx\n", (u64)dom0, (u64)d); + if (vmx_dom0) + vmx_final_setup_domain(dom0); set_bit(_DOMF_constructed, &d->domain_flags); new_thread(v, pkern_entry, 0, 0); + physdev_init_dom0(d); + // FIXME: Hack for keyboard input #ifdef CLONE_DOMAIN0 if (d == dom0) @@ -1280,7 +975,6 @@ return 0; } -#endif // CONFIG_VTI // FIXME: When dom0 can construct domains, this goes away (or is rewritten) int construct_domU(struct domain *d, diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/hyperprivop.S --- a/xen/arch/ia64/hyperprivop.S Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/hyperprivop.S Fri Aug 26 20:47:16 2005 @@ -73,7 +73,8 @@ ld4 r20=[r20] ;; cmp.eq p7,p0=r0,r20 (p7) br.cond.sptk.many 1f - mov r20=IA64_KR(CURRENT);; + movl r20=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r20=[r20];; adds r21=IA64_VCPU_IRR0_OFFSET,r20; adds r22=IA64_VCPU_IRR0_OFFSET+8,r20;; ld8 r23=[r21],16; ld8 r24=[r22],16;; @@ -257,7 +258,8 @@ st8 [r21]=r20 ;; // leave cr.ifs alone for later rfi // set iip to go to domain IVA break instruction vector - mov r22=IA64_KR(CURRENT);; + movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r22=[r22];; adds r22=IA64_VCPU_IVA_OFFSET,r22;; ld8 r23=[r22];; movl r24=0x3000;; @@ -306,7 +308,7 @@ mov r28=IA64_TIMER_VECTOR;; cmp.ne p6,p0=r28,r30 (p6) br.cond.spnt.few rp;; - movl r20=(PERCPU_ADDR)+IA64_CPUINFO_ITM_NEXT_OFFSET;; + movl r20=THIS_CPU(cpu_info)+IA64_CPUINFO_ITM_NEXT_OFFSET;; ld8 r26=[r20];; mov r27=ar.itc;; adds r27=200,r27;; // safety margin @@ -340,7 +342,8 @@ (p6) br.cond.spnt.few fast_tick_reflect_done;; extr.u r27=r20,0,6 // r27 has low 6 bits of itv.vector extr.u r26=r20,6,2;; // r26 has irr index of itv.vector - mov r19=IA64_KR(CURRENT);; + movl r19=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r19=[r19];; adds r22=IA64_VCPU_DOMAIN_ITM_LAST_OFFSET,r19 adds r23=IA64_VCPU_DOMAIN_ITM_OFFSET,r19;; ld8 r24=[r22];; @@ -581,7 +584,8 @@ st8 [r18]=r0;; // FIXME: need to save iipa and isr to be arch-compliant // set iip to go to domain IVA break instruction vector - mov r22=IA64_KR(CURRENT);; + movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r22=[r22];; adds r22=IA64_VCPU_IVA_OFFSET,r22;; ld8 r23=[r22];; add r20=r20,r23;; @@ -803,7 +807,8 @@ // r18=&vpsr.i|vpsr.ic, r21==vpsr, r22=vcr.iip // make sure none of these get trashed in case going to just_do_rfi - mov r30=IA64_KR(CURRENT);; + movl r30=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r30=[r30];; adds r24=IA64_VCPU_INSVC3_OFFSET,r30;; mov r25=192 adds r16=IA64_VCPU_IRR3_OFFSET,r30;; @@ -1010,7 +1015,8 @@ ld4 r21=[r20];; cmp.eq p7,p0=r21,r0 // meta==0? (p7) br.spnt.many 1f ;; // already in virtual mode - mov r22=IA64_KR(CURRENT);; + movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r22=[r22];; adds r22=IA64_VCPU_META_SAVED_RR0_OFFSET,r22;; ld4 r23=[r22];; mov rr[r0]=r23;; @@ -1045,7 +1051,8 @@ ld4 r21=[r20];; cmp.ne p7,p0=r21,r0 // meta==0? (p7) br.spnt.many 1f ;; // already in metaphysical mode - mov r22=IA64_KR(CURRENT);; + movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r22=[r22];; adds r22=IA64_VCPU_META_RR0_OFFSET,r22;; ld4 r23=[r22];; mov rr[r0]=r23;; @@ -1137,7 +1144,8 @@ (p7) adds r20=XSI_PEND_OFS-XSI_PSR_IC_OFS,r18 ;; (p7) st4 [r20]=r0;; (p7) br.spnt.many 1f ;; - mov r30=IA64_KR(CURRENT);; + movl r30=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r30=[r30];; adds r24=IA64_VCPU_INSVC3_OFFSET,r30;; mov r25=192 adds r22=IA64_VCPU_IRR3_OFFSET,r30;; @@ -1242,7 +1250,8 @@ adds r21=1,r21;; st8 [r20]=r21;; #endif - mov r22=IA64_KR(CURRENT);; + movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r22=[r22];; adds r22=IA64_VCPU_INSVC3_OFFSET,r22;; ld8 r23=[r22];; cmp.eq p6,p0=r23,r0;; @@ -1305,9 +1314,10 @@ adds r21=1,r21;; st8 [r20]=r21;; #endif - movl r20=(PERCPU_ADDR)+IA64_CPUINFO_ITM_NEXT_OFFSET;; + movl r20=THIS_CPU(cpu_info)+IA64_CPUINFO_ITM_NEXT_OFFSET;; ld8 r21=[r20];; - mov r20=IA64_KR(CURRENT);; + movl r20=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r20=[r20];; adds r20=IA64_VCPU_DOMAIN_ITM_OFFSET,r20;; st8 [r20]=r8;; cmp.geu p6,p0=r21,r8;; @@ -1378,7 +1388,8 @@ st8 [r20]=r21;; #endif extr.u r26=r9,8,24 // r26 = r9.rid - mov r20=IA64_KR(CURRENT);; + movl r20=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r20=[r20];; adds r21=IA64_VCPU_STARTING_RID_OFFSET,r20;; ld4 r22=[r21];; adds r21=IA64_VCPU_ENDING_RID_OFFSET,r20;; @@ -1544,7 +1555,8 @@ mov ar.lc=r30 ;; mov r29=cr.ipsr mov r30=cr.iip;; - mov r27=IA64_KR(CURRENT);; + movl r27=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r27=[r27];; adds r25=IA64_VCPU_DTLB_OFFSET,r27 adds r26=IA64_VCPU_ITLB_OFFSET,r27;; ld8 r24=[r25] diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/ivt.S --- a/xen/arch/ia64/ivt.S Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/ivt.S Fri Aug 26 20:47:16 2005 @@ -136,7 +136,11 @@ ;; rsm psr.dt // use physical addressing for data mov r31=pr // save the predicate registers +#ifdef XEN + movl r19=THIS_CPU(cpu_kr)+IA64_KR_PT_BASE_OFFSET;; +#else mov r19=IA64_KR(PT_BASE) // get page table base address +#endif shl r21=r16,3 // shift bit 60 into sign bit shr.u r17=r16,61 // get the region number into r17 ;; @@ -503,7 +507,11 @@ * Clobbered: b0, r18, r19, r21, psr.dt (cleared) */ rsm psr.dt // switch to using physical data addressing +#ifdef XEN + movl r19=THIS_CPU(cpu_kr)+IA64_KR_PT_BASE_OFFSET;; +#else mov r19=IA64_KR(PT_BASE) // get the page table base address +#endif shl r21=r16,3 // shift bit 60 into sign bit ;; shr.u r17=r16,61 // get the region number into r17 @@ -833,7 +841,9 @@ cmp4.eq p7,p0=r0,r19 (p7) br.sptk.many fast_hyperprivop ;; - mov r22=IA64_KR(CURRENT);; + movl r22=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r22 = [r22] + ;; adds r22=IA64_VCPU_BREAKIMM_OFFSET,r22;; ld4 r23=[r22];; cmp4.eq p6,p7=r23,r17 // Xen-reserved breakimm? @@ -842,7 +852,8 @@ br.sptk.many fast_break_reflect ;; #endif - mov r16=IA64_KR(CURRENT) // r16 = current task; 12 cycle read lat. + movl r16=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + ld8 r16=[r16] mov r17=cr.iim mov r18=__IA64_BREAK_SYSCALL mov r21=ar.fpsr @@ -934,7 +945,7 @@ // FIXME: this is a hack... use cpuinfo.ksoftirqd because its // not used anywhere else and we need a place to stash ivr and // there's no registers available unused by SAVE_MIN/REST - movl r29=(PERCPU_ADDR)+IA64_CPUINFO_KSOFTIRQD_OFFSET;; + movl r29=THIS_CPU(cpu_info)+IA64_CPUINFO_KSOFTIRQD_OFFSET;; st8 [r29]=r30;; movl r28=slow_interrupt;; mov r29=rp;; @@ -954,7 +965,7 @@ ;; alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group #ifdef XEN - movl out0=(PERCPU_ADDR)+IA64_CPUINFO_KSOFTIRQD_OFFSET;; + movl out0=THIS_CPU(cpu_info)+IA64_CPUINFO_KSOFTIRQD_OFFSET;; ld8 out0=[out0];; #else mov out0=cr.ivr // pass cr.ivr as first arg diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/linux-xen/entry.S --- a/xen/arch/ia64/linux-xen/entry.S Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/linux-xen/entry.S Fri Aug 26 20:47:16 2005 @@ -191,7 +191,8 @@ adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13 movl r25=init_task - mov r27=IA64_KR(CURRENT_STACK) + movl r27=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_STACK_OFFSET;; + ld8 r27=[r27] adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0 #ifdef XEN dep r20=0,in0,60,4 // physical address of "next" @@ -214,7 +215,8 @@ ;; (p6) srlz.d ld8 sp=[r21] // load kernel stack pointer of new task - mov IA64_KR(CURRENT)=in0 // update "current" application register + movl r8=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; + st8 [r8]=in0 mov r8=r13 // return pointer to previously running task mov r13=in0 // set "current" pointer ;; @@ -233,7 +235,8 @@ ;; cmp.eq p7,p0=r25,r23 ;; -(p7) mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... +(p7) movl r8=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_STACK_OFFSET;; +(p7) st8 [r8]=r26 (p7) br.cond.sptk .done #endif rsm psr.ic // interrupts (psr.i) are already disabled here @@ -247,8 +250,8 @@ mov cr.ifa=in0 // VA of next task... ;; mov r25=IA64_TR_CURRENT_STACK - mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... - ;; + movl r8=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_STACK_OFFSET;; + st8 [r8]=r26 itr.d dtr[r25]=r23 // wire in new mapping... br.cond.sptk .done END(ia64_switch_to) @@ -947,7 +950,8 @@ ldf.fill f11=[r2] bsw.0 // switch back to bank 0 (no stop bit required beforehand...) ;; -(pUStk) mov r18=IA64_KR(CURRENT)// M2 (12 cycle read latency) +(pUStk) movl r18=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; +(pUStk) ld8 r18=[r18] adds r16=PT(CR_IPSR)+16,r12 adds r17=PT(CR_IIP)+16,r12 diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/linux-xen/head.S --- a/xen/arch/ia64/linux-xen/head.S Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/linux-xen/head.S Fri Aug 26 20:47:16 2005 @@ -226,7 +226,7 @@ bsw.1 ;; #else // CONFIG_VTI - mov IA64_KR(CURRENT)=r2 // virtual address + mov IA64_KR(CURRENT)=r2 mov IA64_KR(CURRENT_STACK)=r16 #endif // CONFIG_VTI mov r13=r2 diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/linux-xen/irq_ia64.c --- a/xen/arch/ia64/linux-xen/irq_ia64.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/linux-xen/irq_ia64.c Fri Aug 26 20:47:16 2005 @@ -265,7 +265,7 @@ */ vmx_irq_exit(); if ( wake_dom0 && current != dom0 ) - domain_wake(dom0->vcpu[0]); + vcpu_wake(dom0->vcpu[0]); } #endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/linux-xen/setup.c --- a/xen/arch/ia64/linux-xen/setup.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/linux-xen/setup.c Fri Aug 26 20:47:16 2005 @@ -51,9 +51,7 @@ #include <asm/smp.h> #include <asm/system.h> #include <asm/unistd.h> -#ifdef CONFIG_VTI #include <asm/vmx.h> -#endif // CONFIG_VTI #include <asm/io.h> #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) @@ -66,6 +64,7 @@ #endif DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info); +DEFINE_PER_CPU(cpu_kr_ia64_t, cpu_kr); DEFINE_PER_CPU(unsigned long, local_per_cpu_offset); DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); unsigned long ia64_cycles_per_usec; @@ -401,9 +400,9 @@ cpu_physical_id(0) = hard_smp_processor_id(); #endif -#ifdef CONFIG_VTI +#ifdef XEN identify_vmx_feature(); -#endif // CONFIG_VTI +#endif cpu_init(); /* initialize the bootstrap CPU */ @@ -599,7 +598,7 @@ c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1)); c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); -#ifdef CONFIG_VTI +#ifdef XEN /* If vmx feature is on, do necessary initialization for vmx */ if (vmx_enabled) vmx_init_env(); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/linux/minstate.h --- a/xen/arch/ia64/linux/minstate.h Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/linux/minstate.h Fri Aug 26 20:47:16 2005 @@ -61,7 +61,9 @@ ;; #ifdef MINSTATE_VIRT -# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT) +# define MINSTATE_GET_CURRENT(reg) \ + movl reg=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;;\ + ld8 reg=[reg] # define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_VIRT # define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_VIRT #endif @@ -170,7 +172,8 @@ ;; \ .mem.offset 0,0; st8.spill [r16]=r13,16; \ .mem.offset 8,0; st8.spill [r17]=r21,16; /* save ar.fpsr */ \ - mov r13=IA64_KR(CURRENT); /* establish `current' */ \ + movl r13=THIS_CPU(cpu_kr)+IA64_KR_CURRENT_OFFSET;; \ + ld8 r13=[r13]; /* establish 'current' */ \ ;; \ .mem.offset 0,0; st8.spill [r16]=r15,16; \ .mem.offset 8,0; st8.spill [r17]=r14,16; \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/regionreg.c --- a/xen/arch/ia64/regionreg.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/regionreg.c Fri Aug 26 20:47:16 2005 @@ -29,9 +29,6 @@ #define MAX_RID_BLOCKS (1 << (IA64_MAX_IMPL_RID_BITS-IA64_MIN_IMPL_RID_BITS)) #define RIDS_PER_RIDBLOCK MIN_RIDS -// This is the one global memory representation of the default Xen region reg -ia64_rr xen_rr; - #if 0 // following already defined in include/asm-ia64/gcc_intrin.h // it should probably be ifdef'd out from there to ensure all region @@ -65,7 +62,7 @@ // returns -1 if none available -unsigned long allocate_metaphysical_rr0(void) +unsigned long allocate_metaphysical_rr(void) { ia64_rr rrv; @@ -79,17 +76,6 @@ { // fix this when the increment allocation mechanism is fixed. return 1; -} - - -void init_rr(void) -{ - xen_rr.rrval = 0; - xen_rr.ve = 0; - xen_rr.rid = allocate_reserved_rid(); - xen_rr.ps = PAGE_SHIFT; - - printf("initialized xen_rr.rid=0x%lx\n", xen_rr.rid); } /************************************* @@ -186,34 +172,6 @@ return 1; } - -// This function is purely for performance... apparently scrambling -// bits in the region id makes for better hashing, which means better -// use of the VHPT, which means better performance -// Note that the only time a RID should be mangled is when it is stored in -// a region register; anytime it is "viewable" outside of this module, -// it should be unmangled - -// NOTE: this function is also implemented in assembly code in hyper_set_rr!! -// Must ensure these two remain consistent! -static inline unsigned long -vmMangleRID(unsigned long RIDVal) -{ - union bits64 { unsigned char bytes[4]; unsigned long uint; }; - - union bits64 t; - unsigned char tmp; - - t.uint = RIDVal; - tmp = t.bytes[1]; - t.bytes[1] = t.bytes[3]; - t.bytes[3] = tmp; - - return t.uint; -} - -// since vmMangleRID is symmetric, use it for unmangling also -#define vmUnmangleRID(x) vmMangleRID(x) static inline void set_rr_no_srlz(unsigned long rr, unsigned long rrval) diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/vcpu.c --- a/xen/arch/ia64/vcpu.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/vcpu.c Fri Aug 26 20:47:16 2005 @@ -14,9 +14,7 @@ #include <asm/tlb.h> #include <asm/processor.h> #include <asm/delay.h> -#ifdef CONFIG_VTI #include <asm/vmx_vcpu.h> -#endif // CONFIG_VTI typedef union { struct ia64_psr ia64_psr; diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/vmmu.c --- a/xen/arch/ia64/vmmu.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/vmmu.c Fri Aug 26 20:47:16 2005 @@ -81,10 +81,10 @@ /* * The VRN bits of va stand for which rr to get. */ -rr_t vmmu_get_rr(VCPU *vcpu, u64 va) -{ - rr_t vrr; - vmx_vcpu_get_rr(vcpu, va, &vrr.value); +ia64_rr vmmu_get_rr(VCPU *vcpu, u64 va) +{ + ia64_rr vrr; + vmx_vcpu_get_rr(vcpu, va, &vrr.rrval); return vrr; } @@ -240,7 +240,7 @@ u64 saved_itir, saved_ifa, saved_rr; u64 pages; thash_data_t mtlb; - rr_t vrr; + ia64_rr vrr; unsigned int cl = tlb->cl; mtlb.ifa = tlb->vadr; @@ -264,7 +264,7 @@ /* Only access memory stack which is mapped by TR, * after rr is switched. */ - ia64_set_rr(mtlb.ifa, vmx_vrrtomrr(d, vrr.value)); + ia64_set_rr(mtlb.ifa, vmx_vrrtomrr(d, vrr.rrval)); ia64_srlz_d(); if ( cl == ISIDE_TLB ) { ia64_itci(mtlb.page_flags); @@ -287,12 +287,12 @@ u64 hash_addr, tag; unsigned long psr; struct vcpu *v = current; - rr_t vrr; + ia64_rr vrr; saved_pta = ia64_getreg(_IA64_REG_CR_PTA); saved_rr0 = ia64_get_rr(0); - vrr.value = saved_rr0; + vrr.rrval = saved_rr0; vrr.rid = rid; vrr.ps = ps; @@ -300,7 +300,7 @@ // TODO: Set to enforce lazy mode local_irq_save(psr); ia64_setreg(_IA64_REG_CR_PTA, pta.val); - ia64_set_rr(0, vmx_vrrtomrr(v, vrr.value)); + ia64_set_rr(0, vmx_vrrtomrr(v, vrr.rrval)); ia64_srlz_d(); hash_addr = ia64_thash(va); @@ -318,19 +318,19 @@ u64 hash_addr, tag; u64 psr; struct vcpu *v = current; - rr_t vrr; + ia64_rr vrr; // TODO: Set to enforce lazy mode saved_pta = ia64_getreg(_IA64_REG_CR_PTA); saved_rr0 = ia64_get_rr(0); - vrr.value = saved_rr0; + vrr.rrval = saved_rr0; vrr.rid = rid; vrr.ps = ps; va = (va << 3) >> 3; // set VRN to 0. local_irq_save(psr); ia64_setreg(_IA64_REG_CR_PTA, pta.val); - ia64_set_rr(0, vmx_vrrtomrr(v, vrr.value)); + ia64_set_rr(0, vmx_vrrtomrr(v, vrr.rrval)); ia64_srlz_d(); tag = ia64_ttag(va); @@ -354,15 +354,15 @@ { u64 saved_rr0; u64 psr; - rr_t vrr; + ia64_rr vrr; va = (va << 3) >> 3; // set VRN to 0. saved_rr0 = ia64_get_rr(0); - vrr.value = saved_rr0; + vrr.rrval = saved_rr0; vrr.rid = rid; vrr.ps = ps; local_irq_save(psr); - ia64_set_rr( 0, vmx_vrrtomrr(current,vrr.value) ); + ia64_set_rr( 0, vmx_vrrtomrr(current,vrr.rrval) ); ia64_srlz_d(); ia64_ptcl(va, ps << 2); ia64_set_rr( 0, saved_rr0 ); @@ -421,14 +421,14 @@ u64 gpip; // guest physical IP u64 mpa; thash_data_t *tlb; - rr_t vrr; + ia64_rr vrr; u64 mfn; if ( !(VMX_VPD(vcpu, vpsr) & IA64_PSR_IT) ) { // I-side physical mode gpip = gip; } else { - vmx_vcpu_get_rr(vcpu, gip, &vrr.value); + vmx_vcpu_get_rr(vcpu, gip, &vrr.rrval); tlb = vtlb_lookup_ex (vmx_vcpu_get_vtlb(vcpu), vrr.rid, gip, ISIDE_TLB ); if ( tlb == NULL ) panic("No entry found in ITLB\n"); @@ -448,7 +448,7 @@ thash_data_t data, *ovl; thash_cb_t *hcb; search_section_t sections; - rr_t vrr; + ia64_rr vrr; hcb = vmx_vcpu_get_vtlb(vcpu); data.page_flags=pte & ~PAGE_FLAGS_RV_MASK; @@ -481,7 +481,7 @@ thash_data_t data, *ovl; thash_cb_t *hcb; search_section_t sections; - rr_t vrr; + ia64_rr vrr; hcb = vmx_vcpu_get_vtlb(vcpu); data.page_flags=pte & ~PAGE_FLAGS_RV_MASK; @@ -511,7 +511,7 @@ { thash_cb_t *hcb; - rr_t vrr; + ia64_rr vrr; u64 preferred_size; vmx_vcpu_get_rr(vcpu, va, &vrr); @@ -527,7 +527,7 @@ thash_data_t data, *ovl; thash_cb_t *hcb; search_section_t sections; - rr_t vrr; + ia64_rr vrr; hcb = vmx_vcpu_get_vtlb(vcpu); data.page_flags=pte & ~PAGE_FLAGS_RV_MASK; @@ -559,7 +559,7 @@ thash_data_t data, *ovl; thash_cb_t *hcb; search_section_t sections; - rr_t vrr; + ia64_rr vrr; hcb = vmx_vcpu_get_vtlb(vcpu); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/vmx_init.c --- a/xen/arch/ia64/vmx_init.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/vmx_init.c Fri Aug 26 20:47:16 2005 @@ -22,6 +22,9 @@ */ /* + * 05/08/16 Kun tian (Kevin Tian) <kevin.tian@xxxxxxxxx>: + * Disable doubling mapping + * * 05/03/23 Kun Tian (Kevin Tian) <kevin.tian@xxxxxxxxx>: * Simplied design in first step: * - One virtual environment @@ -39,6 +42,7 @@ #include <xen/lib.h> #include <asm/vmmu.h> #include <public/arch-ia64.h> +#include <public/io/ioreq.h> #include <asm/vmx_phy_mode.h> #include <asm/processor.h> #include <asm/vmx.h> @@ -126,8 +130,43 @@ else ASSERT(tmp_base != __vsa_base); +#ifdef XEN_DBL_MAPPING /* Init stub for rr7 switch */ vmx_init_double_mapping_stub(); +#endif +} + +void vmx_setup_platform(struct vcpu *v, struct vcpu_guest_context *c) +{ + struct domain *d = v->domain; + shared_iopage_t *sp; + + ASSERT(d != dom0); /* only for non-privileged vti domain */ + d->arch.vmx_platform.shared_page_va = __va(c->share_io_pg); + sp = get_sp(d); + memset((char *)sp,0,PAGE_SIZE); + /* FIXME: temp due to old CP */ + sp->sp_global.eport = 2; +#ifdef V_IOSAPIC_READY + sp->vcpu_number = 1; +#endif + /* TEMP */ + d->arch.vmx_platform.pib_base = 0xfee00000UL; + + /* One more step to enable interrupt assist */ + set_bit(ARCH_VMX_INTR_ASSIST, &v->arch.arch_vmx.flags); + /* Only open one port for I/O and interrupt emulation */ + if (v == d->vcpu[0]) { + memset(&d->shared_info->evtchn_mask[0], 0xff, + sizeof(d->shared_info->evtchn_mask)); + clear_bit(iopacket_port(d), &d->shared_info->evtchn_mask[0]); + } + + /* FIXME: only support PMT table continuously by far */ + d->arch.pmt = __va(c->pt_base); + d->arch.max_pfn = c->pt_max_pfn; + + vmx_final_setup_domain(d); } typedef union { @@ -171,7 +210,7 @@ } - +#ifdef CONFIG_VTI /* * Create a VP on intialized VMX environment. */ @@ -190,6 +229,7 @@ panic("ia64_pal_vp_create failed. \n"); } +#ifdef XEN_DBL_MAPPING void vmx_init_double_mapping_stub(void) { u64 base, psr; @@ -206,6 +246,7 @@ ia64_srlz_i(); printk("Add TR mapping for rr7 switch stub, with physical: 0x%lx\n", (u64)(__pa(base))); } +#endif /* Other non-context related tasks can be done in context switch */ void @@ -219,12 +260,14 @@ if (status != PAL_STATUS_SUCCESS) panic("Save vp status failed\n"); +#ifdef XEN_DBL_MAPPING /* FIXME: Do we really need purge double mapping for old vcpu? * Since rid is completely different between prev and next, * it's not overlap and thus no MCA possible... */ dom_rr7 = vmx_vrrtomrr(v, VMX(v, vrr[7])); vmx_purge_double_mapping(dom_rr7, KERNEL_START, (u64)v->arch.vtlb->ts->vhpt->hash); +#endif /* Need to save KR when domain switch, though HV itself doesn;t * use them. @@ -252,12 +295,14 @@ if (status != PAL_STATUS_SUCCESS) panic("Restore vp status failed\n"); +#ifdef XEN_DBL_MAPPING dom_rr7 = vmx_vrrtomrr(v, VMX(v, vrr[7])); pte_xen = pte_val(pfn_pte((xen_pstart >> PAGE_SHIFT), PAGE_KERNEL)); pte_vhpt = pte_val(pfn_pte((__pa(v->arch.vtlb->ts->vhpt->hash) >> PAGE_SHIFT), PAGE_KERNEL)); vmx_insert_double_mapping(dom_rr7, KERNEL_START, (u64)v->arch.vtlb->ts->vhpt->hash, pte_xen, pte_vhpt); +#endif ia64_set_kr(0, v->arch.arch_vmx.vkr[0]); ia64_set_kr(1, v->arch.arch_vmx.vkr[1]); @@ -271,6 +316,7 @@ * anchored in vcpu */ } +#ifdef XEN_DBL_MAPPING /* Purge old double mapping and insert new one, due to rr7 change */ void vmx_change_double_mapping(struct vcpu *v, u64 oldrr7, u64 newrr7) @@ -287,6 +333,8 @@ vhpt_base, pte_xen, pte_vhpt); } +#endif // XEN_DBL_MAPPING +#endif // CONFIG_VTI /* * Initialize VMX envirenment for guest. Only the 1st vp/vcpu @@ -307,12 +355,21 @@ v->arch.arch_vmx.vpd = vpd; vpd->virt_env_vaddr = vm_buffer; +#ifdef CONFIG_VTI /* v->arch.schedule_tail = arch_vmx_do_launch; */ vmx_create_vp(v); /* Set this ed to be vmx */ set_bit(ARCH_VMX_VMCS_LOADED, &v->arch.arch_vmx.flags); + /* Physical mode emulation initialization, including + * emulation ID allcation and related memory request + */ + physical_mode_init(v); + + vlsapic_reset(v); + vtm_init(v); +#endif + /* Other vmx specific initialization work */ } - diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/vmx_phy_mode.c --- a/xen/arch/ia64/vmx_phy_mode.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/vmx_phy_mode.c Fri Aug 26 20:47:16 2005 @@ -104,22 +104,8 @@ UINT64 psr; struct domain * d = vcpu->domain; - vcpu->domain->arch.emul_phy_rr0.rid = XEN_RR7_RID+((d->domain_id)<<3); - /* FIXME */ -#if 0 - vcpu->domain->arch.emul_phy_rr0.ps = 28; /* set page size to 256M */ -#endif - vcpu->domain->arch.emul_phy_rr0.ps = EMUL_PHY_PAGE_SHIFT; /* set page size to 4k */ - vcpu->domain->arch.emul_phy_rr0.ve = 1; /* enable VHPT walker on this region */ - - vcpu->domain->arch.emul_phy_rr4.rid = XEN_RR7_RID + ((d->domain_id)<<3) + 4; - vcpu->domain->arch.emul_phy_rr4.ps = EMUL_PHY_PAGE_SHIFT; /* set page size to 4k */ - vcpu->domain->arch.emul_phy_rr4.ve = 1; /* enable VHPT walker on this region */ - vcpu->arch.old_rsc = 0; vcpu->arch.mode_flags = GUEST_IN_PHY; - - return; } extern u64 get_mfn(domid_t domid, u64 gpfn, u64 pages); @@ -246,8 +232,12 @@ vmx_load_all_rr(VCPU *vcpu) { unsigned long psr; + ia64_rr phy_rr; psr = ia64_clear_ic(); + + phy_rr.ps = EMUL_PHY_PAGE_SHIFT; + phy_rr.ve = 1; /* WARNING: not allow co-exist of both virtual mode and physical * mode in same region @@ -255,10 +245,10 @@ if (is_physical_mode(vcpu)) { if (vcpu->arch.mode_flags & GUEST_PHY_EMUL) panic("Unexpected domain switch in phy emul\n"); - ia64_set_rr((VRN0 << VRN_SHIFT), - vcpu->domain->arch.emul_phy_rr0.rrval); - ia64_set_rr((VRN4 << VRN_SHIFT), - vcpu->domain->arch.emul_phy_rr4.rrval); + phy_rr.rid = vcpu->domain->arch.metaphysical_rr0; + ia64_set_rr((VRN0 << VRN_SHIFT), phy_rr.rrval); + phy_rr.rid = vcpu->domain->arch.metaphysical_rr4; + ia64_set_rr((VRN4 << VRN_SHIFT), phy_rr.rrval); } else { ia64_set_rr((VRN0 << VRN_SHIFT), vmx_vrrtomrr(vcpu, VMX(vcpu, vrr[VRN0]))); @@ -284,13 +274,18 @@ switch_to_physical_rid(VCPU *vcpu) { UINT64 psr; + ia64_rr phy_rr; + + phy_rr.ps = EMUL_PHY_PAGE_SHIFT; + phy_rr.ve = 1; /* Save original virtual mode rr[0] and rr[4] */ - psr=ia64_clear_ic(); - ia64_set_rr(VRN0<<VRN_SHIFT, vcpu->domain->arch.emul_phy_rr0.rrval); + phy_rr.rid = vcpu->domain->arch.metaphysical_rr0; + ia64_set_rr(VRN0<<VRN_SHIFT, phy_rr.rrval); ia64_srlz_d(); - ia64_set_rr(VRN4<<VRN_SHIFT, vcpu->domain->arch.emul_phy_rr4.rrval); + phy_rr.rid = vcpu->domain->arch.metaphysical_rr4; + ia64_set_rr(VRN4<<VRN_SHIFT, phy_rr.rrval); ia64_srlz_d(); ia64_set_psr(psr); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/vmx_vcpu.c --- a/xen/arch/ia64/vmx_vcpu.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/vmx_vcpu.c Fri Aug 26 20:47:16 2005 @@ -234,9 +234,11 @@ case VRN7: VMX(vcpu,mrr7)=vmx_vrrtomrr(vcpu,val); /* Change double mapping for this domain */ +#ifdef XEN_DBL_MAPPING vmx_change_double_mapping(vcpu, vmx_vrrtomrr(vcpu,oldrr.rrval), vmx_vrrtomrr(vcpu,newrr.rrval)); +#endif break; default: ia64_set_rr(reg,vmx_vrrtomrr(vcpu,val)); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/vtlb.c --- a/xen/arch/ia64/vtlb.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/vtlb.c Fri Aug 26 20:47:16 2005 @@ -283,7 +283,7 @@ thash_data_t *vhpt) { u64 pages,mfn; - rr_t vrr; + ia64_rr vrr; ASSERT ( hcb->ht == THASH_VHPT ); vrr = (hcb->get_rr_fn)(hcb->vcpu,va); @@ -361,7 +361,7 @@ { thash_data_t *hash_table, *cch; int flag; - rr_t vrr; + ia64_rr vrr; u64 gppn; u64 ppns, ppne; @@ -397,7 +397,7 @@ static void vhpt_insert(thash_cb_t *hcb, thash_data_t *entry, u64 va) { thash_data_t *hash_table, *cch; - rr_t vrr; + ia64_rr vrr; hash_table = (hcb->hash_func)(hcb->pta, va, entry->rid, entry->ps); @@ -425,7 +425,7 @@ void thash_insert(thash_cb_t *hcb, thash_data_t *entry, u64 va) { thash_data_t *hash_table; - rr_t vrr; + ia64_rr vrr; vrr = (hcb->get_rr_fn)(hcb->vcpu,entry->vadr); if ( entry->ps != vrr.ps && entry->tc ) { @@ -556,7 +556,7 @@ thash_data_t *hash_table; thash_internal_t *priv = &hcb->priv; u64 tag; - rr_t vrr; + ia64_rr vrr; priv->_curva = va & ~(size-1); priv->_eva = priv->_curva + size; @@ -580,7 +580,7 @@ thash_data_t *hash_table; thash_internal_t *priv = &hcb->priv; u64 tag; - rr_t vrr; + ia64_rr vrr; priv->_curva = va & ~(size-1); priv->_eva = priv->_curva + size; @@ -633,7 +633,7 @@ thash_data_t *ovl; thash_internal_t *priv = &hcb->priv; u64 addr,rr_psize; - rr_t vrr; + ia64_rr vrr; if ( priv->s_sect.tr ) { ovl = vtr_find_next_overlap (hcb); @@ -665,7 +665,7 @@ thash_data_t *ovl; thash_internal_t *priv = &hcb->priv; u64 addr,rr_psize; - rr_t vrr; + ia64_rr vrr; vrr = (hcb->get_rr_fn)(hcb->vcpu,priv->_curva); rr_psize = PSIZE(vrr.ps); @@ -800,7 +800,7 @@ { thash_data_t *hash_table, *cch; u64 tag; - rr_t vrr; + ia64_rr vrr; ASSERT ( hcb->ht == THASH_VTLB ); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/xenirq.c --- a/xen/arch/ia64/xenirq.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/xenirq.c Fri Aug 26 20:47:16 2005 @@ -50,7 +50,7 @@ #endif //FIXME: TEMPORARY HACK!!!! vcpu_pend_interrupt(dom0->vcpu[0],vector); - domain_wake(dom0->vcpu[0]); + vcpu_wake(dom0->vcpu[0]); return(1); } return(0); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/xenmem.c --- a/xen/arch/ia64/xenmem.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/xenmem.c Fri Aug 26 20:47:16 2005 @@ -30,8 +30,8 @@ */ #ifdef CONFIG_VTI unsigned long *mpt_table; -unsigned long *mpt_table_size; -#endif +unsigned long mpt_table_size; +#endif // CONFIG_VTI void paging_init (void) @@ -53,21 +53,6 @@ printk("machine to physical table: 0x%lx\n", (u64)mpt_table); memset(mpt_table, INVALID_M2P_ENTRY, mpt_table_size); - - /* Any more setup here? On VMX enabled platform, - * there's no need to keep guest linear pg table, - * and read only mpt table. MAP cache is not used - * in this stage, and later it will be in region 5. - * IO remap is in region 6 with identity mapping. - */ - /* HV_tlb_init(); */ - -#else // CONFIG_VTI - - /* Allocate and map the machine-to-phys table */ - if ((pg = alloc_domheap_pages(NULL, 10, 0)) == NULL) - panic("Not enough memory to bootstrap Xen.\n"); - memset(page_to_virt(pg), 0x55, 16UL << 20); #endif // CONFIG_VTI /* Other mapping setup */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/xenmisc.c --- a/xen/arch/ia64/xenmisc.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/xenmisc.c Fri Aug 26 20:47:16 2005 @@ -58,9 +58,7 @@ /* calls in xen/common code that are unused on ia64 */ -void sync_lazy_execstate_cpu(unsigned int cpu) {} -void sync_lazy_execstate_mask(cpumask_t mask) {} -void sync_lazy_execstate_all(void) {} +void sync_vcpu_execstate(struct vcpu *v) {} #ifdef CONFIG_VTI int grant_table_create(struct domain *d) { return 0; } @@ -340,7 +338,8 @@ loop: printf("$$$$$ PANIC in domain %d (k6=%p): ", - v->domain->domain_id, ia64_get_kr(IA64_KR_CURRENT)); + v->domain->domain_id, + __get_cpu_var(cpu_kr)._kr[IA64_KR_CURRENT]); va_start(args, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/xensetup.c --- a/xen/arch/ia64/xensetup.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/xensetup.c Fri Aug 26 20:47:16 2005 @@ -183,11 +183,6 @@ printk("xen image pstart: 0x%lx, xenheap pend: 0x%lx\n", xen_pstart, xenheap_phys_end); -#ifdef CONFIG_VTI - /* If we want to enable vhpt for all regions, related initialization - * for HV TLB must be done earlier before first TLB miss - */ -#endif // CONFIG_VTI /* Find next hole */ firsthole_start = 0; efi_memmap_walk(xen_find_first_hole, &firsthole_start); @@ -267,6 +262,14 @@ do_initcalls(); printk("About to call sort_main_extable()\n"); sort_main_extable(); + + /* surrender usage of kernel registers to domain, use percpu area instead */ + __get_cpu_var(cpu_kr)._kr[IA64_KR_IO_BASE] = ia64_get_kr(IA64_KR_IO_BASE); + __get_cpu_var(cpu_kr)._kr[IA64_KR_PER_CPU_DATA] = ia64_get_kr(IA64_KR_PER_CPU_DATA); + __get_cpu_var(cpu_kr)._kr[IA64_KR_CURRENT_STACK] = ia64_get_kr(IA64_KR_CURRENT_STACK); + __get_cpu_var(cpu_kr)._kr[IA64_KR_FPU_OWNER] = ia64_get_kr(IA64_KR_FPU_OWNER); + __get_cpu_var(cpu_kr)._kr[IA64_KR_CURRENT] = ia64_get_kr(IA64_KR_CURRENT); + __get_cpu_var(cpu_kr)._kr[IA64_KR_PT_BASE] = ia64_get_kr(IA64_KR_PT_BASE); /* Create initial domain 0. */ printk("About to call do_createdomain()\n"); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/xentime.c --- a/xen/arch/ia64/xentime.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/ia64/xentime.c Fri Aug 26 20:47:16 2005 @@ -162,14 +162,14 @@ if (domain0_ready && vcpu_timer_expired(dom0->vcpu[0])) { vcpu_pend_timer(dom0->vcpu[0]); //vcpu_set_next_timer(dom0->vcpu[0]); - domain_wake(dom0->vcpu[0]); + vcpu_wake(dom0->vcpu[0]); } if (!is_idle_task(current->domain) && current->domain != dom0) { if (vcpu_timer_expired(current)) { vcpu_pend_timer(current); // ensure another timer interrupt happens even if domain doesn't vcpu_set_next_timer(current); - domain_wake(current); + vcpu_wake(current); } } raise_actimer_softirq(); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/x86/audit.c --- a/xen/arch/x86/audit.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/x86/audit.c Fri Aug 26 20:47:16 2005 @@ -735,7 +735,6 @@ if ( d != current->domain ) domain_pause(d); - sync_lazy_execstate_all(); // Maybe we should just be using BIGLOCK? // diff -r de3576a1c62c -r dfaf788ab18c xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/x86/domain.c Fri Aug 26 20:47:16 2005 @@ -885,27 +885,22 @@ return switch_required; } -void sync_lazy_execstate_cpu(unsigned int cpu) -{ +void sync_vcpu_execstate(struct vcpu *v) +{ + unsigned int cpu = v->processor; + + if ( !cpu_isset(cpu, v->domain->cpumask) ) + return; + if ( cpu == smp_processor_id() ) + { (void)__sync_lazy_execstate(); + } else + { + /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ flush_tlb_mask(cpumask_of_cpu(cpu)); -} - -void sync_lazy_execstate_mask(cpumask_t mask) -{ - if ( cpu_isset(smp_processor_id(), mask) ) - (void)__sync_lazy_execstate(); - /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ - flush_tlb_mask(mask); -} - -void sync_lazy_execstate_all(void) -{ - __sync_lazy_execstate(); - /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ - flush_tlb_mask(cpu_online_map); + } } unsigned long __hypercall_create_continuation( diff -r de3576a1c62c -r dfaf788ab18c xen/arch/x86/x86_32/asm-offsets.c --- a/xen/arch/x86/x86_32/asm-offsets.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/x86/x86_32/asm-offsets.c Fri Aug 26 20:47:16 2005 @@ -71,6 +71,9 @@ OFFSET(VCPUINFO_upcall_mask, vcpu_info_t, evtchn_upcall_mask); BLANK(); + DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); + BLANK(); + OFFSET(TRAPBOUNCE_error_code, struct trap_bounce, error_code); OFFSET(TRAPBOUNCE_cr2, struct trap_bounce, cr2); OFFSET(TRAPBOUNCE_flags, struct trap_bounce, flags); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/x86/x86_32/entry.S --- a/xen/arch/x86/x86_32/entry.S Thu Aug 25 20:52:38 2005 +++ b/xen/arch/x86/x86_32/entry.S Fri Aug 26 20:47:16 2005 @@ -60,6 +60,11 @@ #include <asm/apicdef.h> #include <asm/page.h> #include <public/xen.h> + +#define GET_GUEST_REGS(reg) \ + movl $~(STACK_SIZE-1),reg; \ + andl %esp,reg; \ + orl $(STACK_SIZE-CPUINFO_sizeof),reg; #define GET_CURRENT(reg) \ movl $STACK_SIZE-4, reg; \ @@ -279,7 +284,41 @@ GET_CURRENT(%ebx) andl $(NR_hypercalls-1),%eax PERFC_INCR(PERFC_hypercalls, %eax) +#ifndef NDEBUG + /* Deliberately corrupt parameter regs not used by this hypercall. */ + pushl %eax + pushl UREGS_eip+4(%esp) + pushl 28(%esp) # EBP + pushl 28(%esp) # EDI + pushl 28(%esp) # ESI + pushl 28(%esp) # EDX + pushl 28(%esp) # ECX + pushl 28(%esp) # EBX + movzb hypercall_args_table(,%eax,1),%ecx + leal (%esp,%ecx,4),%edi + subl $6,%ecx + negl %ecx + movl %eax,%esi + movl $0xDEADBEEF,%eax + rep stosl + movl %esi,%eax +#endif call *hypercall_table(,%eax,4) +#ifndef NDEBUG + /* Deliberately corrupt parameter regs used by this hypercall. */ + addl $24,%esp # Shadow parameters + popl %ecx # Shadow EIP + cmpl %ecx,UREGS_eip(%esp) + popl %ecx # Shadow hypercall index + jne skip_clobber # If EIP has changed then don't clobber + movzb hypercall_args_table(,%ecx,1),%ecx + movl %esp,%edi + movl %eax,%esi + movl $0xDEADBEEF,%eax + rep stosl + movl %esi,%eax +skip_clobber: +#endif movl %eax,UREGS_eax(%esp) # save the return value test_all_events: @@ -680,12 +719,14 @@ do_arch_sched_op: # Ensure we return success even if we return via schedule_tail() xorl %eax,%eax - movl %eax,UREGS_eax+4(%esp) + GET_GUEST_REGS(%ecx) + movl %eax,UREGS_eax(%ecx) jmp do_sched_op do_switch_vm86: - # Discard the return address - addl $4,%esp + # Reset the stack pointer + GET_GUEST_REGS(%ecx) + movl %ecx,%esp # GS:ESI == Ring-1 stack activation movl UREGS_esp(%esp),%esi @@ -774,3 +815,36 @@ .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr + +ENTRY(hypercall_args_table) + .byte 1 /* do_set_trap_table */ /* 0 */ + .byte 4 /* do_mmu_update */ + .byte 2 /* do_set_gdt */ + .byte 2 /* do_stack_switch */ + .byte 4 /* do_set_callbacks */ + .byte 1 /* do_fpu_taskswitch */ /* 5 */ + .byte 2 /* do_arch_sched_op */ + .byte 1 /* do_dom0_op */ + .byte 2 /* do_set_debugreg */ + .byte 1 /* do_get_debugreg */ + .byte 4 /* do_update_descriptor */ /* 10 */ + .byte 0 /* do_ni_hypercall */ + .byte 5 /* do_dom_mem_op */ + .byte 2 /* do_multicall */ + .byte 4 /* do_update_va_mapping */ + .byte 2 /* do_set_timer_op */ /* 15 */ + .byte 1 /* do_event_channel_op */ + .byte 1 /* do_xen_version */ + .byte 3 /* do_console_io */ + .byte 1 /* do_physdev_op */ + .byte 3 /* do_grant_table_op */ /* 20 */ + .byte 2 /* do_vm_assist */ + .byte 5 /* do_update_va_mapping_otherdomain */ + .byte 0 /* do_switch_vm86 */ + .byte 2 /* do_boot_vcpu */ + .byte 0 /* do_ni_hypercall */ /* 25 */ + .byte 4 /* do_mmuext_op */ + .byte 1 /* do_acm_op */ + .rept NR_hypercalls-(.-hypercall_args_table) + .byte 0 /* do_ni_hypercall */ + .endr diff -r de3576a1c62c -r dfaf788ab18c xen/arch/x86/x86_64/asm-offsets.c --- a/xen/arch/x86/x86_64/asm-offsets.c Thu Aug 25 20:52:38 2005 +++ b/xen/arch/x86/x86_64/asm-offsets.c Fri Aug 26 20:47:16 2005 @@ -71,6 +71,9 @@ OFFSET(VCPUINFO_upcall_mask, vcpu_info_t, evtchn_upcall_mask); BLANK(); + DEFINE(CPUINFO_sizeof, sizeof(struct cpu_info)); + BLANK(); + OFFSET(TRAPBOUNCE_error_code, struct trap_bounce, error_code); OFFSET(TRAPBOUNCE_cr2, struct trap_bounce, cr2); OFFSET(TRAPBOUNCE_flags, struct trap_bounce, flags); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/x86/x86_64/entry.S --- a/xen/arch/x86/x86_64/entry.S Thu Aug 25 20:52:38 2005 +++ b/xen/arch/x86/x86_64/entry.S Fri Aug 26 20:47:16 2005 @@ -11,6 +11,11 @@ #include <asm/apicdef.h> #include <asm/page.h> #include <public/xen.h> + +#define GET_GUEST_REGS(reg) \ + movq $~(STACK_SIZE-1),reg; \ + andq %rsp,reg; \ + orq $(STACK_SIZE-CPUINFO_sizeof),reg; #define GET_CURRENT(reg) \ movq $STACK_SIZE-8, reg; \ @@ -120,10 +125,42 @@ /*hypercall:*/ movq %r10,%rcx andq $(NR_hypercalls-1),%rax +#ifndef NDEBUG + /* Deliberately corrupt parameter regs not used by this hypercall. */ + pushq %rdi; pushq %rsi; pushq %rdx; pushq %rcx; pushq %r8 ; pushq %r9 + leaq hypercall_args_table(%rip),%r10 + movq $6,%rcx + sub (%r10,%rax,1),%cl + movq %rsp,%rdi + movl $0xDEADBEEF,%eax + rep stosq + popq %r9 ; popq %r8 ; popq %rcx; popq %rdx; popq %rsi; popq %rdi + movq UREGS_rax(%rsp),%rax + andq $(NR_hypercalls-1),%rax + pushq %rax + pushq UREGS_rip+8(%rsp) +#endif leaq hypercall_table(%rip),%r10 PERFC_INCR(PERFC_hypercalls, %rax) callq *(%r10,%rax,8) - movq %rax,UREGS_rax(%rsp) # save the return value +#ifndef NDEBUG + /* Deliberately corrupt parameter regs used by this hypercall. */ + popq %r10 # Shadow RIP + cmpq %r10,UREGS_rip(%rsp) + popq %rcx # Shadow hypercall index + jne skip_clobber /* If RIP has changed then don't clobber. */ + leaq hypercall_args_table(%rip),%r10 + movb (%r10,%rcx,1),%cl + movl $0xDEADBEEF,%r10d + cmpb $1,%cl; jb skip_clobber; movq %r10,UREGS_rdi(%rsp) + cmpb $2,%cl; jb skip_clobber; movq %r10,UREGS_rsi(%rsp) + cmpb $3,%cl; jb skip_clobber; movq %r10,UREGS_rdx(%rsp) + cmpb $4,%cl; jb skip_clobber; movq %r10,UREGS_r10(%rsp) + cmpb $5,%cl; jb skip_clobber; movq %r10,UREGS_r8(%rsp) + cmpb $6,%cl; jb skip_clobber; movq %r10,UREGS_r9(%rsp) +skip_clobber: +#endif + movq %rax,UREGS_rax(%rsp) # save the return value /* %rbx: struct vcpu */ test_all_events: @@ -538,7 +575,8 @@ do_arch_sched_op: # Ensure we return success even if we return via schedule_tail() xorl %eax,%eax - movq %rax,UREGS_rax+8(%rsp) + GET_GUEST_REGS(%r10) + movq %rax,UREGS_rax(%r10) jmp do_sched_op .data @@ -597,3 +635,36 @@ .rept NR_hypercalls-((.-hypercall_table)/4) .quad do_ni_hypercall .endr + +ENTRY(hypercall_args_table) + .byte 1 /* do_set_trap_table */ /* 0 */ + .byte 4 /* do_mmu_update */ + .byte 2 /* do_set_gdt */ + .byte 2 /* do_stack_switch */ + .byte 3 /* do_set_callbacks */ + .byte 1 /* do_fpu_taskswitch */ /* 5 */ + .byte 2 /* do_arch_sched_op */ + .byte 1 /* do_dom0_op */ + .byte 2 /* do_set_debugreg */ + .byte 1 /* do_get_debugreg */ + .byte 2 /* do_update_descriptor */ /* 10 */ + .byte 0 /* do_ni_hypercall */ + .byte 5 /* do_dom_mem_op */ + .byte 2 /* do_multicall */ + .byte 3 /* do_update_va_mapping */ + .byte 1 /* do_set_timer_op */ /* 15 */ + .byte 1 /* do_event_channel_op */ + .byte 1 /* do_xen_version */ + .byte 3 /* do_console_io */ + .byte 1 /* do_physdev_op */ + .byte 3 /* do_grant_table_op */ /* 20 */ + .byte 2 /* do_vm_assist */ + .byte 4 /* do_update_va_mapping_otherdomain */ + .byte 0 /* do_switch_to_user */ + .byte 2 /* do_boot_vcpu */ + .byte 2 /* do_set_segment_base */ /* 25 */ + .byte 4 /* do_mmuext_op */ + .byte 1 /* do_acm_op */ + .rept NR_hypercalls-(.-hypercall_args_table) + .byte 0 /* do_ni_hypercall */ + .endr diff -r de3576a1c62c -r dfaf788ab18c xen/common/domain.c --- a/xen/common/domain.c Thu Aug 25 20:52:38 2005 +++ b/xen/common/domain.c Fri Aug 26 20:47:16 2005 @@ -152,10 +152,7 @@ /* Make sure that every vcpu is descheduled before we finalise. */ for_each_vcpu ( d, v ) - while ( test_bit(_VCPUF_running, &v->vcpu_flags) ) - cpu_relax(); - - sync_lazy_execstate_mask(d->cpumask); + vcpu_sleep_sync(v); BUG_ON(!cpus_empty(d->cpumask)); sync_pagetable_state(d); @@ -209,7 +206,7 @@ /* Put every vcpu to sleep, but don't wait (avoids inter-vcpu deadlock). */ for_each_vcpu ( d, v ) - domain_sleep_nosync(v); + vcpu_sleep_nosync(v); } @@ -226,7 +223,7 @@ for_each_vcpu ( d, v ) { set_bit(_VCPUF_ctrl_pause, &v->vcpu_flags); - domain_sleep_nosync(v); + vcpu_sleep_nosync(v); } send_guest_virq(dom0->vcpu[0], VIRQ_DEBUGGER); @@ -275,7 +272,7 @@ { BUG_ON(v == current); atomic_inc(&v->pausecnt); - domain_sleep_sync(v); + vcpu_sleep_sync(v); } void domain_pause(struct domain *d) @@ -286,7 +283,7 @@ { BUG_ON(v == current); atomic_inc(&v->pausecnt); - domain_sleep_sync(v); + vcpu_sleep_sync(v); } } @@ -294,7 +291,7 @@ { BUG_ON(v == current); if ( atomic_dec_and_test(&v->pausecnt) ) - domain_wake(v); + vcpu_wake(v); } void domain_unpause(struct domain *d) @@ -313,7 +310,7 @@ { BUG_ON(v == current); if ( !test_and_set_bit(_VCPUF_ctrl_pause, &v->vcpu_flags) ) - domain_sleep_sync(v); + vcpu_sleep_sync(v); } } @@ -324,7 +321,7 @@ for_each_vcpu ( d, v ) { if ( test_and_clear_bit(_VCPUF_ctrl_pause, &v->vcpu_flags) ) - domain_wake(v); + vcpu_wake(v); } } @@ -413,7 +410,7 @@ /* domain_unpause_by_systemcontroller */ if ( test_and_clear_bit(_VCPUF_ctrl_pause, &v->vcpu_flags) ) - domain_wake(v); + vcpu_wake(v); xfree(c); return 0; diff -r de3576a1c62c -r dfaf788ab18c xen/common/multicall.c --- a/xen/common/multicall.c Thu Aug 25 20:52:38 2005 +++ b/xen/common/multicall.c Fri Aug 26 20:47:16 2005 @@ -45,6 +45,18 @@ do_multicall_call(&mcs->call); +#ifndef NDEBUG + { + /* + * Deliberately corrupt the contents of the multicall structure. + * The caller must depend only on the 'result' field on return. + */ + multicall_entry_t corrupt; + memset(&corrupt, 0xAA, sizeof(corrupt)); + (void)__copy_to_user(&call_list[i], &corrupt, sizeof(corrupt)); + } +#endif + if ( unlikely(__put_user(mcs->call.result, &call_list[i].result)) ) { DPRINTK("Error writing result back to multicall block.\n"); diff -r de3576a1c62c -r dfaf788ab18c xen/common/schedule.c --- a/xen/common/schedule.c Thu Aug 25 20:52:38 2005 +++ b/xen/common/schedule.c Fri Aug 26 20:47:16 2005 @@ -193,7 +193,7 @@ TRACE_2D(TRC_SCHED_DOM_REM, v->domain->domain_id, v->vcpu_id); } -void domain_sleep_nosync(struct vcpu *v) +void vcpu_sleep_nosync(struct vcpu *v) { unsigned long flags; @@ -205,18 +205,23 @@ TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id); } -void domain_sleep_sync(struct vcpu *v) -{ - domain_sleep_nosync(v); - - while ( test_bit(_VCPUF_running, &v->vcpu_flags) && !domain_runnable(v) ) +void vcpu_sleep_sync(struct vcpu *v) +{ + vcpu_sleep_nosync(v); + + /* + * We can be sure that the VCPU is finally descheduled after the running + * flag is cleared and the scheduler lock is released. + */ + while ( test_bit(_VCPUF_running, &v->vcpu_flags) + && !domain_runnable(v) + && spin_is_locked(&schedule_data[v->processor].schedule_lock) ) cpu_relax(); - if ( cpu_isset(v->processor, v->domain->cpumask) ) - sync_lazy_execstate_cpu(v->processor); -} - -void domain_wake(struct vcpu *v) + sync_vcpu_execstate(v); +} + +void vcpu_wake(struct vcpu *v) { unsigned long flags; @@ -293,7 +298,7 @@ return -ESRCH; clear_bit(_VCPUF_down, &target->vcpu_flags); /* wake vcpu */ - domain_wake(target); + vcpu_wake(target); return 0; } @@ -457,10 +462,10 @@ } } } - } while (!succ); - //spin_lock_irq(&schedule_data[d->vcpu[0]->processor].schedule_lock); + } while ( !succ ); + SCHED_OP(adjdom, d, cmd); - //spin_unlock_irq(&schedule_data[d->vcpu[0]->processor].schedule_lock); + for (cpu = 0; cpu < NR_CPUS; cpu++) if (__get_cpu_bit(cpu, have_lock)) spin_unlock(&schedule_data[cpu].schedule_lock); @@ -520,7 +525,8 @@ perfc_incrc(sched_ctx); #if defined(WAKE_HISTO) - if ( !is_idle_task(next->domain) && next->wokenup ) { + if ( !is_idle_task(next->domain) && next->wokenup ) + { ulong diff = (ulong)(now - next->wokenup); diff /= (ulong)MILLISECS(1); if (diff <= BUCKETS-2) schedule_data[cpu].hist[diff]++; diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/domain.h --- a/xen/include/asm-ia64/domain.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/domain.h Fri Aug 26 20:47:16 2005 @@ -3,39 +3,28 @@ #include <linux/thread_info.h> #include <asm/tlb.h> -#ifdef CONFIG_VTI #include <asm/vmx_vpd.h> #include <asm/vmmu.h> #include <asm/regionreg.h> #include <public/arch-ia64.h> #include <asm/vmx_platform.h> -#endif // CONFIG_VTI #include <xen/list.h> extern void arch_do_createdomain(struct vcpu *); extern void domain_relinquish_resources(struct domain *); -#ifdef CONFIG_VTI -struct trap_bounce { - // TO add, FIXME Eddie -}; - -#define PMT_SIZE (32L*1024*1024) // 32M for PMT -#endif // CONFIG_VTI - struct arch_domain { struct mm_struct *active_mm; struct mm_struct *mm; int metaphysical_rr0; + int metaphysical_rr4; int starting_rid; /* first RID assigned to domain */ int ending_rid; /* one beyond highest RID assigned to domain */ int rid_bits; /* number of virtual rid bits (default: 18) */ int breakimm; -#ifdef CONFIG_VTI + int imp_va_msb; - ia64_rr emul_phy_rr0; - ia64_rr emul_phy_rr4; unsigned long *pmt; /* physical to machine table */ /* * max_pfn is the maximum page frame in guest physical space, including @@ -44,7 +33,7 @@ */ unsigned long max_pfn; struct virutal_platform_def vmx_platform; -#endif //CONFIG_VTI + u64 xen_vastart; u64 xen_vaend; u64 shared_info_va; @@ -78,15 +67,15 @@ #endif void *regs; /* temporary until find a better way to do privops */ int metaphysical_rr0; // from arch_domain (so is pinned) + int metaphysical_rr4; // from arch_domain (so is pinned) int metaphysical_saved_rr0; // from arch_domain (so is pinned) + int metaphysical_saved_rr4; // from arch_domain (so is pinned) int breakimm; // from arch_domain (so is pinned) int starting_rid; /* first RID assigned to domain */ int ending_rid; /* one beyond highest RID assigned to domain */ struct mm_struct *active_mm; struct thread_struct _thread; // this must be last -#ifdef CONFIG_VTI - void (*schedule_tail) (struct vcpu *); - struct trap_bounce trap_bounce; + thash_cb_t *vtlb; char irq_new_pending; char irq_new_condition; // vpsr.i/vtpr change, check for pending VHPI @@ -94,9 +83,7 @@ //for phycial emulation unsigned long old_rsc; int mode_flags; - struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */ -#endif // CONFIG_VTI }; #define active_mm arch.active_mm diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/linux-xen/asm/pal.h --- a/xen/include/asm-ia64/linux-xen/asm/pal.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/linux-xen/asm/pal.h Fri Aug 26 20:47:16 2005 @@ -1559,9 +1559,7 @@ return iprv.status; } -#ifdef CONFIG_VTI #include <asm/vmx_pal.h> -#endif // CONFIG_VTI #endif /* __ASSEMBLY__ */ #endif /* _ASM_IA64_PAL_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/linux-xen/asm/processor.h --- a/xen/include/asm-ia64/linux-xen/asm/processor.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/linux-xen/asm/processor.h Fri Aug 26 20:47:16 2005 @@ -183,6 +183,22 @@ DECLARE_PER_CPU(struct cpuinfo_ia64, cpu_info); +typedef union { + struct { + __u64 kr0; + __u64 kr1; + __u64 kr2; + __u64 kr3; + __u64 kr4; + __u64 kr5; + __u64 kr6; + __u64 kr7; + }; + __u64 _kr[8]; +} cpu_kr_ia64_t; + +DECLARE_PER_CPU(cpu_kr_ia64_t, cpu_kr); + /* * The "local" data variable. It refers to the per-CPU data of the currently executing * CPU, much like "current" points to the per-task data of the currently executing task. diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/mmu_context.h --- a/xen/include/asm-ia64/mmu_context.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/mmu_context.h Fri Aug 26 20:47:16 2005 @@ -2,11 +2,7 @@ #define __ASM_MMU_CONTEXT_H //dummy file to resolve non-arch-indep include #ifdef XEN -#ifndef CONFIG_VTI #define IA64_REGION_ID_KERNEL 0 -#else // CONFIG_VTI -#define IA64_REGION_ID_KERNEL 0x1e0000 /* Start from all 1 in highest 4 bits */ -#endif // CONFIG_VTI #define ia64_rid(ctx,addr) (((ctx) << 3) | (addr >> 61)) #ifndef __ASSEMBLY__ diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/privop.h --- a/xen/include/asm-ia64/privop.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/privop.h Fri Aug 26 20:47:16 2005 @@ -133,7 +133,6 @@ struct { unsigned long qp:6, r1:7, un7:7, r3:7, x6:6, x3:3, un1:1, major:4; }; } INST64_M46; -#ifdef CONFIG_VTI typedef union U_INST64_M47 { IA64_INST inst; struct { unsigned long qp:6, un14:14, r3:7, x6:6, x3:3, un1:1, major:4; }; @@ -168,8 +167,6 @@ IA64_INST inst; struct { unsigned long qp:6, f1:7, un7:7, r3:7, x:1, hint:2, x6:6, m:1, major:4; }; } INST64_M6; - -#endif // CONFIG_VTI typedef union U_INST64 { IA64_INST inst; @@ -182,14 +179,12 @@ INST64_I26 I26; // mov register to ar (I unit) INST64_I27 I27; // mov immediate to ar (I unit) INST64_I28 I28; // mov from ar (I unit) -#ifdef CONFIG_VTI - INST64_M1 M1; // ld integer + INST64_M1 M1; // ld integer INST64_M2 M2; INST64_M3 M3; - INST64_M4 M4; // st integer + INST64_M4 M4; // st integer INST64_M5 M5; - INST64_M6 M6; // ldfd floating pointer -#endif // CONFIG_VTI + INST64_M6 M6; // ldfd floating pointer INST64_M28 M28; // purge translation cache entry INST64_M29 M29; // mov register to ar (M unit) INST64_M30 M30; // mov immediate to ar (M unit) @@ -204,9 +199,7 @@ INST64_M44 M44; // set/reset system mask INST64_M45 M45; // translation purge INST64_M46 M46; // translation access (tpa,tak) -#ifdef CONFIG_VTI INST64_M47 M47; // purge translation entry -#endif // CONFIG_VTI } INST64; #define MASK_41 ((UINT64)0x1ffffffffff) diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/regionreg.h --- a/xen/include/asm-ia64/regionreg.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/regionreg.h Fri Aug 26 20:47:16 2005 @@ -1,12 +1,6 @@ #ifndef _REGIONREG_H_ #define _REGIONREG_H_ -#ifdef CONFIG_VTI -#define XEN_DEFAULT_RID 0xf00000 -#define DOMAIN_RID_SHIFT 20 -#define DOMAIN_RID_MASK (~(1U<<DOMAIN_RID_SHIFT -1)) -#else //CONFIG_VTI #define XEN_DEFAULT_RID 7 -#endif // CONFIG_VTI #define IA64_MIN_IMPL_RID_MSB 17 #define _REGION_ID(x) ({ia64_rr _v; _v.rrval = (long) (x); _v.rid;}) #define _REGION_PAGE_SIZE(x) ({ia64_rr _v; _v.rrval = (long) (x); _v.ps;}) @@ -42,4 +36,32 @@ int set_one_rr(unsigned long rr, unsigned long val); +// This function is purely for performance... apparently scrambling +// bits in the region id makes for better hashing, which means better +// use of the VHPT, which means better performance +// Note that the only time a RID should be mangled is when it is stored in +// a region register; anytime it is "viewable" outside of this module, +// it should be unmangled + +// NOTE: this function is also implemented in assembly code in hyper_set_rr!! +// Must ensure these two remain consistent! +static inline unsigned long +vmMangleRID(unsigned long RIDVal) +{ + union bits64 { unsigned char bytes[4]; unsigned long uint; }; + + union bits64 t; + unsigned char tmp; + + t.uint = RIDVal; + tmp = t.bytes[1]; + t.bytes[1] = t.bytes[3]; + t.bytes[3] = tmp; + + return t.uint; +} + +// since vmMangleRID is symmetric, use it for unmangling also +#define vmUnmangleRID(x) vmMangleRID(x) + #endif /* !_REGIONREG_H_ */ diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/serial.h --- a/xen/include/asm-ia64/serial.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/serial.h Fri Aug 26 20:47:16 2005 @@ -1,20 +1,0 @@ -#ifndef __ASM_SERIAL_H__ -#define __ASM_SERIAL_H__ - -#include <asm/regs.h> -#include <asm/irq.h> -#include <xen/serial.h> -#include <asm/hpsim_ssc.h> - -#ifndef CONFIG_VTI -#define arch_serial_putc(_uart, _c) \ - ( platform_is_hp_ski() ? (ia64_ssc(c,0,0,0,SSC_PUTCHAR), 1) : \ - ( longs_peak_putc(c), 1 )) -#else -#define arch_serial_putc(_uart, _c) \ - ( platform_is_hp_ski() ? (ia64_ssc(c,0,0,0,SSC_PUTCHAR), 1) : \ - ( (inb((_uart)->io_base + LSR) & LSR_THRE) ? \ - (outb((_c), (_uart)->io_base + THR), 1) : 0 )) -#endif - -#endif /* __ASM_SERIAL_H__ */ diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/tlb.h --- a/xen/include/asm-ia64/tlb.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/tlb.h Fri Aug 26 20:47:16 2005 @@ -35,17 +35,4 @@ unsigned long rid; } TR_ENTRY; -#ifdef CONFIG_VTI -typedef union { - unsigned long value; - struct { - unsigned long ve : 1; - unsigned long rv1 : 1; - unsigned long ps : 6; - unsigned long rid : 24; - unsigned long rv2 : 32; - }; -} rr_t; -#endif // CONFIG_VTI - #endif diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/vmmu.h --- a/xen/include/asm-ia64/vmmu.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/vmmu.h Fri Aug 26 20:47:16 2005 @@ -23,10 +23,11 @@ #ifndef XEN_TLBthash_H #define XEN_TLBthash_H -#include "xen/config.h" -#include "xen/types.h" -#include "public/xen.h" -#include "asm/tlb.h" +#include <xen/config.h> +#include <xen/types.h> +#include <public/xen.h> +#include <asm/tlb.h> +#include <asm/regionreg.h> //#define THASH_TLB_TR 0 //#define THASH_TLB_TC 1 @@ -152,7 +153,7 @@ typedef u64 *(GET_MFN_FN)(domid_t d, u64 gpfn, u64 pages); typedef void *(REM_NOTIFIER_FN)(struct hash_cb *hcb, thash_data_t *entry); typedef void (RECYCLE_FN)(struct hash_cb *hc, u64 para); -typedef rr_t (GET_RR_FN)(struct vcpu *vcpu, u64 reg); +typedef ia64_rr (GET_RR_FN)(struct vcpu *vcpu, u64 reg); typedef thash_data_t *(FIND_OVERLAP_FN)(struct thash_cb *hcb, u64 va, u64 ps, int rid, char cl, search_section_t s_sect); typedef thash_data_t *(FIND_NEXT_OVL_FN)(struct thash_cb *hcb); @@ -329,7 +330,7 @@ extern u64 machine_thash(PTA pta, u64 va, u64 rid, u64 ps); extern void purge_machine_tc_by_domid(domid_t domid); extern void machine_tlb_insert(struct vcpu *d, thash_data_t *tlb); -extern rr_t vmmu_get_rr(struct vcpu *vcpu, u64 va); +extern ia64_rr vmmu_get_rr(struct vcpu *vcpu, u64 va); extern thash_cb_t *init_domain_tlb(struct vcpu *d); #define VTLB_DEBUG diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/vmx.h --- a/xen/include/asm-ia64/vmx.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/vmx.h Fri Aug 26 20:47:16 2005 @@ -32,10 +32,12 @@ extern void vmx_init_double_mapping_stub(void); extern void vmx_save_state(struct vcpu *v); extern void vmx_load_state(struct vcpu *v); +extern void vmx_setup_platform(struct vcpu *v, struct vcpu_guest_context *c); +#ifdef XEN_DBL_MAPPING extern vmx_insert_double_mapping(u64,u64,u64,u64,u64); extern void vmx_purge_double_mapping(u64, u64, u64); extern void vmx_change_double_mapping(struct vcpu *v, u64 oldrr7, u64 newrr7); - +#endif extern void vmx_wait_io(void); extern void vmx_io_assist(struct vcpu *v); diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/vmx_vcpu.h --- a/xen/include/asm-ia64/vmx_vcpu.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/vmx_vcpu.h Fri Aug 26 20:47:16 2005 @@ -308,7 +308,9 @@ vtm=&(vcpu->arch.arch_vmx.vtm); VPD_CR(vcpu,itm)=val; +#ifdef CONFIG_VTI vtm_interruption_update(vcpu, vtm); +#endif return IA64_NO_FAULT; } static inline @@ -414,7 +416,9 @@ IA64FAULT vmx_vcpu_set_eoi(VCPU *vcpu, u64 val) { +#ifdef CONFIG_VTI guest_write_eoi(vcpu); +#endif return IA64_NO_FAULT; } @@ -424,7 +428,9 @@ { VPD_CR(vcpu,itv)=val; +#ifdef CONFIG_VTI vtm_set_itv(vcpu); +#endif return IA64_NO_FAULT; } static inline @@ -465,13 +471,17 @@ static inline IA64FAULT vmx_vcpu_set_itc(VCPU *vcpu, UINT64 val) { +#ifdef CONFIG_VTI vtm_set_itc(vcpu, val); +#endif return IA64_NO_FAULT; } static inline IA64FAULT vmx_vcpu_get_itc(VCPU *vcpu,UINT64 *val) { +#ifdef CONFIG_VTI *val = vtm_get_itc(vcpu); +#endif return IA64_NO_FAULT; } static inline @@ -584,15 +594,22 @@ return (IA64_NO_FAULT); } +/* Another hash performance algorithm */ #define redistribute_rid(rid) (((rid) & ~0xffff) | (((rid) << 8) & 0xff00) | (((rid) >> 8) & 0xff)) static inline unsigned long -vmx_vrrtomrr(VCPU *vcpu,unsigned long val) +vmx_vrrtomrr(VCPU *v, unsigned long val) { ia64_rr rr; u64 rid; + rr.rrval=val; + rr.rid = vmMangleRID(v->arch.starting_rid + rr.rid); +/* Disable this rid allocation algorithm for now */ +#if 0 rid=(((u64)vcpu->domain->domain_id)<<DOMAIN_RID_SHIFT) + rr.rid; rr.rid = redistribute_rid(rid); +#endif + rr.ve=1; return rr.rrval; } diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/vmx_vpd.h --- a/xen/include/asm-ia64/vmx_vpd.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/vmx_vpd.h Fri Aug 26 20:47:16 2005 @@ -61,12 +61,6 @@ unsigned long lrr1; unsigned long rsv6[46]; } cr_t; - -void vmx_enter_scheduler(void); - -//FIXME: Map for LID to vcpu, Eddie -#define MAX_NUM_LPS (1UL<<16) -extern struct vcpu *lid_edt[MAX_NUM_LPS]; struct arch_vmx_struct { // struct virutal_platform_def vmx_platform; diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-ia64/xenprocessor.h --- a/xen/include/asm-ia64/xenprocessor.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-ia64/xenprocessor.h Fri Aug 26 20:47:16 2005 @@ -50,16 +50,11 @@ __u64 ri : 2; __u64 ed : 1; __u64 bn : 1; -#ifdef CONFIG_VTI __u64 ia : 1; __u64 vm : 1; __u64 reserved5 : 17; -#else // CONFIG_VTI - __u64 reserved4 : 19; -#endif // CONFIG_VTI }; -#ifdef CONFIG_VTI /* vmx like above but expressed as bitfields for more efficient access: */ typedef union{ __u64 val; @@ -218,6 +213,4 @@ ret; \ }) -#endif // CONFIG_VTI - #endif // _ASM_IA64_XENPROCESSOR_H diff -r de3576a1c62c -r dfaf788ab18c xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/asm-x86/mm.h Fri Aug 26 20:47:16 2005 @@ -370,6 +370,8 @@ void propagate_page_fault(unsigned long addr, u16 error_code); +extern int __sync_lazy_execstate(void); + /* * Caller must own d's BIGLOCK, is responsible for flushing the TLB, and must * hold a reference to the page. diff -r de3576a1c62c -r dfaf788ab18c xen/include/xen/sched.h --- a/xen/include/xen/sched.h Thu Aug 25 20:52:38 2005 +++ b/xen/include/xen/sched.h Fri Aug 26 20:47:16 2005 @@ -245,18 +245,16 @@ long sched_ctl(struct sched_ctl_cmd *); long sched_adjdom(struct sched_adjdom_cmd *); int sched_id(); -void domain_wake(struct vcpu *d); -void domain_sleep_nosync(struct vcpu *d); -void domain_sleep_sync(struct vcpu *d); - -/* - * Force loading of currently-executing domain state on the specified set - * of CPUs. This is used to counteract lazy state switching where required. - */ -extern void sync_lazy_execstate_cpu(unsigned int cpu); -extern void sync_lazy_execstate_mask(cpumask_t mask); -extern void sync_lazy_execstate_all(void); -extern int __sync_lazy_execstate(void); +void vcpu_wake(struct vcpu *d); +void vcpu_sleep_nosync(struct vcpu *d); +void vcpu_sleep_sync(struct vcpu *d); + +/* + * Force synchronisation of given VCPU's state. If it is currently descheduled, + * this call will ensure that all its state is committed to memory and that + * no CPU is using critical state (e.g., page tables) belonging to the VCPU. + */ +extern void sync_vcpu_execstate(struct vcpu *v); /* * Called by the scheduler to switch to another VCPU. On entry, although @@ -268,7 +266,7 @@ * The callee must ensure that the local CPU is no longer running in @prev's * context, and that the context is saved to memory, before returning. * Alternatively, if implementing lazy context switching, it suffices to ensure - * that invoking __sync_lazy_execstate() will switch and commit @prev's state. + * that invoking sync_vcpu_execstate() will switch and commit @prev's state. */ extern void context_switch( struct vcpu *prev, @@ -287,7 +285,8 @@ extern void continue_running( struct vcpu *same); -int idle_cpu(int cpu); /* Is CPU 'cpu' idle right now? */ +/* Is CPU 'cpu' idle right now? */ +int idle_cpu(int cpu); void startup_cpu_idle_loop(void); @@ -410,7 +409,7 @@ static inline void vcpu_unblock(struct vcpu *v) { if ( test_and_clear_bit(_VCPUF_blocked, &v->vcpu_flags) ) - domain_wake(v); + vcpu_wake(v); } #define IS_PRIV(_d) \ diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/include/xmalloc.h --- /dev/null Thu Aug 25 20:52:38 2005 +++ b/extras/mini-os/include/xmalloc.h Fri Aug 26 20:47:16 2005 @@ -0,0 +1,23 @@ +#ifndef __XMALLOC_H__ +#define __XMALLOC_H__ + +/* Allocate space for typed object. */ +#define xmalloc(_type) ((_type *)_xmalloc(sizeof(_type), __alignof__(_type))) + +/* Allocate space for array of typed objects. */ +#define xmalloc_array(_type, _num) ((_type *)_xmalloc_array(sizeof(_type), __alignof__(_type), _num)) + +/* Free any of the above. */ +extern void xfree(const void *); + +/* Underlying functions */ +extern void *_xmalloc(size_t size, size_t align); +static inline void *_xmalloc_array(size_t size, size_t align, size_t num) +{ + /* Check for overflow. */ + if (size && num > UINT_MAX / size) + return NULL; + return _xmalloc(size * num, align); +} + +#endif /* __XMALLOC_H__ */ diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/lib/xmalloc.c --- /dev/null Thu Aug 25 20:52:38 2005 +++ b/extras/mini-os/lib/xmalloc.c Fri Aug 26 20:47:16 2005 @@ -0,0 +1,219 @@ +/* + **************************************************************************** + * (C) 2005 - Grzegorz Milos - Intel Research Cambridge + **************************************************************************** + * + * File: xmaloc.c + * Author: Grzegorz Milos (gm281@xxxxxxxxx) + * Changes: + * + * Date: Aug 2005 + * + * Environment: Xen Minimal OS + * Description: simple memory allocator + * + **************************************************************************** + * Simple allocator for Mini-os. If larger than a page, simply use the + * page-order allocator. + * + * Copy of the allocator for Xen by Rusty Russell: + * Copyright (C) 2005 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <os.h> +#include <mm.h> +#include <types.h> +#include <lib.h> +#include <list.h> + +static LIST_HEAD(freelist); +/* static spinlock_t freelist_lock = SPIN_LOCK_UNLOCKED; */ + +struct xmalloc_hdr +{ + /* Total including this hdr. */ + size_t size; + struct list_head freelist; +} __cacheline_aligned; + +static void maybe_split(struct xmalloc_hdr *hdr, size_t size, size_t block) +{ + struct xmalloc_hdr *extra; + size_t leftover = block - size; + + /* If enough is left to make a block, put it on free list. */ + if ( leftover >= (2 * sizeof(struct xmalloc_hdr)) ) + { + extra = (struct xmalloc_hdr *)((unsigned long)hdr + size); + extra->size = leftover; + list_add(&extra->freelist, &freelist); + } + else + { + size = block; + } + + hdr->size = size; + /* Debugging aid. */ + hdr->freelist.next = hdr->freelist.prev = NULL; +} + +static void *xmalloc_new_page(size_t size) +{ + struct xmalloc_hdr *hdr; + /* unsigned long flags; */ + + hdr = (struct xmalloc_hdr *)alloc_page(); + if ( hdr == NULL ) + return NULL; + + /* spin_lock_irqsave(&freelist_lock, flags); */ + maybe_split(hdr, size, PAGE_SIZE); + /* spin_unlock_irqrestore(&freelist_lock, flags); */ + + return hdr+1; +} + +/* Big object? Just use the page allocator. */ +static void *xmalloc_whole_pages(size_t size) +{ + struct xmalloc_hdr *hdr; + unsigned int pageorder = get_order(size); + + hdr = (struct xmalloc_hdr *)alloc_pages(pageorder); + if ( hdr == NULL ) + return NULL; + + hdr->size = (1 << (pageorder + PAGE_SHIFT)); + /* Debugging aid. */ + hdr->freelist.next = hdr->freelist.prev = NULL; + + return hdr+1; +} + +/* Return size, increased to alignment with align. */ +static inline size_t align_up(size_t size, size_t align) +{ + return (size + align - 1) & ~(align - 1); +} + +void *_xmalloc(size_t size, size_t align) +{ + struct xmalloc_hdr *i; + /* unsigned long flags; */ + + /* Add room for header, pad to align next header. */ + size += sizeof(struct xmalloc_hdr); + size = align_up(size, __alignof__(struct xmalloc_hdr)); + + /* For big allocs, give them whole pages. */ + if ( size >= PAGE_SIZE ) + return xmalloc_whole_pages(size); + + /* Search free list. */ + /* spin_lock_irqsave(&freelist_lock, flags); */ + list_for_each_entry( i, &freelist, freelist ) + { + if ( i->size < size ) + continue; + list_del(&i->freelist); + maybe_split(i, size, i->size); + /* spin_unlock_irqrestore(&freelist_lock, flags); */ + return i+1; + } + /* spin_unlock_irqrestore(&freelist_lock, flags); */ + + /* Alloc a new page and return from that. */ + return xmalloc_new_page(size); +} + +void xfree(const void *p) +{ + /* unsigned long flags; */ + struct xmalloc_hdr *i, *tmp, *hdr; + + if ( p == NULL ) + return; + + hdr = (struct xmalloc_hdr *)p - 1; + + /* We know hdr will be on same page. */ + if(((long)p & PAGE_MASK) != ((long)hdr & PAGE_MASK)) + { + printk("Header should be on the same page\n"); + *(int*)0=0; + } + + /* Not previously freed. */ + if(hdr->freelist.next || hdr->freelist.prev) + { + printk("Should not be previously freed\n"); + *(int*)0=0; + } + + /* Big allocs free directly. */ + if ( hdr->size >= PAGE_SIZE ) + { + free_pages(hdr, get_order(hdr->size)); + return; + } + + /* Merge with other free block, or put in list. */ + /* spin_lock_irqsave(&freelist_lock, flags); */ + list_for_each_entry_safe( i, tmp, &freelist, freelist ) + { + unsigned long _i = (unsigned long)i; + unsigned long _hdr = (unsigned long)hdr; + + /* Do not merge across page boundaries. */ + if ( ((_i ^ _hdr) & PAGE_MASK) != 0 ) + continue; + + /* We follow this block? Swallow it. */ + if ( (_i + i->size) == _hdr ) + { + list_del(&i->freelist); + i->size += hdr->size; + hdr = i; + } + + /* We precede this block? Swallow it. */ + if ( (_hdr + hdr->size) == _i ) + { + list_del(&i->freelist); + hdr->size += i->size; + } + } + + /* Did we merge an entire page? */ + if ( hdr->size == PAGE_SIZE ) + { + if((((unsigned long)hdr) & (PAGE_SIZE-1)) != 0) + { + printk("Bug\n"); + *(int*)0=0; + } + free_pages(hdr, 0); + } + else + { + list_add(&hdr->freelist, &freelist); + } + + /* spin_unlock_irqrestore(&freelist_lock, flags); */ +} + diff -r de3576a1c62c -r dfaf788ab18c linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h --- /dev/null Thu Aug 25 20:52:38 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h Fri Aug 26 20:47:16 2005 @@ -0,0 +1,33 @@ +#ifndef __x86_64_MMU_H +#define __x86_64_MMU_H + +#include <linux/spinlock.h> +#include <asm/semaphore.h> + +/* + * The x86_64 doesn't have a mmu context, but + * we put the segment information here. + * + * cpu_vm_mask is used to optimize ldt flushing. + */ +typedef struct { + void *ldt; + rwlock_t ldtlock; + int size; + struct semaphore sem; +#ifdef CONFIG_XEN + unsigned pinned:1; + struct list_head unpinned; +#endif +} mm_context_t; + +#ifdef CONFIG_XEN +extern struct list_head mm_unpinned; +extern spinlock_t mm_unpinned_lock; + +/* mm/memory.c:exit_mmap hook */ +extern void _arch_exit_mmap(struct mm_struct *mm); +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) +#endif + +#endif diff -r de3576a1c62c -r dfaf788ab18c tools/xenstore/testsuite/vg-suppressions --- /dev/null Thu Aug 25 20:52:38 2005 +++ b/tools/xenstore/testsuite/vg-suppressions Fri Aug 26 20:47:16 2005 @@ -0,0 +1,9 @@ +{ + Glibc goes boom from _start (Debian glibc 2.3.5-3) + Memcheck:Cond + obj:/lib/ld-2.3.5.so + obj:/lib/ld-2.3.5.so + obj:/lib/ld-2.3.5.so + obj:/lib/ld-2.3.5.so + obj:/lib/ld-2.3.5.so +} diff -r de3576a1c62c -r dfaf788ab18c extras/mini-os/lib/malloc.c --- a/extras/mini-os/lib/malloc.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,5697 +0,0 @@ -/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- - **************************************************************************** - * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge - **************************************************************************** - * - * File: malloc.c - * Author: Rolf Neugebauer (neugebar@xxxxxxxxxxxxx) - * Changes: - * - * Date: Aug 2003 - * - * Environment: Xen Minimal OS - * Description: Library functions, maloc at al - * - **************************************************************************** - * $Id: c-insert.c,v 1.7 2002/11/08 16:04:34 rn Exp $ - **************************************************************************** - */ - -#include <os.h> -#include <mm.h> -#include <types.h> -#include <lib.h> - -/* standard compile option */ -#define HAVE_MEMCOPY 1 -#define USE_MEMCPY 1 -#undef HAVE_MMAP -#undef MMAP_CLEARS -#undef HAVE_MREMAP -#define malloc_getpagesize PAGE_SIZE -#undef HAVE_USR_INCLUDE_MALLOC_H -#define LACKS_UNISTD_H 1 -#define LACKS_SYS_PARAM_H 1 -#define LACKS_SYS_MMAN_H 1 -#define LACKS_FCNTL_H 1 - - -/* page allocator interface */ -#define MORECORE more_core -#define MORECORE_CONTIGUOUS 0 -#define MORECORE_FAILURE 0 -#define MORECORE_CANNOT_TRIM 1 - -static void *more_core(size_t n) -{ - static void *last; - unsigned long order, num_pages; - void *ret; - - if (n == 0) - return last; - - n = PFN_UP(n); - for ( order = 0; n > 1; order++ ) - n >>= 1; - ret = (void *)alloc_pages(order); - - /* work out pointer to end of chunk */ - if ( ret ) - { - num_pages = 1 << order; - last = (char *)ret + (num_pages * PAGE_SIZE); - } - - return ret; -} - -/* other options commented out below */ -#define __STD_C 1 -#define Void_t void -#define assert(x) ((void)0) - -#define CHUNK_SIZE_T unsigned long -#define PTR_UINT unsigned long -#define INTERNAL_SIZE_T size_t -#define SIZE_SZ (sizeof(INTERNAL_SIZE_T)) -#define MALLOC_ALIGNMENT (2 * SIZE_SZ) -#define MALLOC_ALIGN_MASK (MALLOC_ALIGNMENT - 1) -#define TRIM_FASTBINS 0 - -#define M_MXFAST 1 -#define DEFAULT_MXFAST 64 -#define M_TRIM_THRESHOLD -1 -#define DEFAULT_TRIM_THRESHOLD (256 * 1024) -#define M_TOP_PAD -2 -#define DEFAULT_TOP_PAD (0) -#define M_MMAP_THRESHOLD -3 -#define DEFAULT_MMAP_THRESHOLD (256 * 1024) -#define M_MMAP_MAX -4 -#define DEFAULT_MMAP_MAX (0) -#define MALLOC_FAILURE_ACTION printf("malloc failure\n") - -#define cALLOc public_cALLOc -#define fREe public_fREe -#define cFREe public_cFREe -#define mALLOc public_mALLOc -#define mEMALIGn public_mEMALIGn -#define rEALLOc public_rEALLOc -#define vALLOc public_vALLOc -#define pVALLOc public_pVALLOc -#define mALLINFo public_mALLINFo -#define mALLOPt public_mALLOPt -#define mTRIm public_mTRIm -#define mSTATs public_mSTATs -#define mUSABLe public_mUSABLe -#define iCALLOc public_iCALLOc -#define iCOMALLOc public_iCOMALLOc - -#define public_cALLOc calloc -#define public_fREe free -#define public_cFREe cfree -#define public_mALLOc malloc -#define public_mEMALIGn memalign -#define public_rEALLOc realloc -#define public_vALLOc valloc -#define public_pVALLOc pvalloc -#define public_mALLINFo mallinfo -#define public_mALLOPt mallopt -#define public_mTRIm malloc_trim -#define public_mSTATs malloc_stats -#define public_mUSABLe malloc_usable_size -#define public_iCALLOc independent_calloc -#define public_iCOMALLOc independent_comalloc - - -/* - This is a version (aka dlmalloc) of malloc/free/realloc written by - Doug Lea and released to the public domain. Use, modify, and - redistribute this code without permission or acknowledgement in any - way you wish. Send questions, comments, complaints, performance - data, etc to dl@xxxxxxxxxxxxx - -* VERSION 2.7.2 Sat Aug 17 09:07:30 2002 Doug Lea (dl at gee) - - Note: There may be an updated version of this malloc obtainable at - ftp://gee.cs.oswego.edu/pub/misc/malloc.c - Check before installing! - -* Quickstart - - This library is all in one file to simplify the most common usage: - ftp it, compile it (-O), and link it into another program. All - of the compile-time options default to reasonable values for use on - most unix platforms. Compile -DWIN32 for reasonable defaults on windows. - You might later want to step through various compile-time and dynamic - tuning options. - - For convenience, an include file for code using this malloc is at: - ftp://gee.cs.oswego.edu/pub/misc/malloc-2.7.1.h - You don't really need this .h file unless you call functions not - defined in your system include files. The .h file contains only the - excerpts from this file needed for using this malloc on ANSI C/C++ - systems, so long as you haven't changed compile-time options about - naming and tuning parameters. If you do, then you can create your - own malloc.h that does include all settings by cutting at the point - indicated below. - -* Why use this malloc? - - This is not the fastest, most space-conserving, most portable, or - most tunable malloc ever written. However it is among the fastest - while also being among the most space-conserving, portable and tunable. - Consistent balance across these factors results in a good general-purpose - allocator for malloc-intensive programs. - - The main properties of the algorithms are: - * For large (>= 512 bytes) requests, it is a pure best-fit allocator, - with ties normally decided via FIFO (i.e. least recently used). - * For small (<= 64 bytes by default) requests, it is a caching - allocator, that maintains pools of quickly recycled chunks. - * In between, and for combinations of large and small requests, it does - the best it can trying to meet both goals at once. - * For very large requests (>= 128KB by default), it relies on system - memory mapping facilities, if supported. - - For a longer but slightly out of date high-level description, see - http://gee.cs.oswego.edu/dl/html/malloc.html - - You may already by default be using a C library containing a malloc - that is based on some version of this malloc (for example in - linux). You might still want to use the one in this file in order to - customize settings or to avoid overheads associated with library - versions. - -* Contents, described in more detail in "description of public routines" below. - - Standard (ANSI/SVID/...) functions: - malloc(size_t n); - calloc(size_t n_elements, size_t element_size); - free(Void_t* p); - realloc(Void_t* p, size_t n); - memalign(size_t alignment, size_t n); - valloc(size_t n); - mallinfo() - mallopt(int parameter_number, int parameter_value) - - Additional functions: - independent_calloc(size_t n_elements, size_t size, Void_t* chunks[]); - independent_comalloc(size_t n_elements, size_t sizes[], Void_t* chunks[]); - pvalloc(size_t n); - cfree(Void_t* p); - malloc_trim(size_t pad); - malloc_usable_size(Void_t* p); - malloc_stats(); - -* Vital statistics: - - Supported pointer representation: 4 or 8 bytes - Supported size_t representation: 4 or 8 bytes - Note that size_t is allowed to be 4 bytes even if pointers are 8. - You can adjust this by defining INTERNAL_SIZE_T - - Alignment: 2 * sizeof(size_t) (default) - (i.e., 8 byte alignment with 4byte size_t). This suffices for - nearly all current machines and C compilers. However, you can - define MALLOC_ALIGNMENT to be wider than this if necessary. - - Minimum overhead per allocated chunk: 4 or 8 bytes - Each malloced chunk has a hidden word of overhead holding size - and status information. - - Minimum allocated size: 4-byte ptrs: 16 bytes (including 4 overhead) - 8-byte ptrs: 24/32 bytes (including, 4/8 overhead) - - When a chunk is freed, 12 (for 4byte ptrs) or 20 (for 8 byte - ptrs but 4 byte size) or 24 (for 8/8) additional bytes are - needed; 4 (8) for a trailing size field and 8 (16) bytes for - free list pointers. Thus, the minimum allocatable size is - 16/24/32 bytes. - - Even a request for zero bytes (i.e., malloc(0)) returns a - pointer to something of the minimum allocatable size. - - The maximum overhead wastage (i.e., number of extra bytes - allocated than were requested in malloc) is less than or equal - to the minimum size, except for requests >= mmap_threshold that - are serviced via mmap(), where the worst case wastage is 2 * - sizeof(size_t) bytes plus the remainder from a system page (the - minimal mmap unit); typically 4096 or 8192 bytes. - - Maximum allocated size: 4-byte size_t: 2^32 minus about two pages - 8-byte size_t: 2^64 minus about two pages - - It is assumed that (possibly signed) size_t values suffice to - represent chunk sizes. `Possibly signed' is due to the fact - that `size_t' may be defined on a system as either a signed or - an unsigned type. The ISO C standard says that it must be - unsigned, but a few systems are known not to adhere to this. - Additionally, even when size_t is unsigned, sbrk (which is by - default used to obtain memory from system) accepts signed - arguments, and may not be able to handle size_t-wide arguments - with negative sign bit. Generally, values that would - appear as negative after accounting for overhead and alignment - are supported only via mmap(), which does not have this - limitation. - - Requests for sizes outside the allowed range will perform an optional - failure action and then return null. (Requests may also - also fail because a system is out of memory.) - - Thread-safety: NOT thread-safe unless USE_MALLOC_LOCK defined - - When USE_MALLOC_LOCK is defined, wrappers are created to - surround every public call with either a pthread mutex or - a win32 spinlock (depending on WIN32). This is not - especially fast, and can be a major bottleneck. - It is designed only to provide minimal protection - in concurrent environments, and to provide a basis for - extensions. If you are using malloc in a concurrent program, - you would be far better off obtaining ptmalloc, which is - derived from a version of this malloc, and is well-tuned for - concurrent programs. (See http://www.malloc.de) Note that - even when USE_MALLOC_LOCK is defined, you can can guarantee - full thread-safety only if no threads acquire memory through - direct calls to MORECORE or other system-level allocators. - - Compliance: I believe it is compliant with the 1997 Single Unix Specification - (See http://www.opennc.org). Also SVID/XPG, ANSI C, and probably - others as well. - -* Synopsis of compile-time options: - - People have reported using previous versions of this malloc on all - versions of Unix, sometimes by tweaking some of the defines - below. It has been tested most extensively on Solaris and - Linux. It is also reported to work on WIN32 platforms. - People also report using it in stand-alone embedded systems. - - The implementation is in straight, hand-tuned ANSI C. It is not - at all modular. (Sorry!) It uses a lot of macros. To be at all - usable, this code should be compiled using an optimizing compiler - (for example gcc -O3) that can simplify expressions and control - paths. (FAQ: some macros import variables as arguments rather than - declare locals because people reported that some debuggers - otherwise get confused.) - - OPTION DEFAULT VALUE - - Compilation Environment options: - - __STD_C derived from C compiler defines - WIN32 NOT defined - HAVE_MEMCPY defined - USE_MEMCPY 1 if HAVE_MEMCPY is defined - HAVE_MMAP defined as 1 - MMAP_CLEARS 1 - HAVE_MREMAP 0 unless linux defined - malloc_getpagesize derived from system #includes, or 4096 if not - HAVE_USR_INCLUDE_MALLOC_H NOT defined - LACKS_UNISTD_H NOT defined unless WIN32 - LACKS_SYS_PARAM_H NOT defined unless WIN32 - LACKS_SYS_MMAN_H NOT defined unless WIN32 - LACKS_FCNTL_H NOT defined - - Changing default word sizes: - - INTERNAL_SIZE_T size_t - MALLOC_ALIGNMENT 2 * sizeof(INTERNAL_SIZE_T) - PTR_UINT unsigned long - CHUNK_SIZE_T unsigned long - - Configuration and functionality options: - - USE_DL_PREFIX NOT defined - USE_PUBLIC_MALLOC_WRAPPERS NOT defined - USE_MALLOC_LOCK NOT defined - DEBUG NOT defined - REALLOC_ZERO_BYTES_FREES NOT defined - MALLOC_FAILURE_ACTION errno = ENOMEM, if __STD_C defined, else no-op - TRIM_FASTBINS 0 - FIRST_SORTED_BIN_SIZE 512 - - Options for customizing MORECORE: - - MORECORE sbrk - MORECORE_CONTIGUOUS 1 - MORECORE_CANNOT_TRIM NOT defined - MMAP_AS_MORECORE_SIZE (1024 * 1024) - - Tuning options that are also dynamically changeable via mallopt: - - DEFAULT_MXFAST 64 - DEFAULT_TRIM_THRESHOLD 256 * 1024 - DEFAULT_TOP_PAD 0 - DEFAULT_MMAP_THRESHOLD 256 * 1024 - DEFAULT_MMAP_MAX 65536 - - There are several other #defined constants and macros that you - probably don't want to touch unless you are extending or adapting malloc. -*/ - -/* RN: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ -#if 0 - -/* - WIN32 sets up defaults for MS environment and compilers. - Otherwise defaults are for unix. -*/ - -/* #define WIN32 */ - -#ifdef WIN32 - -#define WIN32_LEAN_AND_MEAN -#include <windows.h> - -/* Win32 doesn't supply or need the following headers */ -#define LACKS_UNISTD_H -#define LACKS_SYS_PARAM_H -#define LACKS_SYS_MMAN_H - -/* Use the supplied emulation of sbrk */ -#define MORECORE sbrk -#define MORECORE_CONTIGUOUS 1 -#define MORECORE_FAILURE ((void*)(-1)) - -/* Use the supplied emulation of mmap and munmap */ -#define HAVE_MMAP 1 -#define MUNMAP_FAILURE (-1) -#define MMAP_CLEARS 1 - -/* These values don't really matter in windows mmap emulation */ -#define MAP_PRIVATE 1 -#define MAP_ANONYMOUS 2 -#define PROT_READ 1 -#define PROT_WRITE 2 - -/* Emulation functions defined at the end of this file */ - -/* If USE_MALLOC_LOCK, use supplied critical-section-based lock functions */ -#ifdef USE_MALLOC_LOCK -static int slwait(int *sl); -static int slrelease(int *sl); -#endif - -static long getpagesize(void); -static long getregionsize(void); -static void *sbrk(long size); -static void *mmap(void *ptr, long size, long prot, long type, long handle, long arg); -static long munmap(void *ptr, long size); - -static void vminfo (unsigned long*free, unsigned long*reserved, unsigned long*committed); -static int cpuinfo (int whole, unsigned long*kernel, unsigned long*user); - -#endif - -/* - __STD_C should be nonzero if using ANSI-standard C compiler, a C++ - compiler, or a C compiler sufficiently close to ANSI to get away - with it. -*/ - -#ifndef __STD_C -#if defined(__STDC__) || defined(_cplusplus) -#define __STD_C 1 -#else -#define __STD_C 0 -#endif -#endif /*__STD_C*/ - - -/* - Void_t* is the pointer type that malloc should say it returns -*/ - -#ifndef Void_t -#if (__STD_C || defined(WIN32)) -#define Void_t void -#else -#define Void_t char -#endif -#endif /*Void_t*/ - -#if __STD_C -#include <stddef.h> /* for size_t */ -#else -#include <sys/types.h> -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -/* define LACKS_UNISTD_H if your system does not have a <unistd.h>. */ - -/* #define LACKS_UNISTD_H */ - -#ifndef LACKS_UNISTD_H -#include <unistd.h> -#endif - -/* define LACKS_SYS_PARAM_H if your system does not have a <sys/param.h>. */ - -/* #define LACKS_SYS_PARAM_H */ - - -#include <stdio.h> /* needed for malloc_stats */ -#include <errno.h> /* needed for optional MALLOC_FAILURE_ACTION */ - - -/* - Debugging: - - Because freed chunks may be overwritten with bookkeeping fields, this - malloc will often die when freed memory is overwritten by user - programs. This can be very effective (albeit in an annoying way) - in helping track down dangling pointers. - - If you compile with -DDEBUG, a number of assertion checks are - enabled that will catch more memory errors. You probably won't be - able to make much sense of the actual assertion errors, but they - should help you locate incorrectly overwritten memory. The - checking is fairly extensive, and will slow down execution - noticeably. Calling malloc_stats or mallinfo with DEBUG set will - attempt to check every non-mmapped allocated and free chunk in the - course of computing the summmaries. (By nature, mmapped regions - cannot be checked very much automatically.) - - Setting DEBUG may also be helpful if you are trying to modify - this code. The assertions in the check routines spell out in more - detail the assumptions and invariants underlying the algorithms. - - Setting DEBUG does NOT provide an automated mechanism for checking - that all accesses to malloced memory stay within their - bounds. However, there are several add-ons and adaptations of this - or other mallocs available that do this. -*/ - -#if DEBUG -#include <assert.h> -#else -#define assert(x) ((void)0) -#endif - -/* - The unsigned integer type used for comparing any two chunk sizes. - This should be at least as wide as size_t, but should not be signed. -*/ - -#ifndef CHUNK_SIZE_T -#define CHUNK_SIZE_T unsigned long -#endif - -/* - The unsigned integer type used to hold addresses when they are are - manipulated as integers. Except that it is not defined on all - systems, intptr_t would suffice. -*/ -#ifndef PTR_UINT -#define PTR_UINT unsigned long -#endif - - -/* - INTERNAL_SIZE_T is the word-size used for internal bookkeeping - of chunk sizes. - - The default version is the same as size_t. - - While not strictly necessary, it is best to define this as an - unsigned type, even if size_t is a signed type. This may avoid some - artificial size limitations on some systems. - - On a 64-bit machine, you may be able to reduce malloc overhead by - defining INTERNAL_SIZE_T to be a 32 bit `unsigned int' at the - expense of not being able to handle more than 2^32 of malloced - space. If this limitation is acceptable, you are encouraged to set - this unless you are on a platform requiring 16byte alignments. In - this case the alignment requirements turn out to negate any - potential advantages of decreasing size_t word size. - - Implementors: Beware of the possible combinations of: - - INTERNAL_SIZE_T might be signed or unsigned, might be 32 or 64 bits, - and might be the same width as int or as long - - size_t might have different width and signedness as INTERNAL_SIZE_T - - int and long might be 32 or 64 bits, and might be the same width - To deal with this, most comparisons and difference computations - among INTERNAL_SIZE_Ts should cast them to CHUNK_SIZE_T, being - aware of the fact that casting an unsigned int to a wider long does - not sign-extend. (This also makes checking for negative numbers - awkward.) Some of these casts result in harmless compiler warnings - on some systems. -*/ - -#ifndef INTERNAL_SIZE_T -#define INTERNAL_SIZE_T size_t -#endif - -/* The corresponding word size */ -#define SIZE_SZ (sizeof(INTERNAL_SIZE_T)) - - - -/* - MALLOC_ALIGNMENT is the minimum alignment for malloc'ed chunks. - It must be a power of two at least 2 * SIZE_SZ, even on machines - for which smaller alignments would suffice. It may be defined as - larger than this though. Note however that code and data structures - are optimized for the case of 8-byte alignment. -*/ - - -#ifndef MALLOC_ALIGNMENT -#define MALLOC_ALIGNMENT (2 * SIZE_SZ) -#endif - -/* The corresponding bit mask value */ -#define MALLOC_ALIGN_MASK (MALLOC_ALIGNMENT - 1) - - - -/* - REALLOC_ZERO_BYTES_FREES should be set if a call to - realloc with zero bytes should be the same as a call to free. - Some people think it should. Otherwise, since this malloc - returns a unique pointer for malloc(0), so does realloc(p, 0). -*/ - -/* #define REALLOC_ZERO_BYTES_FREES */ - -/* - TRIM_FASTBINS controls whether free() of a very small chunk can - immediately lead to trimming. Setting to true (1) can reduce memory - footprint, but will almost always slow down programs that use a lot - of small chunks. - - Define this only if you are willing to give up some speed to more - aggressively reduce system-level memory footprint when releasing - memory in programs that use many small chunks. You can get - essentially the same effect by setting MXFAST to 0, but this can - lead to even greater slowdowns in programs using many small chunks. - TRIM_FASTBINS is an in-between compile-time option, that disables - only those chunks bordering topmost memory from being placed in - fastbins. -*/ - -#ifndef TRIM_FASTBINS -#define TRIM_FASTBINS 0 -#endif - - -/* - USE_DL_PREFIX will prefix all public routines with the string 'dl'. - This is necessary when you only want to use this malloc in one part - of a program, using your regular system malloc elsewhere. -*/ - -/* #define USE_DL_PREFIX */ - - -/* - USE_MALLOC_LOCK causes wrapper functions to surround each - callable routine with pthread mutex lock/unlock. - - USE_MALLOC_LOCK forces USE_PUBLIC_MALLOC_WRAPPERS to be defined -*/ - - -/* #define USE_MALLOC_LOCK */ - - -/* - If USE_PUBLIC_MALLOC_WRAPPERS is defined, every public routine is - actually a wrapper function that first calls MALLOC_PREACTION, then - calls the internal routine, and follows it with - MALLOC_POSTACTION. This is needed for locking, but you can also use - this, without USE_MALLOC_LOCK, for purposes of interception, - instrumentation, etc. It is a sad fact that using wrappers often - noticeably degrades performance of malloc-intensive programs. -*/ - -#ifdef USE_MALLOC_LOCK -#define USE_PUBLIC_MALLOC_WRAPPERS -#else -/* #define USE_PUBLIC_MALLOC_WRAPPERS */ -#endif - - -/* - Two-phase name translation. - All of the actual routines are given mangled names. - When wrappers are used, they become the public callable versions. - When DL_PREFIX is used, the callable names are prefixed. -*/ - -#ifndef USE_PUBLIC_MALLOC_WRAPPERS -#define cALLOc public_cALLOc -#define fREe public_fREe -#define cFREe public_cFREe -#define mALLOc public_mALLOc -#define mEMALIGn public_mEMALIGn -#define rEALLOc public_rEALLOc -#define vALLOc public_vALLOc -#define pVALLOc public_pVALLOc -#define mALLINFo public_mALLINFo -#define mALLOPt public_mALLOPt -#define mTRIm public_mTRIm -#define mSTATs public_mSTATs -#define mUSABLe public_mUSABLe -#define iCALLOc public_iCALLOc -#define iCOMALLOc public_iCOMALLOc -#endif - -#ifdef USE_DL_PREFIX -#define public_cALLOc dlcalloc -#define public_fREe dlfree -#define public_cFREe dlcfree -#define public_mALLOc dlmalloc -#define public_mEMALIGn dlmemalign -#define public_rEALLOc dlrealloc -#define public_vALLOc dlvalloc -#define public_pVALLOc dlpvalloc -#define public_mALLINFo dlmallinfo -#define public_mALLOPt dlmallopt -#define public_mTRIm dlmalloc_trim -#define public_mSTATs dlmalloc_stats -#define public_mUSABLe dlmalloc_usable_size -#define public_iCALLOc dlindependent_calloc -#define public_iCOMALLOc dlindependent_comalloc -#else /* USE_DL_PREFIX */ -#define public_cALLOc calloc -#define public_fREe free -#define public_cFREe cfree -#define public_mALLOc malloc -#define public_mEMALIGn memalign -#define public_rEALLOc realloc -#define public_vALLOc valloc -#define public_pVALLOc pvalloc -#define public_mALLINFo mallinfo -#define public_mALLOPt mallopt -#define public_mTRIm malloc_trim -#define public_mSTATs malloc_stats -#define public_mUSABLe malloc_usable_size -#define public_iCALLOc independent_calloc -#define public_iCOMALLOc independent_comalloc -#endif /* USE_DL_PREFIX */ - - -/* - HAVE_MEMCPY should be defined if you are not otherwise using - ANSI STD C, but still have memcpy and memset in your C library - and want to use them in calloc and realloc. Otherwise simple - macro versions are defined below. - - USE_MEMCPY should be defined as 1 if you actually want to - have memset and memcpy called. People report that the macro - versions are faster than libc versions on some systems. - - Even if USE_MEMCPY is set to 1, loops to copy/clear small chunks - (of <= 36 bytes) are manually unrolled in realloc and calloc. -*/ - -#define HAVE_MEMCPY - -#ifndef USE_MEMCPY -#ifdef HAVE_MEMCPY -#define USE_MEMCPY 1 -#else -#define USE_MEMCPY 0 -#endif -#endif - - -#if (__STD_C || defined(HAVE_MEMCPY)) - -#ifdef WIN32 -/* On Win32 memset and memcpy are already declared in windows.h */ -#else -#if __STD_C -void* memset(void*, int, size_t); -void* memcpy(void*, const void*, size_t); -#else -Void_t* memset(); -Void_t* memcpy(); -#endif -#endif -#endif - -/* - MALLOC_FAILURE_ACTION is the action to take before "return 0" when - malloc fails to be able to return memory, either because memory is - exhausted or because of illegal arguments. - - By default, sets errno if running on STD_C platform, else does nothing. -*/ - -#ifndef MALLOC_FAILURE_ACTION -#if __STD_C -#define MALLOC_FAILURE_ACTION \ - errno = ENOMEM; - -#else -#define MALLOC_FAILURE_ACTION -#endif -#endif - -/* - MORECORE-related declarations. By default, rely on sbrk -*/ - - -#ifdef LACKS_UNISTD_H -#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) -#if __STD_C -extern Void_t* sbrk(ptrdiff_t); -#else -extern Void_t* sbrk(); -#endif -#endif -#endif - -/* - MORECORE is the name of the routine to call to obtain more memory - from the system. See below for general guidance on writing - alternative MORECORE functions, as well as a version for WIN32 and a - sample version for pre-OSX macos. -*/ - -#ifndef MORECORE -#define MORECORE sbrk -#endif - -/* - MORECORE_FAILURE is the value returned upon failure of MORECORE - as well as mmap. Since it cannot be an otherwise valid memory address, - and must reflect values of standard sys calls, you probably ought not - try to redefine it. -*/ - -#ifndef MORECORE_FAILURE -#define MORECORE_FAILURE (-1) -#endif - -/* - If MORECORE_CONTIGUOUS is true, take advantage of fact that - consecutive calls to MORECORE with positive arguments always return - contiguous increasing addresses. This is true of unix sbrk. Even - if not defined, when regions happen to be contiguous, malloc will - permit allocations spanning regions obtained from different - calls. But defining this when applicable enables some stronger - consistency checks and space efficiencies. -*/ - -#ifndef MORECORE_CONTIGUOUS -#define MORECORE_CONTIGUOUS 1 -#endif - -/* - Define MORECORE_CANNOT_TRIM if your version of MORECORE - cannot release space back to the system when given negative - arguments. This is generally necessary only if you are using - a hand-crafted MORECORE function that cannot handle negative arguments. -*/ - -/* #define MORECORE_CANNOT_TRIM */ - - -/* - Define HAVE_MMAP as true to optionally make malloc() use mmap() to - allocate very large blocks. These will be returned to the - operating system immediately after a free(). Also, if mmap - is available, it is used as a backup strategy in cases where - MORECORE fails to provide space from system. - - This malloc is best tuned to work with mmap for large requests. - If you do not have mmap, operations involving very large chunks (1MB - or so) may be slower than you'd like. -*/ - -#ifndef HAVE_MMAP -#define HAVE_MMAP 1 -#endif - -#if HAVE_MMAP -/* - Standard unix mmap using /dev/zero clears memory so calloc doesn't - need to. -*/ - -#ifndef MMAP_CLEARS -#define MMAP_CLEARS 1 -#endif - -#else /* no mmap */ -#ifndef MMAP_CLEARS -#define MMAP_CLEARS 0 -#endif -#endif - - -/* - MMAP_AS_MORECORE_SIZE is the minimum mmap size argument to use if - sbrk fails, and mmap is used as a backup (which is done only if - HAVE_MMAP). The value must be a multiple of page size. This - backup strategy generally applies only when systems have "holes" in - address space, so sbrk cannot perform contiguous expansion, but - there is still space available on system. On systems for which - this is known to be useful (i.e. most linux kernels), this occurs - only when programs allocate huge amounts of memory. Between this, - and the fact that mmap regions tend to be limited, the size should - be large, to avoid too many mmap calls and thus avoid running out - of kernel resources. -*/ - -#ifndef MMAP_AS_MORECORE_SIZE -#define MMAP_AS_MORECORE_SIZE (1024 * 1024) -#endif - -/* - Define HAVE_MREMAP to make realloc() use mremap() to re-allocate - large blocks. This is currently only possible on Linux with - kernel versions newer than 1.3.77. -*/ - -#ifndef HAVE_MREMAP -#ifdef linux -#define HAVE_MREMAP 1 -#else -#define HAVE_MREMAP 0 -#endif - -#endif /* HAVE_MMAP */ - - -/* - The system page size. To the extent possible, this malloc manages - memory from the system in page-size units. Note that this value is - cached during initialization into a field of malloc_state. So even - if malloc_getpagesize is a function, it is only called once. - - The following mechanics for getpagesize were adapted from bsd/gnu - getpagesize.h. If none of the system-probes here apply, a value of - 4096 is used, which should be OK: If they don't apply, then using - the actual value probably doesn't impact performance. -*/ - - -#ifndef malloc_getpagesize - -#ifndef LACKS_UNISTD_H -# include <unistd.h> -#endif - -# ifdef _SC_PAGESIZE /* some SVR4 systems omit an underscore */ -# ifndef _SC_PAGE_SIZE -# define _SC_PAGE_SIZE _SC_PAGESIZE -# endif -# endif - -# ifdef _SC_PAGE_SIZE -# define malloc_getpagesize sysconf(_SC_PAGE_SIZE) -# else -# if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE) - extern size_t getpagesize(); -# define malloc_getpagesize getpagesize() -# else -# ifdef WIN32 /* use supplied emulation of getpagesize */ -# define malloc_getpagesize getpagesize() -# else -# ifndef LACKS_SYS_PARAM_H -# include <sys/param.h> -# endif -# ifdef EXEC_PAGESIZE -# define malloc_getpagesize EXEC_PAGESIZE -# else -# ifdef NBPG -# ifndef CLSIZE -# define malloc_getpagesize NBPG -# else -# define malloc_getpagesize (NBPG * CLSIZE) -# endif -# else -# ifdef NBPC -# define malloc_getpagesize NBPC -# else -# ifdef PAGESIZE -# define malloc_getpagesize PAGESIZE -# else /* just guess */ -# define malloc_getpagesize (4096) -# endif -# endif -# endif -# endif -# endif -# endif -# endif -#endif - -/* - This version of malloc supports the standard SVID/XPG mallinfo - routine that returns a struct containing usage properties and - statistics. It should work on any SVID/XPG compliant system that has - a /usr/include/malloc.h defining struct mallinfo. (If you'd like to - install such a thing yourself, cut out the preliminary declarations - as described above and below and save them in a malloc.h file. But - there's no compelling reason to bother to do this.) - - The main declaration needed is the mallinfo struct that is returned - (by-copy) by mallinfo(). The SVID/XPG malloinfo struct contains a - bunch of fields that are not even meaningful in this version of - malloc. These fields are are instead filled by mallinfo() with - other numbers that might be of interest. - - HAVE_USR_INCLUDE_MALLOC_H should be set if you have a - /usr/include/malloc.h file that includes a declaration of struct - mallinfo. If so, it is included; else an SVID2/XPG2 compliant - version is declared below. These must be precisely the same for - mallinfo() to work. The original SVID version of this struct, - defined on most systems with mallinfo, declares all fields as - ints. But some others define as unsigned long. If your system - defines the fields using a type of different width than listed here, - you must #include your system version and #define - HAVE_USR_INCLUDE_MALLOC_H. -*/ - -/* #define HAVE_USR_INCLUDE_MALLOC_H */ - -#ifdef HAVE_USR_INCLUDE_MALLOC_H -#include "/usr/include/malloc.h" -#else - -/* SVID2/XPG mallinfo structure */ - -struct mallinfo { - int arena; /* non-mmapped space allocated from system */ - int ordblks; /* number of free chunks */ - int smblks; /* number of fastbin blocks */ - int hblks; /* number of mmapped regions */ - int hblkhd; /* space in mmapped regions */ - int usmblks; /* maximum total allocated space */ - int fsmblks; /* space available in freed fastbin blocks */ - int uordblks; /* total allocated space */ - int fordblks; /* total free space */ - int keepcost; /* top-most, releasable (via malloc_trim) space */ -}; - -/* - SVID/XPG defines four standard parameter numbers for mallopt, - normally defined in malloc.h. Only one of these (M_MXFAST) is used - in this malloc. The others (M_NLBLKS, M_GRAIN, M_KEEP) don't apply, - so setting them has no effect. But this malloc also supports other - options in mallopt described below. -*/ -#endif - - -/* ---------- description of public routines ------------ */ - -/* - malloc(size_t n) - Returns a pointer to a newly allocated chunk of at least n bytes, or null - if no space is available. Additionally, on failure, errno is - set to ENOMEM on ANSI C systems. - - If n is zero, malloc returns a minumum-sized chunk. (The minimum - size is 16 bytes on most 32bit systems, and 24 or 32 bytes on 64bit - systems.) On most systems, size_t is an unsigned type, so calls - with negative arguments are interpreted as requests for huge amounts - of space, which will often fail. The maximum supported value of n - differs across systems, but is in all cases less than the maximum - representable value of a size_t. -*/ -#if __STD_C -Void_t* public_mALLOc(size_t); -#else -Void_t* public_mALLOc(); -#endif - -/* - free(Void_t* p) - Releases the chunk of memory pointed to by p, that had been previously - allocated using malloc or a related routine such as realloc. - It has no effect if p is null. It can have arbitrary (i.e., bad!) - effects if p has already been freed. - - Unless disabled (using mallopt), freeing very large spaces will - when possible, automatically trigger operations that give - back unused memory to the system, thus reducing program footprint. -*/ -#if __STD_C -void public_fREe(Void_t*); -#else -void public_fREe(); -#endif - -/* - calloc(size_t n_elements, size_t element_size); - Returns a pointer to n_elements * element_size bytes, with all locations - set to zero. -*/ -#if __STD_C -Void_t* public_cALLOc(size_t, size_t); -#else -Void_t* public_cALLOc(); -#endif - -/* - realloc(Void_t* p, size_t n) - Returns a pointer to a chunk of size n that contains the same data - as does chunk p up to the minimum of (n, p's size) bytes, or null - if no space is available. - - The returned pointer may or may not be the same as p. The algorithm - prefers extending p when possible, otherwise it employs the - equivalent of a malloc-copy-free sequence. - - If p is null, realloc is equivalent to malloc. - - If space is not available, realloc returns null, errno is set (if on - ANSI) and p is NOT freed. - - if n is for fewer bytes than already held by p, the newly unused - space is lopped off and freed if possible. Unless the #define - REALLOC_ZERO_BYTES_FREES is set, realloc with a size argument of - zero (re)allocates a minimum-sized chunk. - - Large chunks that were internally obtained via mmap will always - be reallocated using malloc-copy-free sequences unless - the system supports MREMAP (currently only linux). - - The old unix realloc convention of allowing the last-free'd chunk - to be used as an argument to realloc is not supported. -*/ -#if __STD_C -Void_t* public_rEALLOc(Void_t*, size_t); -#else -Void_t* public_rEALLOc(); -#endif - -/* - memalign(size_t alignment, size_t n); - Returns a pointer to a newly allocated chunk of n bytes, aligned - in accord with the alignment argument. - - The alignment argument should be a power of two. If the argument is - not a power of two, the nearest greater power is used. - 8-byte alignment is guaranteed by normal malloc calls, so don't - bother calling memalign with an argument of 8 or less. - - Overreliance on memalign is a sure way to fragment space. -*/ -#if __STD_C -Void_t* public_mEMALIGn(size_t, size_t); -#else -Void_t* public_mEMALIGn(); -#endif - -/* - valloc(size_t n); - Equivalent to memalign(pagesize, n), where pagesize is the page - size of the system. If the pagesize is unknown, 4096 is used. -*/ -#if __STD_C -Void_t* public_vALLOc(size_t); -#else -Void_t* public_vALLOc(); -#endif - - - -/* - mallopt(int parameter_number, int parameter_value) - Sets tunable parameters The format is to provide a - (parameter-number, parameter-value) pair. mallopt then sets the - corresponding parameter to the argument value if it can (i.e., so - long as the value is meaningful), and returns 1 if successful else - 0. SVID/XPG/ANSI defines four standard param numbers for mallopt, - normally defined in malloc.h. Only one of these (M_MXFAST) is used - in this malloc. The others (M_NLBLKS, M_GRAIN, M_KEEP) don't apply, - so setting them has no effect. But this malloc also supports four - other options in mallopt. See below for details. Briefly, supported - parameters are as follows (listed defaults are for "typical" - configurations). - - Symbol param # default allowed param values - M_MXFAST 1 64 0-80 (0 disables fastbins) - M_TRIM_THRESHOLD -1 256*1024 any (-1U disables trimming) - M_TOP_PAD -2 0 any - M_MMAP_THRESHOLD -3 256*1024 any (or 0 if no MMAP support) - M_MMAP_MAX -4 65536 any (0 disables use of mmap) -*/ -#if __STD_C -int public_mALLOPt(int, int); -#else -int public_mALLOPt(); -#endif - - -/* - mallinfo() - Returns (by copy) a struct containing various summary statistics: - - arena: current total non-mmapped bytes allocated from system - ordblks: the number of free chunks - smblks: the number of fastbin blocks (i.e., small chunks that - have been freed but not use resused or consolidated) - hblks: current number of mmapped regions - hblkhd: total bytes held in mmapped regions - usmblks: the maximum total allocated space. This will be greater - than current total if trimming has occurred. - fsmblks: total bytes held in fastbin blocks - uordblks: current total allocated space (normal or mmapped) - fordblks: total free space - keepcost: the maximum number of bytes that could ideally be released - back to system via malloc_trim. ("ideally" means that - it ignores page restrictions etc.) - - Because these fields are ints, but internal bookkeeping may - be kept as longs, the reported values may wrap around zero and - thus be inaccurate. -*/ -#if __STD_C -struct mallinfo public_mALLINFo(void); -#else -struct mallinfo public_mALLINFo(); -#endif - -/* - independent_calloc(size_t n_elements, size_t element_size, Void_t* chunks[]); - - independent_calloc is similar to calloc, but instead of returning a - single cleared space, it returns an array of pointers to n_elements - independent elements that can hold contents of size elem_size, each - of which starts out cleared, and can be independently freed, - realloc'ed etc. The elements are guaranteed to be adjacently - allocated (this is not guaranteed to occur with multiple callocs or - mallocs), which may also improve cache locality in some - applications. - - The "chunks" argument is optional (i.e., may be null, which is - probably the most typical usage). If it is null, the returned array - is itself dynamically allocated and should also be freed when it is - no longer needed. Otherwise, the chunks array must be of at least - n_elements in length. It is filled in with the pointers to the - chunks. - - In either case, independent_calloc returns this pointer array, or - null if the allocation failed. If n_elements is zero and "chunks" - is null, it returns a chunk representing an array with zero elements - (which should be freed if not wanted). - - Each element must be individually freed when it is no longer - needed. If you'd like to instead be able to free all at once, you - should instead use regular calloc and assign pointers into this - space to represent elements. (In this case though, you cannot - independently free elements.) - - independent_calloc simplifies and speeds up implementations of many - kinds of pools. It may also be useful when constructing large data - structures that initially have a fixed number of fixed-sized nodes, - but the number is not known at compile time, and some of the nodes - may later need to be freed. For example: - - struct Node { int item; struct Node* next; }; - - struct Node* build_list() { - struct Node** pool; - int n = read_number_of_nodes_needed(); - if (n <= 0) return 0; - pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0); - if (pool == 0) die(); - // organize into a linked list... - struct Node* first = pool[0]; - for (i = 0; i < n-1; ++i) - pool[i]->next = pool[i+1]; - free(pool); // Can now free the array (or not, if it is needed later) - return first; - } -*/ -#if __STD_C -Void_t** public_iCALLOc(size_t, size_t, Void_t**); -#else -Void_t** public_iCALLOc(); -#endif - -/* - independent_comalloc(size_t n_elements, size_t sizes[], Void_t* chunks[]); - - independent_comalloc allocates, all at once, a set of n_elements - chunks with sizes indicated in the "sizes" array. It returns - an array of pointers to these elements, each of which can be - independently freed, realloc'ed etc. The elements are guaranteed to - be adjacently allocated (this is not guaranteed to occur with - multiple callocs or mallocs), which may also improve cache locality - in some applications. - - The "chunks" argument is optional (i.e., may be null). If it is null - the returned array is itself dynamically allocated and should also - be freed when it is no longer needed. Otherwise, the chunks array - must be of at least n_elements in length. It is filled in with the - pointers to the chunks. - - In either case, independent_comalloc returns this pointer array, or - null if the allocation failed. If n_elements is zero and chunks is - null, it returns a chunk representing an array with zero elements - (which should be freed if not wanted). - - Each element must be individually freed when it is no longer - needed. If you'd like to instead be able to free all at once, you - should instead use a single regular malloc, and assign pointers at - particular offsets in the aggregate space. (In this case though, you - cannot independently free elements.) - - independent_comallac differs from independent_calloc in that each - element may have a different size, and also that it does not - automatically clear elements. - - independent_comalloc can be used to speed up allocation in cases - where several structs or objects must always be allocated at the - same time. For example: - - struct Head { ... } - struct Foot { ... } - - void send_message(char* msg) { - int msglen = strlen(msg); - size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) }; - void* chunks[3]; - if (independent_comalloc(3, sizes, chunks) == 0) - die(); - struct Head* head = (struct Head*)(chunks[0]); - char* body = (char*)(chunks[1]); - struct Foot* foot = (struct Foot*)(chunks[2]); - // ... - } - - In general though, independent_comalloc is worth using only for - larger values of n_elements. For small values, you probably won't - detect enough difference from series of malloc calls to bother. - - Overuse of independent_comalloc can increase overall memory usage, - since it cannot reuse existing noncontiguous small chunks that - might be available for some of the elements. -*/ -#if __STD_C -Void_t** public_iCOMALLOc(size_t, size_t*, Void_t**); -#else -Void_t** public_iCOMALLOc(); -#endif - - -/* - pvalloc(size_t n); - Equivalent to valloc(minimum-page-that-holds(n)), that is, - round up n to nearest pagesize. - */ -#if __STD_C -Void_t* public_pVALLOc(size_t); -#else -Void_t* public_pVALLOc(); -#endif - -/* - cfree(Void_t* p); - Equivalent to free(p). - - cfree is needed/defined on some systems that pair it with calloc, - for odd historical reasons (such as: cfree is used in example - code in the first edition of K&R). -*/ -#if __STD_C -void public_cFREe(Void_t*); -#else -void public_cFREe(); -#endif - -/* - malloc_trim(size_t pad); - - If possible, gives memory back to the system (via negative - arguments to sbrk) if there is unused memory at the `high' end of - the malloc pool. You can call this after freeing large blocks of - memory to potentially reduce the system-level memory requirements - of a program. However, it cannot guarantee to reduce memory. Under - some allocation patterns, some large free blocks of memory will be - locked between two used chunks, so they cannot be given back to - the system. - - The `pad' argument to malloc_trim represents the amount of free - trailing space to leave untrimmed. If this argument is zero, - only the minimum amount of memory to maintain internal data - structures will be left (one page or less). Non-zero arguments - can be supplied to maintain enough trailing space to service - future expected allocations without having to re-obtain memory - from the system. - - Malloc_trim returns 1 if it actually released any memory, else 0. - On systems that do not support "negative sbrks", it will always - rreturn 0. -*/ -#if __STD_C -int public_mTRIm(size_t); -#else -int public_mTRIm(); -#endif - -/* - malloc_usable_size(Void_t* p); - - Returns the number of bytes you can actually use in - an allocated chunk, which may be more than you requested (although - often not) due to alignment and minimum size constraints. - You can use this many bytes without worrying about - overwriting other allocated objects. This is not a particularly great - programming practice. malloc_usable_size can be more useful in - debugging and assertions, for example: - - p = malloc(n); - assert(malloc_usable_size(p) >= 256); - -*/ -#if __STD_C -size_t public_mUSABLe(Void_t*); -#else -size_t public_mUSABLe(); -#endif - -/* - malloc_stats(); - Prints on stderr the amount of space obtained from the system (both - via sbrk and mmap), the maximum amount (which may be more than - current if malloc_trim and/or munmap got called), and the current - number of bytes allocated via malloc (or realloc, etc) but not yet - freed. Note that this is the number of bytes allocated, not the - number requested. It will be larger than the number requested - because of alignment and bookkeeping overhead. Because it includes - alignment wastage as being in use, this figure may be greater than - zero even when no user-level chunks are allocated. - - The reported current and maximum system memory can be inaccurate if - a program makes other calls to system memory allocation functions - (normally sbrk) outside of malloc. - - malloc_stats prints only the most commonly interesting statistics. - More information can be obtained by calling mallinfo. - -*/ -#if __STD_C -void public_mSTATs(); -#else -void public_mSTATs(); -#endif - -/* mallopt tuning options */ - -/* - M_MXFAST is the maximum request size used for "fastbins", special bins - that hold returned chunks without consolidating their spaces. This - enables future requests for chunks of the same size to be handled - very quickly, but can increase fragmentation, and thus increase the - overall memory footprint of a program. - - This malloc manages fastbins very conservatively yet still - efficiently, so fragmentation is rarely a problem for values less - than or equal to the default. The maximum supported value of MXFAST - is 80. You wouldn't want it any higher than this anyway. Fastbins - are designed especially for use with many small structs, objects or - strings -- the default handles structs/objects/arrays with sizes up - to 16 4byte fields, or small strings representing words, tokens, - etc. Using fastbins for larger objects normally worsens - fragmentation without improving speed. - - M_MXFAST is set in REQUEST size units. It is internally used in - chunksize units, which adds padding and alignment. You can reduce - M_MXFAST to 0 to disable all use of fastbins. This causes the malloc - algorithm to be a closer approximation of fifo-best-fit in all cases, - not just for larger requests, but will generally cause it to be - slower. -*/ - - -/* M_MXFAST is a standard SVID/XPG tuning option, usually listed in malloc.h */ -#ifndef M_MXFAST -#define M_MXFAST 1 -#endif - -#ifndef DEFAULT_MXFAST -#define DEFAULT_MXFAST 64 -#endif - - -/* - M_TRIM_THRESHOLD is the maximum amount of unused top-most memory - to keep before releasing via malloc_trim in free(). - - Automatic trimming is mainly useful in long-lived programs. - Because trimming via sbrk can be slow on some systems, and can - sometimes be wasteful (in cases where programs immediately - afterward allocate more large chunks) the value should be high - enough so that your overall system performance would improve by - releasing this much memory. - - The trim threshold and the mmap control parameters (see below) - can be traded off with one another. Trimming and mmapping are - two different ways of releasing unused memory back to the - system. Between these two, it is often possible to keep - system-level demands of a long-lived program down to a bare - minimum. For example, in one test suite of sessions measuring - the XF86 X server on Linux, using a trim threshold of 128K and a - mmap threshold of 192K led to near-minimal long term resource - consumption. - - If you are using this malloc in a long-lived program, it should - pay to experiment with these values. As a rough guide, you - might set to a value close to the average size of a process - (program) running on your system. Releasing this much memory - would allow such a process to run in memory. Generally, it's - worth it to tune for trimming rather tham memory mapping when a - program undergoes phases where several large chunks are - allocated and released in ways that can reuse each other's - storage, perhaps mixed with phases where there are no such - chunks at all. And in well-behaved long-lived programs, - controlling release of large blocks via trimming versus mapping - is usually faster. - - However, in most programs, these parameters serve mainly as - protection against the system-level effects of carrying around - massive amounts of unneeded memory. Since frequent calls to - sbrk, mmap, and munmap otherwise degrade performance, the default - parameters are set to relatively high values that serve only as - safeguards. - - The trim value must be greater than page size to have any useful - effect. To disable trimming completely, you can set to - (unsigned long)(-1) - - Trim settings interact with fastbin (MXFAST) settings: Unless - TRIM_FASTBINS is defined, automatic trimming never takes place upon - freeing a chunk with size less than or equal to MXFAST. Trimming is - instead delayed until subsequent freeing of larger chunks. However, - you can still force an attempted trim by calling malloc_trim. - - Also, trimming is not generally possible in cases where - the main arena is obtained via mmap. - - Note that the trick some people use of mallocing a huge space and - then freeing it at program startup, in an attempt to reserve system - memory, doesn't have the intended effect under automatic trimming, - since that memory will immediately be returned to the system. -*/ - -#define M_TRIM_THRESHOLD -1 - -#ifndef DEFAULT_TRIM_THRESHOLD -#define DEFAULT_TRIM_THRESHOLD (256 * 1024) -#endif - -/* - M_TOP_PAD is the amount of extra `padding' space to allocate or - retain whenever sbrk is called. It is used in two ways internally: - - * When sbrk is called to extend the top of the arena to satisfy - a new malloc request, this much padding is added to the sbrk - request. - - * When malloc_trim is called automatically from free(), - it is used as the `pad' argument. - - In both cases, the actual amount of padding is rounded - so that the end of the arena is always a system page boundary. - - The main reason for using padding is to avoid calling sbrk so - often. Having even a small pad greatly reduces the likelihood - that nearly every malloc request during program start-up (or - after trimming) will invoke sbrk, which needlessly wastes - time. - - Automatic rounding-up to page-size units is normally sufficient - to avoid measurable overhead, so the default is 0. However, in - systems where sbrk is relatively slow, it can pay to increase - this value, at the expense of carrying around more memory than - the program needs. -*/ - -#define M_TOP_PAD -2 - -#ifndef DEFAULT_TOP_PAD -#define DEFAULT_TOP_PAD (0) -#endif - -/* - M_MMAP_THRESHOLD is the request size threshold for using mmap() - to service a request. Requests of at least this size that cannot - be allocated using already-existing space will be serviced via mmap. - (If enough normal freed space already exists it is used instead.) - - Using mmap segregates relatively large chunks of memory so that - they can be individually obtained and released from the host - system. A request serviced through mmap is never reused by any - other request (at least not directly; the system may just so - happen to remap successive requests to the same locations). - - Segregating space in this way has the benefits that: - - 1. Mmapped space can ALWAYS be individually released back - to the system, which helps keep the system level memory - demands of a long-lived program low. - 2. Mapped memory can never become `locked' between - other chunks, as can happen with normally allocated chunks, which - means that even trimming via malloc_trim would not release them. - 3. On some systems with "holes" in address spaces, mmap can obtain - memory that sbrk cannot. - - However, it has the disadvantages that: - - 1. The space cannot be reclaimed, consolidated, and then - used to service later requests, as happens with normal chunks. - 2. It can lead to more wastage because of mmap page alignment - requirements - 3. It causes malloc performance to be more dependent on host - system memory management support routines which may vary in - implementation quality and may impose arbitrary - limitations. Generally, servicing a request via normal - malloc steps is faster than going through a system's mmap. - - The advantages of mmap nearly always outweigh disadvantages for - "large" chunks, but the value of "large" varies across systems. The - default is an empirically derived value that works well in most - systems. -*/ - -#define M_MMAP_THRESHOLD -3 - -#ifndef DEFAULT_MMAP_THRESHOLD -#define DEFAULT_MMAP_THRESHOLD (256 * 1024) -#endif - -/* - M_MMAP_MAX is the maximum number of requests to simultaneously - service using mmap. This parameter exists because -. Some systems have a limited number of internal tables for - use by mmap, and using more than a few of them may degrade - performance. - - The default is set to a value that serves only as a safeguard. - Setting to 0 disables use of mmap for servicing large requests. If - HAVE_MMAP is not set, the default value is 0, and attempts to set it - to non-zero values in mallopt will fail. -*/ - -#define M_MMAP_MAX -4 - -#ifndef DEFAULT_MMAP_MAX -#if HAVE_MMAP -#define DEFAULT_MMAP_MAX (65536) -#else -#define DEFAULT_MMAP_MAX (0) -#endif -#endif - -#ifdef __cplusplus -}; /* end of extern "C" */ -#endif - - -/* RN XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ -#endif - -/* - ======================================================================== - To make a fully customizable malloc.h header file, cut everything - above this line, put into file malloc.h, edit to suit, and #include it - on the next line, as well as in programs that use this malloc. - ======================================================================== -*/ - -/* #include "malloc.h" */ - -/* --------------------- public wrappers ---------------------- */ - -#ifdef USE_PUBLIC_MALLOC_WRAPPERS - -/* Declare all routines as internal */ -#if __STD_C -static Void_t* mALLOc(size_t); -static void fREe(Void_t*); -static Void_t* rEALLOc(Void_t*, size_t); -static Void_t* mEMALIGn(size_t, size_t); -static Void_t* vALLOc(size_t); -static Void_t* pVALLOc(size_t); -static Void_t* cALLOc(size_t, size_t); -static Void_t** iCALLOc(size_t, size_t, Void_t**); -static Void_t** iCOMALLOc(size_t, size_t*, Void_t**); -static void cFREe(Void_t*); -static int mTRIm(size_t); -static size_t mUSABLe(Void_t*); -static void mSTATs(); -static int mALLOPt(int, int); -static struct mallinfo mALLINFo(void); -#else -static Void_t* mALLOc(); -static void fREe(); -static Void_t* rEALLOc(); -static Void_t* mEMALIGn(); -static Void_t* vALLOc(); -static Void_t* pVALLOc(); -static Void_t* cALLOc(); -static Void_t** iCALLOc(); -static Void_t** iCOMALLOc(); -static void cFREe(); -static int mTRIm(); -static size_t mUSABLe(); -static void mSTATs(); -static int mALLOPt(); -static struct mallinfo mALLINFo(); -#endif - -/* - MALLOC_PREACTION and MALLOC_POSTACTION should be - defined to return 0 on success, and nonzero on failure. - The return value of MALLOC_POSTACTION is currently ignored - in wrapper functions since there is no reasonable default - action to take on failure. -*/ - - -#ifdef USE_MALLOC_LOCK - -#ifdef WIN32 - -static int mALLOC_MUTEx; -#define MALLOC_PREACTION slwait(&mALLOC_MUTEx) -#define MALLOC_POSTACTION slrelease(&mALLOC_MUTEx) - -#else - -#include <pthread.h> - -static pthread_mutex_t mALLOC_MUTEx = PTHREAD_MUTEX_INITIALIZER; - -#define MALLOC_PREACTION pthread_mutex_lock(&mALLOC_MUTEx) -#define MALLOC_POSTACTION pthread_mutex_unlock(&mALLOC_MUTEx) - -#endif /* USE_MALLOC_LOCK */ - -#else - -/* Substitute anything you like for these */ - -#define MALLOC_PREACTION (0) -#define MALLOC_POSTACTION (0) - -#endif - -Void_t* public_mALLOc(size_t bytes) { - Void_t* m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = mALLOc(bytes); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -void public_fREe(Void_t* m) { - if (MALLOC_PREACTION != 0) { - return; - } - fREe(m); - if (MALLOC_POSTACTION != 0) { - } -} - -Void_t* public_rEALLOc(Void_t* m, size_t bytes) { - if (MALLOC_PREACTION != 0) { - return 0; - } - m = rEALLOc(m, bytes); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -Void_t* public_mEMALIGn(size_t alignment, size_t bytes) { - Void_t* m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = mEMALIGn(alignment, bytes); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -Void_t* public_vALLOc(size_t bytes) { - Void_t* m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = vALLOc(bytes); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -Void_t* public_pVALLOc(size_t bytes) { - Void_t* m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = pVALLOc(bytes); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -Void_t* public_cALLOc(size_t n, size_t elem_size) { - Void_t* m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = cALLOc(n, elem_size); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - - -Void_t** public_iCALLOc(size_t n, size_t elem_size, Void_t** chunks) { - Void_t** m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = iCALLOc(n, elem_size, chunks); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -Void_t** public_iCOMALLOc(size_t n, size_t sizes[], Void_t** chunks) { - Void_t** m; - if (MALLOC_PREACTION != 0) { - return 0; - } - m = iCOMALLOc(n, sizes, chunks); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -void public_cFREe(Void_t* m) { - if (MALLOC_PREACTION != 0) { - return; - } - cFREe(m); - if (MALLOC_POSTACTION != 0) { - } -} - -int public_mTRIm(size_t s) { - int result; - if (MALLOC_PREACTION != 0) { - return 0; - } - result = mTRIm(s); - if (MALLOC_POSTACTION != 0) { - } - return result; -} - -size_t public_mUSABLe(Void_t* m) { - size_t result; - if (MALLOC_PREACTION != 0) { - return 0; - } - result = mUSABLe(m); - if (MALLOC_POSTACTION != 0) { - } - return result; -} - -void public_mSTATs() { - if (MALLOC_PREACTION != 0) { - return; - } - mSTATs(); - if (MALLOC_POSTACTION != 0) { - } -} - -struct mallinfo public_mALLINFo() { - struct mallinfo m; - if (MALLOC_PREACTION != 0) { - struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - return nm; - } - m = mALLINFo(); - if (MALLOC_POSTACTION != 0) { - } - return m; -} - -int public_mALLOPt(int p, int v) { - int result; - if (MALLOC_PREACTION != 0) { - return 0; - } - result = mALLOPt(p, v); - if (MALLOC_POSTACTION != 0) { - } - return result; -} - -#endif - - - -/* ------------- Optional versions of memcopy ---------------- */ - - -#if USE_MEMCPY - -/* - Note: memcpy is ONLY invoked with non-overlapping regions, - so the (usually slower) memmove is not needed. -*/ - -#define MALLOC_COPY(dest, src, nbytes) memcpy(dest, src, nbytes) -#define MALLOC_ZERO(dest, nbytes) memset(dest, 0, nbytes) - -#else /* !USE_MEMCPY */ - -/* Use Duff's device for good zeroing/copying performance. */ - -#define MALLOC_ZERO(charp, nbytes) \ -do { \ - INTERNAL_SIZE_T* mzp = (INTERNAL_SIZE_T*)(charp); \ - CHUNK_SIZE_T mctmp = (nbytes)/sizeof(INTERNAL_SIZE_T); \ - long mcn; \ - if (mctmp < 8) mcn = 0; else { mcn = (mctmp-1)/8; mctmp %= 8; } \ - switch (mctmp) { \ - case 0: for(;;) { *mzp++ = 0; \ - case 7: *mzp++ = 0; \ - case 6: *mzp++ = 0; \ - case 5: *mzp++ = 0; \ - case 4: *mzp++ = 0; \ - case 3: *mzp++ = 0; \ - case 2: *mzp++ = 0; \ - case 1: *mzp++ = 0; if(mcn <= 0) break; mcn--; } \ - } \ -} while(0) - -#define MALLOC_COPY(dest,src,nbytes) \ -do { \ - INTERNAL_SIZE_T* mcsrc = (INTERNAL_SIZE_T*) src; \ - INTERNAL_SIZE_T* mcdst = (INTERNAL_SIZE_T*) dest; \ - CHUNK_SIZE_T mctmp = (nbytes)/sizeof(INTERNAL_SIZE_T); \ - long mcn; \ - if (mctmp < 8) mcn = 0; else { mcn = (mctmp-1)/8; mctmp %= 8; } \ - switch (mctmp) { \ - case 0: for(;;) { *mcdst++ = *mcsrc++; \ - case 7: *mcdst++ = *mcsrc++; \ - case 6: *mcdst++ = *mcsrc++; \ - case 5: *mcdst++ = *mcsrc++; \ - case 4: *mcdst++ = *mcsrc++; \ - case 3: *mcdst++ = *mcsrc++; \ - case 2: *mcdst++ = *mcsrc++; \ - case 1: *mcdst++ = *mcsrc++; if(mcn <= 0) break; mcn--; } \ - } \ -} while(0) - -#endif - -/* ------------------ MMAP support ------------------ */ - - -#if HAVE_MMAP - -#ifndef LACKS_FCNTL_H -#include <fcntl.h> -#endif - -#ifndef LACKS_SYS_MMAN_H -#include <sys/mman.h> -#endif - -#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) -#define MAP_ANONYMOUS MAP_ANON -#endif - -/* - Nearly all versions of mmap support MAP_ANONYMOUS, - so the following is unlikely to be needed, but is - supplied just in case. -*/ - -#ifndef MAP_ANONYMOUS - -static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */ - -#define MMAP(addr, size, prot, flags) ((dev_zero_fd < 0) ? \ - (dev_zero_fd = open("/dev/zero", O_RDWR), \ - mmap((addr), (size), (prot), (flags), dev_zero_fd, 0)) : \ - mmap((addr), (size), (prot), (flags), dev_zero_fd, 0)) - -#else - -#define MMAP(addr, size, prot, flags) \ - (mmap((addr), (size), (prot), (flags)|MAP_ANONYMOUS, -1, 0)) - -#endif - - -#endif /* HAVE_MMAP */ - - -/* - ----------------------- Chunk representations ----------------------- -*/ - - -/* - This struct declaration is misleading (but accurate and necessary). - It declares a "view" into memory allowing access to necessary - fields at known offsets from a given base. See explanation below. -*/ - -struct malloc_chunk { - - INTERNAL_SIZE_T prev_size; /* Size of previous chunk (if free). */ - INTERNAL_SIZE_T size; /* Size in bytes, including overhead. */ - - struct malloc_chunk* fd; /* double links -- used only if free. */ - struct malloc_chunk* bk; -}; - - -typedef struct malloc_chunk* mchunkptr; - -/* - malloc_chunk details: - - (The following includes lightly edited explanations by Colin Plumb.) - - Chunks of memory are maintained using a `boundary tag' method as - described in e.g., Knuth or Standish. (See the paper by Paul - Wilson ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a - survey of such techniques.) Sizes of free chunks are stored both - in the front of each chunk and at the end. This makes - consolidating fragmented chunks into bigger chunks very fast. The - size fields also hold bits representing whether chunks are free or - in use. - - An allocated chunk looks like this: - - - chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of previous chunk, if allocated | | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of chunk, in bytes |P| - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | User data starts here... . - . . - . (malloc_usable_space() bytes) . - . | -nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of chunk | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - - - Where "chunk" is the front of the chunk for the purpose of most of - the malloc code, but "mem" is the pointer that is returned to the - user. "Nextchunk" is the beginning of the next contiguous chunk. - - Chunks always begin on even word boundries, so the mem portion - (which is returned to the user) is also on an even word boundary, and - thus at least double-word aligned. - - Free chunks are stored in circular doubly-linked lists, and look like this: - - chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Size of previous chunk | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - `head:' | Size of chunk, in bytes |P| - mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Forward pointer to next chunk in list | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Back pointer to previous chunk in list | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Unused space (may be 0 bytes long) . - . . - . | -nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - `foot:' | Size of chunk, in bytes | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - - The P (PREV_INUSE) bit, stored in the unused low-order bit of the - chunk size (which is always a multiple of two words), is an in-use - bit for the *previous* chunk. If that bit is *clear*, then the - word before the current chunk size contains the previous chunk - size, and can be used to find the front of the previous chunk. - The very first chunk allocated always has this bit set, - preventing access to non-existent (or non-owned) memory. If - prev_inuse is set for any given chunk, then you CANNOT determine - the size of the previous chunk, and might even get a memory - addressing fault when trying to do so. - - Note that the `foot' of the current chunk is actually represented - as the prev_size of the NEXT chunk. This makes it easier to - deal with alignments etc but can be very confusing when trying - to extend or adapt this code. - - The two exceptions to all this are - - 1. The special chunk `top' doesn't bother using the - trailing size field since there is no next contiguous chunk - that would have to index off it. After initialization, `top' - is forced to always exist. If it would become less than - MINSIZE bytes long, it is replenished. - - 2. Chunks allocated via mmap, which have the second-lowest-order - bit (IS_MMAPPED) set in their size fields. Because they are - allocated one-by-one, each must contain its own trailing size field. - -*/ - -/* - ---------- Size and alignment checks and conversions ---------- -*/ - -/* conversion from malloc headers to user pointers, and back */ - -#define chunk2mem(p) ((Void_t*)((char*)(p) + 2*SIZE_SZ)) -#define mem2chunk(mem) ((mchunkptr)((char*)(mem) - 2*SIZE_SZ)) - -/* The smallest possible chunk */ -#define MIN_CHUNK_SIZE (sizeof(struct malloc_chunk)) - -/* The smallest size we can malloc is an aligned minimal chunk */ - -#define MINSIZE \ - (CHUNK_SIZE_T)(((MIN_CHUNK_SIZE+MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK)) - -/* Check if m has acceptable alignment */ - -#define aligned_OK(m) (((PTR_UINT)((m)) & (MALLOC_ALIGN_MASK)) == 0) - - -/* - Check if a request is so large that it would wrap around zero when - padded and aligned. To simplify some other code, the bound is made - low enough so that adding MINSIZE will also not wrap around sero. -*/ - -#define REQUEST_OUT_OF_RANGE(req) \ - ((CHUNK_SIZE_T)(req) >= \ - (CHUNK_SIZE_T)(INTERNAL_SIZE_T)(-2 * MINSIZE)) - -/* pad request bytes into a usable size -- internal version */ - -#define request2size(req) \ - (((req) + SIZE_SZ + MALLOC_ALIGN_MASK < MINSIZE) ? \ - MINSIZE : \ - ((req) + SIZE_SZ + MALLOC_ALIGN_MASK) & ~MALLOC_ALIGN_MASK) - -/* Same, except also perform argument check */ - -#define checked_request2size(req, sz) \ - if (REQUEST_OUT_OF_RANGE(req)) { \ - MALLOC_FAILURE_ACTION; \ - return 0; \ - } \ - (sz) = request2size(req); - -/* - --------------- Physical chunk operations --------------- -*/ - - -/* size field is or'ed with PREV_INUSE when previous adjacent chunk in use */ -#define PREV_INUSE 0x1 - -/* extract inuse bit of previous chunk */ -#define prev_inuse(p) ((p)->size & PREV_INUSE) - - -/* size field is or'ed with IS_MMAPPED if the chunk was obtained with mmap() */ -#define IS_MMAPPED 0x2 - -/* check for mmap()'ed chunk */ -#define chunk_is_mmapped(p) ((p)->size & IS_MMAPPED) - -/* - Bits to mask off when extracting size - - Note: IS_MMAPPED is intentionally not masked off from size field in - macros for which mmapped chunks should never be seen. This should - cause helpful core dumps to occur if it is tried by accident by - people extending or adapting this malloc. -*/ -#define SIZE_BITS (PREV_INUSE|IS_MMAPPED) - -/* Get size, ignoring use bits */ -#define chunksize(p) ((p)->size & ~(SIZE_BITS)) - - -/* Ptr to next physical malloc_chunk. */ -#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->size & ~PREV_INUSE) )) - -/* Ptr to previous physical malloc_chunk */ -#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_size) )) - -/* Treat space at ptr + offset as a chunk */ -#define chunk_at_offset(p, s) ((mchunkptr)(((char*)(p)) + (s))) - -/* extract p's inuse bit */ -#define inuse(p)\ -((((mchunkptr)(((char*)(p))+((p)->size & ~PREV_INUSE)))->size) & PREV_INUSE) - -/* set/clear chunk as being inuse without otherwise disturbing */ -#define set_inuse(p)\ -((mchunkptr)(((char*)(p)) + ((p)->size & ~PREV_INUSE)))->size |= PREV_INUSE - -#define clear_inuse(p)\ -((mchunkptr)(((char*)(p)) + ((p)->size & ~PREV_INUSE)))->size &= ~(PREV_INUSE) - - -/* check/set/clear inuse bits in known places */ -#define inuse_bit_at_offset(p, s)\ - (((mchunkptr)(((char*)(p)) + (s)))->size & PREV_INUSE) - -#define set_inuse_bit_at_offset(p, s)\ - (((mchunkptr)(((char*)(p)) + (s)))->size |= PREV_INUSE) - -#define clear_inuse_bit_at_offset(p, s)\ - (((mchunkptr)(((char*)(p)) + (s)))->size &= ~(PREV_INUSE)) - - -/* Set size at head, without disturbing its use bit */ -#define set_head_size(p, s) ((p)->size = (((p)->size & PREV_INUSE) | (s))) - -/* Set size/use field */ -#define set_head(p, s) ((p)->size = (s)) - -/* Set size at footer (only when chunk is not in use) */ -#define set_foot(p, s) (((mchunkptr)((char*)(p) + (s)))->prev_size = (s)) - - -/* - -------------------- Internal data structures -------------------- - - All internal state is held in an instance of malloc_state defined - below. There are no other static variables, except in two optional - cases: - * If USE_MALLOC_LOCK is defined, the mALLOC_MUTEx declared above. - * If HAVE_MMAP is true, but mmap doesn't support - MAP_ANONYMOUS, a dummy file descriptor for mmap. - - Beware of lots of tricks that minimize the total bookkeeping space - requirements. The result is a little over 1K bytes (for 4byte - pointers and size_t.) -*/ - -/* - Bins - - An array of bin headers for free chunks. Each bin is doubly - linked. The bins are approximately proportionally (log) spaced. - There are a lot of these bins (128). This may look excessive, but - works very well in practice. Most bins hold sizes that are - unusual as malloc request sizes, but are more usual for fragments - and consolidated sets of chunks, which is what these bins hold, so - they can be found quickly. All procedures maintain the invariant - that no consolidated chunk physically borders another one, so each - chunk in a list is known to be preceeded and followed by either - inuse chunks or the ends of memory. - - Chunks in bins are kept in size order, with ties going to the - approximately least recently used chunk. Ordering isn't needed - for the small bins, which all contain the same-sized chunks, but - facilitates best-fit allocation for larger chunks. These lists - are just sequential. Keeping them in order almost never requires - enough traversal to warrant using fancier ordered data - structures. - - Chunks of the same size are linked with the most - recently freed at the front, and allocations are taken from the - back. This results in LRU (FIFO) allocation order, which tends - to give each chunk an equal opportunity to be consolidated with - adjacent freed chunks, resulting in larger free chunks and less - fragmentation. - - To simplify use in double-linked lists, each bin header acts - as a malloc_chunk. This avoids special-casing for headers. - But to conserve space and improve locality, we allocate - only the fd/bk pointers of bins, and then use repositioning tricks - to treat these as the fields of a malloc_chunk*. -*/ - -typedef struct malloc_chunk* mbinptr; - -/* addressing -- note that bin_at(0) does not exist */ -#define bin_at(m, i) ((mbinptr)((char*)&((m)->bins[(i)<<1]) - (SIZE_SZ<<1))) - -/* analog of ++bin */ -#define next_bin(b) ((mbinptr)((char*)(b) + (sizeof(mchunkptr)<<1))) - -/* Reminders about list directionality within bins */ -#define first(b) ((b)->fd) -#define last(b) ((b)->bk) - -/* Take a chunk off a bin list */ -#define unlink(P, BK, FD) { \ - FD = P->fd; \ - BK = P->bk; \ - FD->bk = BK; \ - BK->fd = FD; \ -} - -/* - Indexing - - Bins for sizes < 512 bytes contain chunks of all the same size, spaced - 8 bytes apart. Larger bins are approximately logarithmically spaced: - - 64 bins of size 8 - 32 bins of size 64 - 16 bins of size 512 - 8 bins of size 4096 - 4 bins of size 32768 - 2 bins of size 262144 - 1 bin of size what's left - - The bins top out around 1MB because we expect to service large - requests via mmap. -*/ - -#define NBINS 96 -#define NSMALLBINS 32 -#define SMALLBIN_WIDTH 8 -#define MIN_LARGE_SIZE 256 - -#define in_smallbin_range(sz) \ - ((CHUNK_SIZE_T)(sz) < (CHUNK_SIZE_T)MIN_LARGE_SIZE) - -#define smallbin_index(sz) (((unsigned)(sz)) >> 3) - -/* - Compute index for size. We expect this to be inlined when - compiled with optimization, else not, which works out well. -*/ -static int largebin_index(unsigned int sz) { - unsigned int x = sz >> SMALLBIN_WIDTH; - unsigned int m; /* bit position of highest set bit of m */ - - if (x >= 0x10000) return NBINS-1; - - /* On intel, use BSRL instruction to find highest bit */ -#if defined(__GNUC__) && defined(i386) - - __asm__("bsrl %1,%0\n\t" - : "=r" (m) - : "g" (x)); - -#else - { - /* - Based on branch-free nlz algorithm in chapter 5 of Henry - S. Warren Jr's book "Hacker's Delight". - */ - - unsigned int n = ((x - 0x100) >> 16) & 8; - x <<= n; - m = ((x - 0x1000) >> 16) & 4; - n += m; - x <<= m; - m = ((x - 0x4000) >> 16) & 2; - n += m; - x = (x << m) >> 14; - m = 13 - n + (x & ~(x>>1)); - } -#endif - - /* Use next 2 bits to create finer-granularity bins */ - return NSMALLBINS + (m << 2) + ((sz >> (m + 6)) & 3); -} - -#define bin_index(sz) \ - ((in_smallbin_range(sz)) ? smallbin_index(sz) : largebin_index(sz)) - -/* - FIRST_SORTED_BIN_SIZE is the chunk size corresponding to the - first bin that is maintained in sorted order. This must - be the smallest size corresponding to a given bin. - - Normally, this should be MIN_LARGE_SIZE. But you can weaken - best fit guarantees to sometimes speed up malloc by increasing value. - Doing this means that malloc may choose a chunk that is - non-best-fitting by up to the width of the bin. - - Some useful cutoff values: - 512 - all bins sorted - 2560 - leaves bins <= 64 bytes wide unsorted - 12288 - leaves bins <= 512 bytes wide unsorted - 65536 - leaves bins <= 4096 bytes wide unsorted - 262144 - leaves bins <= 32768 bytes wide unsorted - -1 - no bins sorted (not recommended!) -*/ - -#define FIRST_SORTED_BIN_SIZE MIN_LARGE_SIZE -/* #define FIRST_SORTED_BIN_SIZE 65536 */ - -/* - Unsorted chunks - - All remainders from chunk splits, as well as all returned chunks, - are first placed in the "unsorted" bin. They are then placed - in regular bins after malloc gives them ONE chance to be used before - binning. So, basically, the unsorted_chunks list acts as a queue, - with chunks being placed on it in free (and malloc_consolidate), - and taken off (to be either used or placed in bins) in malloc. -*/ - -/* The otherwise unindexable 1-bin is used to hold unsorted chunks. */ -#define unsorted_chunks(M) (bin_at(M, 1)) - -/* - Top - - The top-most available chunk (i.e., the one bordering the end of - available memory) is treated specially. It is never included in - any bin, is used only if no other chunk is available, and is - released back to the system if it is very large (see - M_TRIM_THRESHOLD). Because top initially - points to its own bin with initial zero size, thus forcing - extension on the first malloc request, we avoid having any special - code in malloc to check whether it even exists yet. But we still - need to do so when getting memory from system, so we make - initial_top treat the bin as a legal but unusable chunk during the - interval between initialization and the first call to - sYSMALLOc. (This is somewhat delicate, since it relies on - the 2 preceding words to be zero during this interval as well.) -*/ - -/* Conveniently, the unsorted bin can be used as dummy top on first call */ -#define initial_top(M) (unsorted_chunks(M)) - -/* - Binmap - - To help compensate for the large number of bins, a one-level index - structure is used for bin-by-bin searching. `binmap' is a - bitvector recording whether bins are definitely empty so they can - be skipped over during during traversals. The bits are NOT always - cleared as soon as bins are empty, but instead only - when they are noticed to be empty during traversal in malloc. -*/ - -/* Conservatively use 32 bits per map word, even if on 64bit system */ -#define BINMAPSHIFT 5 -#define BITSPERMAP (1U << BINMAPSHIFT) -#define BINMAPSIZE (NBINS / BITSPERMAP) - -#define idx2block(i) ((i) >> BINMAPSHIFT) -#define idx2bit(i) ((1U << ((i) & ((1U << BINMAPSHIFT)-1)))) - -#define mark_bin(m,i) ((m)->binmap[idx2block(i)] |= idx2bit(i)) -#define unmark_bin(m,i) ((m)->binmap[idx2block(i)] &= ~(idx2bit(i))) -#define get_binmap(m,i) ((m)->binmap[idx2block(i)] & idx2bit(i)) - -/* - Fastbins - - An array of lists holding recently freed small chunks. Fastbins - are not doubly linked. It is faster to single-link them, and - since chunks are never removed from the middles of these lists, - double linking is not necessary. Also, unlike regular bins, they - are not even processed in FIFO order (they use faster LIFO) since - ordering doesn't much matter in the transient contexts in which - fastbins are normally used. - - Chunks in fastbins keep their inuse bit set, so they cannot - be consolidated with other free chunks. malloc_consolidate - releases all chunks in fastbins and consolidates them with - other free chunks. -*/ - -typedef struct malloc_chunk* mfastbinptr; - -/* offset 2 to use otherwise unindexable first 2 bins */ -#define fastbin_index(sz) ((((unsigned int)(sz)) >> 3) - 2) - -/* The maximum fastbin request size we support */ -#define MAX_FAST_SIZE 80 - -#define NFASTBINS (fastbin_index(request2size(MAX_FAST_SIZE))+1) - -/* - FASTBIN_CONSOLIDATION_THRESHOLD is the size of a chunk in free() - that triggers automatic consolidation of possibly-surrounding - fastbin chunks. This is a heuristic, so the exact value should not - matter too much. It is defined at half the default trim threshold as a - compromise heuristic to only attempt consolidation if it is likely - to lead to trimming. However, it is not dynamically tunable, since - consolidation reduces fragmentation surrounding loarge chunks even - if trimming is not used. -*/ - -#define FASTBIN_CONSOLIDATION_THRESHOLD \ - ((unsigned long)(DEFAULT_TRIM_THRESHOLD) >> 1) - -/* - Since the lowest 2 bits in max_fast don't matter in size comparisons, - they are used as flags. -*/ - -/* - ANYCHUNKS_BIT held in max_fast indicates that there may be any - freed chunks at all. It is set true when entering a chunk into any - bin. -*/ - -#define ANYCHUNKS_BIT (1U) - -#define have_anychunks(M) (((M)->max_fast & ANYCHUNKS_BIT)) -#define set_anychunks(M) ((M)->max_fast |= ANYCHUNKS_BIT) -#define clear_anychunks(M) ((M)->max_fast &= ~ANYCHUNKS_BIT) - -/* - FASTCHUNKS_BIT held in max_fast indicates that there are probably - some fastbin chunks. It is set true on entering a chunk into any - fastbin, and cleared only in malloc_consolidate. -*/ - -#define FASTCHUNKS_BIT (2U) - -#define have_fastchunks(M) (((M)->max_fast & FASTCHUNKS_BIT)) -#define set_fastchunks(M) ((M)->max_fast |= (FASTCHUNKS_BIT|ANYCHUNKS_BIT)) -#define clear_fastchunks(M) ((M)->max_fast &= ~(FASTCHUNKS_BIT)) - -/* - Set value of max_fast. - Use impossibly small value if 0. -*/ - -#define set_max_fast(M, s) \ - (M)->max_fast = (((s) == 0)? SMALLBIN_WIDTH: request2size(s)) | \ - ((M)->max_fast & (FASTCHUNKS_BIT|ANYCHUNKS_BIT)) - -#define get_max_fast(M) \ - ((M)->max_fast & ~(FASTCHUNKS_BIT | ANYCHUNKS_BIT)) - - -/* - morecore_properties is a status word holding dynamically discovered - or controlled properties of the morecore function -*/ - -#define MORECORE_CONTIGUOUS_BIT (1U) - -#define contiguous(M) \ - (((M)->morecore_properties & MORECORE_CONTIGUOUS_BIT)) -#define noncontiguous(M) \ - (((M)->morecore_properties & MORECORE_CONTIGUOUS_BIT) == 0) -#define set_contiguous(M) \ - ((M)->morecore_properties |= MORECORE_CONTIGUOUS_BIT) -#define set_noncontiguous(M) \ - ((M)->morecore_properties &= ~MORECORE_CONTIGUOUS_BIT) - - -/* - ----------- Internal state representation and initialization ----------- -*/ - -struct malloc_state { - - /* The maximum chunk size to be eligible for fastbin */ - INTERNAL_SIZE_T max_fast; /* low 2 bits used as flags */ - - /* Fastbins */ - mfastbinptr fastbins[NFASTBINS]; - - /* Base of the topmost chunk -- not otherwise kept in a bin */ - mchunkptr top; - - /* The remainder from the most recent split of a small request */ - mchunkptr last_remainder; - - /* Normal bins packed as described above */ - mchunkptr bins[NBINS * 2]; - - /* Bitmap of bins. Trailing zero map handles cases of largest binned size */ - unsigned int binmap[BINMAPSIZE+1]; - - /* Tunable parameters */ - CHUNK_SIZE_T trim_threshold; - INTERNAL_SIZE_T top_pad; - INTERNAL_SIZE_T mmap_threshold; - - /* Memory map support */ - int n_mmaps; - int n_mmaps_max; - int max_n_mmaps; - - /* Cache malloc_getpagesize */ - unsigned int pagesize; - - /* Track properties of MORECORE */ - unsigned int morecore_properties; - - /* Statistics */ - INTERNAL_SIZE_T mmapped_mem; - INTERNAL_SIZE_T sbrked_mem; - INTERNAL_SIZE_T max_sbrked_mem; - INTERNAL_SIZE_T max_mmapped_mem; - INTERNAL_SIZE_T max_total_mem; -}; - -typedef struct malloc_state *mstate; - -/* - There is exactly one instance of this struct in this malloc. - If you are adapting this malloc in a way that does NOT use a static - malloc_state, you MUST explicitly zero-fill it before using. This - malloc relies on the property that malloc_state is initialized to - all zeroes (as is true of C statics). -*/ - -static struct malloc_state av_; /* never directly referenced */ - -/* - All uses of av_ are via get_malloc_state(). - At most one "call" to get_malloc_state is made per invocation of - the public versions of malloc and free, but other routines - that in turn invoke malloc and/or free may call more then once. - Also, it is called in check* routines if DEBUG is set. -*/ - -#define get_malloc_state() (&(av_)) - -/* - Initialize a malloc_state struct. - - This is called only from within malloc_consolidate, which needs - be called in the same contexts anyway. It is never called directly - outside of malloc_consolidate because some optimizing compilers try - to inline it at all call points, which turns out not to be an - optimization at all. (Inlining it in malloc_consolidate is fine though.) -*/ - -#if __STD_C -static void malloc_init_state(mstate av) -#else -static void malloc_init_state(av) mstate av; -#endif -{ - int i; - mbinptr bin; - - /* Establish circular links for normal bins */ - for (i = 1; i < NBINS; ++i) { - bin = bin_at(av,i); - bin->fd = bin->bk = bin; - } - - av->top_pad = DEFAULT_TOP_PAD; - av->n_mmaps_max = DEFAULT_MMAP_MAX; - av->mmap_threshold = DEFAULT_MMAP_THRESHOLD; - av->trim_threshold = DEFAULT_TRIM_THRESHOLD; - -#if MORECORE_CONTIGUOUS - set_contiguous(av); -#else - set_noncontiguous(av); -#endif - - - set_max_fast(av, DEFAULT_MXFAST); - - av->top = initial_top(av); - av->pagesize = malloc_getpagesize; -} - -/* - Other internal utilities operating on mstates -*/ - -static Void_t* sYSMALLOc(INTERNAL_SIZE_T, mstate); -#ifndef MORECORE_CANNOT_TRIM -static int sYSTRIm(size_t, mstate); -#endif -static void malloc_consolidate(mstate); -static Void_t** iALLOc(size_t, size_t*, int, Void_t**); - -/* - Debugging support - - These routines make a number of assertions about the states - of data structures that should be true at all times. If any - are not true, it's very likely that a user program has somehow - trashed memory. (It's also possible that there is a coding error - in malloc. In which case, please report it!) -*/ - -#if ! DEBUG - -#define check_chunk(P) -#define check_free_chunk(P) -#define check_inuse_chunk(P) -#define check_remalloced_chunk(P,N) -#define check_malloced_chunk(P,N) -#define check_malloc_state() - -#else -#define check_chunk(P) do_check_chunk(P) -#define check_free_chunk(P) do_check_free_chunk(P) -#define check_inuse_chunk(P) do_check_inuse_chunk(P) -#define check_remalloced_chunk(P,N) do_check_remalloced_chunk(P,N) -#define check_malloced_chunk(P,N) do_check_malloced_chunk(P,N) -#define check_malloc_state() do_check_malloc_state() - -/* - Properties of all chunks -*/ - -#if __STD_C -static void do_check_chunk(mchunkptr p) -#else -static void do_check_chunk(p) mchunkptr p; -#endif -{ - mstate av = get_malloc_state(); - CHUNK_SIZE_T sz = chunksize(p); - /* min and max possible addresses assuming contiguous allocation */ - char* max_address = (char*)(av->top) + chunksize(av->top); - char* min_address = max_address - av->sbrked_mem; - - if (!chunk_is_mmapped(p)) { - - /* Has legal address ... */ - if (p != av->top) { - if (contiguous(av)) { - assert(((char*)p) >= min_address); - assert(((char*)p + sz) <= ((char*)(av->top))); - } - } - else { - /* top size is always at least MINSIZE */ - assert((CHUNK_SIZE_T)(sz) >= MINSIZE); - /* top predecessor always marked inuse */ - assert(prev_inuse(p)); - } - - } - else { -#if HAVE_MMAP - /* address is outside main heap */ - if (contiguous(av) && av->top != initial_top(av)) { - assert(((char*)p) < min_address || ((char*)p) > max_address); - } - /* chunk is page-aligned */ - assert(((p->prev_size + sz) & (av->pagesize-1)) == 0); - /* mem is aligned */ - assert(aligned_OK(chunk2mem(p))); -#else - /* force an appropriate assert violation if debug set */ - assert(!chunk_is_mmapped(p)); -#endif - } -} - -/* - Properties of free chunks -*/ - -#if __STD_C -static void do_check_free_chunk(mchunkptr p) -#else -static void do_check_free_chunk(p) mchunkptr p; -#endif -{ - mstate av = get_malloc_state(); - - INTERNAL_SIZE_T sz = p->size & ~PREV_INUSE; - mchunkptr next = chunk_at_offset(p, sz); - - do_check_chunk(p); - - /* Chunk must claim to be free ... */ - assert(!inuse(p)); - assert (!chunk_is_mmapped(p)); - - /* Unless a special marker, must have OK fields */ - if ((CHUNK_SIZE_T)(sz) >= MINSIZE) - { - assert((sz & MALLOC_ALIGN_MASK) == 0); - assert(aligned_OK(chunk2mem(p))); - /* ... matching footer field */ - assert(next->prev_size == sz); - /* ... and is fully consolidated */ - assert(prev_inuse(p)); - assert (next == av->top || inuse(next)); - - /* ... and has minimally sane links */ - assert(p->fd->bk == p); - assert(p->bk->fd == p); - } - else /* markers are always of size SIZE_SZ */ - assert(sz == SIZE_SZ); -} - -/* - Properties of inuse chunks -*/ - -#if __STD_C -static void do_check_inuse_chunk(mchunkptr p) -#else -static void do_check_inuse_chunk(p) mchunkptr p; -#endif -{ - mstate av = get_malloc_state(); - mchunkptr next; - do_check_chunk(p); - - if (chunk_is_mmapped(p)) - return; /* mmapped chunks have no next/prev */ - - /* Check whether it claims to be in use ... */ - assert(inuse(p)); - - next = next_chunk(p); - - /* ... and is surrounded by OK chunks. - Since more things can be checked with free chunks than inuse ones, - if an inuse chunk borders them and debug is on, it's worth doing them. - */ - if (!prev_inuse(p)) { - /* Note that we cannot even look at prev unless it is not inuse */ - mchunkptr prv = prev_chunk(p); - assert(next_chunk(prv) == p); - do_check_free_chunk(prv); - } - - if (next == av->top) { - assert(prev_inuse(next)); - assert(chunksize(next) >= MINSIZE); - } - else if (!inuse(next)) - do_check_free_chunk(next); -} - -/* - Properties of chunks recycled from fastbins -*/ - -#if __STD_C -static void do_check_remalloced_chunk(mchunkptr p, INTERNAL_SIZE_T s) -#else -static void do_check_remalloced_chunk(p, s) mchunkptr p; INTERNAL_SIZE_T s; -#endif -{ - INTERNAL_SIZE_T sz = p->size & ~PREV_INUSE; - - do_check_inuse_chunk(p); - - /* Legal size ... */ - assert((sz & MALLOC_ALIGN_MASK) == 0); - assert((CHUNK_SIZE_T)(sz) >= MINSIZE); - /* ... and alignment */ - assert(aligned_OK(chunk2mem(p))); - /* chunk is less than MINSIZE more than request */ - assert((long)(sz) - (long)(s) >= 0); - assert((long)(sz) - (long)(s + MINSIZE) < 0); -} - -/* - Properties of nonrecycled chunks at the point they are malloced -*/ - -#if __STD_C -static void do_check_malloced_chunk(mchunkptr p, INTERNAL_SIZE_T s) -#else -static void do_check_malloced_chunk(p, s) mchunkptr p; INTERNAL_SIZE_T s; -#endif -{ - /* same as recycled case ... */ - do_check_remalloced_chunk(p, s); - - /* - ... plus, must obey implementation invariant that prev_inuse is - always true of any allocated chunk; i.e., that each allocated - chunk borders either a previously allocated and still in-use - chunk, or the base of its memory arena. This is ensured - by making all allocations from the the `lowest' part of any found - chunk. This does not necessarily hold however for chunks - recycled via fastbins. - */ - - assert(prev_inuse(p)); -} - - -/* - Properties of malloc_state. - - This may be useful for debugging malloc, as well as detecting user - programmer errors that somehow write into malloc_state. - - If you are extending or experimenting with this malloc, you can - probably figure out how to hack this routine to print out or - display chunk addresses, sizes, bins, and other instrumentation. -*/ - -static void do_check_malloc_state() -{ - mstate av = get_malloc_state(); - int i; - mchunkptr p; - mchunkptr q; - mbinptr b; - unsigned int binbit; - int empty; - unsigned int idx; - INTERNAL_SIZE_T size; - CHUNK_SIZE_T total = 0; - int max_fast_bin; - - /* internal size_t must be no wider than pointer type */ - assert(sizeof(INTERNAL_SIZE_T) <= sizeof(char*)); - - /* alignment is a power of 2 */ - assert((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-1)) == 0); - - /* cannot run remaining checks until fully initialized */ - if (av->top == 0 || av->top == initial_top(av)) - return; - - /* pagesize is a power of 2 */ - assert((av->pagesize & (av->pagesize-1)) == 0); - - /* properties of fastbins */ - - /* max_fast is in allowed range */ - assert(get_max_fast(av) <= request2size(MAX_FAST_SIZE)); - - max_fast_bin = fastbin_index(av->max_fast); - - for (i = 0; i < NFASTBINS; ++i) { - p = av->fastbins[i]; - - /* all bins past max_fast are empty */ - if (i > max_fast_bin) - assert(p == 0); - - while (p != 0) { - /* each chunk claims to be inuse */ - do_check_inuse_chunk(p); - total += chunksize(p); - /* chunk belongs in this bin */ - assert(fastbin_index(chunksize(p)) == i); - p = p->fd; - } - } - - if (total != 0) - assert(have_fastchunks(av)); - else if (!have_fastchunks(av)) - assert(total == 0); - - /* check normal bins */ - for (i = 1; i < NBINS; ++i) { - b = bin_at(av,i); - - /* binmap is accurate (except for bin 1 == unsorted_chunks) */ - if (i >= 2) { - binbit = get_binmap(av,i); - empty = last(b) == b; - if (!binbit) - assert(empty); - else if (!empty) - assert(binbit); - } - - for (p = last(b); p != b; p = p->bk) { - /* each chunk claims to be free */ - do_check_free_chunk(p); - size = chunksize(p); - total += size; - if (i >= 2) { - /* chunk belongs in bin */ - idx = bin_index(size); - assert(idx == i); - /* lists are sorted */ - if ((CHUNK_SIZE_T) size >= (CHUNK_SIZE_T)(FIRST_SORTED_BIN_SIZE)) { - assert(p->bk == b || - (CHUNK_SIZE_T)chunksize(p->bk) >= - (CHUNK_SIZE_T)chunksize(p)); - } - } - /* chunk is followed by a legal chain of inuse chunks */ - for (q = next_chunk(p); - (q != av->top && inuse(q) && - (CHUNK_SIZE_T)(chunksize(q)) >= MINSIZE); - q = next_chunk(q)) - do_check_inuse_chunk(q); - } - } - - /* top chunk is OK */ - check_chunk(av->top); - - /* sanity checks for statistics */ - - assert(total <= (CHUNK_SIZE_T)(av->max_total_mem)); - assert(av->n_mmaps >= 0); - assert(av->n_mmaps <= av->max_n_mmaps); - - assert((CHUNK_SIZE_T)(av->sbrked_mem) <= - (CHUNK_SIZE_T)(av->max_sbrked_mem)); - - assert((CHUNK_SIZE_T)(av->mmapped_mem) <= - (CHUNK_SIZE_T)(av->max_mmapped_mem)); - - assert((CHUNK_SIZE_T)(av->max_total_mem) >= - (CHUNK_SIZE_T)(av->mmapped_mem) + (CHUNK_SIZE_T)(av->sbrked_mem)); -} -#endif - - -/* ----------- Routines dealing with system allocation -------------- */ - -/* - sysmalloc handles malloc cases requiring more memory from the system. - On entry, it is assumed that av->top does not have enough - space to service request for nb bytes, thus requiring that av->top - be extended or replaced. -*/ - -#if __STD_C -static Void_t* sYSMALLOc(INTERNAL_SIZE_T nb, mstate av) -#else -static Void_t* sYSMALLOc(nb, av) INTERNAL_SIZE_T nb; mstate av; -#endif -{ - mchunkptr old_top; /* incoming value of av->top */ - INTERNAL_SIZE_T old_size; /* its size */ - char* old_end; /* its end address */ - - long size; /* arg to first MORECORE or mmap call */ - char* brk; /* return value from MORECORE */ - - long correction; /* arg to 2nd MORECORE call */ - char* snd_brk; /* 2nd return val */ - - INTERNAL_SIZE_T front_misalign; /* unusable bytes at front of new space */ - INTERNAL_SIZE_T end_misalign; /* partial page left at end of new space */ - char* aligned_brk; /* aligned offset into brk */ - - mchunkptr p; /* the allocated/returned chunk */ - mchunkptr remainder; /* remainder from allocation */ - CHUNK_SIZE_T remainder_size; /* its size */ - - CHUNK_SIZE_T sum; /* for updating stats */ - - size_t pagemask = av->pagesize - 1; - - /* - If there is space available in fastbins, consolidate and retry - malloc from scratch rather than getting memory from system. This - can occur only if nb is in smallbin range so we didn't consolidate - upon entry to malloc. It is much easier to handle this case here - than in malloc proper. - */ - - if (have_fastchunks(av)) { - assert(in_smallbin_range(nb)); - malloc_consolidate(av); - return mALLOc(nb - MALLOC_ALIGN_MASK); - } - - -#if HAVE_MMAP - - /* - If have mmap, and the request size meets the mmap threshold, and - the system supports mmap, and there are few enough currently - allocated mmapped regions, try to directly map this request - rather than expanding top. - */ - - if ((CHUNK_SIZE_T)(nb) >= (CHUNK_SIZE_T)(av->mmap_threshold) && - (av->n_mmaps < av->n_mmaps_max)) { - - char* mm; /* return value from mmap call*/ - - /* - Round up size to nearest page. For mmapped chunks, the overhead - is one SIZE_SZ unit larger than for normal chunks, because there - is no following chunk whose prev_size field could be used. - */ - size = (nb + SIZE_SZ + MALLOC_ALIGN_MASK + pagemask) & ~pagemask; - - /* Don't try if size wraps around 0 */ - if ((CHUNK_SIZE_T)(size) > (CHUNK_SIZE_T)(nb)) { - - mm = (char*)(MMAP(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE)); - - if (mm != (char*)(MORECORE_FAILURE)) { - - /* - The offset to the start of the mmapped region is stored - in the prev_size field of the chunk. This allows us to adjust - returned start address to meet alignment requirements here - and in memalign(), and still be able to compute proper - address argument for later munmap in free() and realloc(). - */ - - front_misalign = (INTERNAL_SIZE_T)chunk2mem(mm) & MALLOC_ALIGN_MASK; - if (front_misalign > 0) { - correction = MALLOC_ALIGNMENT - front_misalign; - p = (mchunkptr)(mm + correction); - p->prev_size = correction; - set_head(p, (size - correction) |IS_MMAPPED); - } - else { - p = (mchunkptr)mm; - p->prev_size = 0; - set_head(p, size|IS_MMAPPED); - } - - /* update statistics */ - - if (++av->n_mmaps > av->max_n_mmaps) - av->max_n_mmaps = av->n_mmaps; - - sum = av->mmapped_mem += size; - if (sum > (CHUNK_SIZE_T)(av->max_mmapped_mem)) - av->max_mmapped_mem = sum; - sum += av->sbrked_mem; - if (sum > (CHUNK_SIZE_T)(av->max_total_mem)) - av->max_total_mem = sum; - - check_chunk(p); - - return chunk2mem(p); - } - } - } -#endif - - /* Record incoming configuration of top */ - - old_top = av->top; - old_size = chunksize(old_top); - old_end = (char*)(chunk_at_offset(old_top, old_size)); - - brk = snd_brk = (char*)(MORECORE_FAILURE); - - /* - If not the first time through, we require old_size to be - at least MINSIZE and to have prev_inuse set. - */ - - assert((old_top == initial_top(av) && old_size == 0) || - ((CHUNK_SIZE_T) (old_size) >= MINSIZE && - prev_inuse(old_top))); - - /* Precondition: not enough current space to satisfy nb request */ - assert((CHUNK_SIZE_T)(old_size) < (CHUNK_SIZE_T)(nb + MINSIZE)); - - /* Precondition: all fastbins are consolidated */ - assert(!have_fastchunks(av)); - - - /* Request enough space for nb + pad + overhead */ - - size = nb + av->top_pad + MINSIZE; - - /* - If contiguous, we can subtract out existing space that we hope to - combine with new space. We add it back later only if - we don't actually get contiguous space. - */ - - if (contiguous(av)) - size -= old_size; - - /* - Round to a multiple of page size. - If MORECORE is not contiguous, this ensures that we only call it - with whole-page arguments. And if MORECORE is contiguous and - this is not first time through, this preserves page-alignment of - previous calls. Otherwise, we correct to page-align below. - */ - - size = (size + pagemask) & ~pagemask; - - /* - Don't try to call MORECORE if argument is so big as to appear - negative. Note that since mmap takes size_t arg, it may succeed - below even if we cannot call MORECORE. - */ - - if (size > 0) - brk = (char*)(MORECORE(size)); - - /* - If have mmap, try using it as a backup when MORECORE fails or - cannot be used. This is worth doing on systems that have "holes" in - address space, so sbrk cannot extend to give contiguous space, but - space is available elsewhere. Note that we ignore mmap max count - and threshold limits, since the space will not be used as a - segregated mmap region. - */ - -#if HAVE_MMAP - if (brk == (char*)(MORECORE_FAILURE)) { - - /* Cannot merge with old top, so add its size back in */ - if (contiguous(av)) - size = (size + old_size + pagemask) & ~pagemask; - - /* If we are relying on mmap as backup, then use larger units */ - if ((CHUNK_SIZE_T)(size) < (CHUNK_SIZE_T)(MMAP_AS_MORECORE_SIZE)) - size = MMAP_AS_MORECORE_SIZE; - - /* Don't try if size wraps around 0 */ - if ((CHUNK_SIZE_T)(size) > (CHUNK_SIZE_T)(nb)) { - - brk = (char*)(MMAP(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE)); - - if (brk != (char*)(MORECORE_FAILURE)) { - - /* We do not need, and cannot use, another sbrk call to find end */ - snd_brk = brk + size; - - /* - Record that we no longer have a contiguous sbrk region. - After the first time mmap is used as backup, we do not - ever rely on contiguous space since this could incorrectly - bridge regions. - */ - set_noncontiguous(av); - } - } - } -#endif - - if (brk != (char*)(MORECORE_FAILURE)) { - av->sbrked_mem += size; - - /* - If MORECORE extends previous space, we can likewise extend top size. - */ - - if (brk == old_end && snd_brk == (char*)(MORECORE_FAILURE)) { - set_head(old_top, (size + old_size) | PREV_INUSE); - } - - /* - Otherwise, make adjustments: - - * If the first time through or noncontiguous, we need to call sbrk - just to find out where the end of memory lies. - - * We need to ensure that all returned chunks from malloc will meet - MALLOC_ALIGNMENT - - * If there was an intervening foreign sbrk, we need to adjust sbrk - request size to account for fact that we will not be able to - combine new space with existing space in old_top. - - * Almost all systems internally allocate whole pages at a time, in - which case we might as well use the whole last page of request. - So we allocate enough more memory to hit a page boundary now, - which in turn causes future contiguous calls to page-align. - */ - - else { - front_misalign = 0; - end_misalign = 0; - correction = 0; - aligned_brk = brk; - - /* - If MORECORE returns an address lower than we have seen before, - we know it isn't really contiguous. This and some subsequent - checks help cope with non-conforming MORECORE functions and - the presence of "foreign" calls to MORECORE from outside of - malloc or by other threads. We cannot guarantee to detect - these in all cases, but cope with the ones we do detect. - */ - if (contiguous(av) && old_size != 0 && brk < old_end) { - set_noncontiguous(av); - } - - /* handle contiguous cases */ - if (contiguous(av)) { - - /* - We can tolerate forward non-contiguities here (usually due - to foreign calls) but treat them as part of our space for - stats reporting. - */ - if (old_size != 0) - av->sbrked_mem += brk - old_end; - - /* Guarantee alignment of first new chunk made from this space */ - - front_misalign = (INTERNAL_SIZE_T)chunk2mem(brk) & MALLOC_ALIGN_MASK; - if (front_misalign > 0) { - - /* - Skip over some bytes to arrive at an aligned position. - We don't need to specially mark these wasted front bytes. - They will never be accessed anyway because - prev_inuse of av->top (and any chunk created from its start) - is always true after initialization. - */ - - correction = MALLOC_ALIGNMENT - front_misalign; - aligned_brk += correction; - } - - /* - If this isn't adjacent to existing space, then we will not - be able to merge with old_top space, so must add to 2nd request. - */ - - correction += old_size; - - /* Extend the end address to hit a page boundary */ - end_misalign = (INTERNAL_SIZE_T)(brk + size + correction); - correction += ((end_misalign + pagemask) & ~pagemask) - end_misalign; - - assert(correction >= 0); - snd_brk = (char*)(MORECORE(correction)); - - if (snd_brk == (char*)(MORECORE_FAILURE)) { - /* - If can't allocate correction, try to at least find out current - brk. It might be enough to proceed without failing. - */ - correction = 0; - snd_brk = (char*)(MORECORE(0)); - } - else if (snd_brk < brk) { - /* - If the second call gives noncontiguous space even though - it says it won't, the only course of action is to ignore - results of second call, and conservatively estimate where - the first call left us. Also set noncontiguous, so this - won't happen again, leaving at most one hole. - - Note that this check is intrinsically incomplete. Because - MORECORE is allowed to give more space than we ask for, - there is no reliable way to detect a noncontiguity - producing a forward gap for the second call. - */ - snd_brk = brk + size; - correction = 0; - set_noncontiguous(av); - } - - } - - /* handle non-contiguous cases */ - else { - /* MORECORE/mmap must correctly align */ - assert(aligned_OK(chunk2mem(brk))); - - /* Find out current end of memory */ - if (snd_brk == (char*)(MORECORE_FAILURE)) { - snd_brk = (char*)(MORECORE(0)); - av->sbrked_mem += snd_brk - brk - size; - } - } - - /* Adjust top based on results of second sbrk */ - if (snd_brk != (char*)(MORECORE_FAILURE)) { - av->top = (mchunkptr)aligned_brk; - set_head(av->top, (snd_brk - aligned_brk + correction) | PREV_INUSE); - av->sbrked_mem += correction; - - /* - If not the first time through, we either have a - gap due to foreign sbrk or a non-contiguous region. Insert a - double fencepost at old_top to prevent consolidation with space - we don't own. These fenceposts are artificial chunks that are - marked as inuse and are in any case too small to use. We need - two to make sizes and alignments work out. - */ - - if (old_size != 0) { - /* - Shrink old_top to insert fenceposts, keeping size a - multiple of MALLOC_ALIGNMENT. We know there is at least - enough space in old_top to do this. - */ - old_size = (old_size - 3*SIZE_SZ) & ~MALLOC_ALIGN_MASK; - set_head(old_top, old_size | PREV_INUSE); - - /* - Note that the following assignments completely overwrite - old_top when old_size was previously MINSIZE. This is - intentional. We need the fencepost, even if old_top otherwise gets - lost. - */ - chunk_at_offset(old_top, old_size )->size = - SIZE_SZ|PREV_INUSE; - - chunk_at_offset(old_top, old_size + SIZE_SZ)->size = - SIZE_SZ|PREV_INUSE; - - /* - If possible, release the rest, suppressing trimming. - */ - if (old_size >= MINSIZE) { - INTERNAL_SIZE_T tt = av->trim_threshold; - av->trim_threshold = (INTERNAL_SIZE_T)(-1); - fREe(chunk2mem(old_top)); - av->trim_threshold = tt; - } - } - } - } - - /* Update statistics */ - sum = av->sbrked_mem; - if (sum > (CHUNK_SIZE_T)(av->max_sbrked_mem)) - av->max_sbrked_mem = sum; - - sum += av->mmapped_mem; - if (sum > (CHUNK_SIZE_T)(av->max_total_mem)) - av->max_total_mem = sum; - - check_malloc_state(); - - /* finally, do the allocation */ - - p = av->top; - size = chunksize(p); - - /* check that one of the above allocation paths succeeded */ - if ((CHUNK_SIZE_T)(size) >= (CHUNK_SIZE_T)(nb + MINSIZE)) { - remainder_size = size - nb; - remainder = chunk_at_offset(p, nb); - av->top = remainder; - set_head(p, nb | PREV_INUSE); - set_head(remainder, remainder_size | PREV_INUSE); - check_malloced_chunk(p, nb); - return chunk2mem(p); - } - - } - - /* catch all failure paths */ - MALLOC_FAILURE_ACTION; - return 0; -} - - - - -#ifndef MORECORE_CANNOT_TRIM -/* - sYSTRIm is an inverse of sorts to sYSMALLOc. It gives memory back - to the system (via negative arguments to sbrk) if there is unused - memory at the `high' end of the malloc pool. It is called - automatically by free() when top space exceeds the trim - threshold. It is also called by the public malloc_trim routine. It - returns 1 if it actually released any memory, else 0. -*/ - -#if __STD_C -static int sYSTRIm(size_t pad, mstate av) -#else -static int sYSTRIm(pad, av) size_t pad; mstate av; -#endif -{ - long top_size; /* Amount of top-most memory */ - long extra; /* Amount to release */ - long released; /* Amount actually released */ - char* current_brk; /* address returned by pre-check sbrk call */ - char* new_brk; /* address returned by post-check sbrk call */ - size_t pagesz; - - pagesz = av->pagesize; - top_size = chunksize(av->top); - - /* Release in pagesize units, keeping at least one page */ - extra = ((top_size - pad - MINSIZE + (pagesz-1)) / pagesz - 1) * pagesz; - - if (extra > 0) { - - /* - Only proceed if end of memory is where we last set it. - This avoids problems if there were foreign sbrk calls. - */ - current_brk = (char*)(MORECORE(0)); - if (current_brk == (char*)(av->top) + top_size) { - - /* - Attempt to release memory. We ignore MORECORE return value, - and instead call again to find out where new end of memory is. - This avoids problems if first call releases less than we asked, - of if failure somehow altered brk value. (We could still - encounter problems if it altered brk in some very bad way, - but the only thing we can do is adjust anyway, which will cause - some downstream failure.) - */ - - MORECORE(-extra); - new_brk = (char*)(MORECORE(0)); - - if (new_brk != (char*)MORECORE_FAILURE) { - released = (long)(current_brk - new_brk); - - if (released != 0) { - /* Success. Adjust top. */ - av->sbrked_mem -= released; - set_head(av->top, (top_size - released) | PREV_INUSE); - check_malloc_state(); - return 1; - } - } - } - } - return 0; -} -#endif - -/* - ------------------------------ malloc ------------------------------ -*/ - - -#if __STD_C -Void_t* mALLOc(size_t bytes) -#else - Void_t* mALLOc(bytes) size_t bytes; -#endif -{ - mstate av = get_malloc_state(); - - INTERNAL_SIZE_T nb; /* normalized request size */ - unsigned int idx; /* associated bin index */ - mbinptr bin; /* associated bin */ - mfastbinptr* fb; /* associated fastbin */ - - mchunkptr victim; /* inspected/selected chunk */ - INTERNAL_SIZE_T size; /* its size */ - int victim_index; /* its bin index */ - - mchunkptr remainder; /* remainder from a split */ - CHUNK_SIZE_T remainder_size; /* its size */ - - unsigned int block; /* bit map traverser */ - unsigned int bit; /* bit map traverser */ - unsigned int map; /* current word of binmap */ - - mchunkptr fwd; /* misc temp for linking */ - mchunkptr bck; /* misc temp for linking */ - - /* - Convert request size to internal form by adding SIZE_SZ bytes - overhead plus possibly more to obtain necessary alignment and/or - to obtain a size of at least MINSIZE, the smallest allocatable - size. Also, checked_request2size traps (returning 0) request sizes - that are so large that they wrap around zero when padded and - aligned. - */ - - checked_request2size(bytes, nb); - - /* - Bypass search if no frees yet - */ - if (!have_anychunks(av)) { - if (av->max_fast == 0) /* initialization check */ - malloc_consolidate(av); - goto use_top; - } - - /* - If the size qualifies as a fastbin, first check corresponding bin. - */ - - if ((CHUNK_SIZE_T)(nb) <= (CHUNK_SIZE_T)(av->max_fast)) { - fb = &(av->fastbins[(fastbin_index(nb))]); - if ( (victim = *fb) != 0) { - *fb = victim->fd; - check_remalloced_chunk(victim, nb); - return chunk2mem(victim); - } - } - - /* - If a small request, check regular bin. Since these "smallbins" - hold one size each, no searching within bins is necessary. - (For a large request, we need to wait until unsorted chunks are - processed to find best fit. But for small ones, fits are exact - anyway, so we can check now, which is faster.) - */ - - if (in_smallbin_range(nb)) { - idx = smallbin_index(nb); - bin = bin_at(av,idx); - - if ( (victim = last(bin)) != bin) { - bck = victim->bk; - set_inuse_bit_at_offset(victim, nb); - bin->bk = bck; - bck->fd = bin; - - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - } - - /* - If this is a large request, consolidate fastbins before continuing. - While it might look excessive to kill all fastbins before - even seeing if there is space available, this avoids - fragmentation problems normally associated with fastbins. - Also, in practice, programs tend to have runs of either small or - large requests, but less often mixtures, so consolidation is not - invoked all that often in most programs. And the programs that - it is called frequently in otherwise tend to fragment. - */ - - else { - idx = largebin_index(nb); - if (have_fastchunks(av)) - malloc_consolidate(av); - } - - /* - Process recently freed or remaindered chunks, taking one only if - it is exact fit, or, if this a small request, the chunk is remainder from - the most recent non-exact fit. Place other traversed chunks in - bins. Note that this step is the only place in any routine where - chunks are placed in bins. - */ - - while ( (victim = unsorted_chunks(av)->bk) != unsorted_chunks(av)) { - bck = victim->bk; - size = chunksize(victim); - - /* - If a small request, try to use last remainder if it is the - only chunk in unsorted bin. This helps promote locality for - runs of consecutive small requests. This is the only - exception to best-fit, and applies only when there is - no exact fit for a small chunk. - */ - - if (in_smallbin_range(nb) && - bck == unsorted_chunks(av) && - victim == av->last_remainder && - (CHUNK_SIZE_T)(size) > (CHUNK_SIZE_T)(nb + MINSIZE)) { - - /* split and reattach remainder */ - remainder_size = size - nb; - remainder = chunk_at_offset(victim, nb); - unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder; - av->last_remainder = remainder; - remainder->bk = remainder->fd = unsorted_chunks(av); - - set_head(victim, nb | PREV_INUSE); - set_head(remainder, remainder_size | PREV_INUSE); - set_foot(remainder, remainder_size); - - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - - /* remove from unsorted list */ - unsorted_chunks(av)->bk = bck; - bck->fd = unsorted_chunks(av); - - /* Take now instead of binning if exact fit */ - - if (size == nb) { - set_inuse_bit_at_offset(victim, size); - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - - /* place chunk in bin */ - - if (in_smallbin_range(size)) { - victim_index = smallbin_index(size); - bck = bin_at(av, victim_index); - fwd = bck->fd; - } - else { - victim_index = largebin_index(size); - bck = bin_at(av, victim_index); - fwd = bck->fd; - - if (fwd != bck) { - /* if smaller than smallest, place first */ - if ((CHUNK_SIZE_T)(size) < (CHUNK_SIZE_T)(bck->bk->size)) { - fwd = bck; - bck = bck->bk; - } - else if ((CHUNK_SIZE_T)(size) >= - (CHUNK_SIZE_T)(FIRST_SORTED_BIN_SIZE)) { - - /* maintain large bins in sorted order */ - size |= PREV_INUSE; /* Or with inuse bit to speed comparisons */ - while ((CHUNK_SIZE_T)(size) < (CHUNK_SIZE_T)(fwd->size)) - fwd = fwd->fd; - bck = fwd->bk; - } - } - } - - mark_bin(av, victim_index); - victim->bk = bck; - victim->fd = fwd; - fwd->bk = victim; - bck->fd = victim; - } - - /* - If a large request, scan through the chunks of current bin to - find one that fits. (This will be the smallest that fits unless - FIRST_SORTED_BIN_SIZE has been changed from default.) This is - the only step where an unbounded number of chunks might be - scanned without doing anything useful with them. However the - lists tend to be short. - */ - - if (!in_smallbin_range(nb)) { - bin = bin_at(av, idx); - - for (victim = last(bin); victim != bin; victim = victim->bk) { - size = chunksize(victim); - - if ((CHUNK_SIZE_T)(size) >= (CHUNK_SIZE_T)(nb)) { - remainder_size = size - nb; - unlink(victim, bck, fwd); - - /* Exhaust */ - if (remainder_size < MINSIZE) { - set_inuse_bit_at_offset(victim, size); - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - /* Split */ - else { - remainder = chunk_at_offset(victim, nb); - unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder; - remainder->bk = remainder->fd = unsorted_chunks(av); - set_head(victim, nb | PREV_INUSE); - set_head(remainder, remainder_size | PREV_INUSE); - set_foot(remainder, remainder_size); - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - } - } - } - - /* - Search for a chunk by scanning bins, starting with next largest - bin. This search is strictly by best-fit; i.e., the smallest - (with ties going to approximately the least recently used) chunk - that fits is selected. - - The bitmap avoids needing to check that most blocks are nonempty. - */ - - ++idx; - bin = bin_at(av,idx); - block = idx2block(idx); - map = av->binmap[block]; - bit = idx2bit(idx); - - for (;;) { - - /* Skip rest of block if there are no more set bits in this block. */ - if (bit > map || bit == 0) { - do { - if (++block >= BINMAPSIZE) /* out of bins */ - goto use_top; - } while ( (map = av->binmap[block]) == 0); - - bin = bin_at(av, (block << BINMAPSHIFT)); - bit = 1; - } - - /* Advance to bin with set bit. There must be one. */ - while ((bit & map) == 0) { - bin = next_bin(bin); - bit <<= 1; - assert(bit != 0); - } - - /* Inspect the bin. It is likely to be non-empty */ - victim = last(bin); - - /* If a false alarm (empty bin), clear the bit. */ - if (victim == bin) { - av->binmap[block] = map &= ~bit; /* Write through */ - bin = next_bin(bin); - bit <<= 1; - } - - else { - size = chunksize(victim); - - /* We know the first chunk in this bin is big enough to use. */ - assert((CHUNK_SIZE_T)(size) >= (CHUNK_SIZE_T)(nb)); - - remainder_size = size - nb; - - /* unlink */ - bck = victim->bk; - bin->bk = bck; - bck->fd = bin; - - /* Exhaust */ - if (remainder_size < MINSIZE) { - set_inuse_bit_at_offset(victim, size); - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - - /* Split */ - else { - remainder = chunk_at_offset(victim, nb); - - unsorted_chunks(av)->bk = unsorted_chunks(av)->fd = remainder; - remainder->bk = remainder->fd = unsorted_chunks(av); - /* advertise as last remainder */ - if (in_smallbin_range(nb)) - av->last_remainder = remainder; - - set_head(victim, nb | PREV_INUSE); - set_head(remainder, remainder_size | PREV_INUSE); - set_foot(remainder, remainder_size); - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - } - } - - use_top: - /* - If large enough, split off the chunk bordering the end of memory - (held in av->top). Note that this is in accord with the best-fit - search rule. In effect, av->top is treated as larger (and thus - less well fitting) than any other available chunk since it can - be extended to be as large as necessary (up to system - limitations). - - We require that av->top always exists (i.e., has size >= - MINSIZE) after initialization, so if it would otherwise be - exhuasted by current request, it is replenished. (The main - reason for ensuring it exists is that we may need MINSIZE space - to put in fenceposts in sysmalloc.) - */ - - victim = av->top; - size = chunksize(victim); - - if ((CHUNK_SIZE_T)(size) >= (CHUNK_SIZE_T)(nb + MINSIZE)) { - remainder_size = size - nb; - remainder = chunk_at_offset(victim, nb); - av->top = remainder; - set_head(victim, nb | PREV_INUSE); - set_head(remainder, remainder_size | PREV_INUSE); - - check_malloced_chunk(victim, nb); - return chunk2mem(victim); - } - - /* - If no space in top, relay to handle system-dependent cases - */ - return sYSMALLOc(nb, av); -} - -/* - ------------------------------ free ------------------------------ -*/ - -#if __STD_C -void fREe(Void_t* mem) -#else -void fREe(mem) Void_t* mem; -#endif -{ - mstate av = get_malloc_state(); - - mchunkptr p; /* chunk corresponding to mem */ - INTERNAL_SIZE_T size; /* its size */ - mfastbinptr* fb; /* associated fastbin */ - mchunkptr nextchunk; /* next contiguous chunk */ - INTERNAL_SIZE_T nextsize; /* its size */ - int nextinuse; /* true if nextchunk is used */ - INTERNAL_SIZE_T prevsize; /* size of previous contiguous chunk */ - mchunkptr bck; /* misc temp for linking */ - mchunkptr fwd; /* misc temp for linking */ - - /* free(0) has no effect */ - if (mem != 0) { - p = mem2chunk(mem); - size = chunksize(p); - - check_inuse_chunk(p); - - /* - If eligible, place chunk on a fastbin so it can be found - and used quickly in malloc. - */ - - if ((CHUNK_SIZE_T)(size) <= (CHUNK_SIZE_T)(av->max_fast) - -#if TRIM_FASTBINS - /* - If TRIM_FASTBINS set, don't place chunks - bordering top into fastbins - */ - && (chunk_at_offset(p, size) != av->top) -#endif - ) { - - set_fastchunks(av); - fb = &(av->fastbins[fastbin_index(size)]); - p->fd = *fb; - *fb = p; - } - - /* - Consolidate other non-mmapped chunks as they arrive. - */ - - else if (!chunk_is_mmapped(p)) { - set_anychunks(av); - - nextchunk = chunk_at_offset(p, size); - nextsize = chunksize(nextchunk); - - /* consolidate backward */ - if (!prev_inuse(p)) { - prevsize = p->prev_size; - size += prevsize; - p = chunk_at_offset(p, -((long) prevsize)); - unlink(p, bck, fwd); - } - - if (nextchunk != av->top) { - /* get and clear inuse bit */ - nextinuse = inuse_bit_at_offset(nextchunk, nextsize); - set_head(nextchunk, nextsize); - - /* consolidate forward */ - if (!nextinuse) { - unlink(nextchunk, bck, fwd); - size += nextsize; - } - - /* - Place the chunk in unsorted chunk list. Chunks are - not placed into regular bins until after they have - been given one chance to be used in malloc. - */ - - bck = unsorted_chunks(av); - fwd = bck->fd; - p->bk = bck; - p->fd = fwd; - bck->fd = p; - fwd->bk = p; - - set_head(p, size | PREV_INUSE); - set_foot(p, size); - - check_free_chunk(p); - } - - /* - If the chunk borders the current high end of memory, - consolidate into top - */ - - else { - size += nextsize; - set_head(p, size | PREV_INUSE); - av->top = p; - check_chunk(p); - } - - /* - If freeing a large space, consolidate possibly-surrounding - chunks. Then, if the total unused topmost memory exceeds trim - threshold, ask malloc_trim to reduce top. - - Unless max_fast is 0, we don't know if there are fastbins - bordering top, so we cannot tell for sure whether threshold - has been reached unless fastbins are consolidated. But we - don't want to consolidate on each free. As a compromise, - consolidation is performed if FASTBIN_CONSOLIDATION_THRESHOLD - is reached. - */ - - if ((CHUNK_SIZE_T)(size) >= FASTBIN_CONSOLIDATION_THRESHOLD) { - if (have_fastchunks(av)) - malloc_consolidate(av); - -#ifndef MORECORE_CANNOT_TRIM - if ((CHUNK_SIZE_T)(chunksize(av->top)) >= - (CHUNK_SIZE_T)(av->trim_threshold)) - sYSTRIm(av->top_pad, av); -#endif - } - - } - /* - If the chunk was allocated via mmap, release via munmap() - Note that if HAVE_MMAP is false but chunk_is_mmapped is - true, then user must have overwritten memory. There's nothing - we can do to catch this error unless DEBUG is set, in which case - check_inuse_chunk (above) will have triggered error. - */ - - else { -#if HAVE_MMAP - int ret; - INTERNAL_SIZE_T offset = p->prev_size; - av->n_mmaps--; - av->mmapped_mem -= (size + offset); - ret = munmap((char*)p - offset, size + offset); - /* munmap returns non-zero on failure */ - assert(ret == 0); -#endif - } - } -} - -/* - ------------------------- malloc_consolidate ------------------------- - - malloc_consolidate is a specialized version of free() that tears - down chunks held in fastbins. Free itself cannot be used for this - purpose since, among other things, it might place chunks back onto - fastbins. So, instead, we need to use a minor variant of the same - code. - - Also, because this routine needs to be called the first time through - malloc anyway, it turns out to be the perfect place to trigger - initialization code. -*/ - -#if __STD_C -static void malloc_consolidate(mstate av) -#else -static void malloc_consolidate(av) mstate av; -#endif -{ - mfastbinptr* fb; /* current fastbin being consolidated */ - mfastbinptr* maxfb; /* last fastbin (for loop control) */ - mchunkptr p; /* current chunk being consolidated */ - mchunkptr nextp; /* next chunk to consolidate */ - mchunkptr unsorted_bin; /* bin header */ - mchunkptr first_unsorted; /* chunk to link to */ - - /* These have same use as in free() */ - mchunkptr nextchunk; - INTERNAL_SIZE_T size; - INTERNAL_SIZE_T nextsize; - INTERNAL_SIZE_T prevsize; - int nextinuse; - mchunkptr bck; - mchunkptr fwd; - - /* - If max_fast is 0, we know that av hasn't - yet been initialized, in which case do so below - */ - - if (av->max_fast != 0) { - clear_fastchunks(av); - - unsorted_bin = unsorted_chunks(av); - - /* - Remove each chunk from fast bin and consolidate it, placing it - then in unsorted bin. Among other reasons for doing this, - placing in unsorted bin avoids needing to calculate actual bins - until malloc is sure that chunks aren't immediately going to be - reused anyway. - */ - - maxfb = &(av->fastbins[fastbin_index(av->max_fast)]); - fb = &(av->fastbins[0]); - do { - if ( (p = *fb) != 0) { - *fb = 0; - - do { - check_inuse_chunk(p); - nextp = p->fd; - - /* Slightly streamlined version of consolidation code in free() */ - size = p->size & ~PREV_INUSE; - nextchunk = chunk_at_offset(p, size); - nextsize = chunksize(nextchunk); - - if (!prev_inuse(p)) { - prevsize = p->prev_size; - size += prevsize; - p = chunk_at_offset(p, -((long) prevsize)); - unlink(p, bck, fwd); - } - - if (nextchunk != av->top) { - nextinuse = inuse_bit_at_offset(nextchunk, nextsize); - set_head(nextchunk, nextsize); - - if (!nextinuse) { - size += nextsize; - unlink(nextchunk, bck, fwd); - } - - first_unsorted = unsorted_bin->fd; - unsorted_bin->fd = p; - first_unsorted->bk = p; - - set_head(p, size | PREV_INUSE); - p->bk = unsorted_bin; - p->fd = first_unsorted; - set_foot(p, size); - } - - else { - size += nextsize; - set_head(p, size | PREV_INUSE); - av->top = p; - } - - } while ( (p = nextp) != 0); - - } - } while (fb++ != maxfb); - } - else { - malloc_init_state(av); - check_malloc_state(); - } -} - -/* - ------------------------------ realloc ------------------------------ -*/ - - -#if __STD_C -Void_t* rEALLOc(Void_t* oldmem, size_t bytes) -#else -Void_t* rEALLOc(oldmem, bytes) Void_t* oldmem; size_t bytes; -#endif -{ - mstate av = get_malloc_state(); - - INTERNAL_SIZE_T nb; /* padded request size */ - - mchunkptr oldp; /* chunk corresponding to oldmem */ - INTERNAL_SIZE_T oldsize; /* its size */ - - mchunkptr newp; /* chunk to return */ - INTERNAL_SIZE_T newsize; /* its size */ - Void_t* newmem; /* corresponding user mem */ - - mchunkptr next; /* next contiguous chunk after oldp */ - - mchunkptr remainder; /* extra space at end of newp */ - CHUNK_SIZE_T remainder_size; /* its size */ - - mchunkptr bck; /* misc temp for linking */ - mchunkptr fwd; /* misc temp for linking */ - - CHUNK_SIZE_T copysize; /* bytes to copy */ - unsigned int ncopies; /* INTERNAL_SIZE_T words to copy */ - INTERNAL_SIZE_T* s; /* copy source */ - INTERNAL_SIZE_T* d; /* copy destination */ - - -#ifdef REALLOC_ZERO_BYTES_FREES - if (bytes == 0) { - fREe(oldmem); - return 0; - } -#endif - - /* realloc of null is supposed to be same as malloc */ - if (oldmem == 0) return mALLOc(bytes); - - checked_request2size(bytes, nb); - - oldp = mem2chunk(oldmem); - oldsize = chunksize(oldp); - - check_inuse_chunk(oldp); - - if (!chunk_is_mmapped(oldp)) { - - if ((CHUNK_SIZE_T)(oldsize) >= (CHUNK_SIZE_T)(nb)) { - /* already big enough; split below */ - newp = oldp; - newsize = oldsize; - } - - else { - next = chunk_at_offset(oldp, oldsize); - - /* Try to expand forward into top */ - if (next == av->top && - (CHUNK_SIZE_T)(newsize = oldsize + chunksize(next)) >= - (CHUNK_SIZE_T)(nb + MINSIZE)) { - set_head_size(oldp, nb); - av->top = chunk_at_offset(oldp, nb); - set_head(av->top, (newsize - nb) | PREV_INUSE); - return chunk2mem(oldp); - } - - /* Try to expand forward into next chunk; split off remainder below */ - else if (next != av->top && - !inuse(next) && - (CHUNK_SIZE_T)(newsize = oldsize + chunksize(next)) >= - (CHUNK_SIZE_T)(nb)) { - newp = oldp; - unlink(next, bck, fwd); - } - - /* allocate, copy, free */ - else { - newmem = mALLOc(nb - MALLOC_ALIGN_MASK); - if (newmem == 0) - return 0; /* propagate failure */ - - newp = mem2chunk(newmem); - newsize = chunksize(newp); - - /* - Avoid copy if newp is next chunk after oldp. - */ - if (newp == next) { - newsize += oldsize; - newp = oldp; - } - else { - /* - Unroll copy of <= 36 bytes (72 if 8byte sizes) - We know that contents have an odd number of - INTERNAL_SIZE_T-sized words; minimally 3. - */ - - copysize = oldsize - SIZE_SZ; - s = (INTERNAL_SIZE_T*)(oldmem); - d = (INTERNAL_SIZE_T*)(newmem); - ncopies = copysize / sizeof(INTERNAL_SIZE_T); - assert(ncopies >= 3); - - if (ncopies > 9) - MALLOC_COPY(d, s, copysize); - - else { - *(d+0) = *(s+0); - *(d+1) = *(s+1); - *(d+2) = *(s+2); - if (ncopies > 4) { - *(d+3) = *(s+3); - *(d+4) = *(s+4); - if (ncopies > 6) { - *(d+5) = *(s+5); - *(d+6) = *(s+6); - if (ncopies > 8) { - *(d+7) = *(s+7); - *(d+8) = *(s+8); - } - } - } - } - - fREe(oldmem); - check_inuse_chunk(newp); - return chunk2mem(newp); - } - } - } - - /* If possible, free extra space in old or extended chunk */ - - assert((CHUNK_SIZE_T)(newsize) >= (CHUNK_SIZE_T)(nb)); - - remainder_size = newsize - nb; - - if (remainder_size < MINSIZE) { /* not enough extra to split off */ - set_head_size(newp, newsize); - set_inuse_bit_at_offset(newp, newsize); - } - else { /* split remainder */ - remainder = chunk_at_offset(newp, nb); - set_head_size(newp, nb); - set_head(remainder, remainder_size | PREV_INUSE); - /* Mark remainder as inuse so free() won't complain */ - set_inuse_bit_at_offset(remainder, remainder_size); - fREe(chunk2mem(remainder)); - } - - check_inuse_chunk(newp); - return chunk2mem(newp); - } - - /* - Handle mmap cases - */ - - else { -#if HAVE_MMAP - -#if HAVE_MREMAP - INTERNAL_SIZE_T offset = oldp->prev_size; - size_t pagemask = av->pagesize - 1; - char *cp; - CHUNK_SIZE_T sum; - - /* Note the extra SIZE_SZ overhead */ - newsize = (nb + offset + SIZE_SZ + pagemask) & ~pagemask; - - /* don't need to remap if still within same page */ - if (oldsize == newsize - offset) - return oldmem; - - cp = (char*)mremap((char*)oldp - offset, oldsize + offset, newsize, 1); - - if (cp != (char*)MORECORE_FAILURE) { - - newp = (mchunkptr)(cp + offset); - set_head(newp, (newsize - offset)|IS_MMAPPED); - - assert(aligned_OK(chunk2mem(newp))); - assert((newp->prev_size == offset)); - - /* update statistics */ - sum = av->mmapped_mem += newsize - oldsize; - if (sum > (CHUNK_SIZE_T)(av->max_mmapped_mem)) - av->max_mmapped_mem = sum; - sum += av->sbrked_mem; - if (sum > (CHUNK_SIZE_T)(av->max_total_mem)) - av->max_total_mem = sum; - - return chunk2mem(newp); - } -#endif - - /* Note the extra SIZE_SZ overhead. */ - if ((CHUNK_SIZE_T)(oldsize) >= (CHUNK_SIZE_T)(nb + SIZE_SZ)) - newmem = oldmem; /* do nothing */ - else { - /* Must alloc, copy, free. */ - newmem = mALLOc(nb - MALLOC_ALIGN_MASK); - if (newmem != 0) { - MALLOC_COPY(newmem, oldmem, oldsize - 2*SIZE_SZ); - fREe(oldmem); - } - } - return newmem; - -#else - /* If !HAVE_MMAP, but chunk_is_mmapped, user must have overwritten mem */ - check_malloc_state(); - MALLOC_FAILURE_ACTION; - return 0; -#endif - } -} - -/* - ------------------------------ memalign ------------------------------ -*/ - -#if __STD_C -Void_t* mEMALIGn(size_t alignment, size_t bytes) -#else -Void_t* mEMALIGn(alignment, bytes) size_t alignment; size_t bytes; -#endif -{ - INTERNAL_SIZE_T nb; /* padded request size */ - char* m; /* memory returned by malloc call */ - mchunkptr p; /* corresponding chunk */ - char* brk; /* alignment point within p */ - mchunkptr newp; /* chunk to return */ - INTERNAL_SIZE_T newsize; /* its size */ - INTERNAL_SIZE_T leadsize; /* leading space before alignment point */ - mchunkptr remainder; /* spare room at end to split off */ - CHUNK_SIZE_T remainder_size; /* its size */ - INTERNAL_SIZE_T size; - - /* If need less alignment than we give anyway, just relay to malloc */ - - if (alignment <= MALLOC_ALIGNMENT) return mALLOc(bytes); - - /* Otherwise, ensure that it is at least a minimum chunk size */ - - if (alignment < MINSIZE) alignment = MINSIZE; - - /* Make sure alignment is power of 2 (in case MINSIZE is not). */ - if ((alignment & (alignment - 1)) != 0) { - size_t a = MALLOC_ALIGNMENT * 2; - while ((CHUNK_SIZE_T)a < (CHUNK_SIZE_T)alignment) a <<= 1; - alignment = a; - } - - checked_request2size(bytes, nb); - - /* - Strategy: find a spot within that chunk that meets the alignment - request, and then possibly free the leading and trailing space. - */ - - - /* Call malloc with worst case padding to hit alignment. */ - - m = (char*)(mALLOc(nb + alignment + MINSIZE)); - - if (m == 0) return 0; /* propagate failure */ - - p = mem2chunk(m); - - if ((((PTR_UINT)(m)) % alignment) != 0) { /* misaligned */ - - /* - Find an aligned spot inside chunk. Since we need to give back - leading space in a chunk of at least MINSIZE, if the first - calculation places us at a spot with less than MINSIZE leader, - we can move to the next aligned spot -- we've allocated enough - total room so that this is always possible. - */ - - brk = (char*)mem2chunk((PTR_UINT)(((PTR_UINT)(m + alignment - 1)) & - -((signed long) alignment))); - if ((CHUNK_SIZE_T)(brk - (char*)(p)) < MINSIZE) - brk += alignment; - - newp = (mchunkptr)brk; - leadsize = brk - (char*)(p); - newsize = chunksize(p) - leadsize; - - /* For mmapped chunks, just adjust offset */ - if (chunk_is_mmapped(p)) { - newp->prev_size = p->prev_size + leadsize; - set_head(newp, newsize|IS_MMAPPED); - return chunk2mem(newp); - } - - /* Otherwise, give back leader, use the rest */ - set_head(newp, newsize | PREV_INUSE); - set_inuse_bit_at_offset(newp, newsize); - set_head_size(p, leadsize); - fREe(chunk2mem(p)); - p = newp; - - assert (newsize >= nb && - (((PTR_UINT)(chunk2mem(p))) % alignment) == 0); - } - - /* Also give back spare room at the end */ - if (!chunk_is_mmapped(p)) { - size = chunksize(p); - if ((CHUNK_SIZE_T)(size) > (CHUNK_SIZE_T)(nb + MINSIZE)) { - remainder_size = size - nb; - remainder = chunk_at_offset(p, nb); - set_head(remainder, remainder_size | PREV_INUSE); - set_head_size(p, nb); - fREe(chunk2mem(remainder)); - } - } - - check_inuse_chunk(p); - return chunk2mem(p); -} - -/* - ------------------------------ calloc ------------------------------ -*/ - -#if __STD_C -Void_t* cALLOc(size_t n_elements, size_t elem_size) -#else -Void_t* cALLOc(n_elements, elem_size) size_t n_elements; size_t elem_size; -#endif -{ - mchunkptr p; - CHUNK_SIZE_T clearsize; - CHUNK_SIZE_T nclears; - INTERNAL_SIZE_T* d; - - Void_t* mem = mALLOc(n_elements * elem_size); - - if (mem != 0) { - p = mem2chunk(mem); - - if (!chunk_is_mmapped(p)) - { - /* - Unroll clear of <= 36 bytes (72 if 8byte sizes) - We know that contents have an odd number of - INTERNAL_SIZE_T-sized words; minimally 3. - */ - - d = (INTERNAL_SIZE_T*)mem; - clearsize = chunksize(p) - SIZE_SZ; - nclears = clearsize / sizeof(INTERNAL_SIZE_T); - assert(nclears >= 3); - - if (nclears > 9) - MALLOC_ZERO(d, clearsize); - - else { - *(d+0) = 0; - *(d+1) = 0; - *(d+2) = 0; - if (nclears > 4) { - *(d+3) = 0; - *(d+4) = 0; - if (nclears > 6) { - *(d+5) = 0; - *(d+6) = 0; - if (nclears > 8) { - *(d+7) = 0; - *(d+8) = 0; - } - } - } - } - } -#if ! MMAP_CLEARS - else - { - d = (INTERNAL_SIZE_T*)mem; - /* - Note the additional SIZE_SZ - */ - clearsize = chunksize(p) - 2*SIZE_SZ; - MALLOC_ZERO(d, clearsize); - } -#endif - } - return mem; -} - -/* - ------------------------------ cfree ------------------------------ -*/ - -#if __STD_C -void cFREe(Void_t *mem) -#else -void cFREe(mem) Void_t *mem; -#endif -{ - fREe(mem); -} - -/* - ------------------------- independent_calloc ------------------------- -*/ - -#if __STD_C -Void_t** iCALLOc(size_t n_elements, size_t elem_size, Void_t* chunks[]) -#else -Void_t** iCALLOc(n_elements, elem_size, chunks) size_t n_elements; size_t elem_size; Void_t* chunks[]; -#endif -{ - size_t sz = elem_size; /* serves as 1-element array */ - /* opts arg of 3 means all elements are same size, and should be cleared */ - return iALLOc(n_elements, &sz, 3, chunks); -} - -/* - ------------------------- independent_comalloc ------------------------- -*/ - -#if __STD_C -Void_t** iCOMALLOc(size_t n_elements, size_t sizes[], Void_t* chunks[]) -#else -Void_t** iCOMALLOc(n_elements, sizes, chunks) size_t n_elements; size_t sizes[]; Void_t* chunks[]; -#endif -{ - return iALLOc(n_elements, sizes, 0, chunks); -} - - -/* - ------------------------------ ialloc ------------------------------ - ialloc provides common support for independent_X routines, handling all of - the combinations that can result. - - The opts arg has: - bit 0 set if all elements are same size (using sizes[0]) - bit 1 set if elements should be zeroed -*/ - - -#if __STD_C -static Void_t** iALLOc(size_t n_elements, - size_t* sizes, - int opts, - Void_t* chunks[]) -#else -static Void_t** iALLOc(n_elements, sizes, opts, chunks) size_t n_elements; size_t* sizes; int opts; Void_t* chunks[]; -#endif -{ - mstate av = get_malloc_state(); - INTERNAL_SIZE_T element_size; /* chunksize of each element, if all same */ - INTERNAL_SIZE_T contents_size; /* total size of elements */ - INTERNAL_SIZE_T array_size; /* request size of pointer array */ - Void_t* mem; /* malloced aggregate space */ - mchunkptr p; /* corresponding chunk */ - INTERNAL_SIZE_T remainder_size; /* remaining bytes while splitting */ - Void_t** marray; /* either "chunks" or malloced ptr array */ - mchunkptr array_chunk; /* chunk for malloced ptr array */ - int mmx; /* to disable mmap */ - INTERNAL_SIZE_T size; - size_t i; - - /* Ensure initialization */ - if (av->max_fast == 0) malloc_consolidate(av); - - /* compute array length, if needed */ - if (chunks != 0) { - if (n_elements == 0) - return chunks; /* nothing to do */ - marray = chunks; - array_size = 0; - } - else { - /* if empty req, must still return chunk representing empty array */ - if (n_elements == 0) - return (Void_t**) mALLOc(0); - marray = 0; - array_size = request2size(n_elements * (sizeof(Void_t*))); - } - - /* compute total element size */ - if (opts & 0x1) { /* all-same-size */ - element_size = request2size(*sizes); - contents_size = n_elements * element_size; - } - else { /* add up all the sizes */ - element_size = 0; - contents_size = 0; - for (i = 0; i != n_elements; ++i) - contents_size += request2size(sizes[i]); - } - - /* subtract out alignment bytes from total to minimize overallocation */ - size = contents_size + array_size - MALLOC_ALIGN_MASK; - - /* - Allocate the aggregate chunk. - But first disable mmap so malloc won't use it, since - we would not be able to later free/realloc space internal - to a segregated mmap region. - */ - mmx = av->n_mmaps_max; /* disable mmap */ - av->n_mmaps_max = 0; - mem = mALLOc(size); - av->n_mmaps_max = mmx; /* reset mmap */ - if (mem == 0) - return 0; - - p = mem2chunk(mem); - assert(!chunk_is_mmapped(p)); - remainder_size = chunksize(p); - - if (opts & 0x2) { /* optionally clear the elements */ - MALLOC_ZERO(mem, remainder_size - SIZE_SZ - array_size); - } - - /* If not provided, allocate the pointer array as final part of chunk */ - if (marray == 0) { - array_chunk = chunk_at_offset(p, contents_size); - marray = (Void_t**) (chunk2mem(array_chunk)); - set_head(array_chunk, (remainder_size - contents_size) | PREV_INUSE); - remainder_size = contents_size; - } - - /* split out elements */ - for (i = 0; ; ++i) { - marray[i] = chunk2mem(p); - if (i != n_elements-1) { - if (element_size != 0) - size = element_size; - else - size = request2size(sizes[i]); - remainder_size -= size; - set_head(p, size | PREV_INUSE); - p = chunk_at_offset(p, size); - } - else { /* the final element absorbs any overallocation slop */ - set_head(p, remainder_size | PREV_INUSE); - break; - } - } - -#if DEBUG - if (marray != chunks) { - /* final element must have exactly exhausted chunk */ - if (element_size != 0) - assert(remainder_size == element_size); - else - assert(remainder_size == request2size(sizes[i])); - check_inuse_chunk(mem2chunk(marray)); - } - - for (i = 0; i != n_elements; ++i) - check_inuse_chunk(mem2chunk(marray[i])); -#endif - - return marray; -} - - -/* - ------------------------------ valloc ------------------------------ -*/ - -#if __STD_C -Void_t* vALLOc(size_t bytes) -#else -Void_t* vALLOc(bytes) size_t bytes; -#endif -{ - /* Ensure initialization */ - mstate av = get_malloc_state(); - if (av->max_fast == 0) malloc_consolidate(av); - return mEMALIGn(av->pagesize, bytes); -} - -/* - ------------------------------ pvalloc ------------------------------ -*/ - - -#if __STD_C -Void_t* pVALLOc(size_t bytes) -#else -Void_t* pVALLOc(bytes) size_t bytes; -#endif -{ - mstate av = get_malloc_state(); - size_t pagesz; - - /* Ensure initialization */ - if (av->max_fast == 0) malloc_consolidate(av); - pagesz = av->pagesize; - return mEMALIGn(pagesz, (bytes + pagesz - 1) & ~(pagesz - 1)); -} - - -/* - ------------------------------ malloc_trim ------------------------------ -*/ - -#if __STD_C -int mTRIm(size_t pad) -#else -int mTRIm(pad) size_t pad; -#endif -{ - mstate av = get_malloc_state(); - /* Ensure initialization/consolidation */ - malloc_consolidate(av); - -#ifndef MORECORE_CANNOT_TRIM - return sYSTRIm(pad, av); -#else - return 0; -#endif -} - - -/* - ------------------------- malloc_usable_size ------------------------- -*/ - -#if __STD_C -size_t mUSABLe(Void_t* mem) -#else -size_t mUSABLe(mem) Void_t* mem; -#endif -{ - mchunkptr p; - if (mem != 0) { - p = mem2chunk(mem); - if (chunk_is_mmapped(p)) - return chunksize(p) - 2*SIZE_SZ; - else if (inuse(p)) - return chunksize(p) - SIZE_SZ; - } - return 0; -} - -/* - ------------------------------ mallinfo ------------------------------ -*/ - -struct mallinfo mALLINFo() -{ - mstate av = get_malloc_state(); - struct mallinfo mi; - int i; - mbinptr b; - mchunkptr p; - INTERNAL_SIZE_T avail; - INTERNAL_SIZE_T fastavail; - int nblocks; - int nfastblocks; - - /* Ensure initialization */ - if (av->top == 0) malloc_consolidate(av); - - check_malloc_state(); - - /* Account for top */ - avail = chunksize(av->top); - nblocks = 1; /* top always exists */ - - /* traverse fastbins */ - nfastblocks = 0; - fastavail = 0; - - for (i = 0; i < NFASTBINS; ++i) { - for (p = av->fastbins[i]; p != 0; p = p->fd) { - ++nfastblocks; - fastavail += chunksize(p); - } - } - - avail += fastavail; - - /* traverse regular bins */ - for (i = 1; i < NBINS; ++i) { - b = bin_at(av, i); - for (p = last(b); p != b; p = p->bk) { - ++nblocks; - avail += chunksize(p); - } - } - - mi.smblks = nfastblocks; - mi.ordblks = nblocks; - mi.fordblks = avail; - mi.uordblks = av->sbrked_mem - avail; - mi.arena = av->sbrked_mem; - mi.hblks = av->n_mmaps; - mi.hblkhd = av->mmapped_mem; - mi.fsmblks = fastavail; - mi.keepcost = chunksize(av->top); - mi.usmblks = av->max_total_mem; - return mi; -} - -/* - ------------------------------ malloc_stats ------------------------------ -*/ - -void mSTATs() -{ - struct mallinfo mi = mALLINFo(); - -#ifdef WIN32 - { - CHUNK_SIZE_T free, reserved, committed; - vminfo (&free, &reserved, &committed); - fprintf(stderr, "free bytes = %10lu\n", - free); - fprintf(stderr, "reserved bytes = %10lu\n", - reserved); - fprintf(stderr, "committed bytes = %10lu\n", - committed); - } -#endif - -/* RN XXX */ - printf("max system bytes = %10lu\n", - (CHUNK_SIZE_T)(mi.usmblks)); - printf("system bytes = %10lu\n", - (CHUNK_SIZE_T)(mi.arena + mi.hblkhd)); - printf("in use bytes = %10lu\n", - (CHUNK_SIZE_T)(mi.uordblks + mi.hblkhd)); - -#ifdef WIN32 - { - CHUNK_SIZE_T kernel, user; - if (cpuinfo (TRUE, &kernel, &user)) { - fprintf(stderr, "kernel ms = %10lu\n", - kernel); - fprintf(stderr, "user ms = %10lu\n", - user); - } - } -#endif -} - - -/* - ------------------------------ mallopt ------------------------------ -*/ - -#if __STD_C -int mALLOPt(int param_number, int value) -#else -int mALLOPt(param_number, value) int param_number; int value; -#endif -{ - mstate av = get_malloc_state(); - /* Ensure initialization/consolidation */ - malloc_consolidate(av); - - switch(param_number) { - case M_MXFAST: - if (value >= 0 && value <= MAX_FAST_SIZE) { - set_max_fast(av, value); - return 1; - } - else - return 0; - - case M_TRIM_THRESHOLD: - av->trim_threshold = value; - return 1; - - case M_TOP_PAD: - av->top_pad = value; - return 1; - - case M_MMAP_THRESHOLD: - av->mmap_threshold = value; - return 1; - - case M_MMAP_MAX: -#if !HAVE_MMAP - if (value != 0) - return 0; -#endif - av->n_mmaps_max = value; - return 1; - - default: - return 0; - } -} - - -/* - -------------------- Alternative MORECORE functions -------------------- -*/ - - -/* - General Requirements for MORECORE. - - The MORECORE function must have the following properties: - - If MORECORE_CONTIGUOUS is false: - - * MORECORE must allocate in multiples of pagesize. It will - only be called with arguments that are multiples of pagesize. - - * MORECORE(0) must return an address that is at least - MALLOC_ALIGNMENT aligned. (Page-aligning always suffices.) - - else (i.e. If MORECORE_CONTIGUOUS is true): - - * Consecutive calls to MORECORE with positive arguments - return increasing addresses, indicating that space has been - contiguously extended. - - * MORECORE need not allocate in multiples of pagesize. - Calls to MORECORE need not have args of multiples of pagesize. - - * MORECORE need not page-align. - - In either case: - - * MORECORE may allocate more memory than requested. (Or even less, - but this will generally result in a malloc failure.) - - * MORECORE must not allocate memory when given argument zero, but - instead return one past the end address of memory from previous - nonzero call. This malloc does NOT call MORECORE(0) - until at least one call with positive arguments is made, so - the initial value returned is not important. - - * Even though consecutive calls to MORECORE need not return contiguous - addresses, it must be OK for malloc'ed chunks to span multiple - regions in those cases where they do happen to be contiguous. - - * MORECORE need not handle negative arguments -- it may instead - just return MORECORE_FAILURE when given negative arguments. - Negative arguments are always multiples of pagesize. MORECORE - must not misinterpret negative args as large positive unsigned - args. You can suppress all such calls from even occurring by defining - MORECORE_CANNOT_TRIM, - - There is some variation across systems about the type of the - argument to sbrk/MORECORE. If size_t is unsigned, then it cannot - actually be size_t, because sbrk supports negative args, so it is - normally the signed type of the same width as size_t (sometimes - declared as "intptr_t", and sometimes "ptrdiff_t"). It doesn't much - matter though. Internally, we use "long" as arguments, which should - work across all reasonable possibilities. - - Additionally, if MORECORE ever returns failure for a positive - request, and HAVE_MMAP is true, then mmap is used as a noncontiguous - system allocator. This is a useful backup strategy for systems with - holes in address spaces -- in this case sbrk cannot contiguously - expand the heap, but mmap may be able to map noncontiguous space. - - If you'd like mmap to ALWAYS be used, you can define MORECORE to be - a function that always returns MORECORE_FAILURE. - - Malloc only has limited ability to detect failures of MORECORE - to supply contiguous space when it says it can. In particular, - multithreaded programs that do not use locks may result in - rece conditions across calls to MORECORE that result in gaps - that cannot be detected as such, and subsequent corruption. - - If you are using this malloc with something other than sbrk (or its - emulation) to supply memory regions, you probably want to set - MORECORE_CONTIGUOUS as false. As an example, here is a custom - allocator kindly contributed for pre-OSX macOS. It uses virtually - but not necessarily physically contiguous non-paged memory (locked - in, present and won't get swapped out). You can use it by - uncommenting this section, adding some #includes, and setting up the - appropriate defines above: - - #define MORECORE osMoreCore - #define MORECORE_CONTIGUOUS 0 - - There is also a shutdown routine that should somehow be called for - cleanup upon program exit. - - #define MAX_POOL_ENTRIES 100 - #define MINIMUM_MORECORE_SIZE (64 * 1024) - static int next_os_pool; - void *our_os_pools[MAX_POOL_ENTRIES]; - - void *osMoreCore(int size) - { - void *ptr = 0; - static void *sbrk_top = 0; - - if (size > 0) - { - if (size < MINIMUM_MORECORE_SIZE) - size = MINIMUM_MORECORE_SIZE; - if (CurrentExecutionLevel() == kTaskLevel) - ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0); - if (ptr == 0) - { - return (void *) MORECORE_FAILURE; - } - // save ptrs so they can be freed during cleanup - our_os_pools[next_os_pool] = ptr; - next_os_pool++; - ptr = (void *) ((((CHUNK_SIZE_T) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK); - sbrk_top = (char *) ptr + size; - return ptr; - } - else if (size < 0) - { - // we don't currently support shrink behavior - return (void *) MORECORE_FAILURE; - } - else - { - return sbrk_top; - } - } - - // cleanup any allocated memory pools - // called as last thing before shutting down driver - - void osCleanupMem(void) - { - void **ptr; - - for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++) - if (*ptr) - { - PoolDeallocate(*ptr); - *ptr = 0; - } - } - -*/ - - -/* - -------------------------------------------------------------- - - Emulation of sbrk for win32. - Donated by J. Walter <Walter@xxxxxxxxxxxx>. - For additional information about this code, and malloc on Win32, see - http://www.genesys-e.de/jwalter/ -*/ - - -#ifdef WIN32 - -#ifdef _DEBUG -/* #define TRACE */ -#endif - -/* Support for USE_MALLOC_LOCK */ -#ifdef USE_MALLOC_LOCK - -/* Wait for spin lock */ -static int slwait (int *sl) { - while (InterlockedCompareExchange ((void **) sl, (void *) 1, (void *) 0) != 0) - Sleep (0); - return 0; -} - -/* Release spin lock */ -static int slrelease (int *sl) { - InterlockedExchange (sl, 0); - return 0; -} - -#ifdef NEEDED -/* Spin lock for emulation code */ -static int g_sl; -#endif - -#endif /* USE_MALLOC_LOCK */ - -/* getpagesize for windows */ -static long getpagesize (void) { - static long g_pagesize = 0; - if (! g_pagesize) { - SYSTEM_INFO system_info; - GetSystemInfo (&system_info); - g_pagesize = system_info.dwPageSize; - } - return g_pagesize; -} -static long getregionsize (void) { - static long g_regionsize = 0; - if (! g_regionsize) { - SYSTEM_INFO system_info; - GetSystemInfo (&system_info); - g_regionsize = system_info.dwAllocationGranularity; - } - return g_regionsize; -} - -/* A region list entry */ -typedef struct _region_list_entry { - void *top_allocated; - void *top_committed; - void *top_reserved; - long reserve_size; - struct _region_list_entry *previous; -} region_list_entry; - -/* Allocate and link a region entry in the region list */ -static int region_list_append (region_list_entry **last, void *base_reserved, long reserve_size) { - region_list_entry *next = HeapAlloc (GetProcessHeap (), 0, sizeof (region_list_entry)); - if (! next) - return FALSE; - next->top_allocated = (char *) base_reserved; - next->top_committed = (char *) base_reserved; - next->top_reserved = (char *) base_reserved + reserve_size; - next->reserve_size = reserve_size; - next->previous = *last; - *last = next; - return TRUE; -} -/* Free and unlink the last region entry from the region list */ -static int region_list_remove (region_list_entry **last) { - region_list_entry *previous = (*last)->previous; - if (! HeapFree (GetProcessHeap (), sizeof (region_list_entry), *last)) - return FALSE; - *last = previous; - return TRUE; -} - -#define CEIL(size,to) (((size)+(to)-1)&~((to)-1)) -#define FLOOR(size,to) ((size)&~((to)-1)) - -#define SBRK_SCALE 0 -/* #define SBRK_SCALE 1 */ -/* #define SBRK_SCALE 2 */ -/* #define SBRK_SCALE 4 */ - -/* sbrk for windows */ -static void *sbrk (long size) { - static long g_pagesize, g_my_pagesize; - static long g_regionsize, g_my_regionsize; - static region_list_entry *g_last; - void *result = (void *) MORECORE_FAILURE; -#ifdef TRACE - printf ("sbrk %d\n", size); -#endif -#if defined (USE_MALLOC_LOCK) && defined (NEEDED) - /* Wait for spin lock */ - slwait (&g_sl); -#endif - /* First time initialization */ - if (! g_pagesize) { - g_pagesize = getpagesize (); - g_my_pagesize = g_pagesize << SBRK_SCALE; - } - if (! g_regionsize) { - g_regionsize = getregionsize (); - g_my_regionsize = g_regionsize << SBRK_SCALE; - } - if (! g_last) { - if (! region_list_append (&g_last, 0, 0)) - goto sbrk_exit; - } - /* Assert invariants */ - assert (g_last); - assert ((char *) g_last->top_reserved - g_last->reserve_size <= (char *) g_last->top_allocated && - g_last->top_allocated <= g_last->top_committed); - assert ((char *) g_last->top_reserved - g_last->reserve_size <= (char *) g_last->top_committed && - g_last->top_committed <= g_last->top_reserved && - (unsigned) g_last->top_committed % g_pagesize == 0); - assert ((unsigned) g_last->top_reserved % g_regionsize == 0); - assert ((unsigned) g_last->reserve_size % g_regionsize == 0); - /* Allocation requested? */ - if (size >= 0) { - /* Allocation size is the requested size */ - long allocate_size = size; - /* Compute the size to commit */ - long to_commit = (char *) g_last->top_allocated + allocate_size - (char *) g_last->top_committed; - /* Do we reach the commit limit? */ - if (to_commit > 0) { - /* Round size to commit */ - long commit_size = CEIL (to_commit, g_my_pagesize); - /* Compute the size to reserve */ - long to_reserve = (char *) g_last->top_committed + commit_size - (char *) g_last->top_reserved; - /* Do we reach the reserve limit? */ - if (to_reserve > 0) { - /* Compute the remaining size to commit in the current region */ - long remaining_commit_size = (char *) g_last->top_reserved - (char *) g_last->top_committed; - if (remaining_commit_size > 0) { - /* Assert preconditions */ - assert ((unsigned) g_last->top_committed % g_pagesize == 0); - assert (0 < remaining_commit_size && remaining_commit_size % g_pagesize == 0); { - /* Commit this */ - void *base_committed = VirtualAlloc (g_last->top_committed, remaining_commit_size, - MEM_COMMIT, PAGE_READWRITE); - /* Check returned pointer for consistency */ - if (base_committed != g_last->top_committed) - goto sbrk_exit; - /* Assert postconditions */ - assert ((unsigned) base_committed % g_pagesize == 0); -#ifdef TRACE - printf ("Commit %p %d\n", base_committed, remaining_commit_size); -#endif - /* Adjust the regions commit top */ - g_last->top_committed = (char *) base_committed + remaining_commit_size; - } - } { - /* Now we are going to search and reserve. */ - int contiguous = -1; - int found = FALSE; - MEMORY_BASIC_INFORMATION memory_info; - void *base_reserved; - long reserve_size; - do { - /* Assume contiguous memory */ - contiguous = TRUE; - /* Round size to reserve */ - reserve_size = CEIL (to_reserve, g_my_regionsize); - /* Start with the current region's top */ - memory_info.BaseAddress = g_last->top_reserved; - /* Assert preconditions */ - assert ((unsigned) memory_info.BaseAddress % g_pagesize == 0); - assert (0 < reserve_size && reserve_size % g_regionsize == 0); - while (VirtualQuery (memory_info.BaseAddress, &memory_info, sizeof (memory_info))) { - /* Assert postconditions */ - assert ((unsigned) memory_info.BaseAddress % g_pagesize == 0); -#ifdef TRACE - printf ("Query %p %d %s\n", memory_info.BaseAddress, memory_info.RegionSize, - memory_info.State == MEM_FREE ? "FREE": - (memory_info.State == MEM_RESERVE ? "RESERVED": - (memory_info.State == MEM_COMMIT ? "COMMITTED": "?"))); -#endif - /* Region is free, well aligned and big enough: we are done */ - if (memory_info.State == MEM_FREE && - (unsigned) memory_info.BaseAddress % g_regionsize == 0 && - memory_info.RegionSize >= (unsigned) reserve_size) { - found = TRUE; - break; - } - /* From now on we can't get contiguous memory! */ - contiguous = FALSE; - /* Recompute size to reserve */ - reserve_size = CEIL (allocate_size, g_my_regionsize); - memory_info.BaseAddress = (char *) memory_info.BaseAddress + memory_info.RegionSize; - /* Assert preconditions */ - assert ((unsigned) memory_info.BaseAddress % g_pagesize == 0); - assert (0 < reserve_size && reserve_size % g_regionsize == 0); - } - /* Search failed? */ - if (! found) - goto sbrk_exit; - /* Assert preconditions */ - assert ((unsigned) memory_info.BaseAddress % g_regionsize == 0); - assert (0 < reserve_size && reserve_size % g_regionsize == 0); - /* Try to reserve this */ - base_reserved = VirtualAlloc (memory_info.BaseAddress, reserve_size, - MEM_RESERVE, PAGE_NOACCESS); - if (! base_reserved) { - int rc = GetLastError (); - if (rc != ERROR_INVALID_ADDRESS) - goto sbrk_exit; - } - /* A null pointer signals (hopefully) a race condition with another thread. */ - /* In this case, we try again. */ - } while (! base_reserved); - /* Check returned pointer for consistency */ - if (memory_info.BaseAddress && base_reserved != memory_info.BaseAddress) - goto sbrk_exit; - /* Assert postconditions */ - assert ((unsigned) base_reserved % g_regionsize == 0); -#ifdef TRACE - printf ("Reserve %p %d\n", base_reserved, reserve_size); -#endif - /* Did we get contiguous memory? */ - if (contiguous) { - long start_size = (char *) g_last->top_committed - (char *) g_last->top_allocated; - /* Adjust allocation size */ - allocate_size -= start_size; - /* Adjust the regions allocation top */ - g_last->top_allocated = g_last->top_committed; - /* Recompute the size to commit */ - to_commit = (char *) g_last->top_allocated + allocate_size - (char *) g_last->top_committed; - /* Round size to commit */ - commit_size = CEIL (to_commit, g_my_pagesize); - } - /* Append the new region to the list */ - if (! region_list_append (&g_last, base_reserved, reserve_size)) - goto sbrk_exit; - /* Didn't we get contiguous memory? */ - if (! contiguous) { - /* Recompute the size to commit */ - to_commit = (char *) g_last->top_allocated + allocate_size - (char *) g_last->top_committed; - /* Round size to commit */ - commit_size = CEIL (to_commit, g_my_pagesize); - } - } - } - /* Assert preconditions */ - assert ((unsigned) g_last->top_committed % g_pagesize == 0); - assert (0 < commit_size && commit_size % g_pagesize == 0); { - /* Commit this */ - void *base_committed = VirtualAlloc (g_last->top_committed, commit_size, - MEM_COMMIT, PAGE_READWRITE); - /* Check returned pointer for consistency */ - if (base_committed != g_last->top_committed) - goto sbrk_exit; - /* Assert postconditions */ - assert ((unsigned) base_committed % g_pagesize == 0); -#ifdef TRACE - printf ("Commit %p %d\n", base_committed, commit_size); -#endif - /* Adjust the regions commit top */ - g_last->top_committed = (char *) base_committed + commit_size; - } - } - /* Adjust the regions allocation top */ - g_last->top_allocated = (char *) g_last->top_allocated + allocate_size; - result = (char *) g_last->top_allocated - size; - /* Deallocation requested? */ - } else if (size < 0) { - long deallocate_size = - size; - /* As long as we have a region to release */ - while ((char *) g_last->top_allocated - deallocate_size < (char *) g_last->top_reserved - g_last->reserve_size) { - /* Get the size to release */ - long release_size = g_last->reserve_size; - /* Get the base address */ - void *base_reserved = (char *) g_last->top_reserved - release_size; - /* Assert preconditions */ - assert ((unsigned) base_reserved % g_regionsize == 0); - assert (0 < release_size && release_size % g_regionsize == 0); { - /* Release this */ - int rc = VirtualFree (base_reserved, 0, - MEM_RELEASE); - /* Check returned code for consistency */ - if (! rc) - goto sbrk_exit; -#ifdef TRACE - printf ("Release %p %d\n", base_reserved, release_size); -#endif - } - /* Adjust deallocation size */ - deallocate_size -= (char *) g_last->top_allocated - (char *) base_reserved; - /* Remove the old region from the list */ - if (! region_list_remove (&g_last)) - goto sbrk_exit; - } { - /* Compute the size to decommit */ - long to_decommit = (char *) g_last->top_committed - ((char *) g_last->top_allocated - deallocate_size); - if (to_decommit >= g_my_pagesize) { - /* Compute the size to decommit */ - long decommit_size = FLOOR (to_decommit, g_my_pagesize); - /* Compute the base address */ - void *base_committed = (char *) g_last->top_committed - decommit_size; - /* Assert preconditions */ - assert ((unsigned) base_committed % g_pagesize == 0); - assert (0 < decommit_size && decommit_size % g_pagesize == 0); { - /* Decommit this */ - int rc = VirtualFree ((char *) base_committed, decommit_size, - MEM_DECOMMIT); - /* Check returned code for consistency */ - if (! rc) - goto sbrk_exit; -#ifdef TRACE - printf ("Decommit %p %d\n", base_committed, decommit_size); -#endif - } - /* Adjust deallocation size and regions commit and allocate top */ - deallocate_size -= (char *) g_last->top_allocated - (char *) base_committed; - g_last->top_committed = base_committed; - g_last->top_allocated = base_committed; - } - } - /* Adjust regions allocate top */ - g_last->top_allocated = (char *) g_last->top_allocated - deallocate_size; - /* Check for underflow */ - if ((char *) g_last->top_reserved - g_last->reserve_size > (char *) g_last->top_allocated || - g_last->top_allocated > g_last->top_committed) { - /* Adjust regions allocate top */ - g_last->top_allocated = (char *) g_last->top_reserved - g_last->reserve_size; - goto sbrk_exit; - } - result = g_last->top_allocated; - } - /* Assert invariants */ - assert (g_last); - assert ((char *) g_last->top_reserved - g_last->reserve_size <= (char *) g_last->top_allocated && - g_last->top_allocated <= g_last->top_committed); - assert ((char *) g_last->top_reserved - g_last->reserve_size <= (char *) g_last->top_committed && - g_last->top_committed <= g_last->top_reserved && - (unsigned) g_last->top_committed % g_pagesize == 0); - assert ((unsigned) g_last->top_reserved % g_regionsize == 0); - assert ((unsigned) g_last->reserve_size % g_regionsize == 0); - -sbrk_exit: -#if defined (USE_MALLOC_LOCK) && defined (NEEDED) - /* Release spin lock */ - slrelease (&g_sl); -#endif - return result; -} - -/* mmap for windows */ -static void *mmap (void *ptr, long size, long prot, long type, long handle, long arg) { - static long g_pagesize; - static long g_regionsize; -#ifdef TRACE - printf ("mmap %d\n", size); -#endif -#if defined (USE_MALLOC_LOCK) && defined (NEEDED) - /* Wait for spin lock */ - slwait (&g_sl); -#endif - /* First time initialization */ - if (! g_pagesize) - g_pagesize = getpagesize (); - if (! g_regionsize) - g_regionsize = getregionsize (); - /* Assert preconditions */ - assert ((unsigned) ptr % g_regionsize == 0); - assert (size % g_pagesize == 0); - /* Allocate this */ - ptr = VirtualAlloc (ptr, size, - MEM_RESERVE | MEM_COMMIT | MEM_TOP_DOWN, PAGE_READWRITE); - if (! ptr) { - ptr = (void *) MORECORE_FAILURE; - goto mmap_exit; - } - /* Assert postconditions */ - assert ((unsigned) ptr % g_regionsize == 0); -#ifdef TRACE - printf ("Commit %p %d\n", ptr, size); -#endif -mmap_exit: -#if defined (USE_MALLOC_LOCK) && defined (NEEDED) - /* Release spin lock */ - slrelease (&g_sl); -#endif - return ptr; -} - -/* munmap for windows */ -static long munmap (void *ptr, long size) { - static long g_pagesize; - static long g_regionsize; - int rc = MUNMAP_FAILURE; -#ifdef TRACE - printf ("munmap %p %d\n", ptr, size); -#endif -#if defined (USE_MALLOC_LOCK) && defined (NEEDED) - /* Wait for spin lock */ - slwait (&g_sl); -#endif - /* First time initialization */ - if (! g_pagesize) - g_pagesize = getpagesize (); - if (! g_regionsize) - g_regionsize = getregionsize (); - /* Assert preconditions */ - assert ((unsigned) ptr % g_regionsize == 0); - assert (size % g_pagesize == 0); - /* Free this */ - if (! VirtualFree (ptr, 0, - MEM_RELEASE)) - goto munmap_exit; - rc = 0; -#ifdef TRACE - printf ("Release %p %d\n", ptr, size); -#endif -munmap_exit: -#if defined (USE_MALLOC_LOCK) && defined (NEEDED) - /* Release spin lock */ - slrelease (&g_sl); -#endif - return rc; -} - -static void vminfo (CHUNK_SIZE_T *free, CHUNK_SIZE_T *reserved, CHUNK_SIZE_T *committed) { - MEMORY_BASIC_INFORMATION memory_info; - memory_info.BaseAddress = 0; - *free = *reserved = *committed = 0; - while (VirtualQuery (memory_info.BaseAddress, &memory_info, sizeof (memory_info))) { - switch (memory_info.State) { - case MEM_FREE: - *free += memory_info.RegionSize; - break; - case MEM_RESERVE: - *reserved += memory_info.RegionSize; - break; - case MEM_COMMIT: - *committed += memory_info.RegionSize; - break; - } - memory_info.BaseAddress = (char *) memory_info.BaseAddress + memory_info.RegionSize; - } -} - -static int cpuinfo (int whole, CHUNK_SIZE_T *kernel, CHUNK_SIZE_T *user) { - if (whole) { - __int64 creation64, exit64, kernel64, user64; - int rc = GetProcessTimes (GetCurrentProcess (), - (FILETIME *) &creation64, - (FILETIME *) &exit64, - (FILETIME *) &kernel64, - (FILETIME *) &user64); - if (! rc) { - *kernel = 0; - *user = 0; - return FALSE; - } - *kernel = (CHUNK_SIZE_T) (kernel64 / 10000); - *user = (CHUNK_SIZE_T) (user64 / 10000); - return TRUE; - } else { - __int64 creation64, exit64, kernel64, user64; - int rc = GetThreadTimes (GetCurrentThread (), - (FILETIME *) &creation64, - (FILETIME *) &exit64, - (FILETIME *) &kernel64, - (FILETIME *) &user64); - if (! rc) { - *kernel = 0; - *user = 0; - return FALSE; - } - *kernel = (CHUNK_SIZE_T) (kernel64 / 10000); - *user = (CHUNK_SIZE_T) (user64 / 10000); - return TRUE; - } -} - -#endif /* WIN32 */ - -/* ------------------------------------------------------------ -History: - V2.7.2 Sat Aug 17 09:07:30 2002 Doug Lea (dl at gee) - * Fix malloc_state bitmap array misdeclaration - - V2.7.1 Thu Jul 25 10:58:03 2002 Doug Lea (dl at gee) - * Allow tuning of FIRST_SORTED_BIN_SIZE - * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte. - * Better detection and support for non-contiguousness of MORECORE. - Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger - * Bypass most of malloc if no frees. Thanks To Emery Berger. - * Fix freeing of old top non-contiguous chunk im sysmalloc. - * Raised default trim and map thresholds to 256K. - * Fix mmap-related #defines. Thanks to Lubos Lunak. - * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield. - * Branch-free bin calculation - * Default trim and mmap thresholds now 256K. - - V2.7.0 Sun Mar 11 14:14:06 2001 Doug Lea (dl at gee) - * Introduce independent_comalloc and independent_calloc. - Thanks to Michael Pachos for motivation and help. - * Make optional .h file available - * Allow > 2GB requests on 32bit systems. - * new WIN32 sbrk, mmap, munmap, lock code from <Walter@xxxxxxxxxxxx>. - Thanks also to Andreas Mueller <a.mueller at paradatec.de>, - and Anonymous. - * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for - helping test this.) - * memalign: check alignment arg - * realloc: don't try to shift chunks backwards, since this - leads to more fragmentation in some programs and doesn't - seem to help in any others. - * Collect all cases in malloc requiring system memory into sYSMALLOc - * Use mmap as backup to sbrk - * Place all internal state in malloc_state - * Introduce fastbins (although similar to 2.5.1) - * Many minor tunings and cosmetic improvements - * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK - * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS - Thanks to Tony E. Bennett <tbennett@xxxxxxxxxx> and others. - * Include errno.h to support default failure action. - - V2.6.6 Sun Dec 5 07:42:19 1999 Doug Lea (dl at gee) - * return null for negative arguments - * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com> - * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h' - (e.g. WIN32 platforms) - * Cleanup header file inclusion for WIN32 platforms - * Cleanup code to avoid Microsoft Visual C++ compiler complaints - * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing - memory allocation routines - * Set 'malloc_getpagesize' for WIN32 platforms (needs more work) - * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to - usage of 'assert' in non-WIN32 code - * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to - avoid infinite loop - * Always call 'fREe()' rather than 'free()' - - V2.6.5 Wed Jun 17 15:57:31 1998 Doug Lea (dl at gee) - * Fixed ordering problem with boundary-stamping - - V2.6.3 Sun May 19 08:17:58 1996 Doug Lea (dl at gee) - * Added pvalloc, as recommended by H.J. Liu - * Added 64bit pointer support mainly from Wolfram Gloger - * Added anonymously donated WIN32 sbrk emulation - * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen - * malloc_extend_top: fix mask error that caused wastage after - foreign sbrks - * Add linux mremap support code from HJ Liu - - V2.6.2 Tue Dec 5 06:52:55 1995 Doug Lea (dl at gee) - * Integrated most documentation with the code. - * Add support for mmap, with help from - Wolfram Gloger (Gloger@xxxxxxxxxxxxxxxxxxx). - * Use last_remainder in more cases. - * Pack bins using idea from colin@xxxxxxxxxxxxxxx - * Use ordered bins instead of best-fit threshhold - * Eliminate block-local decls to simplify tracing and debugging. - * Support another case of realloc via move into top - * Fix error occuring when initial sbrk_base not word-aligned. - * Rely on page size for units instead of SBRK_UNIT to - avoid surprises about sbrk alignment conventions. - * Add mallinfo, mallopt. Thanks to Raymond Nijssen - (raymond@xxxxxxxxxxxxx) for the suggestion. - * Add `pad' argument to malloc_trim and top_pad mallopt parameter. - * More precautions for cases where other routines call sbrk, - courtesy of Wolfram Gloger (Gloger@xxxxxxxxxxxxxxxxxxx). - * Added macros etc., allowing use in linux libc from - H.J. Lu (hjl@xxxxxxxxxxxxxx) - * Inverted this history list - - V2.6.1 Sat Dec 2 14:10:57 1995 Doug Lea (dl at gee) - * Re-tuned and fixed to behave more nicely with V2.6.0 changes. - * Removed all preallocation code since under current scheme - the work required to undo bad preallocations exceeds - the work saved in good cases for most test programs. - * No longer use return list or unconsolidated bins since - no scheme using them consistently outperforms those that don't - given above changes. - * Use best fit for very large chunks to prevent some worst-cases. - * Added some support for debugging - - V2.6.0 Sat Nov 4 07:05:23 1995 Doug Lea (dl at gee) - * Removed footers when chunks are in use. Thanks to - Paul Wilson (wilson@xxxxxxxxxxxx) for the suggestion. - - V2.5.4 Wed Nov 1 07:54:51 1995 Doug Lea (dl at gee) - * Added malloc_trim, with help from Wolfram Gloger - (wmglo@xxxxxxxxxxxxxxxxxxxxxxxx). - - V2.5.3 Tue Apr 26 10:16:01 1994 Doug Lea (dl at g) - - V2.5.2 Tue Apr 5 16:20:40 1994 Doug Lea (dl at g) - * realloc: try to expand in both directions - * malloc: swap order of clean-bin strategy; - * realloc: only conditionally expand backwards - * Try not to scavenge used bins - * Use bin counts as a guide to preallocation - * Occasionally bin return list chunks in first scan - * Add a few optimizations from colin@xxxxxxxxxxxxxxx - - V2.5.1 Sat Aug 14 15:40:43 1993 Doug Lea (dl at g) - * faster bin computation & slightly different binning - * merged all consolidations to one part of malloc proper - (eliminating old malloc_find_space & malloc_clean_bin) - * Scan 2 returns chunks (not just 1) - * Propagate failure in realloc if malloc returns 0 - * Add stuff to allow compilation on non-ANSI compilers - from kpv@xxxxxxxxxxxxxxxx - - V2.5 Sat Aug 7 07:41:59 1993 Doug Lea (dl at g.oswego.edu) - * removed potential for odd address access in prev_chunk - * removed dependency on getpagesize.h - * misc cosmetics and a bit more internal documentation - * anticosmetics: mangled names in macros to evade debugger strangeness - * tested on sparc, hp-700, dec-mips, rs6000 - with gcc & native cc (hp, dec only) allowing - Detlefs & Zorn comparison study (in SIGPLAN Notices.) - - Trial version Fri Aug 28 13:14:29 1992 Doug Lea (dl at g.oswego.edu) - * Based loosely on libg++-1.2X malloc. (It retains some of the overall - structure of old version, but most details differ.) - -*/ diff -r de3576a1c62c -r dfaf788ab18c tools/vnet/INSTALL --- a/tools/vnet/INSTALL Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,31 +0,0 @@ -To compile and install run "make install"; if it fails or you need to reinstall -run "make clean" first or the build will fail, at least that is what I have -found under 2.6.10. - -Other important items: -1) You will need to have your xen0 kernel compiled with HMAC_SUPPORT - 2.6.x = (MAIN MENU: Cryptographic Options -> HMAC Support) - BEFORE running "make install". - -2) You will want at least some of the other alogorithms listed under - "Cryptographic Options" for the kernel compiled as modules. - -3) You will want the networking IPsec/VLAN options compiled in as modules - 2.6.x = (MAIN MENU: Device Drivers -> Networking Support -> - Networking Options -> - IP: AH transformation - IP: ESP transformation - IP: IPComp transformation - IP: tunnel transformation - - IPsec user configuration interface - - 802.1Q VLAN Support - -4) The module (vnet_module) will not properly load from the command line - with a "modprobe vnet_module". Use network-vnet to properly configure - your system and load the module for you. - -Please refer to the additional documentation found in tools/vnet/doc for -proper syntax and config file parameters. - diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/cpumask.h --- a/xen/arch/ia64/patch/linux-2.6.11/cpumask.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,12 +0,0 @@ ---- ../../linux-2.6.11/include/linux/cpumask.h 2005-03-02 00:38:00.000000000 -0700 -+++ include/asm-ia64/linux/cpumask.h 2005-04-28 13:21:20.000000000 -0600 -@@ -342,7 +342,9 @@ - */ - - extern cpumask_t cpu_possible_map; -+#ifndef XEN - extern cpumask_t cpu_online_map; -+#endif - extern cpumask_t cpu_present_map; - - #if NR_CPUS > 1 diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/efi.c --- a/xen/arch/ia64/patch/linux-2.6.11/efi.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,50 +0,0 @@ ---- ../../linux-2.6.11/arch/ia64/kernel/efi.c 2005-03-02 00:37:47.000000000 -0700 -+++ arch/ia64/efi.c 2005-06-09 06:15:36.000000000 -0600 -@@ -320,6 +320,16 @@ - if (!(md->attribute & EFI_MEMORY_WB)) - continue; - -+#ifdef XEN -+// this works around a problem in the ski bootloader -+{ -+ extern long running_on_sim; -+ if (running_on_sim && md->type != EFI_CONVENTIONAL_MEMORY) -+ continue; -+} -+// this is a temporary hack to avoid CONFIG_VIRTUAL_MEM_MAP -+ if (md->phys_addr >= 0x100000000) continue; -+#endif - /* - * granule_addr is the base of md's first granule. - * [granule_addr - first_non_wb_addr) is guaranteed to -@@ -719,6 +729,30 @@ - return 0; - } - -+#ifdef XEN -+// variation of efi_get_iobase which returns entire memory descriptor -+efi_memory_desc_t * -+efi_get_io_md (void) -+{ -+ void *efi_map_start, *efi_map_end, *p; -+ efi_memory_desc_t *md; -+ u64 efi_desc_size; -+ -+ efi_map_start = __va(ia64_boot_param->efi_memmap); -+ efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; -+ efi_desc_size = ia64_boot_param->efi_memdesc_size; -+ -+ for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { -+ md = p; -+ if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) { -+ if (md->attribute & EFI_MEMORY_UC) -+ return md; -+ } -+ } -+ return 0; -+} -+#endif -+ - u32 - efi_mem_type (unsigned long phys_addr) - { diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/entry.S --- a/xen/arch/ia64/patch/linux-2.6.11/entry.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,237 +0,0 @@ ---- ../../linux-2.6.11/arch/ia64/kernel/entry.S 2005-03-02 00:37:50.000000000 -0700 -+++ arch/ia64/entry.S 2005-05-23 16:49:23.000000000 -0600 -@@ -46,6 +46,7 @@ - - #include "minstate.h" - -+#ifndef XEN - /* - * execve() is special because in case of success, we need to - * setup a null register window frame. -@@ -174,6 +175,7 @@ - mov rp=loc0 - br.ret.sptk.many rp - END(sys_clone) -+#endif /* !XEN */ - - /* - * prev_task <- ia64_switch_to(struct task_struct *next) -@@ -191,7 +193,11 @@ - movl r25=init_task - mov r27=IA64_KR(CURRENT_STACK) - adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0 -+#ifdef XEN -+ dep r20=0,in0,60,4 // physical address of "next" -+#else - dep r20=0,in0,61,3 // physical address of "next" -+#endif - ;; - st8 [r22]=sp // save kernel stack pointer of old task - shr.u r26=r20,IA64_GRANULE_SHIFT -@@ -220,6 +226,16 @@ - br.ret.sptk.many rp // boogie on out in new context - - .map: -+#ifdef XEN -+ // avoid overlapping with kernel TR -+ movl r25=KERNEL_START -+ dep r23=0,in0,0,KERNEL_TR_PAGE_SHIFT -+ ;; -+ cmp.eq p7,p0=r25,r23 -+ ;; -+(p7) mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... -+(p7) br.cond.sptk .done -+#endif - rsm psr.ic // interrupts (psr.i) are already disabled here - movl r25=PAGE_KERNEL - ;; -@@ -376,7 +392,11 @@ - * - b7 holds address to return to - * - must not touch r8-r11 - */ -+#ifdef XEN -+GLOBAL_ENTRY(load_switch_stack) -+#else - ENTRY(load_switch_stack) -+#endif - .prologue - .altrp b7 - -@@ -470,6 +490,7 @@ - br.cond.sptk.many b7 - END(load_switch_stack) - -+#ifndef XEN - GLOBAL_ENTRY(__ia64_syscall) - .regstk 6,0,0,0 - mov r15=in5 // put syscall number in place -@@ -588,6 +609,7 @@ - } - .ret4: br.cond.sptk ia64_leave_kernel - END(ia64_strace_leave_kernel) -+#endif - - GLOBAL_ENTRY(ia64_ret_from_clone) - PT_REGS_UNWIND_INFO(0) -@@ -604,6 +626,15 @@ - */ - br.call.sptk.many rp=ia64_invoke_schedule_tail - } -+#ifdef XEN -+ // new domains are cloned but not exec'ed so switch to user mode here -+ cmp.ne pKStk,pUStk=r0,r0 -+#ifdef CONFIG_VTI -+ br.cond.spnt ia64_leave_hypervisor -+#else // CONFIG_VTI -+ br.cond.spnt ia64_leave_kernel -+#endif // CONFIG_VTI -+#else - .ret8: - adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; -@@ -614,6 +645,7 @@ - ;; - cmp.ne p6,p0=r2,r0 - (p6) br.cond.spnt .strace_check_retval -+#endif - ;; // added stop bits to prevent r8 dependency - END(ia64_ret_from_clone) - // fall through -@@ -700,19 +732,27 @@ - .work_processed_syscall: - adds r2=PT(LOADRS)+16,r12 - adds r3=PT(AR_BSPSTORE)+16,r12 -+#ifdef XEN -+ ;; -+#else - adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; - (p6) ld4 r31=[r18] // load current_thread_info()->flags -+#endif - ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" - mov b7=r0 // clear b7 - ;; - ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage) - ld8 r18=[r2],PT(R9)-PT(B6) // load b6 -+#ifndef XEN - (p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? -+#endif - ;; - mov r16=ar.bsp // M2 get existing backing store pointer -+#ifndef XEN - (p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending? - (p6) br.cond.spnt .work_pending_syscall -+#endif - ;; - // start restoring the state saved on the kernel stack (struct pt_regs): - ld8 r9=[r2],PT(CR_IPSR)-PT(R9) -@@ -757,7 +797,11 @@ - ;; - ld8.fill r12=[r2] // restore r12 (sp) - ld8.fill r15=[r3] // restore r15 -+#ifdef XEN -+ movl r3=THIS_CPU(ia64_phys_stacked_size_p8) -+#else - addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0 -+#endif - ;; - (pUStk) ld4 r3=[r3] // r3 = cpu_data->phys_stacked_size_p8 - (pUStk) st1 [r14]=r17 -@@ -814,9 +858,18 @@ - (pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk - #endif - .work_processed_kernel: -+#ifdef XEN -+ alloc loc0=ar.pfs,0,1,1,0 -+ adds out0=16,r12 -+ ;; -+(p6) br.call.sptk.many b0=deliver_pending_interrupt -+ mov ar.pfs=loc0 -+ mov r31=r0 -+#else - adds r17=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; - (p6) ld4 r31=[r17] // load current_thread_info()->flags -+#endif - adds r21=PT(PR)+16,r12 - ;; - -@@ -828,17 +881,20 @@ - ld8 r28=[r2],8 // load b6 - adds r29=PT(R24)+16,r12 - -- ld8.fill r16=[r3],PT(AR_CSD)-PT(R16) -+ ld8.fill r16=[r3] - adds r30=PT(AR_CCV)+16,r12 - (p6) and r19=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE? - ;; -+ adds r3=PT(AR_CSD)-PT(R16),r3 - ld8.fill r24=[r29] - ld8 r15=[r30] // load ar.ccv - (p6) cmp4.ne.unc p6,p0=r19, r0 // any special work pending? - ;; - ld8 r29=[r2],16 // load b7 - ld8 r30=[r3],16 // load ar.csd -+#ifndef XEN - (p6) br.cond.spnt .work_pending -+#endif - ;; - ld8 r31=[r2],16 // load ar.ssd - ld8.fill r8=[r3],16 -@@ -934,7 +990,11 @@ - shr.u r18=r19,16 // get byte size of existing "dirty" partition - ;; - mov r16=ar.bsp // get existing backing store pointer -+#ifdef XEN -+ movl r17=THIS_CPU(ia64_phys_stacked_size_p8) -+#else - addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 -+#endif - ;; - ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 - (pKStk) br.cond.dpnt skip_rbs_switch -@@ -1069,6 +1129,7 @@ - mov pr=r31,-1 // I0 - rfi // B - -+#ifndef XEN - /* - * On entry: - * r20 = ¤t->thread_info->pre_count (if CONFIG_PREEMPT) -@@ -1130,6 +1191,7 @@ - ld8 r8=[r2] - ld8 r10=[r3] - br.cond.sptk.many .work_processed_syscall // re-check -+#endif - - END(ia64_leave_kernel) - -@@ -1166,6 +1228,7 @@ - br.ret.sptk.many rp - END(ia64_invoke_schedule_tail) - -+#ifndef XEN - /* - * Setup stack and call do_notify_resume_user(). Note that pSys and pNonSys need to - * be set up by the caller. We declare 8 input registers so the system call -@@ -1264,6 +1327,7 @@ - mov ar.unat=r9 - br.many b7 - END(sys_rt_sigreturn) -+#endif - - GLOBAL_ENTRY(ia64_prepare_handle_unaligned) - .prologue -@@ -1278,6 +1342,7 @@ - br.cond.sptk.many rp // goes to ia64_leave_kernel - END(ia64_prepare_handle_unaligned) - -+#ifndef XEN - // - // unw_init_running(void (*callback)(info, arg), void *arg) - // -@@ -1585,3 +1650,4 @@ - data8 sys_ni_syscall - - .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls -+#endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/entry.h --- a/xen/arch/ia64/patch/linux-2.6.11/entry.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,37 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/test3.bk/xen/../../linux-2.6.11/arch/ia64/kernel/entry.h 2005-03-01 23:38:07.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/test3.bk/xen/arch/ia64/entry.h 2005-05-18 14:00:53.000000000 -0700 -@@ -7,6 +7,12 @@ - #define PRED_LEAVE_SYSCALL 1 /* TRUE iff leave from syscall */ - #define PRED_KERNEL_STACK 2 /* returning to kernel-stacks? */ - #define PRED_USER_STACK 3 /* returning to user-stacks? */ -+#ifdef CONFIG_VTI -+#define PRED_EMUL 2 /* Need to save r4-r7 for inst emulation */ -+#define PRED_NON_EMUL 3 /* No need to save r4-r7 for normal path */ -+#define PRED_BN0 6 /* Guest is in bank 0 */ -+#define PRED_BN1 7 /* Guest is in bank 1 */ -+#endif // CONFIG_VTI - #define PRED_SYSCALL 4 /* inside a system call? */ - #define PRED_NON_SYSCALL 5 /* complement of PRED_SYSCALL */ - -@@ -17,12 +23,21 @@ - # define pLvSys PASTE(p,PRED_LEAVE_SYSCALL) - # define pKStk PASTE(p,PRED_KERNEL_STACK) - # define pUStk PASTE(p,PRED_USER_STACK) -+#ifdef CONFIG_VTI -+# define pEml PASTE(p,PRED_EMUL) -+# define pNonEml PASTE(p,PRED_NON_EMUL) -+# define pBN0 PASTE(p,PRED_BN0) -+# define pBN1 PASTE(p,PRED_BN1) -+#endif // CONFIG_VTI - # define pSys PASTE(p,PRED_SYSCALL) - # define pNonSys PASTE(p,PRED_NON_SYSCALL) - #endif - - #define PT(f) (IA64_PT_REGS_##f##_OFFSET) - #define SW(f) (IA64_SWITCH_STACK_##f##_OFFSET) -+#ifdef CONFIG_VTI -+#define VPD(f) (VPD_##f##_START_OFFSET) -+#endif // CONFIG_VTI - - #define PT_REGS_SAVES(off) \ - .unwabi 3, 'i'; \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/gcc_intrin.h --- a/xen/arch/ia64/patch/linux-2.6.11/gcc_intrin.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,69 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/test3.bk/xen/../../linux-2.6.11/include/asm-ia64/gcc_intrin.h 2005-03-01 23:38:08.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/test3.bk/xen/include/asm-ia64/gcc_intrin.h 2005-05-18 14:00:53.000000000 -0700 -@@ -368,6 +368,66 @@ - #define ia64_mf() asm volatile ("mf" ::: "memory") - #define ia64_mfa() asm volatile ("mf.a" ::: "memory") - -+#ifdef CONFIG_VTI -+/* -+ * Flushrs instruction stream. -+ */ -+#define ia64_flushrs() asm volatile ("flushrs;;":::"memory") -+ -+#define ia64_loadrs() asm volatile ("loadrs;;":::"memory") -+ -+#define ia64_get_rsc() \ -+({ \ -+ unsigned long val; \ -+ asm volatile ("mov %0=ar.rsc;;" : "=r"(val) :: "memory"); \ -+ val; \ -+}) -+ -+#define ia64_set_rsc(val) \ -+ asm volatile ("mov ar.rsc=%0;;" :: "r"(val) : "memory") -+ -+#define ia64_get_bspstore() \ -+({ \ -+ unsigned long val; \ -+ asm volatile ("mov %0=ar.bspstore;;" : "=r"(val) :: "memory"); \ -+ val; \ -+}) -+ -+#define ia64_set_bspstore(val) \ -+ asm volatile ("mov ar.bspstore=%0;;" :: "r"(val) : "memory") -+ -+#define ia64_get_rnat() \ -+({ \ -+ unsigned long val; \ -+ asm volatile ("mov %0=ar.rnat;" : "=r"(val) :: "memory"); \ -+ val; \ -+}) -+ -+#define ia64_set_rnat(val) \ -+ asm volatile ("mov ar.rnat=%0;;" :: "r"(val) : "memory") -+ -+#define ia64_ttag(addr) \ -+({ \ -+ __u64 ia64_intri_res; \ -+ asm volatile ("ttag %0=%1" : "=r"(ia64_intri_res) : "r" (addr)); \ -+ ia64_intri_res; \ -+}) -+ -+#define ia64_get_dcr() \ -+({ \ -+ __u64 result; \ -+ asm volatile ("mov %0=cr.dcr" : "=r"(result) : ); \ -+ result; \ -+}) -+ -+#define ia64_set_dcr(val) \ -+({ \ -+ asm volatile ("mov cr.dcr=%0" :: "r"(val) ); \ -+}) -+ -+#endif // CONFIG_VTI -+ -+ - #define ia64_invala() asm volatile ("invala" ::: "memory") - - #define ia64_thash(addr) \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/hardirq.h --- a/xen/arch/ia64/patch/linux-2.6.11/hardirq.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,14 +0,0 @@ ---- ../../linux-2.6.11/include/linux/hardirq.h 2005-03-02 00:38:00.000000000 -0700 -+++ include/asm-ia64/linux/hardirq.h 2005-04-28 16:34:39.000000000 -0600 -@@ -60,7 +60,11 @@ - */ - #define in_irq() (hardirq_count()) - #define in_softirq() (softirq_count()) -+#ifndef XEN - #define in_interrupt() (irq_count()) -+#else -+#define in_interrupt() 0 // FIXME LATER -+#endif - - #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) - # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/head.S --- a/xen/arch/ia64/patch/linux-2.6.11/head.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,120 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/arch/ia64/kernel/head.S 2005-03-01 23:38:13.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/arch/ia64/head.S 2005-05-18 12:40:50.000000000 -0700 -@@ -76,21 +76,21 @@ - * We initialize all of them to prevent inadvertently assuming - * something about the state of address translation early in boot. - */ -- mov r6=((ia64_rid(IA64_REGION_ID_KERNEL, (0<<61)) << 8) | (PAGE_SHIFT << 2) | 1) -+ movl r6=((ia64_rid(IA64_REGION_ID_KERNEL, (0<<61)) << 8) | (PAGE_SHIFT << 2) | 1) - movl r7=(0<<61) -- mov r8=((ia64_rid(IA64_REGION_ID_KERNEL, (1<<61)) << 8) | (PAGE_SHIFT << 2) | 1) -+ movl r8=((ia64_rid(IA64_REGION_ID_KERNEL, (1<<61)) << 8) | (PAGE_SHIFT << 2) | 1) - movl r9=(1<<61) -- mov r10=((ia64_rid(IA64_REGION_ID_KERNEL, (2<<61)) << 8) | (PAGE_SHIFT << 2) | 1) -+ movl r10=((ia64_rid(IA64_REGION_ID_KERNEL, (2<<61)) << 8) | (PAGE_SHIFT << 2) | 1) - movl r11=(2<<61) -- mov r12=((ia64_rid(IA64_REGION_ID_KERNEL, (3<<61)) << 8) | (PAGE_SHIFT << 2) | 1) -+ movl r12=((ia64_rid(IA64_REGION_ID_KERNEL, (3<<61)) << 8) | (PAGE_SHIFT << 2) | 1) - movl r13=(3<<61) -- mov r14=((ia64_rid(IA64_REGION_ID_KERNEL, (4<<61)) << 8) | (PAGE_SHIFT << 2) | 1) -+ movl r14=((ia64_rid(IA64_REGION_ID_KERNEL, (4<<61)) << 8) | (PAGE_SHIFT << 2) | 1) - movl r15=(4<<61) -- mov r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1) -+ movl r16=((ia64_rid(IA64_REGION_ID_KERNEL, (5<<61)) << 8) | (PAGE_SHIFT << 2) | 1) - movl r17=(5<<61) -- mov r18=((ia64_rid(IA64_REGION_ID_KERNEL, (6<<61)) << 8) | (IA64_GRANULE_SHIFT << 2)) -+ movl r18=((ia64_rid(IA64_REGION_ID_KERNEL, (6<<61)) << 8) | (IA64_GRANULE_SHIFT << 2)) - movl r19=(6<<61) -- mov r20=((ia64_rid(IA64_REGION_ID_KERNEL, (7<<61)) << 8) | (IA64_GRANULE_SHIFT << 2)) -+ movl r20=((ia64_rid(IA64_REGION_ID_KERNEL, (7<<61)) << 8) | (IA64_GRANULE_SHIFT << 2)) - movl r21=(7<<61) - ;; - mov rr[r7]=r6 -@@ -129,8 +129,13 @@ - /* - * Switch into virtual mode: - */ -+#ifdef CONFIG_VTI -+ movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH \ -+ |IA64_PSR_DI) -+#else // CONFIG_VTI - movl r16=(IA64_PSR_IT|IA64_PSR_IC|IA64_PSR_DT|IA64_PSR_RT|IA64_PSR_DFH|IA64_PSR_BN \ - |IA64_PSR_DI) -+#endif // CONFIG_VTI - ;; - mov cr.ipsr=r16 - movl r17=1f -@@ -143,7 +148,11 @@ - 1: // now we are in virtual mode - - // set IVT entry point---can't access I/O ports without it -+#ifdef CONFIG_VTI -+ movl r3=vmx_ia64_ivt -+#else // CONFIG_VTI - movl r3=ia64_ivt -+#endif // CONFIG_VTI - ;; - mov cr.iva=r3 - movl r2=FPSR_DEFAULT -@@ -187,7 +196,11 @@ - dep r18=0,r3,0,12 - ;; - or r18=r17,r18 -+#ifdef XEN -+ dep r2=-1,r3,60,4 // IMVA of task -+#else - dep r2=-1,r3,61,3 // IMVA of task -+#endif - ;; - mov r17=rr[r2] - shr.u r16=r3,IA64_GRANULE_SHIFT -@@ -207,8 +220,15 @@ - - .load_current: - // load the "current" pointer (r13) and ar.k6 with the current task -+#ifdef CONFIG_VTI -+ mov r21=r2 // virtual address -+ ;; -+ bsw.1 -+ ;; -+#else // CONFIG_VTI - mov IA64_KR(CURRENT)=r2 // virtual address - mov IA64_KR(CURRENT_STACK)=r16 -+#endif // CONFIG_VTI - mov r13=r2 - /* - * Reserve space at the top of the stack for "struct pt_regs". Kernel threads -@@ -227,7 +247,11 @@ - ;; - mov ar.rsc=0x3 // place RSE in eager mode - -+#ifdef XEN -+(isBP) dep r28=-1,r28,60,4 // make address virtual -+#else - (isBP) dep r28=-1,r28,61,3 // make address virtual -+#endif - (isBP) movl r2=ia64_boot_param - ;; - (isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader -@@ -254,7 +278,9 @@ - br.call.sptk.many b0=console_print - - self: hint @pause -+ ;; - br.sptk.many self // endless loop -+ ;; - END(_start) - - GLOBAL_ENTRY(ia64_save_debug_regs) -@@ -850,7 +876,11 @@ - * intermediate precision so that we can produce a full 64-bit result. - */ - GLOBAL_ENTRY(sched_clock) -+#ifdef XEN -+ movl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET -+#else - addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 -+#endif - mov.m r9=ar.itc // fetch cycle-counter (35 cyc) - ;; - ldf8 f8=[r8] diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/hpsim_ssc.h --- a/xen/arch/ia64/patch/linux-2.6.11/hpsim_ssc.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,26 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/arch/ia64/hp/sim/hpsim_ssc.h 2005-03-01 23:38:17.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/include/asm-ia64/hpsim_ssc.h 2005-05-18 12:40:19.000000000 -0700 -@@ -33,4 +33,23 @@ - */ - extern long ia64_ssc (long arg0, long arg1, long arg2, long arg3, int nr); - -+#ifdef XEN -+/* Note: These are declared in linux/arch/ia64/hp/sim/simscsi.c but belong -+ * in linux/include/asm-ia64/hpsim_ssc.h, hence their addition here */ -+#define SSC_OPEN 50 -+#define SSC_CLOSE 51 -+#define SSC_READ 52 -+#define SSC_WRITE 53 -+#define SSC_GET_COMPLETION 54 -+#define SSC_WAIT_COMPLETION 55 -+ -+#define SSC_WRITE_ACCESS 2 -+#define SSC_READ_ACCESS 1 -+ -+struct ssc_disk_req { -+ unsigned long addr; -+ unsigned long len; -+}; -+#endif -+ - #endif /* _IA64_PLATFORM_HPSIM_SSC_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/ia64regs.h --- a/xen/arch/ia64/patch/linux-2.6.11/ia64regs.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,38 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/test3.bk/xen/../../linux-2.6.11/include/asm-ia64/ia64regs.h 2005-03-01 23:38:07.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/test3.bk/xen/include/asm-ia64/ia64regs.h 2005-05-18 14:00:53.000000000 -0700 -@@ -87,6 +87,35 @@ - #define _IA64_REG_CR_LRR0 4176 - #define _IA64_REG_CR_LRR1 4177 - -+#ifdef CONFIG_VTI -+#define IA64_REG_CR_DCR 0 -+#define IA64_REG_CR_ITM 1 -+#define IA64_REG_CR_IVA 2 -+#define IA64_REG_CR_PTA 8 -+#define IA64_REG_CR_IPSR 16 -+#define IA64_REG_CR_ISR 17 -+#define IA64_REG_CR_IIP 19 -+#define IA64_REG_CR_IFA 20 -+#define IA64_REG_CR_ITIR 21 -+#define IA64_REG_CR_IIPA 22 -+#define IA64_REG_CR_IFS 23 -+#define IA64_REG_CR_IIM 24 -+#define IA64_REG_CR_IHA 25 -+#define IA64_REG_CR_LID 64 -+#define IA64_REG_CR_IVR 65 -+#define IA64_REG_CR_TPR 66 -+#define IA64_REG_CR_EOI 67 -+#define IA64_REG_CR_IRR0 68 -+#define IA64_REG_CR_IRR1 69 -+#define IA64_REG_CR_IRR2 70 -+#define IA64_REG_CR_IRR3 71 -+#define IA64_REG_CR_ITV 72 -+#define IA64_REG_CR_PMV 73 -+#define IA64_REG_CR_CMCV 74 -+#define IA64_REG_CR_LRR0 80 -+#define IA64_REG_CR_LRR1 81 -+#endif // CONFIG_VTI -+ - /* Indirect Registers for getindreg() and setindreg() */ - - #define _IA64_REG_INDR_CPUID 9000 /* getindreg only */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/interrupt.h --- a/xen/arch/ia64/patch/linux-2.6.11/interrupt.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,27 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/include/linux/interrupt.h 2005-03-01 23:38:09.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/include/asm-ia64/linux/interrupt.h 2005-05-18 12:40:50.000000000 -0700 -@@ -33,6 +33,7 @@ - #define IRQ_HANDLED (1) - #define IRQ_RETVAL(x) ((x) != 0) - -+#ifndef XEN - struct irqaction { - irqreturn_t (*handler)(int, void *, struct pt_regs *); - unsigned long flags; -@@ -49,6 +50,7 @@ - irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long, const char *, void *); - extern void free_irq(unsigned int, void *); -+#endif - - - #ifdef CONFIG_GENERIC_HARDIRQS -@@ -121,7 +123,7 @@ - }; - - asmlinkage void do_softirq(void); --extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); -+//extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data); - extern void softirq_init(void); - #define __raise_softirq_irqoff(nr) do { local_softirq_pending() |= 1UL << (nr); } while (0) - extern void FASTCALL(raise_softirq_irqoff(unsigned int nr)); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/io.h --- a/xen/arch/ia64/patch/linux-2.6.11/io.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,14 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/include/asm-ia64/io.h 2005-03-01 23:38:34.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/include/asm-ia64/io.h 2005-05-18 12:40:50.000000000 -0700 -@@ -23,7 +23,11 @@ - #define __SLOW_DOWN_IO do { } while (0) - #define SLOW_DOWN_IO do { } while (0) - -+#ifdef XEN -+#define __IA64_UNCACHED_OFFSET 0xe800000000000000UL -+#else - #define __IA64_UNCACHED_OFFSET 0xc000000000000000UL /* region 6 */ -+#endif - - /* - * The legacy I/O space defined by the ia64 architecture supports only 65536 ports, but diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/irq_ia64.c --- a/xen/arch/ia64/patch/linux-2.6.11/irq_ia64.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,126 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/arch/ia64/kernel/irq_ia64.c 2005-03-01 23:38:07.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/arch/ia64/irq_ia64.c 2005-05-18 12:40:51.000000000 -0700 -@@ -106,6 +106,9 @@ - unsigned long saved_tpr; - - #if IRQ_DEBUG -+#ifdef XEN -+ xen_debug_irq(vector, regs); -+#endif - { - unsigned long bsp, sp; - -@@ -148,6 +151,9 @@ - ia64_setreg(_IA64_REG_CR_TPR, vector); - ia64_srlz_d(); - -+#ifdef XEN -+ if (!xen_do_IRQ(vector)) -+#endif - __do_IRQ(local_vector_to_irq(vector), regs); - - /* -@@ -167,6 +173,103 @@ - irq_exit(); - } - -+#ifdef CONFIG_VTI -+#define vmx_irq_enter() \ -+ add_preempt_count(HARDIRQ_OFFSET); -+ -+/* Now softirq will be checked when leaving hypervisor, or else -+ * scheduler irq will be executed too early. -+ */ -+#define vmx_irq_exit(void) \ -+ sub_preempt_count(HARDIRQ_OFFSET); -+/* -+ * That's where the IVT branches when we get an external -+ * interrupt. This branches to the correct hardware IRQ handler via -+ * function ptr. -+ */ -+void -+vmx_ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) -+{ -+ unsigned long saved_tpr; -+ int wake_dom0 = 0; -+ -+ -+#if IRQ_DEBUG -+ { -+ unsigned long bsp, sp; -+ -+ /* -+ * Note: if the interrupt happened while executing in -+ * the context switch routine (ia64_switch_to), we may -+ * get a spurious stack overflow here. This is -+ * because the register and the memory stack are not -+ * switched atomically. -+ */ -+ bsp = ia64_getreg(_IA64_REG_AR_BSP); -+ sp = ia64_getreg(_IA64_REG_AR_SP); -+ -+ if ((sp - bsp) < 1024) { -+ static unsigned char count; -+ static long last_time; -+ -+ if (jiffies - last_time > 5*HZ) -+ count = 0; -+ if (++count < 5) { -+ last_time = jiffies; -+ printk("ia64_handle_irq: DANGER: less than " -+ "1KB of free stack space!!\n" -+ "(bsp=0x%lx, sp=%lx)\n", bsp, sp); -+ } -+ } -+ } -+#endif /* IRQ_DEBUG */ -+ -+ /* -+ * Always set TPR to limit maximum interrupt nesting depth to -+ * 16 (without this, it would be ~240, which could easily lead -+ * to kernel stack overflows). -+ */ -+ vmx_irq_enter(); -+ saved_tpr = ia64_getreg(_IA64_REG_CR_TPR); -+ ia64_srlz_d(); -+ while (vector != IA64_SPURIOUS_INT_VECTOR) { -+ if (!IS_RESCHEDULE(vector)) { -+ ia64_setreg(_IA64_REG_CR_TPR, vector); -+ ia64_srlz_d(); -+ -+ if (vector != IA64_TIMER_VECTOR) { -+ /* FIXME: Leave IRQ re-route later */ -+ vmx_vcpu_pend_interrupt(dom0->vcpu[0],vector); -+ wake_dom0 = 1; -+ } -+ else { // FIXME: Handle Timer only now -+ __do_IRQ(local_vector_to_irq(vector), regs); -+ } -+ -+ /* -+ * Disable interrupts and send EOI: -+ */ -+ local_irq_disable(); -+ ia64_setreg(_IA64_REG_CR_TPR, saved_tpr); -+ } -+ else { -+ printf("Oops: RESCHEDULE IPI absorbed by HV\n"); -+ } -+ ia64_eoi(); -+ vector = ia64_get_ivr(); -+ } -+ /* -+ * This must be done *after* the ia64_eoi(). For example, the keyboard softirq -+ * handler needs to be able to wait for further keyboard interrupts, which can't -+ * come through until ia64_eoi() has been done. -+ */ -+ vmx_irq_exit(); -+ if ( wake_dom0 && current != dom0 ) -+ domain_wake(dom0->vcpu[0]); -+} -+#endif -+ -+ - #ifdef CONFIG_HOTPLUG_CPU - /* - * This function emulates a interrupt processing when a cpu is about to be diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/kregs.h --- a/xen/arch/ia64/patch/linux-2.6.11/kregs.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,66 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/include/asm-ia64/kregs.h 2005-03-01 23:37:49.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/include/asm-ia64/kregs.h 2005-05-18 12:40:50.000000000 -0700 -@@ -29,8 +29,21 @@ - */ - #define IA64_TR_KERNEL 0 /* itr0, dtr0: maps kernel image (code & data) */ - #define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */ -+#ifdef CONFIG_VTI -+#define IA64_TR_XEN_IN_DOM 6 /* itr6, dtr6: Double mapping for xen image in domain space */ -+#endif // CONFIG_VTI - #define IA64_TR_PERCPU_DATA 1 /* dtr1: percpu data */ - #define IA64_TR_CURRENT_STACK 2 /* dtr2: maps kernel's memory- & register-stacks */ -+#ifdef XEN -+#define IA64_TR_SHARED_INFO 3 /* dtr3: page shared with domain */ -+#define IA64_TR_VHPT 4 /* dtr4: vhpt */ -+#define IA64_TR_ARCH_INFO 5 -+#ifdef CONFIG_VTI -+#define IA64_TR_VHPT_IN_DOM 5 /* dtr5: Double mapping for vhpt table in domain space */ -+#define IA64_TR_RR7_SWITCH_STUB 7 /* dtr7: mapping for rr7 switch stub */ -+#define IA64_TEMP_PHYSICAL 8 /* itr8, dtr8: temp mapping for guest physical memory 256M */ -+#endif // CONFIG_VTI -+#endif - - /* Processor status register bits: */ - #define IA64_PSR_BE_BIT 1 -@@ -66,6 +78,9 @@ - #define IA64_PSR_ED_BIT 43 - #define IA64_PSR_BN_BIT 44 - #define IA64_PSR_IA_BIT 45 -+#ifdef CONFIG_VTI -+#define IA64_PSR_VM_BIT 46 -+#endif // CONFIG_VTI - - /* A mask of PSR bits that we generally don't want to inherit across a clone2() or an - execve(). Only list flags here that need to be cleared/set for BOTH clone2() and -@@ -107,6 +122,9 @@ - #define IA64_PSR_ED (__IA64_UL(1) << IA64_PSR_ED_BIT) - #define IA64_PSR_BN (__IA64_UL(1) << IA64_PSR_BN_BIT) - #define IA64_PSR_IA (__IA64_UL(1) << IA64_PSR_IA_BIT) -+#ifdef CONFIG_VTI -+#define IA64_PSR_VM (__IA64_UL(1) << IA64_PSR_VM_BIT) -+#endif // CONFIG_VTI - - /* User mask bits: */ - #define IA64_PSR_UM (IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL | IA64_PSR_MFH) -@@ -160,4 +178,21 @@ - #define IA64_ISR_CODE_LFETCH 4 - #define IA64_ISR_CODE_PROBEF 5 - -+#ifdef XEN -+/* Interruption Function State */ -+#define IA64_IFS_V_BIT 63 -+#define IA64_IFS_V (__IA64_UL(1) << IA64_IFS_V_BIT) -+ -+/* Page Table Address */ -+#define IA64_PTA_VE_BIT 0 -+#define IA64_PTA_SIZE_BIT 2 -+#define IA64_PTA_VF_BIT 8 -+#define IA64_PTA_BASE_BIT 15 -+ -+#define IA64_PTA_VE (__IA64_UL(1) << IA64_PTA_VE_BIT) -+#define IA64_PTA_SIZE (__IA64_UL(0x3f) << IA64_PTA_SIZE_BIT) -+#define IA64_PTA_VF (__IA64_UL(1) << IA64_PTA_VF_BIT) -+#define IA64_PTA_BASE (__IA64_UL(0) - ((__IA64_UL(1) << IA64_PTA_BASE_BIT))) -+#endif -+ - #endif /* _ASM_IA64_kREGS_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/mca_asm.h --- a/xen/arch/ia64/patch/linux-2.6.11/mca_asm.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,32 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/include/asm-ia64/mca_asm.h 2005-03-01 23:38:38.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/include/asm-ia64/mca_asm.h 2005-05-18 12:40:19.000000000 -0700 -@@ -26,8 +26,13 @@ - * direct mapped to physical addresses. - * 1. Lop off bits 61 thru 63 in the virtual address - */ -+#ifdef XEN -+#define INST_VA_TO_PA(addr) \ -+ dep addr = 0, addr, 60, 4 -+#else // XEN - #define INST_VA_TO_PA(addr) \ - dep addr = 0, addr, 61, 3 -+#endif // XEN - /* - * This macro converts a data virtual address to a physical address - * Right now for simulation purposes the virtual addresses are -@@ -42,9 +47,15 @@ - * direct mapped to physical addresses. - * 1. Put 0x7 in bits 61 thru 63. - */ -+#ifdef XEN -+#define DATA_PA_TO_VA(addr,temp) \ -+ mov temp = 0xf ;; \ -+ dep addr = temp, addr, 60, 4 -+#else // XEN - #define DATA_PA_TO_VA(addr,temp) \ - mov temp = 0x7 ;; \ - dep addr = temp, addr, 61, 3 -+#endif // XEN - - #define GET_THIS_PADDR(reg, var) \ - mov reg = IA64_KR(PER_CPU_DATA);; \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/minstate.h --- a/xen/arch/ia64/patch/linux-2.6.11/minstate.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,25 +0,0 @@ - minstate.h | 4 ++-- - 1 files changed, 2 insertions(+), 2 deletions(-) - -Index: linux-2.6.11-xendiffs/arch/ia64/kernel/minstate.h -=================================================================== ---- linux-2.6.11-xendiffs.orig/arch/ia64/kernel/minstate.h 2005-04-06 22:51:31.170261541 -0500 -+++ linux-2.6.11-xendiffs/arch/ia64/kernel/minstate.h 2005-04-06 22:54:03.210575034 -0500 -@@ -48,7 +48,7 @@ - (pUStk) mov r24=ar.rnat; \ - (pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ - (pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ --(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ -+(pUStk) dep r22=-1,r22,60,4; /* compute kernel virtual addr of RBS */ \ - ;; \ - (pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ - (pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ -@@ -57,7 +57,7 @@ - (pUStk) mov ar.rsc=0x3; /* set eager mode, pl 0, little-endian, loadrs=0 */ \ - - #define MINSTATE_END_SAVE_MIN_PHYS \ -- dep r12=-1,r12,61,3; /* make sp a kernel virtual address */ \ -+ dep r12=-1,r12,60,4; /* make sp a kernel virtual address */ \ - ;; - - #ifdef MINSTATE_VIRT diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/mm_contig.c --- a/xen/arch/ia64/patch/linux-2.6.11/mm_contig.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,47 +0,0 @@ ---- ../../linux-2.6.11/arch/ia64/mm/contig.c 2005-03-02 00:37:55.000000000 -0700 -+++ arch/ia64/mm_contig.c 2005-04-28 16:13:52.000000000 -0600 -@@ -35,6 +35,7 @@ - * - * Just walks the pages in the system and describes where they're allocated. - */ -+#ifndef XEN - void - show_mem (void) - { -@@ -63,6 +64,7 @@ - printk("%d pages swap cached\n", cached); - printk("%ld pages in page table cache\n", pgtable_cache_size); - } -+#endif - - /* physical address where the bootmem map is located */ - unsigned long bootmap_start; -@@ -140,6 +142,7 @@ - * Walk the EFI memory map and find usable memory for the system, taking - * into account reserved areas. - */ -+#ifndef XEN - void - find_memory (void) - { -@@ -168,6 +171,7 @@ - - find_initrd(); - } -+#endif - - #ifdef CONFIG_SMP - /** -@@ -225,6 +229,7 @@ - * Set up the page tables. - */ - -+#ifndef XEN - void - paging_init (void) - { -@@ -297,3 +302,4 @@ - #endif /* !CONFIG_VIRTUAL_MEM_MAP */ - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); - } -+#endif /* !CONFIG_XEN */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/page.h --- a/xen/arch/ia64/patch/linux-2.6.11/page.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,74 +0,0 @@ ---- /home/adsharma/xeno-unstable-ia64-staging.bk/xen/../../linux-2.6.11/include/asm-ia64/page.h 2005-03-01 23:37:48.000000000 -0800 -+++ /home/adsharma/xeno-unstable-ia64-staging.bk/xen/include/asm-ia64/page.h 2005-05-20 09:36:02.000000000 -0700 -@@ -32,6 +32,7 @@ - #define PAGE_ALIGN(addr) (((addr) + PAGE_SIZE - 1) & PAGE_MASK) - - #define PERCPU_PAGE_SHIFT 16 /* log2() of max. size of per-CPU area */ -+ - #define PERCPU_PAGE_SIZE (__IA64_UL_CONST(1) << PERCPU_PAGE_SHIFT) - - #define RGN_MAP_LIMIT ((1UL << (4*PAGE_SHIFT - 12)) - PAGE_SIZE) /* per region addr limit */ -@@ -95,9 +96,15 @@ - #endif - - #ifndef CONFIG_DISCONTIGMEM -+#ifdef XEN -+# define pfn_valid(pfn) (0) -+# define page_to_pfn(_page) ((unsigned long)((_page) - frame_table)) -+# define pfn_to_page(_pfn) (frame_table + (_pfn)) -+#else - # define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) - # define page_to_pfn(page) ((unsigned long) (page - mem_map)) - # define pfn_to_page(pfn) (mem_map + (pfn)) -+#endif - #else - extern struct page *vmem_map; - extern unsigned long max_low_pfn; -@@ -109,6 +116,11 @@ - #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) - #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) - -+#ifdef XEN -+#define page_to_virt(_page) phys_to_virt(page_to_phys(_page)) -+#define phys_to_page(kaddr) pfn_to_page(((kaddr) >> PAGE_SHIFT)) -+#endif -+ - typedef union ia64_va { - struct { - unsigned long off : 61; /* intra-region offset */ -@@ -124,8 +136,23 @@ - * expressed in this way to ensure they result in a single "dep" - * instruction. - */ -+#ifdef XEN -+typedef union xen_va { -+ struct { -+ unsigned long off : 60; -+ unsigned long reg : 4; -+ } f; -+ unsigned long l; -+ void *p; -+} xen_va; -+ -+// xen/drivers/console.c uses __va in a declaration (should be fixed!) -+#define __pa(x) ({xen_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) -+#define __va(x) ({xen_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) -+#else - #define __pa(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) - #define __va(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) -+#endif - - #define REGION_NUMBER(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg;}) - #define REGION_OFFSET(x) ({ia64_va _v; _v.l = (long) (x); _v.f.off;}) -@@ -197,7 +224,11 @@ - # define __pgprot(x) (x) - #endif /* !STRICT_MM_TYPECHECKS */ - -+#ifdef XEN -+#define PAGE_OFFSET __IA64_UL_CONST(0xf000000000000000) -+#else - #define PAGE_OFFSET __IA64_UL_CONST(0xe000000000000000) -+#endif - - #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/pal.S --- a/xen/arch/ia64/patch/linux-2.6.11/pal.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,26 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/arch/ia64/kernel/pal.S 2005-03-01 23:38:33.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/arch/ia64/pal.S 2005-05-18 12:40:19.000000000 -0700 -@@ -166,7 +166,11 @@ - adds r8 = 1f-1b,r8 // calculate return address for call - ;; - mov loc4=ar.rsc // save RSE configuration -+#ifdef XEN -+ dep.z loc2=loc2,0,60 // convert pal entry point to physical -+#else // XEN - dep.z loc2=loc2,0,61 // convert pal entry point to physical -+#endif // XEN - tpa r8=r8 // convert rp to physical - ;; - mov b7 = loc2 // install target to branch reg -@@ -225,7 +229,11 @@ - mov loc3 = psr // save psr - ;; - mov loc4=ar.rsc // save RSE configuration -+#ifdef XEN -+ dep.z loc2=loc2,0,60 // convert pal entry point to physical -+#else // XEN - dep.z loc2=loc2,0,61 // convert pal entry point to physical -+#endif // XEN - ;; - mov ar.rsc=0 // put RSE in enforced lazy, LE mode - movl r16=PAL_PSR_BITS_TO_CLEAR diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/pal.h --- a/xen/arch/ia64/patch/linux-2.6.11/pal.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,12 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/test3.bk/xen/../../linux-2.6.11/include/asm-ia64/pal.h 2005-03-01 23:38:13.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/test3.bk/xen/include/asm-ia64/pal.h 2005-05-18 14:00:53.000000000 -0700 -@@ -1559,6 +1559,9 @@ - return iprv.status; - } - -+#ifdef CONFIG_VTI -+#include <asm/vmx_pal.h> -+#endif // CONFIG_VTI - #endif /* __ASSEMBLY__ */ - - #endif /* _ASM_IA64_PAL_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/pgalloc.h --- a/xen/arch/ia64/patch/linux-2.6.11/pgalloc.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,76 +0,0 @@ ---- ../../linux-2.6.11/include/asm-ia64/pgalloc.h 2005-03-02 00:37:31.000000000 -0700 -+++ include/asm-ia64/pgalloc.h 2005-06-09 13:40:48.000000000 -0600 -@@ -61,7 +61,12 @@ - pgd_t *pgd = pgd_alloc_one_fast(mm); - - if (unlikely(pgd == NULL)) { -+#ifdef XEN -+ pgd = (pgd_t *)alloc_xenheap_page(); -+ memset(pgd,0,PAGE_SIZE); -+#else - pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); -+#endif - } - return pgd; - } -@@ -104,7 +109,12 @@ - static inline pmd_t* - pmd_alloc_one (struct mm_struct *mm, unsigned long addr) - { -+#ifdef XEN -+ pmd_t *pmd = (pmd_t *)alloc_xenheap_page(); -+ memset(pmd,0,PAGE_SIZE); -+#else - pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); -+#endif - - return pmd; - } -@@ -136,7 +146,12 @@ - static inline struct page * - pte_alloc_one (struct mm_struct *mm, unsigned long addr) - { -+#ifdef XEN -+ struct page *pte = alloc_xenheap_page(); -+ memset(pte,0,PAGE_SIZE); -+#else - struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -+#endif - - return pte; - } -@@ -144,7 +159,12 @@ - static inline pte_t * - pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr) - { -+#ifdef XEN -+ pte_t *pte = (pte_t *)alloc_xenheap_page(); -+ memset(pte,0,PAGE_SIZE); -+#else - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); -+#endif - - return pte; - } -@@ -152,13 +172,21 @@ - static inline void - pte_free (struct page *pte) - { -+#ifdef XEN -+ free_xenheap_page(pte); -+#else - __free_page(pte); -+#endif - } - - static inline void - pte_free_kernel (pte_t *pte) - { -+#ifdef XEN -+ free_xenheap_page((unsigned long) pte); -+#else - free_page((unsigned long) pte); -+#endif - } - - #define __pte_free_tlb(tlb, pte) tlb_remove_page((tlb), (pte)) diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/processor.h --- a/xen/arch/ia64/patch/linux-2.6.11/processor.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,37 +0,0 @@ ---- /home/adsharma/xeno-unstable-ia64-staging.bk/xen/../../linux-2.6.11/include/asm-ia64/processor.h 2005-03-01 23:37:58.000000000 -0800 -+++ /home/adsharma/xeno-unstable-ia64-staging.bk/xen/include/asm-ia64/processor.h 2005-05-20 09:36:02.000000000 -0700 -@@ -94,7 +94,11 @@ - #ifdef CONFIG_NUMA - #include <asm/nodedata.h> - #endif -+#ifdef XEN -+#include <asm/xenprocessor.h> -+#endif - -+#ifndef XEN - /* like above but expressed as bitfields for more efficient access: */ - struct ia64_psr { - __u64 reserved0 : 1; -@@ -133,6 +137,7 @@ - __u64 bn : 1; - __u64 reserved4 : 19; - }; -+#endif - - /* - * CPU type, hardware bug flags, and per-CPU state. Frequently used -@@ -408,12 +413,14 @@ - */ - - /* Return TRUE if task T owns the fph partition of the CPU we're running on. */ -+#ifndef XEN - #define ia64_is_local_fpu_owner(t) \ - ({ \ - struct task_struct *__ia64_islfo_task = (t); \ - (__ia64_islfo_task->thread.last_fph_cpu == smp_processor_id() \ - && __ia64_islfo_task == (struct task_struct *) ia64_get_kr(IA64_KR_FPU_OWNER)); \ - }) -+#endif - - /* Mark task T as owning the fph partition of the CPU we're running on. */ - #define ia64_set_local_fpu_owner(t) do { \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/ptrace.h --- a/xen/arch/ia64/patch/linux-2.6.11/ptrace.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,20 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/test3.bk/xen/../../linux-2.6.11/include/asm-ia64/ptrace.h 2005-03-01 23:38:38.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/test3.bk/xen/include/asm-ia64/ptrace.h 2005-05-18 14:00:53.000000000 -0700 -@@ -95,6 +95,9 @@ - * (because the memory stack pointer MUST ALWAYS be aligned this way) - * - */ -+#ifdef XEN -+#include <public/arch-ia64.h> -+#else - struct pt_regs { - /* The following registers are saved by SAVE_MIN: */ - unsigned long b6; /* scratch */ -@@ -170,6 +173,7 @@ - struct ia64_fpreg f10; /* scratch */ - struct ia64_fpreg f11; /* scratch */ - }; -+#endif - - /* - * This structure contains the addition registers that need to diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/series --- a/xen/arch/ia64/patch/linux-2.6.11/series Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,40 +0,0 @@ -bootmem.h -current.h -efi.c -efi.h -entry.S -gcc_intrin.h -hardirq.h -head.S -hpsim_irq.c -hpsim_ssc.h -hw_irq.h -ide.h -init_task.c -init_task.h -interrupt.h -io.h -irq.h -irq_ia64.c -ivt.S -kregs.h -lds.S -linuxtime.h -minstate.h -mm_bootmem.c -mm_contig.c -mmzone.h -page_alloc.c -page.h -processor.h -sal.h -setup.c -slab.c -slab.h -system.h -time.c -kernel-time.c -tlb.c -types.h -unaligned.c -wait.h diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/setup.c --- a/xen/arch/ia64/patch/linux-2.6.11/setup.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,151 +0,0 @@ ---- ../../linux-2.6.11/arch/ia64/kernel/setup.c 2005-03-02 00:37:49.000000000 -0700 -+++ arch/ia64/setup.c 2005-06-03 10:14:24.000000000 -0600 -@@ -51,6 +51,10 @@ - #include <asm/smp.h> - #include <asm/system.h> - #include <asm/unistd.h> -+#ifdef CONFIG_VTI -+#include <asm/vmx.h> -+#endif // CONFIG_VTI -+#include <asm/io.h> - - #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) - # error "struct cpuinfo_ia64 too big!" -@@ -127,7 +131,16 @@ - range_end = min(end, rsvd_region[i].start); - - if (range_start < range_end) -+#ifdef XEN -+ { -+ /* init_boot_pages requires "ps, pe" */ -+ printk("Init boot pages: 0x%lx -> 0x%lx.\n", -+ __pa(range_start), __pa(range_end)); -+ (*func)(__pa(range_start), __pa(range_end), 0); -+ } -+#else - call_pernode_memory(__pa(range_start), range_end - range_start, func); -+#endif - - /* nothing more available in this segment */ - if (range_end == end) return 0; -@@ -185,7 +198,12 @@ - n++; - - rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START); -+#ifdef XEN -+ /* Reserve xen image/bitmap/xen-heap */ -+ rsvd_region[n].end = rsvd_region[n].start + xenheap_size; -+#else - rsvd_region[n].end = (unsigned long) ia64_imva(_end); -+#endif - n++; - - #ifdef CONFIG_BLK_DEV_INITRD -@@ -299,17 +317,25 @@ - } - - void __init -+#ifdef XEN -+early_setup_arch (char **cmdline_p) -+#else - setup_arch (char **cmdline_p) -+#endif - { - unw_init(); - - ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); - - *cmdline_p = __va(ia64_boot_param->command_line); -+#ifdef XEN -+ efi_init(); -+#else - strlcpy(saved_command_line, *cmdline_p, COMMAND_LINE_SIZE); - - efi_init(); - io_port_init(); -+#endif - - #ifdef CONFIG_IA64_GENERIC - { -@@ -336,6 +362,11 @@ - } - #endif - -+#ifdef XEN -+ early_cmdline_parse(cmdline_p); -+ cmdline_parse(*cmdline_p); -+#undef CONFIG_ACPI_BOOT -+#endif - if (early_console_setup(*cmdline_p) == 0) - mark_bsp_online(); - -@@ -351,8 +382,18 @@ - # endif - #endif /* CONFIG_APCI_BOOT */ - -+#ifndef XEN - find_memory(); -+#else -+ io_port_init(); -+} - -+void __init -+late_setup_arch (char **cmdline_p) -+{ -+#undef CONFIG_ACPI_BOOT -+ acpi_table_init(); -+#endif - /* process SAL system table: */ - ia64_sal_init(efi.sal_systab); - -@@ -360,6 +401,10 @@ - cpu_physical_id(0) = hard_smp_processor_id(); - #endif - -+#ifdef CONFIG_VTI -+ identify_vmx_feature(); -+#endif // CONFIG_VTI -+ - cpu_init(); /* initialize the bootstrap CPU */ - - #ifdef CONFIG_ACPI_BOOT -@@ -492,12 +537,14 @@ - { - } - -+#ifndef XEN - struct seq_operations cpuinfo_op = { - .start = c_start, - .next = c_next, - .stop = c_stop, - .show = show_cpuinfo - }; -+#endif - - void - identify_cpu (struct cpuinfo_ia64 *c) -@@ -551,6 +598,12 @@ - } - c->unimpl_va_mask = ~((7L<<61) | ((1L << (impl_va_msb + 1)) - 1)); - c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); -+ -+#ifdef CONFIG_VTI -+ /* If vmx feature is on, do necessary initialization for vmx */ -+ if (vmx_enabled) -+ vmx_init_env(); -+#endif - } - - void -@@ -659,7 +712,11 @@ - | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC)); - atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; -+#ifdef XEN -+ if (current->domain->arch.mm) -+#else - if (current->mm) -+#endif - BUG(); - - ia64_mmu_init(ia64_imva(cpu_data)); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/sn_sal.h --- a/xen/arch/ia64/patch/linux-2.6.11/sn_sal.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,33 +0,0 @@ ---- /data/lwork/attica1/edwardsg/linux-2.6.11/include/asm-ia64/sn/sn_sal.h 2005-03-02 01:38:33 -06:00 -+++ include/asm-ia64/sn/sn_sal.h 2005-06-01 14:31:47 -05:00 -@@ -123,6 +123,7 @@ - #define SALRET_ERROR (-3) - - -+#ifndef XEN - /** - * sn_sal_rev_major - get the major SGI SAL revision number - * -@@ -226,6 +227,7 @@ ia64_sn_get_klconfig_addr(nasid_t nasid) - } - return ret_stuff.v0 ? __va(ret_stuff.v0) : NULL; - } -+#endif /* !XEN */ - - /* - * Returns the next console character. -@@ -304,6 +306,7 @@ ia64_sn_console_putb(const char *buf, in - return (u64)0; - } - -+#ifndef XEN - /* - * Print a platform error record - */ -@@ -987,5 +990,5 @@ ia64_sn_hwperf_op(nasid_t nasid, u64 opc - *v0 = (int) rv.v0; - return (int) rv.status; - } -- -+#endif /* !XEN */ - #endif /* _ASM_IA64_SN_SN_SAL_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/system.h --- a/xen/arch/ia64/patch/linux-2.6.11/system.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,38 +0,0 @@ ---- /home/adsharma/xeno-unstable-ia64-staging.bk/xen/../../linux-2.6.11/include/asm-ia64/system.h 2005-03-01 23:38:07.000000000 -0800 -+++ /home/adsharma/xeno-unstable-ia64-staging.bk/xen/include/asm-ia64/system.h 2005-05-20 09:36:02.000000000 -0700 -@@ -18,14 +18,19 @@ - #include <asm/page.h> - #include <asm/pal.h> - #include <asm/percpu.h> -+#ifdef XEN -+#include <asm/xensystem.h> -+#endif - - #define GATE_ADDR __IA64_UL_CONST(0xa000000000000000) - /* - * 0xa000000000000000+2*PERCPU_PAGE_SIZE - * - 0xa000000000000000+3*PERCPU_PAGE_SIZE remain unmapped (guard page) - */ -+#ifndef XEN - #define KERNEL_START __IA64_UL_CONST(0xa000000100000000) - #define PERCPU_ADDR (-PERCPU_PAGE_SIZE) -+#endif - - #ifndef __ASSEMBLY__ - -@@ -218,6 +223,7 @@ - # define PERFMON_IS_SYSWIDE() (0) - #endif - -+#ifndef XEN - #define IA64_HAS_EXTRA_STATE(t) \ - ((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID) \ - || IS_IA32_PROCESS(ia64_task_regs(t)) || PERFMON_IS_SYSWIDE()) -@@ -230,6 +236,7 @@ - ia64_psr(ia64_task_regs(next))->dfh = !ia64_is_local_fpu_owner(next); \ - (last) = ia64_switch_to((next)); \ - } while (0) -+#endif - - #ifdef CONFIG_SMP - /* diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/time.c --- a/xen/arch/ia64/patch/linux-2.6.11/time.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,56 +0,0 @@ ---- ../../linux-2.6.11/arch/ia64/kernel/time.c 2005-03-02 00:37:50.000000000 -0700 -+++ arch/ia64/time.c 2005-05-02 11:19:29.000000000 -0600 -@@ -29,6 +29,9 @@ - #include <asm/sal.h> - #include <asm/sections.h> - #include <asm/system.h> -+#ifdef XEN -+#include <linux/jiffies.h> // not included by xen/sched.h -+#endif - - extern unsigned long wall_jiffies; - -@@ -45,6 +48,7 @@ - - #endif - -+#ifndef XEN - static struct time_interpolator itc_interpolator = { - .shift = 16, - .mask = 0xffffffffffffffffLL, -@@ -110,6 +114,7 @@ - } while (time_after_eq(ia64_get_itc(), new_itm)); - return IRQ_HANDLED; - } -+#endif - - /* - * Encapsulate access to the itm structure for SMP. -@@ -212,6 +217,7 @@ - + itc_freq/2)/itc_freq; - - if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { -+#ifndef XEN - itc_interpolator.frequency = local_cpu_data->itc_freq; - itc_interpolator.drift = itc_drift; - #ifdef CONFIG_SMP -@@ -228,12 +234,14 @@ - if (!nojitter) itc_interpolator.jitter = 1; - #endif - register_time_interpolator(&itc_interpolator); -+#endif - } - - /* Setup the CPU local timer tick */ - ia64_cpu_local_tick(); - } - -+#ifndef XEN - static struct irqaction timer_irqaction = { - .handler = timer_interrupt, - .flags = SA_INTERRUPT, -@@ -253,3 +261,4 @@ - */ - set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - } -+#endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/tlb.c --- a/xen/arch/ia64/patch/linux-2.6.11/tlb.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,38 +0,0 @@ ---- ../../linux-2.6.11/arch/ia64/mm/tlb.c 2005-03-02 00:38:38.000000000 -0700 -+++ arch/ia64/tlb.c 2005-05-02 10:23:09.000000000 -0600 -@@ -43,6 +43,9 @@ - void - wrap_mmu_context (struct mm_struct *mm) - { -+#ifdef XEN -+printf("wrap_mmu_context: called, not implemented\n"); -+#else - unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx; - struct task_struct *tsk; - int i; -@@ -83,6 +86,7 @@ - put_cpu(); - } - local_flush_tlb_all(); -+#endif - } - - void -@@ -132,6 +136,9 @@ - void - flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end) - { -+#ifdef XEN -+printf("flush_tlb_range: called, not implemented\n"); -+#else - struct mm_struct *mm = vma->vm_mm; - unsigned long size = end - start; - unsigned long nbits; -@@ -163,6 +170,7 @@ - # endif - - ia64_srlz_i(); /* srlz.i implies srlz.d */ -+#endif - } - EXPORT_SYMBOL(flush_tlb_range); - diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/types.h --- a/xen/arch/ia64/patch/linux-2.6.11/types.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,44 +0,0 @@ ---- ../../linux-2.6.11/include/asm-ia64/types.h 2005-03-04 10:26:30.000000000 -0700 -+++ include/asm-ia64/types.h 2005-04-11 15:23:49.000000000 -0600 -@@ -1,5 +1,12 @@ - #ifndef _ASM_IA64_TYPES_H - #define _ASM_IA64_TYPES_H -+#ifdef XEN -+#ifndef __ASSEMBLY__ -+typedef unsigned long ssize_t; -+typedef unsigned long size_t; -+typedef long long loff_t; -+#endif -+#endif - - /* - * This file is never included by application software unless explicitly requested (e.g., -@@ -61,6 +68,28 @@ - typedef __s64 s64; - typedef __u64 u64; - -+#ifdef XEN -+/* -+ * Below are truly Linux-specific types that should never collide with -+ * any application/library that wants linux/types.h. -+ */ -+ -+#ifdef __CHECKER__ -+#define __bitwise __attribute__((bitwise)) -+#else -+#define __bitwise -+#endif -+ -+typedef __u16 __bitwise __le16; -+typedef __u16 __bitwise __be16; -+typedef __u32 __bitwise __le32; -+typedef __u32 __bitwise __be32; -+#if defined(__GNUC__) && !defined(__STRICT_ANSI__) -+typedef __u64 __bitwise __le64; -+typedef __u64 __bitwise __be64; -+#endif -+#endif -+ - #define BITS_PER_LONG 64 - - /* DMA addresses are 64-bits wide, in general. */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/uaccess.h --- a/xen/arch/ia64/patch/linux-2.6.11/uaccess.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,41 +0,0 @@ ---- ../../linux-2.6.11/include/asm-ia64/uaccess.h 2005-03-02 00:37:53.000000000 -0700 -+++ include/asm-ia64/uaccess.h 2005-06-21 21:53:20.000000000 -0600 -@@ -32,6 +32,10 @@ - * David Mosberger-Tang <davidm@xxxxxxxxxx> - */ - -+#ifdef CONFIG_VTI -+#include <asm/vmx_uaccess.h> -+#else // CONFIG_VTI -+ - #include <linux/compiler.h> - #include <linux/errno.h> - #include <linux/sched.h> -@@ -60,6 +64,11 @@ - * address TASK_SIZE is never valid. We also need to make sure that the address doesn't - * point inside the virtually mapped linear page table. - */ -+#ifdef XEN -+/* VT-i reserves bit 60 for the VMM; guest addresses have bit 60 = bit 59 */ -+#define IS_VMM_ADDRESS(addr) ((((addr) >> 60) ^ ((addr) >> 59)) & 1) -+#define __access_ok(addr, size, segment) (!IS_VMM_ADDRESS((unsigned long)(addr))) -+#else - #define __access_ok(addr, size, segment) \ - ({ \ - __chk_user_ptr(addr); \ -@@ -67,6 +76,7 @@ - && ((segment).seg == KERNEL_DS.seg \ - || likely(REGION_OFFSET((unsigned long) (addr)) < RGN_MAP_LIMIT))); \ - }) -+#endif - #define access_ok(type, addr, size) __access_ok((addr), (size), get_fs()) - - static inline int -@@ -343,6 +353,7 @@ - __su_ret; \ - }) - -+#endif // CONFIG_VTI - /* Generic code can't deal with the location-relative format that we use for compactness. */ - #define ARCH_HAS_SORT_EXTABLE - #define ARCH_HAS_SEARCH_EXTABLE diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.11/unaligned.c --- a/xen/arch/ia64/patch/linux-2.6.11/unaligned.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,227 +0,0 @@ ---- /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/../../linux-2.6.11/arch/ia64/kernel/unaligned.c 2005-03-01 23:38:25.000000000 -0800 -+++ /home/adsharma/disk2/xen-ia64/xeno-unstable-rebase.bk/xen/arch/ia64/unaligned.c 2005-05-18 12:40:50.000000000 -0700 -@@ -201,7 +201,11 @@ - - RPT(r1), RPT(r2), RPT(r3), - -+#ifdef CONFIG_VTI -+ RPT(r4), RPT(r5), RPT(r6), RPT(r7), -+#else //CONFIG_VTI - RSW(r4), RSW(r5), RSW(r6), RSW(r7), -+#endif //CONFIG_VTI - - RPT(r8), RPT(r9), RPT(r10), RPT(r11), - RPT(r12), RPT(r13), RPT(r14), RPT(r15), -@@ -291,6 +295,121 @@ - return reg; - } - -+#ifdef CONFIG_VTI -+static void -+set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, unsigned long nat) -+{ -+ struct switch_stack *sw = (struct switch_stack *) regs - 1; -+ unsigned long *bsp, *bspstore, *addr, *rnat_addr, *ubs_end; -+ unsigned long *kbs = (void *) current + IA64_RBS_OFFSET; -+ unsigned long rnats, nat_mask; -+ unsigned long old_rsc,new_rsc; -+ unsigned long on_kbs,rnat; -+ long sof = (regs->cr_ifs) & 0x7f; -+ long sor = 8 * ((regs->cr_ifs >> 14) & 0xf); -+ long rrb_gr = (regs->cr_ifs >> 18) & 0x7f; -+ long ridx = r1 - 32; -+ -+ if (ridx >= sof) { -+ /* this should never happen, as the "rsvd register fault" has higher priority */ -+ DPRINT("ignoring write to r%lu; only %lu registers are allocated!\n", r1, sof); -+ return; -+ } -+ -+ if (ridx < sor) -+ ridx = rotate_reg(sor, rrb_gr, ridx); -+ -+ old_rsc=ia64_get_rsc(); -+ new_rsc=old_rsc&(~0x3); -+ ia64_set_rsc(new_rsc); -+ -+ bspstore = ia64_get_bspstore(); -+ bsp =kbs + (regs->loadrs >> 19);//16+3 -+ -+ addr = ia64_rse_skip_regs(bsp, -sof + ridx); -+ nat_mask = 1UL << ia64_rse_slot_num(addr); -+ rnat_addr = ia64_rse_rnat_addr(addr); -+ -+ if(addr >= bspstore){ -+ -+ ia64_flushrs (); -+ ia64_mf (); -+ *addr = val; -+ bspstore = ia64_get_bspstore(); -+ rnat = ia64_get_rnat (); -+ if(bspstore < rnat_addr){ -+ rnat=rnat&(~nat_mask); -+ }else{ -+ *rnat_addr = (*rnat_addr)&(~nat_mask); -+ } -+ ia64_mf(); -+ ia64_loadrs(); -+ ia64_set_rnat(rnat); -+ }else{ -+ -+ rnat = ia64_get_rnat (); -+ *addr = val; -+ if(bspstore < rnat_addr){ -+ rnat=rnat&(~nat_mask); -+ }else{ -+ *rnat_addr = (*rnat_addr)&(~nat_mask); -+ } -+ ia64_set_bspstore (bspstore); -+ ia64_set_rnat(rnat); -+ } -+ ia64_set_rsc(old_rsc); -+} -+ -+ -+static void -+get_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long *val, unsigned long *nat) -+{ -+ struct switch_stack *sw = (struct switch_stack *) regs - 1; -+ unsigned long *bsp, *addr, *rnat_addr, *ubs_end, *bspstore; -+ unsigned long *kbs = (void *) current + IA64_RBS_OFFSET; -+ unsigned long rnats, nat_mask; -+ unsigned long on_kbs; -+ unsigned long old_rsc, new_rsc; -+ long sof = (regs->cr_ifs) & 0x7f; -+ long sor = 8 * ((regs->cr_ifs >> 14) & 0xf); -+ long rrb_gr = (regs->cr_ifs >> 18) & 0x7f; -+ long ridx = r1 - 32; -+ -+ if (ridx >= sof) { -+ /* read of out-of-frame register returns an undefined value; 0 in our case. */ -+ DPRINT("ignoring read from r%lu; only %lu registers are allocated!\n", r1, sof); -+ panic("wrong stack register number"); -+ } -+ -+ if (ridx < sor) -+ ridx = rotate_reg(sor, rrb_gr, ridx); -+ -+ old_rsc=ia64_get_rsc(); -+ new_rsc=old_rsc&(~(0x3)); -+ ia64_set_rsc(new_rsc); -+ -+ bspstore = ia64_get_bspstore(); -+ bsp =kbs + (regs->loadrs >> 19); //16+3; -+ -+ addr = ia64_rse_skip_regs(bsp, -sof + ridx); -+ nat_mask = 1UL << ia64_rse_slot_num(addr); -+ rnat_addr = ia64_rse_rnat_addr(addr); -+ -+ if(addr >= bspstore){ -+ -+ ia64_flushrs (); -+ ia64_mf (); -+ bspstore = ia64_get_bspstore(); -+ } -+ *val=*addr; -+ if(bspstore < rnat_addr){ -+ *nat=!!(ia64_get_rnat()&nat_mask); -+ }else{ -+ *nat = !!((*rnat_addr)&nat_mask); -+ } -+ ia64_set_rsc(old_rsc); -+} -+#else // CONFIG_VTI - static void - set_rse_reg (struct pt_regs *regs, unsigned long r1, unsigned long val, int nat) - { -@@ -435,9 +554,14 @@ - *nat = 0; - return; - } -+#endif // CONFIG_VTI - - -+#ifdef XEN -+void -+#else - static void -+#endif - setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs) - { - struct switch_stack *sw = (struct switch_stack *) regs - 1; -@@ -466,7 +590,11 @@ - unat = &sw->ar_unat; - } else { - addr = (unsigned long)regs; -+#ifdef CONFIG_VTI -+ unat = ®s->eml_unat; -+#else //CONFIG_VTI - unat = &sw->caller_unat; -+#endif //CONFIG_VTI - } - DPRINT("tmp_base=%lx switch_stack=%s offset=%d\n", - addr, unat==&sw->ar_unat ? "yes":"no", GR_OFFS(regnum)); -@@ -522,7 +650,11 @@ - */ - if (regnum >= IA64_FIRST_ROTATING_FR) { - ia64_sync_fph(current); -+#ifdef XEN -+ current->arch._thread.fph[fph_index(regs, regnum)] = *fpval; -+#else - current->thread.fph[fph_index(regs, regnum)] = *fpval; -+#endif - } else { - /* - * pt_regs or switch_stack ? -@@ -581,7 +713,11 @@ - */ - if (regnum >= IA64_FIRST_ROTATING_FR) { - ia64_flush_fph(current); -+#ifdef XEN -+ *fpval = current->arch._thread.fph[fph_index(regs, regnum)]; -+#else - *fpval = current->thread.fph[fph_index(regs, regnum)]; -+#endif - } else { - /* - * f0 = 0.0, f1= 1.0. Those registers are constant and are thus -@@ -611,7 +747,11 @@ - } - - -+#ifdef XEN -+void -+#else - static void -+#endif - getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs) - { - struct switch_stack *sw = (struct switch_stack *) regs - 1; -@@ -640,7 +780,11 @@ - unat = &sw->ar_unat; - } else { - addr = (unsigned long)regs; -+#ifdef CONFIG_VTI -+ unat = ®s->eml_unat;; -+#else //CONFIG_VTI - unat = &sw->caller_unat; -+#endif //CONFIG_VTI - } - - DPRINT("addr_base=%lx offset=0x%x\n", addr, GR_OFFS(regnum)); -@@ -1294,6 +1438,9 @@ - void - ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs) - { -+#ifdef XEN -+printk("ia64_handle_unaligned: called, not working yet\n"); -+#else - struct ia64_psr *ipsr = ia64_psr(regs); - mm_segment_t old_fs = get_fs(); - unsigned long bundle[2]; -@@ -1502,4 +1649,5 @@ - si.si_imm = 0; - force_sig_info(SIGBUS, &si, current); - goto done; -+#endif - } diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/bootmem.h --- a/xen/arch/ia64/patch/linux-2.6.7/bootmem.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,12 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/bootmem.h 2004-06-15 23:19:52.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/linux/bootmem.h 2004-08-25 19:28:13.000000000 -0600 -@@ -41,7 +41,9 @@ - extern void __init free_bootmem (unsigned long addr, unsigned long size); - extern void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal); - #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -+#ifndef XEN - extern void __init reserve_bootmem (unsigned long addr, unsigned long size); -+#endif - #define alloc_bootmem(x) \ - __alloc_bootmem((x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) - #define alloc_bootmem_low(x) \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/current.h --- a/xen/arch/ia64/patch/linux-2.6.7/current.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,17 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/current.h 2004-06-15 23:19:52.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/current.h 2004-08-25 19:28:12.000000000 -0600 -@@ -12,6 +12,14 @@ - * In kernel mode, thread pointer (r13) is used to point to the current task - * structure. - */ -+#ifdef XEN -+struct domain; -+#define get_current() ((struct vcpu *) ia64_getreg(_IA64_REG_TP)) -+#define current get_current() -+//#define set_current(d) ia64_setreg(_IA64_REG_TP,(void *)d); -+#define set_current(d) (ia64_r13 = (void *)d) -+#else - #define current ((struct task_struct *) ia64_getreg(_IA64_REG_TP)) -+#endif - - #endif /* _ASM_IA64_CURRENT_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/efi.c --- a/xen/arch/ia64/patch/linux-2.6.7/efi.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,85 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/kernel/efi.c 2004-06-15 23:18:55.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/efi.c 2004-12-17 13:47:03.000000000 -0700 -@@ -25,6 +25,9 @@ - #include <linux/types.h> - #include <linux/time.h> - #include <linux/efi.h> -+#ifdef XEN -+#include <xen/sched.h> -+#endif - - #include <asm/io.h> - #include <asm/kregs.h> -@@ -49,7 +52,10 @@ - { \ - struct ia64_fpreg fr[6]; \ - efi_status_t ret; \ -+ efi_time_cap_t *atc = NULL; \ - \ -+ if (tc) \ -+ atc = adjust_arg(tc); \ - ia64_save_scratch_fpregs(fr); \ - ret = efi_call_##prefix((efi_get_time_t *) __va(runtime->get_time), adjust_arg(tm), \ - adjust_arg(tc)); \ -@@ -201,6 +207,7 @@ - if ((*efi.get_time)(&tm, 0) != EFI_SUCCESS) - return; - -+ dummy(); - ts->tv_sec = mktime(tm.year, tm.month, tm.day, tm.hour, tm.minute, tm.second); - ts->tv_nsec = tm.nanosecond; - } -@@ -303,6 +310,10 @@ - if (!(md->attribute & EFI_MEMORY_WB)) - continue; - -+#ifdef XEN -+// this is a temporary hack to avoid CONFIG_VIRTUAL_MEM_MAP -+ if (md->phys_addr >= 0x100000000) continue; -+#endif - /* - * granule_addr is the base of md's first granule. - * [granule_addr - first_non_wb_addr) is guaranteed to -@@ -456,9 +467,11 @@ - - cpu = smp_processor_id(); - -+#ifndef XEN - /* insert this TR into our list for MCA recovery purposes */ - ia64_mca_tlb_list[cpu].pal_base = vaddr & mask; - ia64_mca_tlb_list[cpu].pal_paddr = pte_val(mk_pte_phys(md->phys_addr, PAGE_KERNEL)); -+#endif - } - } - -@@ -680,6 +693,30 @@ - return 0; - } - -+#ifdef XEN -+// variation of efi_get_iobase which returns entire memory descriptor -+efi_memory_desc_t * -+efi_get_io_md (void) -+{ -+ void *efi_map_start, *efi_map_end, *p; -+ efi_memory_desc_t *md; -+ u64 efi_desc_size; -+ -+ efi_map_start = __va(ia64_boot_param->efi_memmap); -+ efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; -+ efi_desc_size = ia64_boot_param->efi_memdesc_size; -+ -+ for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { -+ md = p; -+ if (md->type == EFI_MEMORY_MAPPED_IO_PORT_SPACE) { -+ if (md->attribute & EFI_MEMORY_UC) -+ return md; -+ } -+ } -+ return 0; -+} -+#endif -+ - u32 - efi_mem_type (unsigned long phys_addr) - { diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/efi.h --- a/xen/arch/ia64/patch/linux-2.6.7/efi.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,13 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/efi.h 2004-06-15 23:20:03.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/linux/efi.h 2004-08-25 19:28:13.000000000 -0600 -@@ -15,8 +15,10 @@ - #include <linux/string.h> - #include <linux/time.h> - #include <linux/types.h> -+#ifndef XEN - #include <linux/proc_fs.h> - #include <linux/rtc.h> -+#endif - #include <linux/ioport.h> - - #include <asm/page.h> diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/entry.S --- a/xen/arch/ia64/patch/linux-2.6.7/entry.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,195 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/entry.S 2005-03-24 19:39:56.000000000 -0700 -+++ arch/ia64/entry.S 2005-04-01 12:56:01.000000000 -0700 -@@ -35,7 +35,9 @@ - - #include <asm/asmmacro.h> - #include <asm/cache.h> -+#ifndef XEN - #include <asm/errno.h> -+#endif - #include <asm/kregs.h> - #include <asm/offsets.h> - #include <asm/pgtable.h> -@@ -46,6 +48,23 @@ - - #include "minstate.h" - -+#ifdef XEN -+#define sys_execve 0 -+#define do_fork 0 -+#define syscall_trace 0 -+#define schedule 0 -+#define do_notify_resume_user 0 -+#define ia64_rt_sigsuspend 0 -+#define ia64_rt_sigreturn 0 -+#define ia64_handle_unaligned 0 -+#define errno 0 -+#define sys_ni_syscall 0 -+#define unw_init_frame_info 0 -+#define sys_call_table 0 -+#endif -+ -+ /* -+ - /* - * execve() is special because in case of success, we need to - * setup a null register window frame. -@@ -178,11 +197,14 @@ - DO_SAVE_SWITCH_STACK - .body - -+#ifdef XEN -+//#undef IA64_TASK_THREAD_KSP_OFFSET -+//#define IA64_TASK_THREAD_KSP_OFFSET 0x38 - adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13 - movl r25=init_task - mov r27=IA64_KR(CURRENT_STACK) - adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0 -- dep r20=0,in0,61,3 // physical address of "current" -+ dep r20=0,in0,60,4 // physical address of "current" - ;; - st8 [r22]=sp // save kernel stack pointer of old task - shr.u r26=r20,IA64_GRANULE_SHIFT -@@ -194,6 +216,22 @@ - (p6) cmp.eq p7,p6=r26,r27 - (p6) br.cond.dpnt .map - ;; -+#else -+ adds r22=IA64_TASK_THREAD_KSP_OFFSET,r13 -+ mov r27=IA64_KR(CURRENT_STACK) -+ dep r20=0,in0,61,3 // physical address of "current" -+ ;; -+ st8 [r22]=sp // save kernel stack pointer of old task -+ shr.u r26=r20,IA64_GRANULE_SHIFT -+ adds r21=IA64_TASK_THREAD_KSP_OFFSET,in0 -+ ;; -+ /* -+ * If we've already mapped this task's page, we can skip doing it again. -+ */ -+ cmp.eq p7,p6=r26,r27 -+(p6) br.cond.dpnt .map -+ ;; -+#endif - .done: - (p6) ssm psr.ic // if we we had to map, renable the psr.ic bit FIRST!!! - ;; -@@ -211,6 +249,16 @@ - br.ret.sptk.many rp // boogie on out in new context - - .map: -+#ifdef XEN -+ // avoid overlapping with kernel TR -+ movl r25=KERNEL_START -+ dep r23=0,in0,0,KERNEL_TR_PAGE_SHIFT -+ ;; -+ cmp.eq p7,p0=r25,r23 -+ ;; -+(p7) mov IA64_KR(CURRENT_STACK)=r26 // remember last page we mapped... -+(p7) br.cond.sptk .done -+#endif - rsm psr.ic // interrupts (psr.i) are already disabled here - movl r25=PAGE_KERNEL - ;; -@@ -367,7 +415,11 @@ - * - b7 holds address to return to - * - must not touch r8-r11 - */ -+#ifdef XEN -+GLOBAL_ENTRY(load_switch_stack) -+#else - ENTRY(load_switch_stack) -+#endif - .prologue - .altrp b7 - -@@ -595,6 +647,11 @@ - */ - br.call.sptk.many rp=ia64_invoke_schedule_tail - } -+#ifdef XEN -+ // new domains are cloned but not exec'ed so switch to user mode here -+ cmp.ne pKStk,pUStk=r0,r0 -+ br.cond.spnt ia64_leave_kernel -+#else - .ret8: - adds r2=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; -@@ -603,6 +660,7 @@ - mov r8=0 - tbit.nz p6,p0=r2,TIF_SYSCALL_TRACE - (p6) br.cond.spnt .strace_check_retval -+#endif - ;; // added stop bits to prevent r8 dependency - END(ia64_ret_from_clone) - // fall through -@@ -684,9 +742,14 @@ - #endif /* CONFIG_PREEMPT */ - adds r16=PT(LOADRS)+16,r12 - adds r17=PT(AR_BSPSTORE)+16,r12 -+#ifdef XEN -+ mov r31=r0 -+ ;; -+#else - adds r18=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; - (p6) ld4 r31=[r18] // load current_thread_info()->flags -+#endif - ld8 r19=[r16],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs" - nop.i 0 - ;; -@@ -745,7 +808,11 @@ - mov b7=r0 // clear b7 - ;; - (pUStk) st1 [r14]=r3 -+#ifdef XEN -+ movl r17=THIS_CPU(ia64_phys_stacked_size_p8) -+#else - addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 -+#endif - ;; - mov r16=ar.bsp // get existing backing store pointer - srlz.i // ensure interruption collection is off -@@ -796,9 +863,18 @@ - ;; - (p6) cmp.eq.unc p6,p0=r21,r0 // p6 <- p6 && (r21 == 0) - #endif /* CONFIG_PREEMPT */ -+#ifdef XEN -+ alloc loc0=ar.pfs,0,1,1,0 -+ adds out0=16,r12 -+ ;; -+(p6) br.call.sptk.many b0=deliver_pending_interrupt -+ mov ar.pfs=loc0 -+ mov r31=r0 -+#else - adds r17=TI_FLAGS+IA64_TASK_SIZE,r13 - ;; - (p6) ld4 r31=[r17] // load current_thread_info()->flags -+#endif - adds r21=PT(PR)+16,r12 - ;; - -@@ -912,7 +988,11 @@ - shr.u r18=r19,16 // get byte size of existing "dirty" partition - ;; - mov r16=ar.bsp // get existing backing store pointer -+#ifdef XEN -+ movl r17=THIS_CPU(ia64_phys_stacked_size_p8) -+#else - addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 -+#endif - ;; - ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 - (pKStk) br.cond.dpnt skip_rbs_switch -@@ -1264,6 +1344,7 @@ - br.ret.sptk.many rp - END(unw_init_running) - -+#ifndef XEN - .rodata - .align 8 - .globl sys_call_table -@@ -1526,3 +1607,4 @@ - data8 sys_ni_syscall - - .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls -+#endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/gcc_intrin.h --- a/xen/arch/ia64/patch/linux-2.6.7/gcc_intrin.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,20 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/gcc_intrin.h 2005-01-23 13:23:36.000000000 -0700 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/gcc_intrin.h 2004-08-25 19:28:13.000000000 -0600 -@@ -92,6 +92,9 @@ - - #define ia64_hint_pause 0 - -+#ifdef XEN -+#define ia64_hint(mode) 0 -+#else - #define ia64_hint(mode) \ - ({ \ - switch (mode) { \ -@@ -100,6 +103,7 @@ - break; \ - } \ - }) -+#endif - - - /* Integer values for mux1 instruction */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/hardirq.h --- a/xen/arch/ia64/patch/linux-2.6.7/hardirq.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,22 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/hardirq.h 2004-06-15 23:19:02.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/hardirq.h 2004-12-17 13:47:03.000000000 -0700 -@@ -81,10 +81,19 @@ - */ - #define in_irq() (hardirq_count()) - #define in_softirq() (softirq_count()) -+#ifdef XEN - #define in_interrupt() (irq_count()) -+#else -+#define in_interrupt() 0 // FIXME LATER -+#endif - -+#ifdef XEN -+#define hardirq_trylock(cpu) (!in_interrupt()) -+#define hardirq_endlock(cpu) do { } while (0) -+#else - #define hardirq_trylock() (!in_interrupt()) - #define hardirq_endlock() do { } while (0) -+#endif - - #ifdef CONFIG_PREEMPT - # include <linux/smp_lock.h> diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/head.S --- a/xen/arch/ia64/patch/linux-2.6.7/head.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,93 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/head.S 2005-03-24 19:39:56.000000000 -0700 -+++ arch/ia64/head.S 2005-04-01 12:56:01.000000000 -0700 -@@ -1,3 +1,8 @@ -+#ifdef XEN -+#define console_print printf -+#define kernel_thread_helper 0 -+#define sys_exit 0 -+#endif - /* - * Here is where the ball gets rolling as far as the kernel is concerned. - * When control is transferred to _start, the bootload has already -@@ -166,7 +171,11 @@ - dep r18=0,r3,0,12 - ;; - or r18=r17,r18 -+#ifdef XEN -+ dep r2=-1,r3,60,4 // IMVA of task -+#else - dep r2=-1,r3,61,3 // IMVA of task -+#endif - ;; - mov r17=rr[r2] - ;; -@@ -205,7 +214,11 @@ - ;; - mov ar.rsc=0x3 // place RSE in eager mode - -+#ifdef XEN -+(isBP) dep r28=-1,r28,60,4 // make address virtual -+#else - (isBP) dep r28=-1,r28,61,3 // make address virtual -+#endif - (isBP) movl r2=ia64_boot_param - ;; - (isBP) st8 [r2]=r28 // save the address of the boot param area passed by the bootloader -@@ -238,14 +251,30 @@ - br.call.sptk.many rp=sys_fw_init - .ret1: - #endif -+#ifdef XEN -+ alloc r2=ar.pfs,8,0,2,0 -+ ;; -+#define fake_mbi_magic 0 -+#define MULTIBOOT_INFO_SIZE 1024 -+ .rodata -+fake_mbi: -+ .skip MULTIBOOT_INFO_SIZE -+ .previous -+ movl out0=fake_mbi -+ ;; -+ br.call.sptk.many rp=cmain -+#else - br.call.sptk.many rp=start_kernel -+#endif - .ret2: addl r3=@ltoff(halt_msg),gp - ;; - alloc r2=ar.pfs,8,0,2,0 - ;; - ld8 out0=[r3] - br.call.sptk.many b0=console_print -+ ;; - self: br.sptk.many self // endless loop -+ ;; - END(_start) - - GLOBAL_ENTRY(ia64_save_debug_regs) -@@ -781,8 +810,13 @@ - movl r18=KERNEL_START - dep r3=0,r3,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT - dep r14=0,r14,KERNEL_TR_PAGE_SHIFT,64-KERNEL_TR_PAGE_SHIFT -+#ifdef XEN -+ dep r17=-1,r17,60,4 -+ dep sp=-1,sp,60,4 -+#else - dep r17=-1,r17,61,3 - dep sp=-1,sp,61,3 -+#endif - ;; - or r3=r3,r18 - or r14=r14,r18 -@@ -838,7 +872,12 @@ - * intermediate precision so that we can produce a full 64-bit result. - */ - GLOBAL_ENTRY(sched_clock) -+#ifdef XEN -+ break 0;; // FIX IA64_CPUINFO_NSEC_PER_CYC_OFFSET -+ //movl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET -+#else - addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 -+#endif - mov.m r9=ar.itc // fetch cycle-counter (35 cyc) - ;; - ldf8 f8=[r8] diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/hpsim_irq.c --- a/xen/arch/ia64/patch/linux-2.6.7/hpsim_irq.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,36 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/hp/sim/hpsim_irq.c 2004-06-15 23:20:26.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/hpsim_irq.c 2004-11-01 17:54:15.000000000 -0700 -@@ -9,7 +9,17 @@ - #include <linux/kernel.h> - #include <linux/sched.h> - #include <linux/irq.h> -+#ifdef XEN -+#include <asm/hw_irq.h> -+#endif - -+#if 1 -+void __init -+hpsim_irq_init (void) -+{ -+ printf("*** hpsim_irq_init called: NOT NEEDED?!?!?\n"); -+} -+#else - static unsigned int - hpsim_irq_startup (unsigned int irq) - { -@@ -19,6 +29,10 @@ - static void - hpsim_irq_noop (unsigned int irq) - { -+#if 1 -+printf("hpsim_irq_noop: irq=%d\n",irq); -+while(irq); -+#endif - } - - static struct hw_interrupt_type irq_type_hp_sim = { -@@ -44,3 +58,4 @@ - idesc->handler = &irq_type_hp_sim; - } - } -+#endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/hpsim_ssc.h --- a/xen/arch/ia64/patch/linux-2.6.7/hpsim_ssc.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,26 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/hp/sim/hpsim_ssc.h 2004-06-15 23:19:43.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/hpsim_ssc.h 2004-08-29 01:04:23.000000000 -0600 -@@ -33,4 +33,23 @@ - */ - extern long ia64_ssc (long arg0, long arg1, long arg2, long arg3, int nr); - -+#ifdef XEN -+/* Note: These are declared in linux/arch/ia64/hp/sim/simscsi.c but belong -+ * in linux/include/asm-ia64/hpsim_ssc.h, hence their addition here */ -+#define SSC_OPEN 50 -+#define SSC_CLOSE 51 -+#define SSC_READ 52 -+#define SSC_WRITE 53 -+#define SSC_GET_COMPLETION 54 -+#define SSC_WAIT_COMPLETION 55 -+ -+#define SSC_WRITE_ACCESS 2 -+#define SSC_READ_ACCESS 1 -+ -+struct ssc_disk_req { -+ unsigned long addr; -+ unsigned long len; -+}; -+#endif -+ - #endif /* _IA64_PLATFORM_HPSIM_SSC_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/hw_irq.h --- a/xen/arch/ia64/patch/linux-2.6.7/hw_irq.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,24 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/hw_irq.h 2004-06-15 23:19:22.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/hw_irq.h 2004-08-27 09:07:38.000000000 -0600 -@@ -9,7 +9,9 @@ - #include <linux/interrupt.h> - #include <linux/sched.h> - #include <linux/types.h> -+#ifndef XEN - #include <linux/profile.h> -+#endif - - #include <asm/machvec.h> - #include <asm/ptrace.h> -@@ -96,7 +98,11 @@ - * Default implementations for the irq-descriptor API: - */ - -+#ifdef XEN -+#define _irq_desc irq_desc -+#else - extern irq_desc_t _irq_desc[NR_IRQS]; -+#endif - - #ifndef CONFIG_IA64_GENERIC - static inline irq_desc_t * diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/ide.h --- a/xen/arch/ia64/patch/linux-2.6.7/ide.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,35 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/ide.h 2004-06-15 23:19:36.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/ide.h 2004-08-25 19:28:13.000000000 -0600 -@@ -64,6 +64,32 @@ - #define ide_init_default_irq(base) ide_default_irq(base) - #endif - -+#ifdef XEN -+// this is moved to linux/ide.h in newer versions of linux -+typedef union { -+ unsigned all : 8; /* all of the bits together */ -+ struct { -+ unsigned head : 4; /* always zeros here */ -+ unsigned unit : 1; /* drive select number, 0 or 1 */ -+ unsigned bit5 : 1; /* always 1 */ -+ unsigned lba : 1; /* using LBA instead of CHS */ -+ unsigned bit7 : 1; /* always 1 */ -+ } b; -+} select_t; -+ -+typedef union { -+ unsigned all : 8; /* all of the bits together */ -+ struct { -+ unsigned bit0 : 1; -+ unsigned nIEN : 1; /* device INTRQ to host */ -+ unsigned SRST : 1; /* host soft reset bit */ -+ unsigned bit3 : 1; /* ATA-2 thingy */ -+ unsigned reserved456 : 3; -+ unsigned HOB : 1; /* 48-bit address ordering */ -+ } b; -+} control_t; -+#endif -+ - #include <asm-generic/ide_iops.h> - - #endif /* __KERNEL__ */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/init_task.c --- a/xen/arch/ia64/patch/linux-2.6.7/init_task.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,35 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/kernel/init_task.c 2004-06-15 23:20:26.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/init_task.c 2004-08-27 00:06:35.000000000 -0600 -@@ -15,10 +15,12 @@ - #include <asm/uaccess.h> - #include <asm/pgtable.h> - -+#ifndef XEN - static struct fs_struct init_fs = INIT_FS; - static struct files_struct init_files = INIT_FILES; - static struct signal_struct init_signals = INIT_SIGNALS(init_signals); - static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -+#endif - struct mm_struct init_mm = INIT_MM(init_mm); - - EXPORT_SYMBOL(init_mm); -@@ -33,13 +35,19 @@ - - union { - struct { -+#ifdef XEN -+ struct domain task; -+#else - struct task_struct task; - struct thread_info thread_info; -+#endif - } s; - unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; - } init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{ - .task = INIT_TASK(init_task_mem.s.task), -+#ifndef XEN - .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) -+#endif - }}; - - EXPORT_SYMBOL(init_task); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/init_task.h --- a/xen/arch/ia64/patch/linux-2.6.7/init_task.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,53 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/init_task.h 2004-06-15 23:18:57.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/linux/init_task.h 2004-11-15 17:06:20.000000000 -0700 -@@ -31,6 +31,18 @@ - .max_reqs = ~0U, \ - } - -+#ifdef XEN -+#define INIT_MM(name) \ -+{ \ -+ .mm_rb = RB_ROOT, \ -+ .pgd = swapper_pg_dir, \ -+ .mm_users = ATOMIC_INIT(2), \ -+ .mm_count = ATOMIC_INIT(1), \ -+ .page_table_lock = SPIN_LOCK_UNLOCKED, \ -+ .mmlist = LIST_HEAD_INIT(name.mmlist), \ -+ .cpu_vm_mask = CPU_MASK_ALL, \ -+} -+#else - #define INIT_MM(name) \ - { \ - .mm_rb = RB_ROOT, \ -@@ -43,6 +55,7 @@ - .cpu_vm_mask = CPU_MASK_ALL, \ - .default_kioctx = INIT_KIOCTX(name.default_kioctx, name), \ - } -+#endif - - #define INIT_SIGNALS(sig) { \ - .count = ATOMIC_INIT(1), \ -@@ -64,6 +77,15 @@ - * INIT_TASK is used to set up the first task table, touch at - * your own risk!. Base=0, limit=0x1fffff (=2MB) - */ -+#ifdef XEN -+#define INIT_TASK(tsk) \ -+{ \ -+ /*processor: 0,*/ \ -+ /*domain_id: IDLE_DOMAIN_ID,*/ \ -+ /*domain_flags: DOMF_idle_domain,*/ \ -+ refcnt: ATOMIC_INIT(1) \ -+} -+#else - #define INIT_TASK(tsk) \ - { \ - .state = 0, \ -@@ -113,6 +135,7 @@ - .switch_lock = SPIN_LOCK_UNLOCKED, \ - .journal_info = NULL, \ - } -+#endif - - - diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/interrupt.h --- a/xen/arch/ia64/patch/linux-2.6.7/interrupt.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,18 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/interrupt.h 2004-06-15 23:19:29.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/linux/interrupt.h 2004-08-25 19:28:13.000000000 -0600 -@@ -32,6 +32,7 @@ - #define IRQ_HANDLED (1) - #define IRQ_RETVAL(x) ((x) != 0) - -+#ifndef XEN - struct irqaction { - irqreturn_t (*handler)(int, void *, struct pt_regs *); - unsigned long flags; -@@ -46,6 +47,7 @@ - irqreturn_t (*handler)(int, void *, struct pt_regs *), - unsigned long, const char *, void *); - extern void free_irq(unsigned int, void *); -+#endif - - /* - * Temporary defines for UP kernels, until all code gets fixed. diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/io.h --- a/xen/arch/ia64/patch/linux-2.6.7/io.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,14 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/io.h 2004-06-15 23:18:57.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/io.h 2004-11-05 16:53:36.000000000 -0700 -@@ -23,7 +23,11 @@ - #define __SLOW_DOWN_IO do { } while (0) - #define SLOW_DOWN_IO do { } while (0) - -+#ifdef XEN -+#define __IA64_UNCACHED_OFFSET 0xdffc000000000000 /* region 6 */ -+#else - #define __IA64_UNCACHED_OFFSET 0xc000000000000000 /* region 6 */ -+#endif - - /* - * The legacy I/O space defined by the ia64 architecture supports only 65536 ports, but diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/irq.h --- a/xen/arch/ia64/patch/linux-2.6.7/irq.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,18 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/irq.h 2005-01-23 13:23:36.000000000 -0700 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/irq.h 2004-08-25 19:28:13.000000000 -0600 -@@ -30,6 +30,15 @@ - extern void enable_irq (unsigned int); - extern void set_irq_affinity_info (unsigned int irq, int dest, int redir); - -+#ifdef XEN -+// dup'ed from signal.h to avoid changes to includes -+#define SA_NOPROFILE 0x02000000 -+#define SA_SHIRQ 0x04000000 -+#define SA_RESTART 0x10000000 -+#define SA_INTERRUPT 0x20000000 -+#define SA_SAMPLE_RANDOM SA_RESTART -+#endif -+ - #ifdef CONFIG_SMP - extern void move_irq(int irq); - #else diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/irq_ia64.c --- a/xen/arch/ia64/patch/linux-2.6.7/irq_ia64.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,82 +0,0 @@ ---- /home/djm/linux-2.6.7/arch/ia64/kernel/irq_ia64.c 2004-06-15 23:19:13.000000000 -0600 -+++ arch/ia64/irq_ia64.c 2005-02-17 13:17:16.000000000 -0700 -@@ -17,18 +17,26 @@ - #include <linux/config.h> - #include <linux/module.h> - -+#ifndef XEN - #include <linux/jiffies.h> -+#endif - #include <linux/errno.h> - #include <linux/init.h> - #include <linux/interrupt.h> - #include <linux/ioport.h> -+#ifndef XEN - #include <linux/kernel_stat.h> -+#endif - #include <linux/slab.h> -+#ifndef XEN - #include <linux/ptrace.h> - #include <linux/random.h> /* for rand_initialize_irq() */ - #include <linux/signal.h> -+#endif - #include <linux/smp.h> -+#ifndef XEN - #include <linux/smp_lock.h> -+#endif - #include <linux/threads.h> - - #include <asm/bitops.h> -@@ -101,6 +109,24 @@ - ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) - { - unsigned long saved_tpr; -+#if 0 -+//FIXME: For debug only, can be removed -+ static char firstirq = 1; -+ static char firsttime[256]; -+ static char firstpend[256]; -+ if (firstirq) { -+ int i; -+ for (i=0;i<256;i++) firsttime[i] = 1; -+ for (i=0;i<256;i++) firstpend[i] = 1; -+ firstirq = 0; -+ } -+ if (firsttime[vector]) { -+ printf("**** (entry) First received int on vector=%d,itc=%lx\n", -+ (unsigned long) vector, ia64_get_itc()); -+ firsttime[vector] = 0; -+ } -+#endif -+ - - #if IRQ_DEBUG - { -@@ -145,6 +171,27 @@ - ia64_setreg(_IA64_REG_CR_TPR, vector); - ia64_srlz_d(); - -+#ifdef XEN -+ if (vector != 0xef) { -+ extern void vcpu_pend_interrupt(void *, int); -+#if 0 -+ if (firsttime[vector]) { -+ printf("**** (iterate) First received int on vector=%d,itc=%lx\n", -+ (unsigned long) vector, ia64_get_itc()); -+ firsttime[vector] = 0; -+ } -+ if (firstpend[vector]) { -+ printf("**** First pended int on vector=%d,itc=%lx\n", -+ (unsigned long) vector,ia64_get_itc()); -+ firstpend[vector] = 0; -+ } -+#endif -+ //FIXME: TEMPORARY HACK!!!! -+ vcpu_pend_interrupt(dom0->vcpu[0],vector); -+ domain_wake(dom0->vcpu[0]); -+ } -+ else -+#endif - do_IRQ(local_vector_to_irq(vector), regs); - - /* diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/ivt.S --- a/xen/arch/ia64/patch/linux-2.6.7/ivt.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,528 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/ivt.S 2004-06-15 23:18:59.000000000 -0600 -+++ arch/ia64/ivt.S 2005-04-01 12:56:01.000000000 -0700 -@@ -1,3 +1,21 @@ -+ -+#ifdef XEN -+//#define CONFIG_DISABLE_VHPT // FIXME: change when VHPT is enabled?? -+// these are all hacked out for now as the entire IVT -+// will eventually be replaced... just want to use it -+// for startup code to handle TLB misses -+//#define ia64_leave_kernel 0 -+//#define ia64_ret_from_syscall 0 -+//#define ia64_handle_irq 0 -+//#define ia64_fault 0 -+#define ia64_illegal_op_fault 0 -+#define ia64_prepare_handle_unaligned 0 -+#define ia64_bad_break 0 -+#define ia64_trace_syscall 0 -+#define sys_call_table 0 -+#define sys_ni_syscall 0 -+#include <asm/vhpt.h> -+#endif - /* - * arch/ia64/kernel/ivt.S - * -@@ -76,6 +94,13 @@ - mov r19=n;; /* prepare to save predicates */ \ - br.sptk.many dispatch_to_fault_handler - -+#ifdef XEN -+#define REFLECT(n) \ -+ mov r31=pr; \ -+ mov r19=n;; /* prepare to save predicates */ \ -+ br.sptk.many dispatch_reflection -+#endif -+ - .section .text.ivt,"ax" - - .align 32768 // align on 32KB boundary -@@ -213,6 +238,13 @@ - // 0x0400 Entry 1 (size 64 bundles) ITLB (21) - ENTRY(itlb_miss) - DBG_FAULT(1) -+#ifdef XEN -+ VHPT_CCHAIN_LOOKUP(itlb_miss,i) -+#ifdef VHPT_GLOBAL -+ br.cond.sptk page_fault -+ ;; -+#endif -+#endif - /* - * The ITLB handler accesses the L3 PTE via the virtually mapped linear - * page table. If a nested TLB miss occurs, we switch into physical -@@ -257,6 +289,13 @@ - // 0x0800 Entry 2 (size 64 bundles) DTLB (9,48) - ENTRY(dtlb_miss) - DBG_FAULT(2) -+#ifdef XEN -+ VHPT_CCHAIN_LOOKUP(dtlb_miss,d) -+#ifdef VHPT_GLOBAL -+ br.cond.sptk page_fault -+ ;; -+#endif -+#endif - /* - * The DTLB handler accesses the L3 PTE via the virtually mapped linear - * page table. If a nested TLB miss occurs, we switch into physical -@@ -301,6 +340,13 @@ - // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19) - ENTRY(alt_itlb_miss) - DBG_FAULT(3) -+#ifdef XEN -+//#ifdef VHPT_GLOBAL -+// VHPT_CCHAIN_LOOKUP(alt_itlb_miss,i) -+// br.cond.sptk page_fault -+// ;; -+//#endif -+#endif - mov r16=cr.ifa // get address that caused the TLB miss - movl r17=PAGE_KERNEL - mov r21=cr.ipsr -@@ -339,6 +385,13 @@ - // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46) - ENTRY(alt_dtlb_miss) - DBG_FAULT(4) -+#ifdef XEN -+//#ifdef VHPT_GLOBAL -+// VHPT_CCHAIN_LOOKUP(alt_dtlb_miss,d) -+// br.cond.sptk page_fault -+// ;; -+//#endif -+#endif - mov r16=cr.ifa // get address that caused the TLB miss - movl r17=PAGE_KERNEL - mov r20=cr.isr -@@ -368,6 +421,17 @@ - cmp.ne p8,p0=r0,r23 - (p9) cmp.eq.or.andcm p6,p7=IA64_ISR_CODE_LFETCH,r22 // check isr.code field - (p8) br.cond.spnt page_fault -+#ifdef XEN -+ ;; -+ // FIXME: inadequate test, this is where we test for Xen address -+ // note that 0xf000 (cached) and 0xd000 (uncached) addresses -+ // should be OK. (Though no I/O is done in Xen, EFI needs uncached -+ // addresses and some domain EFI calls are passed through) -+ tbit.nz p0,p8=r16,60 -+(p8) br.cond.spnt page_fault -+//(p8) br.cond.spnt 0 -+ ;; -+#endif - - dep r21=-1,r21,IA64_PSR_ED_BIT,1 - or r19=r19,r17 // insert PTE control bits into r19 -@@ -448,6 +512,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24) - ENTRY(ikey_miss) -+#ifdef XEN -+ REFLECT(6) -+#endif - DBG_FAULT(6) - FAULT(6) - END(ikey_miss) -@@ -460,9 +527,16 @@ - srlz.i - ;; - SAVE_MIN_WITH_COVER -+#ifdef XEN -+ alloc r15=ar.pfs,0,0,4,0 -+ mov out0=cr.ifa -+ mov out1=cr.isr -+ mov out3=cr.itir -+#else - alloc r15=ar.pfs,0,0,3,0 - mov out0=cr.ifa - mov out1=cr.isr -+#endif - adds r3=8,r2 // set up second base pointer - ;; - ssm psr.ic | PSR_DEFAULT_BITS -@@ -483,6 +557,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51) - ENTRY(dkey_miss) -+#ifdef XEN -+ REFLECT(7) -+#endif - DBG_FAULT(7) - FAULT(7) - END(dkey_miss) -@@ -491,6 +568,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54) - ENTRY(dirty_bit) -+#ifdef XEN -+ REFLECT(8) -+#endif - DBG_FAULT(8) - /* - * What we do here is to simply turn on the dirty bit in the PTE. We need to -@@ -553,6 +633,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27) - ENTRY(iaccess_bit) -+#ifdef XEN -+ REFLECT(9) -+#endif - DBG_FAULT(9) - // Like Entry 8, except for instruction access - mov r16=cr.ifa // get the address that caused the fault -@@ -618,6 +701,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55) - ENTRY(daccess_bit) -+#ifdef XEN -+ REFLECT(10) -+#endif - DBG_FAULT(10) - // Like Entry 8, except for data access - mov r16=cr.ifa // get the address that caused the fault -@@ -686,6 +772,16 @@ - * to prevent leaking bits from kernel to user level. - */ - DBG_FAULT(11) -+#ifdef XEN -+ mov r16=cr.isr -+ mov r17=cr.iim -+ mov r31=pr -+ ;; -+ cmp.eq p7,p0=r0,r17 // is this a psuedo-cover? -+ // FIXME: may also need to check slot==2? -+(p7) br.sptk.many dispatch_privop_fault -+ br.sptk.many dispatch_break_fault -+#endif - mov r16=IA64_KR(CURRENT) // r16 = current task; 12 cycle read lat. - mov r17=cr.iim - mov r18=__IA64_BREAK_SYSCALL -@@ -696,7 +792,9 @@ - mov r27=ar.rsc - mov r26=ar.pfs - mov r28=cr.iip -+#ifndef XEN - mov r31=pr // prepare to save predicates -+#endif - mov r20=r1 - ;; - adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 -@@ -792,6 +890,36 @@ - DBG_FAULT(13) - FAULT(13) - -+#ifdef XEN -+ // There is no particular reason for this code to be here, other than that -+ // there happens to be space here that would go unused otherwise. If this -+ // fault ever gets "unreserved", simply moved the following code to a more -+ // suitable spot... -+ -+ENTRY(dispatch_break_fault) -+ SAVE_MIN_WITH_COVER -+ ;; -+ alloc r14=ar.pfs,0,0,4,0 // now it's safe (must be first in insn group!) -+ mov out0=cr.ifa -+ adds out1=16,sp -+ mov out2=cr.isr // FIXME: pity to make this slow access twice -+ mov out3=cr.iim // FIXME: pity to make this slow access twice -+ -+ ssm psr.ic | PSR_DEFAULT_BITS -+ ;; -+ srlz.i // guarantee that interruption collection is on -+ ;; -+(p15) ssm psr.i // restore psr.i -+ adds r3=8,r2 // set up second base pointer -+ ;; -+ SAVE_REST -+ movl r14=ia64_leave_kernel -+ ;; -+ mov rp=r14 -+ br.sptk.many ia64_prepare_handle_break -+END(dispatch_break_fault) -+#endif -+ - .org ia64_ivt+0x3800 - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x3800 Entry 14 (size 64 bundles) Reserved -@@ -842,9 +970,11 @@ - * - ar.fpsr: set to kernel settings - */ - GLOBAL_ENTRY(ia64_syscall_setup) -+#ifndef XEN - #if PT(B6) != 0 - # error This code assumes that b6 is the first field in pt_regs. - #endif -+#endif - st8 [r1]=r19 // save b6 - add r16=PT(CR_IPSR),r1 // initialize first base pointer - add r17=PT(R11),r1 // initialize second base pointer -@@ -974,6 +1104,37 @@ - DBG_FAULT(16) - FAULT(16) - -+#ifdef XEN -+ // There is no particular reason for this code to be here, other than that -+ // there happens to be space here that would go unused otherwise. If this -+ // fault ever gets "unreserved", simply moved the following code to a more -+ // suitable spot... -+ -+ENTRY(dispatch_privop_fault) -+ SAVE_MIN_WITH_COVER -+ ;; -+ alloc r14=ar.pfs,0,0,4,0 // now it's safe (must be first in insn group!) -+ mov out0=cr.ifa -+ adds out1=16,sp -+ mov out2=cr.isr // FIXME: pity to make this slow access twice -+ mov out3=cr.itir -+ -+ ssm psr.ic | PSR_DEFAULT_BITS -+ ;; -+ srlz.i // guarantee that interruption collection is on -+ ;; -+(p15) ssm psr.i // restore psr.i -+ adds r3=8,r2 // set up second base pointer -+ ;; -+ SAVE_REST -+ movl r14=ia64_leave_kernel -+ ;; -+ mov rp=r14 -+ br.sptk.many ia64_prepare_handle_privop -+END(dispatch_privop_fault) -+#endif -+ -+ - .org ia64_ivt+0x4400 - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x4400 Entry 17 (size 64 bundles) Reserved -@@ -1090,6 +1251,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5000 Entry 20 (size 16 bundles) Page Not Present (10,22,49) - ENTRY(page_not_present) -+#ifdef XEN -+ REFLECT(20) -+#endif - DBG_FAULT(20) - mov r16=cr.ifa - rsm psr.dt -@@ -1110,6 +1274,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5100 Entry 21 (size 16 bundles) Key Permission (13,25,52) - ENTRY(key_permission) -+#ifdef XEN -+ REFLECT(21) -+#endif - DBG_FAULT(21) - mov r16=cr.ifa - rsm psr.dt -@@ -1123,6 +1290,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26) - ENTRY(iaccess_rights) -+#ifdef XEN -+ REFLECT(22) -+#endif - DBG_FAULT(22) - mov r16=cr.ifa - rsm psr.dt -@@ -1136,6 +1306,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53) - ENTRY(daccess_rights) -+#ifdef XEN -+ REFLECT(23) -+#endif - DBG_FAULT(23) - mov r16=cr.ifa - rsm psr.dt -@@ -1153,8 +1326,13 @@ - mov r16=cr.isr - mov r31=pr - ;; -+#ifdef XEN -+ cmp4.ge p6,p0=0x20,r16 -+(p6) br.sptk.many dispatch_privop_fault -+#else - cmp4.eq p6,p0=0,r16 - (p6) br.sptk.many dispatch_illegal_op_fault -+#endif - ;; - mov r19=24 // fault number - br.sptk.many dispatch_to_fault_handler -@@ -1164,6 +1342,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35) - ENTRY(disabled_fp_reg) -+#ifdef XEN -+ REFLECT(25) -+#endif - DBG_FAULT(25) - rsm psr.dfh // ensure we can access fph - ;; -@@ -1177,6 +1358,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50) - ENTRY(nat_consumption) -+#ifdef XEN -+ REFLECT(26) -+#endif - DBG_FAULT(26) - FAULT(26) - END(nat_consumption) -@@ -1185,6 +1369,10 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5700 Entry 27 (size 16 bundles) Speculation (40) - ENTRY(speculation_vector) -+#ifdef XEN -+ // this probably need not reflect... -+ REFLECT(27) -+#endif - DBG_FAULT(27) - /* - * A [f]chk.[as] instruction needs to take the branch to the recovery code but -@@ -1228,6 +1416,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56) - ENTRY(debug_vector) -+#ifdef XEN -+ REFLECT(29) -+#endif - DBG_FAULT(29) - FAULT(29) - END(debug_vector) -@@ -1236,6 +1427,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57) - ENTRY(unaligned_access) -+#ifdef XEN -+ REFLECT(30) -+#endif - DBG_FAULT(30) - mov r16=cr.ipsr - mov r31=pr // prepare to save predicates -@@ -1247,6 +1441,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57) - ENTRY(unsupported_data_reference) -+#ifdef XEN -+ REFLECT(31) -+#endif - DBG_FAULT(31) - FAULT(31) - END(unsupported_data_reference) -@@ -1255,6 +1452,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5c00 Entry 32 (size 16 bundles) Floating-Point Fault (64) - ENTRY(floating_point_fault) -+#ifdef XEN -+ REFLECT(32) -+#endif - DBG_FAULT(32) - FAULT(32) - END(floating_point_fault) -@@ -1263,6 +1463,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66) - ENTRY(floating_point_trap) -+#ifdef XEN -+ REFLECT(33) -+#endif - DBG_FAULT(33) - FAULT(33) - END(floating_point_trap) -@@ -1271,6 +1474,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66) - ENTRY(lower_privilege_trap) -+#ifdef XEN -+ REFLECT(34) -+#endif - DBG_FAULT(34) - FAULT(34) - END(lower_privilege_trap) -@@ -1279,6 +1485,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68) - ENTRY(taken_branch_trap) -+#ifdef XEN -+ REFLECT(35) -+#endif - DBG_FAULT(35) - FAULT(35) - END(taken_branch_trap) -@@ -1287,6 +1496,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69) - ENTRY(single_step_trap) -+#ifdef XEN -+ REFLECT(36) -+#endif - DBG_FAULT(36) - FAULT(36) - END(single_step_trap) -@@ -1343,6 +1555,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception (17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77) - ENTRY(ia32_exception) -+#ifdef XEN -+ REFLECT(45) -+#endif - DBG_FAULT(45) - FAULT(45) - END(ia32_exception) -@@ -1351,6 +1566,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept (30,31,59,70,71) - ENTRY(ia32_intercept) -+#ifdef XEN -+ REFLECT(46) -+#endif - DBG_FAULT(46) - #ifdef CONFIG_IA32_SUPPORT - mov r31=pr -@@ -1381,6 +1599,9 @@ - ///////////////////////////////////////////////////////////////////////////////////////// - // 0x6b00 Entry 47 (size 16 bundles) IA-32 Interrupt (74) - ENTRY(ia32_interrupt) -+#ifdef XEN -+ REFLECT(47) -+#endif - DBG_FAULT(47) - #ifdef CONFIG_IA32_SUPPORT - mov r31=pr -@@ -1510,6 +1731,39 @@ - DBG_FAULT(67) - FAULT(67) - -+#ifdef XEN -+ .org ia64_ivt+0x8000 -+ENTRY(dispatch_reflection) -+ /* -+ * Input: -+ * psr.ic: off -+ * r19: intr type (offset into ivt, see ia64_int.h) -+ * r31: contains saved predicates (pr) -+ */ -+ SAVE_MIN_WITH_COVER_R19 -+ alloc r14=ar.pfs,0,0,5,0 -+ mov out4=r15 -+ mov out0=cr.ifa -+ adds out1=16,sp -+ mov out2=cr.isr -+ mov out3=cr.iim -+// mov out3=cr.itir -+ -+ ssm psr.ic | PSR_DEFAULT_BITS -+ ;; -+ srlz.i // guarantee that interruption collection is on -+ ;; -+(p15) ssm psr.i // restore psr.i -+ adds r3=8,r2 // set up second base pointer -+ ;; -+ SAVE_REST -+ movl r14=ia64_leave_kernel -+ ;; -+ mov rp=r14 -+ br.sptk.many ia64_prepare_handle_reflection -+END(dispatch_reflection) -+#endif -+ - #ifdef CONFIG_IA32_SUPPORT - - /* diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/kregs.h --- a/xen/arch/ia64/patch/linux-2.6.7/kregs.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,13 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/kregs.h 2004-06-15 23:19:01.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/kregs.h 2004-09-17 18:27:22.000000000 -0600 -@@ -30,6 +30,10 @@ - #define IA64_TR_PALCODE 1 /* itr1: maps PALcode as required by EFI */ - #define IA64_TR_PERCPU_DATA 1 /* dtr1: percpu data */ - #define IA64_TR_CURRENT_STACK 2 /* dtr2: maps kernel's memory- & register-stacks */ -+#ifdef XEN -+#define IA64_TR_SHARED_INFO 3 /* dtr3: page shared with domain */ -+#define IA64_TR_VHPT 4 /* dtr4: vhpt */ -+#endif - - /* Processor status register bits: */ - #define IA64_PSR_BE_BIT 1 diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/lds.S --- a/xen/arch/ia64/patch/linux-2.6.7/lds.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,17 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/kernel/vmlinux.lds.S 2004-06-15 23:19:52.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/xen.lds.S 2004-08-25 19:28:12.000000000 -0600 -@@ -11,12 +11,14 @@ - OUTPUT_FORMAT("elf64-ia64-little") - OUTPUT_ARCH(ia64) - ENTRY(phys_start) -+#ifndef XEN - jiffies = jiffies_64; - PHDRS { - code PT_LOAD; - percpu PT_LOAD; - data PT_LOAD; - } -+#endif - SECTIONS - { - /* Sections to be discarded */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/linuxtime.h --- a/xen/arch/ia64/patch/linux-2.6.7/linuxtime.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,34 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/time.h 2004-06-15 23:19:37.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/xen/linuxtime.h 2004-11-15 17:42:04.000000000 -0700 -@@ -1,6 +1,11 @@ - #ifndef _LINUX_TIME_H - #define _LINUX_TIME_H - -+#ifdef XEN -+typedef s64 time_t; -+typedef s64 suseconds_t; -+#endif -+ - #include <asm/param.h> - #include <linux/types.h> - -@@ -25,7 +30,9 @@ - #ifdef __KERNEL__ - - #include <linux/spinlock.h> -+#ifndef XEN - #include <linux/seqlock.h> -+#endif - #include <linux/timex.h> - #include <asm/div64.h> - #ifndef div_long_long_rem -@@ -322,7 +329,9 @@ - - extern struct timespec xtime; - extern struct timespec wall_to_monotonic; -+#ifndef XEN - extern seqlock_t xtime_lock; -+#endif - - static inline unsigned long get_seconds(void) - { diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/mca_asm.h --- a/xen/arch/ia64/patch/linux-2.6.7/mca_asm.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,32 +0,0 @@ ---- ../../linux-2.6.7/include/asm-ia64/mca_asm.h 2004-06-15 23:20:03.000000000 -0600 -+++ include/asm-ia64/mca_asm.h 2005-04-01 12:56:37.000000000 -0700 -@@ -26,8 +26,13 @@ - * direct mapped to physical addresses. - * 1. Lop off bits 61 thru 63 in the virtual address - */ -+#ifdef XEN -+#define INST_VA_TO_PA(addr) \ -+ dep addr = 0, addr, 60, 4 -+#else // XEN - #define INST_VA_TO_PA(addr) \ - dep addr = 0, addr, 61, 3 -+#endif // XEN - /* - * This macro converts a data virtual address to a physical address - * Right now for simulation purposes the virtual addresses are -@@ -42,9 +47,15 @@ - * direct mapped to physical addresses. - * 1. Put 0x7 in bits 61 thru 63. - */ -+#ifdef XEN -+#define DATA_PA_TO_VA(addr,temp) \ -+ mov temp = 0xf ;; \ -+ dep addr = temp, addr, 60, 4 -+#else // XEN - #define DATA_PA_TO_VA(addr,temp) \ - mov temp = 0x7 ;; \ - dep addr = temp, addr, 61, 3 -+#endif // XEN - - /* - * This macro jumps to the instruction at the given virtual address diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/minstate.h --- a/xen/arch/ia64/patch/linux-2.6.7/minstate.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,29 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/minstate.h 2004-06-15 23:19:52.000000000 -0600 -+++ arch/ia64/minstate.h 2005-04-01 12:56:01.000000000 -0700 -@@ -45,7 +45,7 @@ - (pKStk) tpa r1=sp; /* compute physical addr of sp */ \ - (pUStk) addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1; /* compute base of memory stack */ \ - (pUStk) mov r23=ar.bspstore; /* save ar.bspstore */ \ --(pUStk) dep r22=-1,r22,61,3; /* compute kernel virtual addr of RBS */ \ -+(pUStk) dep r22=-1,r22,60,4; /* compute kernel virtual addr of RBS */ \ - ;; \ - (pKStk) addl r1=-IA64_PT_REGS_SIZE,r1; /* if in kernel mode, use sp (r12) */ \ - (pUStk) mov ar.bspstore=r22; /* switch to kernel RBS */ \ -@@ -65,7 +65,7 @@ - #endif - - #ifdef MINSTATE_PHYS --# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; dep reg=0,reg,61,3 -+# define MINSTATE_GET_CURRENT(reg) mov reg=IA64_KR(CURRENT);; dep reg=0,reg,60,4 - # define MINSTATE_START_SAVE_MIN MINSTATE_START_SAVE_MIN_PHYS - # define MINSTATE_END_SAVE_MIN MINSTATE_END_SAVE_MIN_PHYS - #endif -@@ -172,7 +172,7 @@ - ;; \ - .mem.offset 0,0; st8.spill [r16]=r15,16; \ - .mem.offset 8,0; st8.spill [r17]=r14,16; \ -- dep r14=-1,r0,61,3; \ -+ dep r14=-1,r0,60,4; \ - ;; \ - .mem.offset 0,0; st8.spill [r16]=r2,16; \ - .mem.offset 8,0; st8.spill [r17]=r3,16; \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/mm_bootmem.c --- a/xen/arch/ia64/patch/linux-2.6.7/mm_bootmem.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,92 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/mm/bootmem.c 2004-06-15 23:19:09.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/mm_bootmem.c 2004-12-17 13:47:03.000000000 -0700 -@@ -10,7 +10,9 @@ - */ - - #include <linux/mm.h> -+#ifndef XEN - #include <linux/kernel_stat.h> -+#endif - #include <linux/swap.h> - #include <linux/interrupt.h> - #include <linux/init.h> -@@ -55,6 +57,9 @@ - bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); - bdata->node_boot_start = (start << PAGE_SHIFT); - bdata->node_low_pfn = end; -+#ifdef XEN -+//printk("init_bootmem_core: mapstart=%lx,start=%lx,end=%lx,bdata->node_bootmem_map=%lx,bdata->node_boot_start=%lx,bdata->node_low_pfn=%lx\n",mapstart,start,end,bdata->node_bootmem_map,bdata->node_boot_start,bdata->node_low_pfn); -+#endif - - /* - * Initially all pages are reserved - setup_arch() has to -@@ -146,6 +151,9 @@ - unsigned long i, start = 0, incr, eidx; - void *ret; - -+#ifdef XEN -+//printf("__alloc_bootmem_core(%lx,%lx,%lx,%lx) called\n",bdata,size,align,goal); -+#endif - if(!size) { - printk("__alloc_bootmem_core(): zero-sized request\n"); - BUG(); -@@ -153,6 +161,9 @@ - BUG_ON(align & (align-1)); - - eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); -+#ifdef XEN -+//printf("__alloc_bootmem_core: eidx=%lx\n",eidx); -+#endif - offset = 0; - if (align && - (bdata->node_boot_start & (align - 1UL)) != 0) -@@ -182,6 +193,9 @@ - unsigned long j; - i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); - i = ALIGN(i, incr); -+#ifdef XEN -+//if (i >= eidx) goto fail_block; -+#endif - if (test_bit(i, bdata->node_bootmem_map)) - continue; - for (j = i + 1; j < i + areasize; ++j) { -@@ -203,6 +217,9 @@ - return NULL; - - found: -+#ifdef XEN -+//printf("__alloc_bootmem_core: start=%lx\n",start); -+#endif - bdata->last_success = start << PAGE_SHIFT; - BUG_ON(start >= eidx); - -@@ -262,6 +279,9 @@ - page = virt_to_page(phys_to_virt(bdata->node_boot_start)); - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); - map = bdata->node_bootmem_map; -+#ifdef XEN -+//printk("free_all_bootmem_core: bdata=%lx, bdata->node_boot_start=%lx, bdata->node_low_pfn=%lx, bdata->node_bootmem_map=%lx\n",bdata,bdata->node_boot_start,bdata->node_low_pfn,bdata->node_bootmem_map); -+#endif - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - if (v) { -@@ -285,6 +305,9 @@ - * Now free the allocator bitmap itself, it's not - * needed anymore: - */ -+#ifdef XEN -+//printk("About to free the allocator bitmap itself\n"); -+#endif - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { -@@ -327,6 +350,9 @@ - return(init_bootmem_core(&contig_page_data, start, 0, pages)); - } - -+#ifdef XEN -+#undef reserve_bootmem -+#endif - #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE - void __init reserve_bootmem (unsigned long addr, unsigned long size) - { diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/mm_contig.c --- a/xen/arch/ia64/patch/linux-2.6.7/mm_contig.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,216 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/mm/contig.c 2004-06-15 23:19:12.000000000 -0600 -+++ arch/ia64/mm_contig.c 2005-03-23 14:54:06.000000000 -0700 -@@ -15,11 +15,21 @@ - * memory. - */ - #include <linux/config.h> -+#ifdef XEN -+#include <xen/sched.h> -+#endif - #include <linux/bootmem.h> - #include <linux/efi.h> - #include <linux/mm.h> - #include <linux/swap.h> - -+#ifdef XEN -+#undef reserve_bootmem -+extern struct page *zero_page_memmap_ptr; -+struct page *mem_map; -+#define MAX_DMA_ADDRESS ~0UL // FIXME??? -+#endif -+ - #include <asm/meminit.h> - #include <asm/pgalloc.h> - #include <asm/pgtable.h> -@@ -37,30 +47,7 @@ - void - show_mem (void) - { -- int i, total = 0, reserved = 0; -- int shared = 0, cached = 0; -- -- printk("Mem-info:\n"); -- show_free_areas(); -- -- printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); -- i = max_mapnr; -- while (i-- > 0) { -- if (!pfn_valid(i)) -- continue; -- total++; -- if (PageReserved(mem_map+i)) -- reserved++; -- else if (PageSwapCache(mem_map+i)) -- cached++; -- else if (page_count(mem_map + i)) -- shared += page_count(mem_map + i) - 1; -- } -- printk("%d pages of RAM\n", total); -- printk("%d reserved pages\n", reserved); -- printk("%d pages shared\n", shared); -- printk("%d pages swap cached\n", cached); -- printk("%ld pages in page table cache\n", pgtable_cache_size); -+ printk("Dummy show_mem\n"); - } - - /* physical address where the bootmem map is located */ -@@ -80,6 +67,9 @@ - { - unsigned long *max_pfnp = arg, pfn; - -+#ifdef XEN -+//printf("find_max_pfn: start=%lx, end=%lx, *arg=%lx\n",start,end,*(unsigned long *)arg); -+#endif - pfn = (PAGE_ALIGN(end - 1) - PAGE_OFFSET) >> PAGE_SHIFT; - if (pfn > *max_pfnp) - *max_pfnp = pfn; -@@ -133,41 +123,6 @@ - return 0; - } - --/** -- * find_memory - setup memory map -- * -- * Walk the EFI memory map and find usable memory for the system, taking -- * into account reserved areas. -- */ --void --find_memory (void) --{ -- unsigned long bootmap_size; -- -- reserve_memory(); -- -- /* first find highest page frame number */ -- max_pfn = 0; -- efi_memmap_walk(find_max_pfn, &max_pfn); -- -- /* how many bytes to cover all the pages */ -- bootmap_size = bootmem_bootmap_pages(max_pfn) << PAGE_SHIFT; -- -- /* look for a location to hold the bootmap */ -- bootmap_start = ~0UL; -- efi_memmap_walk(find_bootmap_location, &bootmap_size); -- if (bootmap_start == ~0UL) -- panic("Cannot find %ld bytes for bootmap\n", bootmap_size); -- -- bootmap_size = init_bootmem(bootmap_start >> PAGE_SHIFT, max_pfn); -- -- /* Free all available memory, then mark bootmem-map as being in use. */ -- efi_memmap_walk(filter_rsvd_memory, free_bootmem); -- reserve_bootmem(bootmap_start, bootmap_size); -- -- find_initrd(); --} -- - #ifdef CONFIG_SMP - /** - * per_cpu_init - setup per-cpu variables -@@ -227,73 +182,42 @@ - void - paging_init (void) - { -- unsigned long max_dma; -- unsigned long zones_size[MAX_NR_ZONES]; --#ifdef CONFIG_VIRTUAL_MEM_MAP -- unsigned long zholes_size[MAX_NR_ZONES]; -- unsigned long max_gap; --#endif -- -- /* initialize mem_map[] */ -+ struct pfn_info *pg; -+ /* Allocate and map the machine-to-phys table */ -+ if ((pg = alloc_domheap_pages(NULL, 10)) == NULL) -+ panic("Not enough memory to bootstrap Xen.\n"); -+ memset(page_to_virt(pg), 0x55, 16UL << 20); - -- memset(zones_size, 0, sizeof(zones_size)); -+ /* Other mapping setup */ - -- num_physpages = 0; -- efi_memmap_walk(count_pages, &num_physpages); - -- max_dma = virt_to_phys((void *) MAX_DMA_ADDRESS) >> PAGE_SHIFT; -- --#ifdef CONFIG_VIRTUAL_MEM_MAP -- memset(zholes_size, 0, sizeof(zholes_size)); -- -- num_dma_physpages = 0; -- efi_memmap_walk(count_dma_pages, &num_dma_physpages); -- -- if (max_low_pfn < max_dma) { -- zones_size[ZONE_DMA] = max_low_pfn; -- zholes_size[ZONE_DMA] = max_low_pfn - num_dma_physpages; -- } else { -- zones_size[ZONE_DMA] = max_dma; -- zholes_size[ZONE_DMA] = max_dma - num_dma_physpages; -- if (num_physpages > num_dma_physpages) { -- zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; -- zholes_size[ZONE_NORMAL] = -- ((max_low_pfn - max_dma) - -- (num_physpages - num_dma_physpages)); -- } -- } -- -- max_gap = 0; -- efi_memmap_walk(find_largest_hole, (u64 *)&max_gap); -- if (max_gap < LARGE_GAP) { -- vmem_map = (struct page *) 0; -- free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, -- zholes_size); -- mem_map = contig_page_data.node_mem_map; -- } else { -- unsigned long map_size; -- -- /* allocate virtual_mem_map */ -- -- map_size = PAGE_ALIGN(max_low_pfn * sizeof(struct page)); -- vmalloc_end -= map_size; -- vmem_map = (struct page *) vmalloc_end; -- efi_memmap_walk(create_mem_map_page_table, 0); -- -- free_area_init_node(0, &contig_page_data, vmem_map, zones_size, -- 0, zholes_size); -- -- mem_map = contig_page_data.node_mem_map; -- printk("Virtual mem_map starts at 0x%p\n", mem_map); -- } --#else /* !CONFIG_VIRTUAL_MEM_MAP */ -- if (max_low_pfn < max_dma) -- zones_size[ZONE_DMA] = max_low_pfn; -- else { -- zones_size[ZONE_DMA] = max_dma; -- zones_size[ZONE_NORMAL] = max_low_pfn - max_dma; -- } -- free_area_init(zones_size); --#endif /* !CONFIG_VIRTUAL_MEM_MAP */ - zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); - } -+ -+struct pfn_info *frame_table; -+unsigned long frame_table_size; -+unsigned long max_page; -+ -+/* FIXME: postpone support to machines with big holes between physical memorys. -+ * Current hack allows only efi memdesc upto 4G place. (See efi.c) -+ */ -+#ifndef CONFIG_VIRTUAL_MEM_MAP -+#define FT_ALIGN_SIZE (16UL << 20) -+void __init init_frametable(void) -+{ -+ unsigned long i, p; -+ frame_table_size = max_page * sizeof(struct pfn_info); -+ frame_table_size = (frame_table_size + PAGE_SIZE - 1) & PAGE_MASK; -+ -+ /* Request continuous trunk from boot allocator, since HV -+ * address is identity mapped */ -+ p = alloc_boot_pages(frame_table_size>>PAGE_SHIFT, FT_ALIGN_SIZE>>PAGE_SHIFT) << PAGE_SHIFT; -+ if (p == 0) -+ panic("Not enough memory for frame table.\n"); -+ -+ frame_table = __va(p); -+ memset(frame_table, 0, frame_table_size); -+ printk("size of frame_table: %lukB\n", -+ frame_table_size >> 10); -+} -+#endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/mmzone.h --- a/xen/arch/ia64/patch/linux-2.6.7/mmzone.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,14 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/mmzone.h 2004-06-15 23:19:36.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/linux/mmzone.h 2004-08-25 19:28:13.000000000 -0600 -@@ -185,7 +185,11 @@ - char *name; - unsigned long spanned_pages; /* total size, including holes */ - unsigned long present_pages; /* amount of memory (excluding holes) */ -+#ifdef XEN -+}; -+#else - } ____cacheline_maxaligned_in_smp; -+#endif - - - /* diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/page.h --- a/xen/arch/ia64/patch/linux-2.6.7/page.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,84 +0,0 @@ ---- ../../linux-2.6.7/include/asm-ia64/page.h 2004-06-15 23:18:58.000000000 -0600 -+++ include/asm-ia64/page.h 2005-04-01 12:56:37.000000000 -0700 -@@ -12,6 +12,9 @@ - #include <asm/intrinsics.h> - #include <asm/types.h> - -+#ifndef __ASSEMBLY__ -+#include <asm/flushtlb.h> -+#endif - /* - * PAGE_SHIFT determines the actual kernel page size. - */ -@@ -84,14 +87,22 @@ - #endif - - #ifndef CONFIG_DISCONTIGMEM -+#ifdef XEN -+#define pfn_valid(pfn) (0) -+#else - #define pfn_valid(pfn) (((pfn) < max_mapnr) && ia64_pfn_valid(pfn)) --#define page_to_pfn(page) ((unsigned long) (page - mem_map)) --#define pfn_to_page(pfn) (mem_map + (pfn)) -+#endif - #endif /* CONFIG_DISCONTIGMEM */ - --#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) -+#define page_to_pfn(_page) ((unsigned long)((_page) - frame_table)) -+#define page_to_virt(_page) phys_to_virt(page_to_phys(_page)) -+ -+#define page_to_phys(_page) (page_to_pfn(_page) << PAGE_SHIFT) - #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) - -+#define pfn_to_page(_pfn) (frame_table + (_pfn)) -+#define phys_to_page(kaddr) pfn_to_page(((kaddr) >> PAGE_SHIFT)) -+ - typedef union ia64_va { - struct { - unsigned long off : 61; /* intra-region offset */ -@@ -107,8 +118,25 @@ - * expressed in this way to ensure they result in a single "dep" - * instruction. - */ -+#ifdef XEN -+typedef union xen_va { -+ struct { -+ unsigned long off : 60; -+ unsigned long reg : 4; -+ } f; -+ unsigned long l; -+ void *p; -+} xen_va; -+ -+// xen/drivers/console.c uses __va in a declaration (should be fixed!) -+#define __pa(x) ({xen_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) -+#define __va(x) ({xen_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) -+//# define __pa(x) ((unsigned long)(((unsigned long)x) - PAGE_OFFSET)) -+//# define __va(x) ((void *)((char *)(x) + PAGE_OFFSET)) -+#else - #define __pa(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = 0; _v.l;}) - #define __va(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg = -1; _v.p;}) -+#endif - - #define REGION_NUMBER(x) ({ia64_va _v; _v.l = (long) (x); _v.f.reg;}) - #define REGION_OFFSET(x) ({ia64_va _v; _v.l = (long) (x); _v.f.off;}) -@@ -180,11 +208,19 @@ - # define __pgprot(x) (x) - #endif /* !STRICT_MM_TYPECHECKS */ - -+#ifdef XEN -+#define PAGE_OFFSET 0xf000000000000000 -+#else - #define PAGE_OFFSET 0xe000000000000000 -+#endif - - #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | \ - (((current->thread.flags & IA64_THREAD_XSTACK) != 0) \ - ? VM_EXEC : 0)) - -+#ifdef XEN -+#define __flush_tlb() do {} while(0); -+#endif -+ - #endif /* _ASM_IA64_PAGE_H */ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/page_alloc.c --- a/xen/arch/ia64/patch/linux-2.6.7/page_alloc.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,305 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/mm/page_alloc.c 2004-06-15 23:18:57.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/page_alloc.c 2004-12-17 13:47:03.000000000 -0700 -@@ -19,20 +19,28 @@ - #include <linux/mm.h> - #include <linux/swap.h> - #include <linux/interrupt.h> -+#ifndef XEN - #include <linux/pagemap.h> -+#endif - #include <linux/bootmem.h> - #include <linux/compiler.h> - #include <linux/module.h> -+#ifndef XEN - #include <linux/suspend.h> - #include <linux/pagevec.h> - #include <linux/blkdev.h> -+#endif - #include <linux/slab.h> -+#ifndef XEN - #include <linux/notifier.h> -+#endif - #include <linux/topology.h> -+#ifndef XEN - #include <linux/sysctl.h> - #include <linux/cpu.h> - - #include <asm/tlbflush.h> -+#endif - - DECLARE_BITMAP(node_online_map, MAX_NUMNODES); - struct pglist_data *pgdat_list; -@@ -71,6 +79,9 @@ - - static void bad_page(const char *function, struct page *page) - { -+#ifdef XEN -+printk("bad_page: called but disabled\n"); -+#else - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n", -@@ -91,6 +102,7 @@ - set_page_count(page, 0); - page->mapping = NULL; - page->mapcount = 0; -+#endif - } - - #ifndef CONFIG_HUGETLB_PAGE -@@ -218,6 +230,7 @@ - - static inline void free_pages_check(const char *function, struct page *page) - { -+#ifndef XEN - if ( page_mapped(page) || - page->mapping != NULL || - page_count(page) != 0 || -@@ -233,6 +246,7 @@ - 1 << PG_swapcache | - 1 << PG_writeback ))) - bad_page(function, page); -+#endif - if (PageDirty(page)) - ClearPageDirty(page); - } -@@ -276,6 +290,9 @@ - - void __free_pages_ok(struct page *page, unsigned int order) - { -+#ifdef XEN -+printk("__free_pages_ok: called but disabled\n"); -+#else - LIST_HEAD(list); - int i; - -@@ -285,6 +302,7 @@ - list_add(&page->lru, &list); - kernel_map_pages(page, 1<<order, 0); - free_pages_bulk(page_zone(page), 1, &list, order); -+#endif - } - - #define MARK_USED(index, order, area) \ -@@ -330,6 +348,7 @@ - */ - static void prep_new_page(struct page *page, int order) - { -+#ifndef XEN - if (page->mapping || page_mapped(page) || - (page->flags & ( - 1 << PG_private | -@@ -343,11 +362,14 @@ - 1 << PG_swapcache | - 1 << PG_writeback ))) - bad_page(__FUNCTION__, page); -+#endif - - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_checked | 1 << PG_mappedtodisk); -+#ifndef XEN - page->private = 0; -+#endif - set_page_refs(page, order); - } - -@@ -590,13 +612,17 @@ - unsigned long min; - struct zone **zones; - struct page *page; -+#ifndef XEN - struct reclaim_state reclaim_state; -+#endif - struct task_struct *p = current; - int i; - int alloc_type; - int do_retry; - -+#ifndef XEN - might_sleep_if(wait); -+#endif - - zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ - if (zones[0] == NULL) /* no zones in the zonelist */ -@@ -610,12 +636,14 @@ - - min = (1<<order) + z->protection[alloc_type]; - -+#ifndef XEN - /* - * We let real-time tasks dip their real-time paws a little - * deeper into reserves. - */ - if (rt_task(p)) - min -= z->pages_low >> 1; -+#endif - - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { -@@ -627,9 +655,11 @@ - } - } - -+#ifndef XEN - /* we're somewhat low on memory, failed to find what we needed */ - for (i = 0; zones[i] != NULL; i++) - wakeup_kswapd(zones[i]); -+#endif - - /* Go through the zonelist again, taking __GFP_HIGH into account */ - for (i = 0; zones[i] != NULL; i++) { -@@ -639,8 +669,10 @@ - - if (gfp_mask & __GFP_HIGH) - min -= z->pages_low >> 2; -+#ifndef XEN - if (rt_task(p)) - min -= z->pages_low >> 1; -+#endif - - if (z->free_pages >= min || - (!wait && z->free_pages >= z->pages_high)) { -@@ -654,6 +686,7 @@ - - /* here we're in the low on memory slow path */ - -+#ifndef XEN - rebalance: - if ((p->flags & (PF_MEMALLOC | PF_MEMDIE)) && !in_interrupt()) { - /* go through the zonelist yet again, ignoring mins */ -@@ -681,6 +714,7 @@ - - p->reclaim_state = NULL; - p->flags &= ~PF_MEMALLOC; -+#endif - - /* go through the zonelist yet one more time */ - for (i = 0; zones[i] != NULL; i++) { -@@ -698,6 +732,11 @@ - } - } - -+#ifdef XEN -+printk(KERN_WARNING "%s: page allocation failure." -+ " order:%d, mode:0x%x\n", -+ "(xen tasks have no comm)", order, gfp_mask); -+#else - /* - * Don't let big-order allocations loop unless the caller explicitly - * requests that. Wait for some write requests to complete then retry. -@@ -724,6 +763,7 @@ - p->comm, order, gfp_mask); - dump_stack(); - } -+#endif - return NULL; - got_pg: - kernel_map_pages(page, 1 << order, 1); -@@ -808,6 +848,7 @@ - - EXPORT_SYMBOL(get_zeroed_page); - -+#ifndef XEN - void __pagevec_free(struct pagevec *pvec) - { - int i = pagevec_count(pvec); -@@ -815,10 +856,15 @@ - while (--i >= 0) - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -+#endif - - fastcall void __free_pages(struct page *page, unsigned int order) - { -+#ifdef XEN -+ if (!PageReserved(page)) { -+#else - if (!PageReserved(page) && put_page_testzero(page)) { -+#endif - if (order == 0) - free_hot_page(page); - else -@@ -914,6 +960,13 @@ - return nr_free_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK); - } - -+#ifdef XEN -+unsigned int nr_free_highpages (void) -+{ -+printf("nr_free_highpages: called but not implemented\n"); -+} -+#endif -+ - #ifdef CONFIG_HIGHMEM - unsigned int nr_free_highpages (void) - { -@@ -1022,6 +1075,7 @@ - - void si_meminfo(struct sysinfo *val) - { -+#ifndef XEN - val->totalram = totalram_pages; - val->sharedram = 0; - val->freeram = nr_free_pages(); -@@ -1034,6 +1088,7 @@ - val->freehigh = 0; - #endif - val->mem_unit = PAGE_SIZE; -+#endif - } - - EXPORT_SYMBOL(si_meminfo); -@@ -1165,7 +1220,9 @@ - printk("= %lukB\n", K(total)); - } - -+#ifndef XEN - show_swap_cache_info(); -+#endif - } - - /* -@@ -1530,6 +1587,9 @@ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); -+#ifdef XEN -+//printf("free_area_init_core-1: calling alloc_bootmem_node(%lx,%lx)\n",pgdat,zone->wait_table_size * sizeof(wait_queue_head_t)); -+#endif - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); -@@ -1584,6 +1644,9 @@ - */ - bitmap_size = (size-1) >> (i+4); - bitmap_size = LONG_ALIGN(bitmap_size+1); -+#ifdef XEN -+//printf("free_area_init_core-2: calling alloc_bootmem_node(%lx,%lx)\n",pgdat, bitmap_size); -+#endif - zone->free_area[i].map = - (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); - } -@@ -1601,6 +1664,9 @@ - calculate_zone_totalpages(pgdat, zones_size, zholes_size); - if (!node_mem_map) { - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); -+#ifdef XEN -+//printf("free_area_init_node: calling alloc_bootmem_node(%lx,%lx)\n",pgdat,size); -+#endif - node_mem_map = alloc_bootmem_node(pgdat, size); - } - pgdat->node_mem_map = node_mem_map; -@@ -1784,6 +1850,7 @@ - - #endif /* CONFIG_PROC_FS */ - -+#ifndef XEN - #ifdef CONFIG_HOTPLUG_CPU - static int page_alloc_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -@@ -2011,3 +2078,4 @@ - setup_per_zone_protection(); - return 0; - } -+#endif diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/pal.S --- a/xen/arch/ia64/patch/linux-2.6.7/pal.S Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,26 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/pal.S 2004-06-15 23:20:25.000000000 -0600 -+++ arch/ia64/pal.S 2005-04-01 12:56:01.000000000 -0700 -@@ -163,7 +163,11 @@ - adds r8 = 1f-1b,r8 // calculate return address for call - ;; - mov loc4=ar.rsc // save RSE configuration -+#ifdef XEN -+ dep.z loc2=loc2,0,60 // convert pal entry point to physical -+#else // XEN - dep.z loc2=loc2,0,61 // convert pal entry point to physical -+#endif // XEN - tpa r8=r8 // convert rp to physical - ;; - mov b7 = loc2 // install target to branch reg -@@ -218,7 +222,11 @@ - mov loc3 = psr // save psr - ;; - mov loc4=ar.rsc // save RSE configuration -+#ifdef XEN -+ dep.z loc2=loc2,0,60 // convert pal entry point to physical -+#else // XEN - dep.z loc2=loc2,0,61 // convert pal entry point to physical -+#endif // XEN - ;; - mov ar.rsc=0 // put RSE in enforced lazy, LE mode - movl r16=PAL_PSR_BITS_TO_CLEAR diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/pgalloc.h --- a/xen/arch/ia64/patch/linux-2.6.7/pgalloc.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,65 +0,0 @@ ---- ../../linux-2.6.7/include/asm-ia64/pgalloc.h 2004-06-15 23:18:54.000000000 -0600 -+++ include/asm-ia64/pgalloc.h 2005-03-23 14:54:11.000000000 -0700 -@@ -34,6 +34,10 @@ - #define pmd_quicklist (local_cpu_data->pmd_quick) - #define pgtable_cache_size (local_cpu_data->pgtable_cache_sz) - -+/* FIXME: Later 3 level page table should be over, to create -+ * new interface upon xen memory allocator. To simplify first -+ * effort moving to xen allocator, use xenheap pages temporarily. -+ */ - static inline pgd_t* - pgd_alloc_one_fast (struct mm_struct *mm) - { -@@ -55,7 +59,7 @@ - pgd_t *pgd = pgd_alloc_one_fast(mm); - - if (unlikely(pgd == NULL)) { -- pgd = (pgd_t *)__get_free_page(GFP_KERNEL); -+ pgd = (pgd_t *)alloc_xenheap_page(); - if (likely(pgd != NULL)) - clear_page(pgd); - } -@@ -93,7 +97,7 @@ - static inline pmd_t* - pmd_alloc_one (struct mm_struct *mm, unsigned long addr) - { -- pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); -+ pmd_t *pmd = (pmd_t *)alloc_xenheap_page(); - - if (likely(pmd != NULL)) - clear_page(pmd); -@@ -125,7 +129,7 @@ - static inline struct page * - pte_alloc_one (struct mm_struct *mm, unsigned long addr) - { -- struct page *pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -+ struct page *pte = alloc_xenheap_page(); - - if (likely(pte != NULL)) - clear_page(page_address(pte)); -@@ -135,7 +139,7 @@ - static inline pte_t * - pte_alloc_one_kernel (struct mm_struct *mm, unsigned long addr) - { -- pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); -+ pte_t *pte = (pte_t *)alloc_xenheap_page(); - - if (likely(pte != NULL)) - clear_page(pte); -@@ -145,13 +149,13 @@ - static inline void - pte_free (struct page *pte) - { -- __free_page(pte); -+ free_xenheap_page(pte); - } - - static inline void - pte_free_kernel (pte_t *pte) - { -- free_page((unsigned long) pte); -+ free_xenheap_page((unsigned long) pte); - } - - #define __pte_free_tlb(tlb, pte) tlb_remove_page((tlb), (pte)) diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/processor.h --- a/xen/arch/ia64/patch/linux-2.6.7/processor.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,19 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/processor.h 2005-01-23 13:23:36.000000000 -0700 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/processor.h 2004-08-25 19:28:13.000000000 -0600 -@@ -406,12 +406,16 @@ - */ - - /* Return TRUE if task T owns the fph partition of the CPU we're running on. */ -+#ifdef XEN -+#define ia64_is_local_fpu_owner(t) 0 -+#else - #define ia64_is_local_fpu_owner(t) \ - ({ \ - struct task_struct *__ia64_islfo_task = (t); \ - (__ia64_islfo_task->thread.last_fph_cpu == smp_processor_id() \ - && __ia64_islfo_task == (struct task_struct *) ia64_get_kr(IA64_KR_FPU_OWNER)); \ - }) -+#endif - - /* Mark task T as owning the fph partition of the CPU we're running on. */ - #define ia64_set_local_fpu_owner(t) do { \ diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/sal.h --- a/xen/arch/ia64/patch/linux-2.6.7/sal.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,26 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/sal.h 2004-06-15 23:20:04.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/sal.h 2004-10-27 13:55:23.000000000 -0600 -@@ -646,7 +646,23 @@ - { - struct ia64_sal_retval isrv; - -+//#ifdef XEN -+#if 0 -+ unsigned long *x = (unsigned long *)ia64_sal; -+ unsigned long *inst = (unsigned long *)*x; -+ unsigned long __ia64_sc_flags; -+ struct ia64_fpreg __ia64_sc_fr[6]; -+printf("ia64_sal_freq_base: about to save_scratch_fpregs\n"); -+ ia64_save_scratch_fpregs(__ia64_sc_fr); -+ spin_lock_irqsave(&sal_lock, __ia64_sc_flags); -+printf("ia64_sal_freq_base: about to call, ia64_sal=%p, ia64_sal[0]=%p, ia64_sal[1]=%p\n",x,x[0],x[1]); -+printf("first inst=%p,%p\n",inst[0],inst[1]); -+ isrv = (*ia64_sal)(SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); -+ spin_unlock_irqrestore(&sal_lock, __ia64_sc_flags); -+ ia64_load_scratch_fpregs(__ia64_sc_fr); -+#else - SAL_CALL(isrv, SAL_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); -+#endif - *ticks_per_second = isrv.v0; - *drift_info = isrv.v1; - return isrv.status; diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/setup.c --- a/xen/arch/ia64/patch/linux-2.6.7/setup.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,203 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/setup.c 2004-06-15 23:18:58.000000000 -0600 -+++ arch/ia64/setup.c 2005-04-04 22:31:09.000000000 -0600 -@@ -21,6 +21,9 @@ - #include <linux/init.h> - - #include <linux/acpi.h> -+#ifdef XEN -+#include <xen/sched.h> -+#endif - #include <linux/bootmem.h> - #include <linux/console.h> - #include <linux/delay.h> -@@ -30,13 +33,17 @@ - #include <linux/seq_file.h> - #include <linux/string.h> - #include <linux/threads.h> -+#ifndef XEN - #include <linux/tty.h> - #include <linux/serial.h> - #include <linux/serial_core.h> -+#endif - #include <linux/efi.h> - #include <linux/initrd.h> - -+#ifndef XEN - #include <asm/ia32.h> -+#endif - #include <asm/machvec.h> - #include <asm/mca.h> - #include <asm/meminit.h> -@@ -50,6 +57,11 @@ - #include <asm/smp.h> - #include <asm/system.h> - #include <asm/unistd.h> -+#ifdef XEN -+#include <linux/mm.h> -+#include <asm/mmu_context.h> -+extern unsigned long loops_per_jiffy; // from linux/init/main.c -+#endif - - #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) - # error "struct cpuinfo_ia64 too big!" -@@ -65,7 +77,9 @@ - DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8); - unsigned long ia64_cycles_per_usec; - struct ia64_boot_param *ia64_boot_param; -+#ifndef XEN - struct screen_info screen_info; -+#endif - - unsigned long ia64_max_cacheline_size; - unsigned long ia64_iobase; /* virtual address for I/O accesses */ -@@ -98,7 +112,6 @@ - struct rsvd_region rsvd_region[IA64_MAX_RSVD_REGIONS + 1]; - int num_rsvd_regions; - -- - /* - * Filter incoming memory segments based on the primitive map created from the boot - * parameters. Segments contained in the map are removed from the memory ranges. A -@@ -128,9 +141,12 @@ - for (i = 0; i < num_rsvd_regions; ++i) { - range_start = max(start, prev_start); - range_end = min(end, rsvd_region[i].start); -- -- if (range_start < range_end) -- call_pernode_memory(__pa(range_start), range_end - range_start, func); -+ /* init_boot_pages requires "ps, pe" */ -+ if (range_start < range_end) { -+ printk("Init boot pages: 0x%lx -> 0x%lx.\n", -+ __pa(range_start), __pa(range_end)); -+ (*func)(__pa(range_start), __pa(range_end), 0); -+ } - - /* nothing more available in this segment */ - if (range_end == end) return 0; -@@ -187,17 +203,17 @@ - + strlen(__va(ia64_boot_param->command_line)) + 1); - n++; - -+ /* Reserve xen image/bitmap/xen-heap */ - rsvd_region[n].start = (unsigned long) ia64_imva((void *)KERNEL_START); -- rsvd_region[n].end = (unsigned long) ia64_imva(_end); -+ rsvd_region[n].end = rsvd_region[n].start + xenheap_size; - n++; - --#ifdef CONFIG_BLK_DEV_INITRD -+ /* This is actually dom0 image */ - if (ia64_boot_param->initrd_start) { - rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start); - rsvd_region[n].end = rsvd_region[n].start + ia64_boot_param->initrd_size; - n++; - } --#endif - - /* end of memory marker */ - rsvd_region[n].start = ~0UL; -@@ -207,6 +223,16 @@ - num_rsvd_regions = n; - - sort_regions(rsvd_region, num_rsvd_regions); -+ -+ { -+ int i; -+ printk("Reserved regions: \n"); -+ for (i = 0; i < num_rsvd_regions; i++) -+ printk(" [%d] -> [0x%lx, 0x%lx]\n", -+ i, -+ rsvd_region[i].start, -+ rsvd_region[i].end); -+ } - } - - /** -@@ -280,23 +306,26 @@ - } - #endif - -+#ifdef XEN - void __init --setup_arch (char **cmdline_p) -+early_setup_arch(char **cmdline_p) - { - unw_init(); -- -- ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist); -- -+ - *cmdline_p = __va(ia64_boot_param->command_line); - strlcpy(saved_command_line, *cmdline_p, sizeof(saved_command_line)); -- -+ cmdline_parse(*cmdline_p); -+ - efi_init(); -- io_port_init(); -- -+ - #ifdef CONFIG_IA64_GENERIC - machvec_init(acpi_get_sysname()); - #endif - -+#ifdef XEN -+#undef CONFIG_ACPI_BOOT -+#endif -+ - #ifdef CONFIG_ACPI_BOOT - /* Initialize the ACPI boot-time table parser */ - acpi_table_init(); -@@ -308,9 +337,13 @@ - smp_build_cpu_map(); /* happens, e.g., with the Ski simulator */ - # endif - #endif /* CONFIG_APCI_BOOT */ -+ io_port_init(); -+} -+#endif - -- find_memory(); -- -+void __init -+setup_arch (void) -+{ - /* process SAL system table: */ - ia64_sal_init(efi.sal_systab); - -@@ -353,7 +386,6 @@ - /* enable IA-64 Machine Check Abort Handling */ - ia64_mca_init(); - -- platform_setup(cmdline_p); - paging_init(); - } - -@@ -413,6 +445,9 @@ - sprintf(cp, " 0x%lx", mask); - } - -+#ifdef XEN -+#define seq_printf(a,b...) printf(b) -+#endif - seq_printf(m, - "processor : %d\n" - "vendor : %s\n" -@@ -616,7 +651,11 @@ - | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC)); - atomic_inc(&init_mm.mm_count); - current->active_mm = &init_mm; -+#ifdef XEN -+ if (current->domain->arch.mm) -+#else - if (current->mm) -+#endif - BUG(); - - ia64_mmu_init(ia64_imva(cpu_data)); -@@ -667,6 +706,8 @@ - void - check_bugs (void) - { -+#ifndef XEN - ia64_patch_mckinley_e9((unsigned long) __start___mckinley_e9_bundles, - (unsigned long) __end___mckinley_e9_bundles); -+#endif - } diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/slab.c --- a/xen/arch/ia64/patch/linux-2.6.7/slab.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,139 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/mm/slab.c 2004-06-15 23:19:44.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/slab.c 2004-12-17 13:47:03.000000000 -0700 -@@ -86,15 +86,30 @@ - #include <linux/init.h> - #include <linux/compiler.h> - #include <linux/seq_file.h> -+#ifndef XEN - #include <linux/notifier.h> - #include <linux/kallsyms.h> - #include <linux/cpu.h> - #include <linux/sysctl.h> - #include <linux/module.h> -+#endif - - #include <asm/uaccess.h> - #include <asm/cacheflush.h> -+#ifndef XEN - #include <asm/tlbflush.h> -+#endif -+ -+#ifdef XEN -+#define lock_cpu_hotplug() do { } while (0) -+#define unlock_cpu_hotplug() do { } while (0) -+#define might_sleep_if(x) do { } while (0) -+#define dump_stack() do { } while (0) -+#define start_cpu_timer(cpu) do { } while (0) -+static inline void __down(struct semaphore *sem) { } -+static inline void __up(struct semaphore *sem) { } -+static inline void might_sleep(void) { } -+#endif - - /* - * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, -@@ -530,7 +545,9 @@ - FULL - } g_cpucache_up; - -+#ifndef XEN - static DEFINE_PER_CPU(struct timer_list, reap_timers); -+#endif - - static void reap_timer_fnc(unsigned long data); - static void free_block(kmem_cache_t* cachep, void** objpp, int len); -@@ -588,6 +605,7 @@ - * Add the CPU number into the expiry time to minimize the possibility of the - * CPUs getting into lockstep and contending for the global cache chain lock. - */ -+#ifndef XEN - static void __devinit start_cpu_timer(int cpu) - { - struct timer_list *rt = &per_cpu(reap_timers, cpu); -@@ -600,6 +618,7 @@ - add_timer_on(rt, cpu); - } - } -+#endif - - #ifdef CONFIG_HOTPLUG_CPU - static void stop_cpu_timer(int cpu) -@@ -634,6 +653,7 @@ - return nc; - } - -+#ifndef XEN - static int __devinit cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -@@ -693,6 +713,7 @@ - } - - static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; -+#endif - - /* Initialisation. - * Called after the gfp() functions have been enabled, and before smp_init(). -@@ -805,10 +826,14 @@ - /* Done! */ - g_cpucache_up = FULL; - -+#ifdef XEN -+printk("kmem_cache_init: some parts commented out, ignored\n"); -+#else - /* Register a cpu startup notifier callback - * that initializes ac_data for all new cpus - */ - register_cpu_notifier(&cpucache_notifier); -+#endif - - - /* The reap timers are started later, with a module init call: -@@ -886,8 +911,10 @@ - page++; - } - sub_page_state(nr_slab, nr_freed); -+#ifndef XEN - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += nr_freed; -+#endif - free_pages((unsigned long)addr, cachep->gfporder); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); -@@ -1363,8 +1390,10 @@ - + cachep->num; - } - -+#ifndef XEN - cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep)%REAPTIMEOUT_LIST3; -+#endif - - /* Need the semaphore to access the chain. */ - down(&cache_chain_sem); -@@ -2237,8 +2266,10 @@ - - if (unlikely(addr < min_addr)) - goto out; -+#ifndef XEN - if (unlikely(addr > (unsigned long)high_memory - size)) - goto out; -+#endif - if (unlikely(addr & align_mask)) - goto out; - if (unlikely(!kern_addr_valid(addr))) -@@ -2769,6 +2800,7 @@ - */ - static void reap_timer_fnc(unsigned long cpu) - { -+#ifndef XEN - struct timer_list *rt = &__get_cpu_var(reap_timers); - - /* CPU hotplug can drag us off cpu: don't run on wrong CPU */ -@@ -2776,6 +2808,7 @@ - cache_reap(); - mod_timer(rt, jiffies + REAPTIMEOUT_CPUC + cpu); - } -+#endif - } - - #ifdef CONFIG_PROC_FS diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/slab.h --- a/xen/arch/ia64/patch/linux-2.6.7/slab.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,14 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/slab.h 2004-06-15 23:20:26.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/slab.h 2004-08-25 19:28:13.000000000 -0600 -@@ -83,7 +83,11 @@ - goto found; \ - else \ - i++; -+#ifdef XEN -+#include <linux/kmalloc_sizes.h> -+#else - #include "kmalloc_sizes.h" -+#endif - #undef CACHE - { - extern void __you_cannot_kmalloc_that_much(void); diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/swiotlb.c --- a/xen/arch/ia64/patch/linux-2.6.7/swiotlb.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,47 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/lib/swiotlb.c 2004-06-15 23:19:43.000000000 -0600 -+++ arch/ia64/lib/swiotlb.c 2005-03-23 14:54:05.000000000 -0700 -@@ -100,7 +100,11 @@ - /* - * Get IO TLB memory from the low pages - */ -- io_tlb_start = alloc_bootmem_low_pages(io_tlb_nslabs * (1 << IO_TLB_SHIFT)); -+ /* FIXME: Do we really need swiotlb in HV? If all memory trunks -+ * presented to guest as <4G, are actually <4G in machine range, -+ * no DMA intevention from HV... -+ */ -+ io_tlb_start = alloc_xenheap_pages(get_order(io_tlb_nslabs * (1 << IO_TLB_SHIFT))); - if (!io_tlb_start) - BUG(); - io_tlb_end = io_tlb_start + io_tlb_nslabs * (1 << IO_TLB_SHIFT); -@@ -110,11 +114,11 @@ - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ -- io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); -+ io_tlb_list = alloc_xenheap_pages(get_order(io_tlb_nslabs * sizeof(int))); - for (i = 0; i < io_tlb_nslabs; i++) - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); - io_tlb_index = 0; -- io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(char *)); -+ io_tlb_orig_addr = alloc_xenheap_pages(get_order(io_tlb_nslabs * sizeof(char *))); - - printk(KERN_INFO "Placing software IO TLB between 0x%p - 0x%p\n", - (void *) io_tlb_start, (void *) io_tlb_end); -@@ -279,7 +283,7 @@ - /* XXX fix me: the DMA API should pass us an explicit DMA mask instead: */ - flags |= GFP_DMA; - -- ret = (void *)__get_free_pages(flags, get_order(size)); -+ ret = (void *)alloc_xenheap_pages(get_order(size)); - if (!ret) - return NULL; - -@@ -294,7 +298,7 @@ - void - swiotlb_free_coherent (struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) - { -- free_pages((unsigned long) vaddr, get_order(size)); -+ free_xenheap_pages((unsigned long) vaddr, get_order(size)); - } - - /* diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/system.h --- a/xen/arch/ia64/patch/linux-2.6.7/system.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,43 +0,0 @@ ---- ../../linux-2.6.7/include/asm-ia64/system.h 2005-03-24 19:39:56.000000000 -0700 -+++ include/asm-ia64/system.h 2005-04-01 12:56:37.000000000 -0700 -@@ -24,8 +24,16 @@ - * 0xa000000000000000+2*PERCPU_PAGE_SIZE - * - 0xa000000000000000+3*PERCPU_PAGE_SIZE remain unmapped (guard page) - */ -+#ifdef XEN -+//#define KERNEL_START 0xf000000100000000 -+#define KERNEL_START 0xf000000004000000 -+#define PERCPU_ADDR 0xf100000000000000-PERCPU_PAGE_SIZE -+#define SHAREDINFO_ADDR 0xf100000000000000 -+#define VHPT_ADDR 0xf200000000000000 -+#else - #define KERNEL_START 0xa000000100000000 - #define PERCPU_ADDR (-PERCPU_PAGE_SIZE) -+#endif - - #ifndef __ASSEMBLY__ - -@@ -218,9 +226,13 @@ - # define PERFMON_IS_SYSWIDE() (0) - #endif - -+#ifdef XEN -+#define IA64_HAS_EXTRA_STATE(t) 0 -+#else - #define IA64_HAS_EXTRA_STATE(t) \ - ((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID) \ - || IS_IA32_PROCESS(ia64_task_regs(t)) || PERFMON_IS_SYSWIDE()) -+#endif - - #define __switch_to(prev,next,last) do { \ - if (IA64_HAS_EXTRA_STATE(prev)) \ -@@ -249,6 +261,9 @@ - #else - # define switch_to(prev,next,last) __switch_to(prev, next, last) - #endif -+//#ifdef XEN -+//#undef switch_to -+//#endif - - /* - * On IA-64, we don't want to hold the runqueue's lock during the low-level context-switch, diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/time.c --- a/xen/arch/ia64/patch/linux-2.6.7/time.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,338 +0,0 @@ ---- ../../linux-2.6.7/arch/ia64/kernel/time.c 2004-06-15 23:19:01.000000000 -0600 -+++ arch/ia64/time.c 2005-03-14 17:27:11.000000000 -0700 -@@ -10,16 +10,22 @@ - */ - #include <linux/config.h> - -+#ifndef XEN - #include <linux/cpu.h> -+#endif - #include <linux/init.h> - #include <linux/kernel.h> - #include <linux/module.h> -+#ifndef XEN - #include <linux/profile.h> -+#endif - #include <linux/sched.h> - #include <linux/time.h> - #include <linux/interrupt.h> - #include <linux/efi.h> -+#ifndef XEN - #include <linux/profile.h> -+#endif - #include <linux/timex.h> - - #include <asm/machvec.h> -@@ -29,6 +35,9 @@ - #include <asm/sal.h> - #include <asm/sections.h> - #include <asm/system.h> -+#ifdef XEN -+#include <asm/ia64_int.h> -+#endif - - extern unsigned long wall_jiffies; - -@@ -45,6 +54,59 @@ - - #endif - -+#ifdef XEN -+volatile unsigned long last_nsec_offset; -+extern rwlock_t xtime_lock; -+unsigned long cpu_khz; /* Detected as we calibrate the TSC */ -+static s_time_t stime_irq; /* System time at last 'time update' */ -+ -+static inline u64 get_time_delta(void) -+{ -+ return ia64_get_itc(); -+} -+ -+s_time_t get_s_time(void) -+{ -+ s_time_t now; -+ unsigned long flags; -+ -+ read_lock_irqsave(&xtime_lock, flags); -+ -+ now = stime_irq + get_time_delta(); -+ -+ /* Ensure that the returned system time is monotonically increasing. */ -+ { -+ static s_time_t prev_now = 0; -+ if ( unlikely(now < prev_now) ) -+ now = prev_now; -+ prev_now = now; -+ } -+ -+ read_unlock_irqrestore(&xtime_lock, flags); -+ -+ return now; -+} -+ -+void update_dom_time(struct vcpu *v) -+{ -+// FIXME: implement this? -+// printf("update_dom_time: called, not implemented, skipping\n"); -+ return; -+} -+ -+/* Set clock to <secs,usecs> after 00:00:00 UTC, 1 January, 1970. */ -+void do_settime(unsigned long secs, unsigned long usecs, u64 system_time_base) -+{ -+// FIXME: Should this be do_settimeofday (from linux)??? -+ printf("do_settime: called, not implemented, stopping\n"); -+ dummy(); -+} -+#endif -+ -+#if 0 /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ -+#endif /* !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! */ -+ -+#ifndef XEN - static void - itc_reset (void) - { -@@ -80,12 +142,15 @@ - return (elapsed_cycles*local_cpu_data->nsec_per_cyc) >> IA64_NSEC_PER_CYC_SHIFT; - } - -+#ifndef XEN - static struct time_interpolator itc_interpolator = { - .get_offset = itc_get_offset, - .update = itc_update, - .reset = itc_reset - }; -+#endif - -+#ifndef XEN - int - do_settimeofday (struct timespec *tv) - { -@@ -95,7 +160,9 @@ - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - -+#ifdef TURN_ME_OFF_FOR_NOW_IA64_XEN - write_seqlock_irq(&xtime_lock); -+#endif - { - /* - * This is revolting. We need to set "xtime" correctly. However, the value -@@ -117,12 +184,15 @@ - time_esterror = NTP_PHASE_LIMIT; - time_interpolator_reset(); - } -+#ifdef TURN_ME_OFF_FOR_NOW_IA64_XEN - write_sequnlock_irq(&xtime_lock); -+#endif - clock_was_set(); - return 0; - } - - EXPORT_SYMBOL(do_settimeofday); -+#endif - - void - do_gettimeofday (struct timeval *tv) -@@ -185,6 +255,7 @@ - } - - EXPORT_SYMBOL(do_gettimeofday); -+#endif - - /* - * The profiling function is SMP safe. (nothing can mess -@@ -195,6 +266,9 @@ - static inline void - ia64_do_profile (struct pt_regs * regs) - { -+#ifdef XEN -+} -+#else - unsigned long ip, slot; - extern cpumask_t prof_cpu_mask; - -@@ -231,24 +305,89 @@ - ip = prof_len-1; - atomic_inc((atomic_t *)&prof_buffer[ip]); - } -+#endif -+ -+#ifdef XEN -+unsigned long domain0_ready = 0; // FIXME (see below) -+#define typecheck(a,b) 1 -+/* FROM linux/include/linux/jiffies.h */ -+/* -+ * These inlines deal with timer wrapping correctly. You are -+ * strongly encouraged to use them -+ * 1. Because people otherwise forget -+ * 2. Because if the timer wrap changes in future you won't have to -+ * alter your driver code. -+ * -+ * time_after(a,b) returns true if the time a is after time b. -+ * -+ * Do this with "<0" and ">=0" to only test the sign of the result. A -+ * good compiler would generate better code (and a really good compiler -+ * wouldn't care). Gcc is currently neither. -+ */ -+#define time_after(a,b) \ -+ (typecheck(unsigned long, a) && \ -+ typecheck(unsigned long, b) && \ -+ ((long)(b) - (long)(a) < 0)) -+#define time_before(a,b) time_after(b,a) -+ -+#define time_after_eq(a,b) \ -+ (typecheck(unsigned long, a) && \ -+ typecheck(unsigned long, b) && \ -+ ((long)(a) - (long)(b) >= 0)) -+#define time_before_eq(a,b) time_after_eq(b,a) -+#endif - - static irqreturn_t - timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) - { - unsigned long new_itm; - -+#ifndef XEN - if (unlikely(cpu_is_offline(smp_processor_id()))) { - return IRQ_HANDLED; - } -+#endif -+#ifdef XEN -+ if (current->domain == dom0) { -+ // FIXME: there's gotta be a better way of doing this... -+ // We have to ensure that domain0 is launched before we -+ // call vcpu_timer_expired on it -+ //domain0_ready = 1; // moved to xensetup.c -+ } -+ if (domain0_ready && vcpu_timer_expired(dom0->vcpu[0])) { -+ vcpu_pend_timer(dom0->vcpu[0]); -+ //vcpu_set_next_timer(dom0->vcpu[0]); -+ domain_wake(dom0->vcpu[0]); -+ } -+ if (!is_idle_task(current->domain) && current->domain != dom0) { -+ if (vcpu_timer_expired(current)) { -+ vcpu_pend_timer(current); -+ // ensure another timer interrupt happens even if domain doesn't -+ vcpu_set_next_timer(current); -+ domain_wake(current); -+ } -+ } -+ raise_actimer_softirq(); -+#endif - -+#ifndef XEN - platform_timer_interrupt(irq, dev_id, regs); -+#endif - - new_itm = local_cpu_data->itm_next; - - if (!time_after(ia64_get_itc(), new_itm)) -+#ifdef XEN -+ return; -+#else - printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n", - ia64_get_itc(), new_itm); -+#endif - -+#ifdef XEN -+// printf("GOT TO HERE!!!!!!!!!!!\n"); -+ //while(1); -+#endif - ia64_do_profile(regs); - - while (1) { -@@ -269,10 +408,16 @@ - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ -+#ifdef TURN_ME_OFF_FOR_NOW_IA64_XEN - write_seqlock(&xtime_lock); -+#endif -+#ifdef TURN_ME_OFF_FOR_NOW_IA64_XEN - do_timer(regs); -+#endif - local_cpu_data->itm_next = new_itm; -+#ifdef TURN_ME_OFF_FOR_NOW_IA64_XEN - write_sequnlock(&xtime_lock); -+#endif - } else - local_cpu_data->itm_next = new_itm; - -@@ -292,7 +437,12 @@ - */ - while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) - new_itm += local_cpu_data->itm_delta; -+//#ifdef XEN -+// vcpu_set_next_timer(current); -+//#else -+//printf("***** timer_interrupt: Setting itm to %lx\n",new_itm); - ia64_set_itm(new_itm); -+//#endif - /* double check, in case we got hit by a (slow) PMI: */ - } while (time_after_eq(ia64_get_itc(), new_itm)); - return IRQ_HANDLED; -@@ -307,6 +457,7 @@ - int cpu = smp_processor_id(); - unsigned long shift = 0, delta; - -+printf("ia64_cpu_local_tick: about to call ia64_set_itv\n"); - /* arrange for the cycle counter to generate a timer interrupt: */ - ia64_set_itv(IA64_TIMER_VECTOR); - -@@ -320,6 +471,7 @@ - shift = (2*(cpu - hi) + 1) * delta/hi/2; - } - local_cpu_data->itm_next = ia64_get_itc() + delta + shift; -+printf("***** ia64_cpu_local_tick: Setting itm to %lx\n",local_cpu_data->itm_next); - ia64_set_itm(local_cpu_data->itm_next); - } - -@@ -335,6 +487,7 @@ - * frequency and then a PAL call to determine the frequency ratio between the ITC - * and the base frequency. - */ -+ - status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM, - &platform_base_freq, &platform_base_drift); - if (status != 0) { -@@ -384,9 +537,11 @@ - + itc_freq/2)/itc_freq; - - if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { -+#ifndef XEN - itc_interpolator.frequency = local_cpu_data->itc_freq; - itc_interpolator.drift = itc_drift; - register_time_interpolator(&itc_interpolator); -+#endif - } - - /* Setup the CPU local timer tick */ -@@ -395,7 +550,9 @@ - - static struct irqaction timer_irqaction = { - .handler = timer_interrupt, -+#ifndef XEN - .flags = SA_INTERRUPT, -+#endif - .name = "timer" - }; - -@@ -403,12 +560,16 @@ - time_init (void) - { - register_percpu_irq(IA64_TIMER_VECTOR, &timer_irqaction); -+#ifndef XEN - efi_gettimeofday(&xtime); -+#endif - ia64_init_itm(); - -+#ifndef XEN - /* - * Initialize wall_to_monotonic such that adding it to xtime will yield zero, the - * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). - */ - set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); -+#endif - } diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/tlb.c --- a/xen/arch/ia64/patch/linux-2.6.7/tlb.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,48 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/mm/tlb.c 2004-06-15 23:19:43.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/tlb.c 2004-08-25 19:28:12.000000000 -0600 -@@ -21,7 +21,9 @@ - #include <asm/mmu_context.h> - #include <asm/pgalloc.h> - #include <asm/pal.h> -+#ifndef XEN - #include <asm/tlbflush.h> -+#endif - - static struct { - unsigned long mask; /* mask of supported purge page-sizes */ -@@ -43,6 +45,9 @@ - void - wrap_mmu_context (struct mm_struct *mm) - { -+#ifdef XEN -+printf("wrap_mmu_context: called, not implemented\n"); -+#else - unsigned long tsk_context, max_ctx = ia64_ctx.max_ctx; - struct task_struct *tsk; - int i; -@@ -83,6 +88,7 @@ - put_cpu(); - } - local_flush_tlb_all(); -+#endif - } - - void -@@ -132,6 +138,9 @@ - void - flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end) - { -+#ifdef XEN -+printf("flush_tlb_range: called, not implemented\n"); -+#else - struct mm_struct *mm = vma->vm_mm; - unsigned long size = end - start; - unsigned long nbits; -@@ -163,6 +172,7 @@ - # endif - - ia64_srlz_i(); /* srlz.i implies srlz.d */ -+#endif - } - EXPORT_SYMBOL(flush_tlb_range); - diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/types.h --- a/xen/arch/ia64/patch/linux-2.6.7/types.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,15 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/asm-ia64/types.h 2004-06-15 23:19:01.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/types.h 2004-11-11 17:08:30.000000000 -0700 -@@ -1,5 +1,12 @@ - #ifndef _ASM_IA64_TYPES_H - #define _ASM_IA64_TYPES_H -+#ifdef XEN -+#ifndef __ASSEMBLY__ -+typedef unsigned long ssize_t; -+typedef unsigned long size_t; -+typedef long long loff_t; -+#endif -+#endif - - /* - * This file is never included by application software unless explicitly requested (e.g., diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/unaligned.c --- a/xen/arch/ia64/patch/linux-2.6.7/unaligned.c Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,97 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/arch/ia64/kernel/unaligned.c 2004-06-15 23:20:03.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/arch/ia64/unaligned.c 2004-08-25 19:28:12.000000000 -0600 -@@ -15,8 +15,10 @@ - */ - #include <linux/kernel.h> - #include <linux/sched.h> -+#ifndef XEN - #include <linux/smp_lock.h> - #include <linux/tty.h> -+#endif - - #include <asm/intrinsics.h> - #include <asm/processor.h> -@@ -24,7 +26,16 @@ - #include <asm/uaccess.h> - #include <asm/unaligned.h> - -+#ifdef XEN -+#define ia64_peek(x...) printk("ia64_peek: called, not implemented\n") -+#define ia64_poke(x...) printk("ia64_poke: called, not implemented\n") -+#define ia64_sync_fph(x...) printk("ia64_sync_fph: called, not implemented\n") -+#define ia64_flush_fph(x...) printk("ia64_flush_fph: called, not implemented\n") -+#define die_if_kernel(x...) printk("die_if_kernel: called, not implemented\n") -+#define jiffies 0 -+#else - extern void die_if_kernel(char *str, struct pt_regs *regs, long err) __attribute__ ((noreturn)); -+#endif - - #undef DEBUG_UNALIGNED_TRAP - -@@ -437,7 +448,11 @@ - } - - -+#ifdef XEN -+void -+#else - static void -+#endif - setreg (unsigned long regnum, unsigned long val, int nat, struct pt_regs *regs) - { - struct switch_stack *sw = (struct switch_stack *) regs - 1; -@@ -611,7 +626,11 @@ - } - - -+#ifdef XEN -+void -+#else - static void -+#endif - getreg (unsigned long regnum, unsigned long *val, int *nat, struct pt_regs *regs) - { - struct switch_stack *sw = (struct switch_stack *) regs - 1; -@@ -1298,7 +1317,9 @@ - mm_segment_t old_fs = get_fs(); - unsigned long bundle[2]; - unsigned long opcode; -+#ifndef XEN - struct siginfo si; -+#endif - const struct exception_table_entry *eh = NULL; - union { - unsigned long l; -@@ -1317,6 +1338,9 @@ - * user-level unaligned accesses. Otherwise, a clever program could trick this - * handler into reading an arbitrary kernel addresses... - */ -+#ifdef XEN -+printk("ia64_handle_unaligned: called, not working yet\n"); -+#else - if (!user_mode(regs)) - eh = search_exception_tables(regs->cr_iip + ia64_psr(regs)->ri); - if (user_mode(regs) || eh) { -@@ -1353,6 +1377,7 @@ - - if (__copy_from_user(bundle, (void *) regs->cr_iip, 16)) - goto failure; -+#endif - - /* - * extract the instruction from the bundle given the slot number -@@ -1493,6 +1518,7 @@ - /* NOT_REACHED */ - } - force_sigbus: -+#ifndef XEN - si.si_signo = SIGBUS; - si.si_errno = 0; - si.si_code = BUS_ADRALN; -@@ -1501,5 +1527,6 @@ - si.si_isr = 0; - si.si_imm = 0; - force_sig_info(SIGBUS, &si, current); -+#endif - goto done; - } diff -r de3576a1c62c -r dfaf788ab18c xen/arch/ia64/patch/linux-2.6.7/wait.h --- a/xen/arch/ia64/patch/linux-2.6.7/wait.h Thu Aug 25 20:52:38 2005 +++ /dev/null Fri Aug 26 20:47:16 2005 @@ -1,26 +0,0 @@ ---- /home/djm/src/xen/xeno-ia64.bk/xen/linux-2.6.7/include/linux/wait.h 2004-06-15 23:19:31.000000000 -0600 -+++ /home/djm/src/xen/xeno-ia64.bk/xen/include/asm-ia64/linux/wait.h 2004-08-25 19:28:13.000000000 -0600 -@@ -104,10 +104,15 @@ - list_del(&old->task_list); - } - -+#ifdef XEN -+void FASTCALL(__wake_up(struct task_struct *p)); -+#else - void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key)); -+#endif - extern void FASTCALL(__wake_up_locked(wait_queue_head_t *q, unsigned int mode)); - extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); - -+#ifndef XEN - #define wake_up(x) __wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL) - #define wake_up_nr(x, nr) __wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr, NULL) - #define wake_up_all(x) __wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, NULL) -@@ -117,6 +122,7 @@ - #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL) - #define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE) - #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) -+#endif - - #define __wait_event(wq, condition) \ - do { \ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |