[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] The patched attached enables x86_64 xenlinux with "late pin, early
# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID 5978be010beca73a6b88ae68d2e120c531bb0edd # Parent edeee85c90b1fe1431437338cb1645acb176b0bd The patched attached enables x86_64 xenlinux with "late pin, early unpin", which is already implemented for x86_32. Since we now only pin the root rather than any of the other levels, the overall performance became better especially with workloads that require heavy memory management operations. On 8-way x86_64 xenlinux (dom0) the kernel build was improved by about 10% (using make -j32). Even a small setup like a UP HT system, I see about 3% performance gain with kernel build (make -j4). Lmbench also shows improvements in fork/exec/sh: Processor, Processes - times in microseconds - smaller is better -------------------------------------------------------------------- Host OS Mhz null null open slct sig sig fork exec sh =20 call I/O stat clos TCP inst hndl proc proc proc --------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ----=20 Linux 2.6.12- 3786 1.13 1.36 3.93 6.04 10.5 1.43 4.33 536. 1446 3614 Linux 2.6.12- 3786 1.13 1.36 3.91 6.03 10.4 1.44 4.38 346. 1050 2831 Signed-off-by: Jun Nakajima <jun.nakajima@xxxxxxxxx> diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/kernel/ldt.c Fri Aug 26 11:02:14 2005 @@ -105,13 +105,18 @@ struct mm_struct * old_mm; int retval = 0; + memset(&mm->context, 0, sizeof(mm->context)); init_MUTEX(&mm->context.sem); - mm->context.size = 0; old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); retval = copy_ldt(&mm->context, &old_mm->context); up(&old_mm->context.sem); + } + if (retval == 0) { + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); } return retval; } @@ -133,6 +138,11 @@ else kfree(mm->context.ldt); mm->context.size = 0; + } + if (!mm->context.pinned) { + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); } } diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/init.c Fri Aug 26 11:02:14 2005 @@ -712,6 +712,7 @@ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); memset(empty_zero_page, 0, sizeof(empty_zero_page)); + init_mm.context.pinned = 1; #ifdef CONFIG_XEN_PHYSDEV_ACCESS { diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c --- a/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/arch/xen/x86_64/mm/pageattr.c Fri Aug 26 11:02:14 2005 @@ -12,19 +12,145 @@ #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/tlbflush.h> +#include <asm/io.h> + +#ifdef CONFIG_XEN #include <asm/pgalloc.h> -#include <asm/io.h> +#include <asm/mmu_context.h> + +LIST_HEAD(mm_unpinned); +DEFINE_SPINLOCK(mm_unpinned_lock); + +static inline void mm_walk_set_prot(void *pt, pgprot_t flags) +{ + struct page *page = virt_to_page(pt); + unsigned long pfn = page_to_pfn(page); + + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + pfn_pte(pfn, flags), 0)); +} + +static void mm_walk(struct mm_struct *mm, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g,u,m; + + pgd = mm->pgd; + for (g = 0; g <= USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + if (PTRS_PER_PUD > 1) /* not folded */ + mm_walk_set_prot(pud,flags); + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + if (PTRS_PER_PMD > 1) /* not folded */ + mm_walk_set_prot(pmd,flags); + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + pte = pte_offset_kernel(pmd,0); + mm_walk_set_prot(pte,flags); + } + } + } +} + +void mm_pin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + mm_walk(mm, PAGE_KERNEL_RO); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL_RO), + UVMF_TLB_FLUSH)); + xen_pgd_pin(__pa(mm->pgd)); /* kernel */ + xen_pgd_pin(__pa(__user_pgd(mm->pgd))); /* user */ + mm->context.pinned = 1; + spin_lock(&mm_unpinned_lock); + list_del(&mm->context.unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_unpin(struct mm_struct *mm) +{ + spin_lock(&mm->page_table_lock); + + xen_pgd_unpin(__pa(mm->pgd)); + xen_pgd_unpin(__pa(__user_pgd(mm->pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)mm->pgd, + pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(mm->pgd), + pfn_pte(virt_to_phys(__user_pgd(mm->pgd))>>PAGE_SHIFT, PAGE_KERNEL), 0)); + mm_walk(mm, PAGE_KERNEL); + xen_tlb_flush(); + mm->context.pinned = 0; + spin_lock(&mm_unpinned_lock); + list_add(&mm->context.unpinned, &mm_unpinned); + spin_unlock(&mm_unpinned_lock); + + spin_unlock(&mm->page_table_lock); +} + +void mm_pin_all(void) +{ + while (!list_empty(&mm_unpinned)) + mm_pin(list_entry(mm_unpinned.next, struct mm_struct, + context.unpinned)); +} + +void _arch_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if ( tsk->active_mm == mm ) + { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) ) + mm_unpin(mm); +} void pte_free(struct page *pte) { - pte_t *ptep; - - ptep = pfn_to_kaddr(page_to_pfn(pte)); - - xen_pte_unpin(__pa(ptep)); - make_page_writable(ptep); - __free_page(pte); -} + unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT); + + if (!pte_write(*virt_to_ptep(va))) + BUG_ON(HYPERVISOR_update_va_mapping( + va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0)); + __free_page(pte); +} +#endif /* CONFIG_XEN */ static inline pte_t *lookup_address(unsigned long address) { @@ -78,7 +204,7 @@ } else asm volatile("wbinvd":::"memory"); if (address) - __flush_tlb_one((unsigned long) address); + __flush_tlb_one(address); else __flush_tlb_all(); } @@ -166,14 +292,17 @@ BUG(); /* on x86-64 the direct mapping set at boot is not using 4k pages */ -// BUG_ON(PageReserved(kpte_page)); /* * ..., but the XEN guest kernels (currently) do: * If the pte was reserved, it means it was created at boot * time (not via split_large_page) and in turn we must not * replace it with a large page. */ - if (!PageReserved(kpte_page)) { +#ifndef CONFIG_XEN + BUG_ON(PageReserved(kpte_page)); +#else + if (!PageReserved(kpte_page)) +#endif switch (page_count(kpte_page)) { case 1: save_page(address, kpte_page); @@ -182,7 +311,6 @@ case 0: BUG(); /* memleak and failed 2M page regeneration */ } - } return 0; } diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu_context.h Fri Aug 26 11:02:14 2005 @@ -58,6 +58,9 @@ } } +extern void mm_pin(struct mm_struct *mm); +extern void mm_unpin(struct mm_struct *mm); +void mm_pin_all(void); static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -66,6 +69,9 @@ struct mmuext_op _op[3], *op = _op; if (likely(prev != next)) { + if (!next->context.pinned) + mm_pin(next); + /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); #if 0 /* XEN: no lazy tlb */ diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/pgalloc.h Fri Aug 26 11:02:14 2005 @@ -21,12 +21,27 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { - set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__va(page_to_pfn(pte) << PAGE_SHIFT), + pfn_pte(page_to_pfn(pte), PAGE_KERNEL_RO), 0)); + set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT))); + } else { + *(pmd) = __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)); + } } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { - set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + } else { + *(pud) = __pud(_PAGE_TABLE | __pa(pmd)); + } } /* @@ -35,53 +50,54 @@ */ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) { - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); - set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); -} - -extern __inline__ pmd_t *get_pmd(void) -{ - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); - return pmd; + if (unlikely((mm)->context.pinned)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, + PAGE_KERNEL_RO), 0)); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_pgd(__user_pgd(pgd), __pgd(_PAGE_TABLE | __pa(pud))); + } else { + *(pgd) = __pgd(_PAGE_TABLE | __pa(pud)); + *(__user_pgd(pgd)) = *(pgd); + } } extern __inline__ void pmd_free(pmd_t *pmd) { - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - xen_pmd_unpin(__pa(pmd)); - make_page_writable(pmd); + pte_t *ptep = virt_to_ptep(pmd); + + if (!pte_write(*ptep)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pmd, + pfn_pte(virt_to_phys(pmd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } free_page((unsigned long)pmd); } static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pmd) - return NULL; - make_page_readonly(pmd); - xen_pmd_pin(__pa(pmd)); return pmd; } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { pud_t *pud = (pud_t *) get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pud) - return NULL; - make_page_readonly(pud); - xen_pud_pin(__pa(pud)); return pud; } static inline void pud_free(pud_t *pud) { - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - xen_pud_unpin(__pa(pud)); - make_page_writable(pud); + pte_t *ptep = virt_to_ptep(pud); + + if (!pte_write(*ptep)) { + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pud, + pfn_pte(virt_to_phys(pud)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } free_page((unsigned long)pud); } @@ -107,10 +123,6 @@ (PTRS_PER_PGD - boundary) * sizeof(pgd_t)); memset(__user_pgd(pgd), 0, PAGE_SIZE); /* clean up user pgd */ - make_pages_readonly(pgd, 2); - - xen_pgd_pin(__pa(pgd)); /* kernel */ - xen_pgd_pin(__pa(__user_pgd(pgd))); /* user */ /* * Set level3_user_pgt for vsyscall area */ @@ -121,31 +133,45 @@ static inline void pgd_free(pgd_t *pgd) { - BUG_ON((unsigned long)pgd & (PAGE_SIZE-1)); - xen_pgd_unpin(__pa(pgd)); - xen_pgd_unpin(__pa(__user_pgd(pgd))); - make_pages_writable(pgd, 2); + pte_t *ptep = virt_to_ptep(pgd); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(pgd)); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)pgd, + pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL), + 0)); + } + + ptep = virt_to_ptep(__user_pgd(pgd)); + + if (!pte_write(*ptep)) { + xen_pgd_unpin(__pa(__user_pgd(pgd))); + BUG_ON(HYPERVISOR_update_va_mapping( + (unsigned long)__user_pgd(pgd), + pfn_pte(virt_to_phys(__user_pgd(pgd))>>PAGE_SHIFT, + PAGE_KERNEL), + 0)); + } + free_pages((unsigned long)pgd, 1); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); + if (pte) + make_page_readonly(pte); + return pte; } static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); - if (!pte) - return NULL; - make_page_readonly(pte); - xen_pte_pin(__pa(pte)); - return virt_to_page((unsigned long)pte); + struct page *pte; + + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + return pte; } /* Should really implement gc for free page table pages. This could be diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h --- a/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/tlbflush.h Fri Aug 26 11:02:14 2005 @@ -18,7 +18,7 @@ #define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) xen_invlpg(addr) +#define __flush_tlb_one(addr) xen_invlpg((unsigned long)addr) /* diff -r edeee85c90b1 -r 5978be010bec linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h --- /dev/null Fri Aug 26 11:00:14 2005 +++ b/linux-2.6-xen-sparse/include/asm-xen/asm-x86_64/mmu.h Fri Aug 26 11:02:14 2005 @@ -0,0 +1,33 @@ +#ifndef __x86_64_MMU_H +#define __x86_64_MMU_H + +#include <linux/spinlock.h> +#include <asm/semaphore.h> + +/* + * The x86_64 doesn't have a mmu context, but + * we put the segment information here. + * + * cpu_vm_mask is used to optimize ldt flushing. + */ +typedef struct { + void *ldt; + rwlock_t ldtlock; + int size; + struct semaphore sem; +#ifdef CONFIG_XEN + unsigned pinned:1; + struct list_head unpinned; +#endif +} mm_context_t; + +#ifdef CONFIG_XEN +extern struct list_head mm_unpinned; +extern spinlock_t mm_unpinned_lock; + +/* mm/memory.c:exit_mmap hook */ +extern void _arch_exit_mmap(struct mm_struct *mm); +#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) +#endif + +#endif _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |