[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] Re: [RFC, PATCH 7/24] i386 Vmi memory hole



>> The complications in my patch come 
>> from the fact that the vsyscall page has to be relocated dynamically, 
>> requiring, basically run time linking on the page and some tweaks to get 
>> sysenter to work.  If you don't use vsyscall (say, non-TLS glibc), then 
>> you don't need that complexity.  But I think it might be needed now, 
>> even for Xen.
> 
> I believe both Xen and execshield move vsyscall out of fixmap, and then
> map into userspace as normal vma.

Yep, my patch (attached below for reference) moves the vsyscall page
into user address space, just below PAGE_OFFSET.  Works basically the
same way the vsyscall page is mapped in the ia32 emulation of the x86_64
 architecture.  Address stays fixed, thus the relocation magic isn't needed.

Once the vsyscall page is moved out of fixmap it's easy to make fixmap
movable and thus have a runtime-resizable address space hole at the top
of address space.  Patch is attached too, although that one is more
proof-of-concept, it doesn't make much sense as-is.  It has a kernel
command line option to specify the top of address space so you can play
around with it ...

Both patches are against -rc3 and most likely still apply just fine,
havn't tested that though.

cheers,

  Gerd

-- 
Gerd 'just married' Hoffmann <kraxel@xxxxxxx>
I'm the hacker formerly known as Gerd Knorr.
http://www.suse.de/~kraxel/just-married.jpeg
Index: vanilla-2.6.16-rc3/arch/i386/kernel/asm-offsets.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/kernel/asm-offsets.c      2006-01-03 
04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/kernel/asm-offsets.c   2006-02-15 
10:59:41.000000000 +0100
@@ -68,5 +68,5 @@ void foo(void)
                 sizeof(struct tss_struct));
 
        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-       DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+       DEFINE(VSYSCALL_BASE, (PAGE_OFFSET - 2*PAGE_SIZE));
 }
Index: vanilla-2.6.16-rc3/arch/i386/kernel/sysenter.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/kernel/sysenter.c 2006-01-03 
04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/kernel/sysenter.c      2006-02-13 
09:57:36.000000000 +0100
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/string.h>
 #include <linux/elf.h>
+#include <linux/mm.h>
 
 #include <asm/cpufeature.h>
 #include <asm/msr.h>
@@ -45,23 +46,88 @@ void enable_sep_cpu(void)
  */
 extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
+static void *syscall_page;
 
 int __init sysenter_setup(void)
 {
-       void *page = (void *)get_zeroed_page(GFP_ATOMIC);
-
-       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
+       syscall_page = (void *)get_zeroed_page(GFP_ATOMIC);
 
        if (!boot_cpu_has(X86_FEATURE_SEP)) {
-               memcpy(page,
+               memcpy(syscall_page,
                       &vsyscall_int80_start,
                       &vsyscall_int80_end - &vsyscall_int80_start);
                return 0;
        }
 
-       memcpy(page,
+       memcpy(syscall_page,
               &vsyscall_sysenter_start,
               &vsyscall_sysenter_end - &vsyscall_sysenter_start);
 
        return 0;
 }
+
+static struct page*
+syscall_nopage(struct vm_area_struct *vma, unsigned long adr, int *type)
+{
+       struct page *p = virt_to_page(adr - vma->vm_start + syscall_page);
+       get_page(p);
+       return p;
+}
+
+/* Prevent VMA merging */
+static void syscall_vma_close(struct vm_area_struct *vma)
+{
+}
+
+static struct vm_operations_struct syscall_vm_ops = {
+       .close = syscall_vma_close,
+       .nopage = syscall_nopage,
+};
+
+/* Setup a VMA at program startup for the vsyscall page */
+int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
+{
+       struct vm_area_struct *vma;
+       struct mm_struct *mm = current->mm;
+       int ret;
+
+       vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+       if (!vma)
+               return -ENOMEM;
+
+       memset(vma, 0, sizeof(struct vm_area_struct));
+       /* Could randomize here */
+       vma->vm_start = VSYSCALL_BASE;
+       vma->vm_end = VSYSCALL_BASE + PAGE_SIZE;
+       /* MAYWRITE to allow gdb to COW and set breakpoints */
+       vma->vm_flags = VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC|VM_MAYWRITE;
+       vma->vm_flags |= mm->def_flags;
+       vma->vm_page_prot = protection_map[vma->vm_flags & 7];
+       vma->vm_ops = &syscall_vm_ops;
+       vma->vm_mm = mm;
+
+       down_write(&mm->mmap_sem);
+       if ((ret = insert_vm_struct(mm, vma))) {
+               up_write(&mm->mmap_sem);
+               kmem_cache_free(vm_area_cachep, vma);
+               return ret;
+       }
+       mm->total_vm++;
+       up_write(&mm->mmap_sem);
+       return 0;
+}
+
+struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+{
+       return NULL;
+}
+
+int in_gate_area(struct task_struct *task, unsigned long addr)
+{
+       return 0;
+}
+
+int in_gate_area_no_task(unsigned long addr)
+{
+       return 0;
+}
Index: vanilla-2.6.16-rc3/include/asm-i386/a.out.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/a.out.h    2006-01-03 
04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/a.out.h 2006-02-13 09:57:36.000000000 
+0100
@@ -19,7 +19,7 @@ struct exec
 
 #ifdef __KERNEL__
 
-#define STACK_TOP      TASK_SIZE
+#define STACK_TOP      (TASK_SIZE - 3*PAGE_SIZE)
 
 #endif
 
Index: vanilla-2.6.16-rc3/include/asm-i386/elf.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/elf.h      2006-01-03 
04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/elf.h   2006-02-13 09:57:36.000000000 
+0100
@@ -129,11 +129,16 @@ extern int dump_task_extended_fpu (struc
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, 
elf_xfpregs)
 
-#define VSYSCALL_BASE  (__fix_to_virt(FIX_VSYSCALL))
+#define VSYSCALL_BASE  (PAGE_OFFSET - 2*PAGE_SIZE)
 #define VSYSCALL_EHDR  ((const struct elfhdr *) VSYSCALL_BASE)
 #define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
 extern void __kernel_vsyscall;
 
+#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
+struct linux_binprm;
+extern int arch_setup_additional_pages(struct linux_binprm *bprm,
+                                       int executable_stack);
+
 #define ARCH_DLINFO                                            \
 do {                                                           \
                NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY);        \
Index: vanilla-2.6.16-rc3/include/asm-i386/fixmap.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/fixmap.h   2006-01-03 
04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/fixmap.h        2006-02-14 
14:40:15.000000000 +0100
@@ -52,7 +52,6 @@
  */
 enum fixed_addresses {
        FIX_HOLE,
-       FIX_VSYSCALL,
 #ifdef CONFIG_X86_LOCAL_APIC
        FIX_APIC_BASE,  /* local (CPU) APIC) -- required for SMP or not */
 #endif
@@ -116,14 +115,6 @@ extern void __set_fixmap (enum fixed_add
 #define __fix_to_virt(x)       (FIXADDR_TOP - ((x) << PAGE_SHIFT))
 #define __virt_to_fix(x)       ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
 
-/*
- * This is the range that is readable by user mode, and things
- * acting like user mode such as get_user_pages.
- */
-#define FIXADDR_USER_START     (__fix_to_virt(FIX_VSYSCALL))
-#define FIXADDR_USER_END       (FIXADDR_USER_START + PAGE_SIZE)
-
-
 extern void __this_fixmap_does_not_exist(void);
 
 /*
Index: vanilla-2.6.16-rc3/include/asm-i386/page.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/page.h     2006-02-13 
09:42:02.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/page.h  2006-02-14 14:40:15.000000000 
+0100
@@ -139,6 +139,8 @@ extern int page_is_ram(unsigned long pag
        ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
+#define __HAVE_ARCH_GATE_AREA 1
+
 #endif /* __KERNEL__ */
 
 #include <asm-generic/page.h>
Index: vanilla-2.6.16-rc3/include/asm-i386/processor.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/processor.h        2006-02-13 
09:42:02.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/processor.h     2006-02-14 
14:43:25.000000000 +0100
@@ -318,7 +318,7 @@ extern int bootloader_type;
 /*
  * User space process size: 3GB (default).
  */
-#define TASK_SIZE      (PAGE_OFFSET)
+#define TASK_SIZE      (PAGE_OFFSET - 3*PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
Index: vanilla-2.6.16-rc3/arch/i386/kernel/setup.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/kernel/setup.c    2006-02-13 
09:39:33.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/kernel/setup.c 2006-02-13 09:57:36.000000000 
+0100
@@ -922,6 +922,12 @@ static void __init parse_cmdline_early (
                else if (!memcmp(from, "vmalloc=", 8))
                        __VMALLOC_RESERVE = memparse(from+8, &from);
 
+               /*
+                * fixmap=addr
+                */
+               else if (!memcmp(from, "fixmap=", 7))
+                       set_fixaddr_top(simple_strtoul(from+7, NULL, 16));
+
        next_char:
                c = *(from++);
                if (!c)
Index: vanilla-2.6.16-rc3/arch/i386/mm/init.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/mm/init.c 2006-02-13 09:39:33.000000000 
+0100
+++ vanilla-2.6.16-rc3/arch/i386/mm/init.c      2006-02-13 14:33:40.000000000 
+0100
@@ -628,6 +628,42 @@ void __init mem_init(void)
                (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
               );
 
+#if 1 /* double-sanity-check paranoia */
+       printk("virtual kernel memory layout:\n"
+              "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+              "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+              "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+              "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
+              "      .init : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+              "      .data : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+              "      .text : 0x%08lx - 0x%08lx   (%4ld kB)\n",
+              FIXADDR_START, FIXADDR_TOP,
+              (FIXADDR_TOP - FIXADDR_START) >> 10,
+
+              PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
+              (LAST_PKMAP*PAGE_SIZE) >> 10,
+
+              VMALLOC_START, VMALLOC_END,
+              (VMALLOC_END - VMALLOC_START) >> 20,
+
+              (unsigned long)__va(0), (unsigned long)high_memory,
+              ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
+
+              (unsigned long)&__init_begin, (unsigned long)&__init_end,
+              ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 
10,
+
+              (unsigned long)&_etext, (unsigned long)&_edata,
+              ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
+
+              (unsigned long)&_text, (unsigned long)&_etext,
+              ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
+
+       BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
+       BUG_ON(VMALLOC_END                     > PKMAP_BASE);
+       BUG_ON(VMALLOC_START                   > VMALLOC_END);
+       BUG_ON((unsigned long)high_memory      > VMALLOC_START);
+#endif /* double-sanity-check paranoia */
+
 #ifdef CONFIG_X86_PAE
        if (!cpu_has_pae)
                panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
Index: vanilla-2.6.16-rc3/arch/i386/mm/pgtable.c
===================================================================
--- vanilla-2.6.16-rc3.orig/arch/i386/mm/pgtable.c      2006-01-03 
04:21:10.000000000 +0100
+++ vanilla-2.6.16-rc3/arch/i386/mm/pgtable.c   2006-02-13 09:57:36.000000000 
+0100
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -138,6 +139,10 @@ void set_pmd_pfn(unsigned long vaddr, un
        __flush_tlb_one(vaddr);
 }
 
+static int fixmaps = 0;
+unsigned long __FIXADDR_TOP = 0xfffff000;
+EXPORT_SYMBOL(__FIXADDR_TOP);
+
 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t 
flags)
 {
        unsigned long address = __fix_to_virt(idx);
@@ -147,6 +152,14 @@ void __set_fixmap (enum fixed_addresses 
                return;
        }
        set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+       fixmaps++;
+}
+
+void set_fixaddr_top(unsigned long top)
+{
+       BUG_ON(fixmaps > 0);
+       printk("%s: addr=0x%lx\n", __FUNCTION__, top);
+       __FIXADDR_TOP = top - PAGE_SIZE;
 }
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
Index: vanilla-2.6.16-rc3/include/asm-i386/fixmap.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/fixmap.h   2006-02-13 
09:57:36.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/fixmap.h        2006-02-13 
09:57:36.000000000 +0100
@@ -20,7 +20,7 @@
  * Leave one empty page between vmalloc'ed areas and
  * the start of the fixmap.
  */
-#define __FIXADDR_TOP  0xfffff000
+extern unsigned long __FIXADDR_TOP;
 
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
@@ -93,6 +93,7 @@ enum fixed_addresses {
 
 extern void __set_fixmap (enum fixed_addresses idx,
                                        unsigned long phys, pgprot_t flags);
+extern void set_fixaddr_top(unsigned long top);
 
 #define set_fixmap(idx, phys) \
                __set_fixmap(idx, phys, PAGE_KERNEL)
Index: vanilla-2.6.16-rc3/include/asm-i386/page.h
===================================================================
--- vanilla-2.6.16-rc3.orig/include/asm-i386/page.h     2006-02-13 
09:57:36.000000000 +0100
+++ vanilla-2.6.16-rc3/include/asm-i386/page.h  2006-02-13 14:21:36.000000000 
+0100
@@ -121,7 +121,7 @@ extern int page_is_ram(unsigned long pag
 
 #define PAGE_OFFSET            ((unsigned long)__PAGE_OFFSET)
 #define VMALLOC_RESERVE                ((unsigned long)__VMALLOC_RESERVE)
-#define MAXMEM                 (-__PAGE_OFFSET-__VMALLOC_RESERVE)
+#define MAXMEM                 (__FIXADDR_TOP-__PAGE_OFFSET-__VMALLOC_RESERVE)
 #define __pa(x)                        ((unsigned long)(x)-PAGE_OFFSET)
 #define __va(x)                        ((void *)((unsigned 
long)(x)+PAGE_OFFSET))
 #define pfn_to_kaddr(pfn)      __va((pfn) << PAGE_SHIFT)
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.