[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH, experimental] i386 Allow the fixmap to be relocated at boot time



This most curious patch allows the fixmap on i386 to be unfixed. The result is that we can create a dynamically sizable hole at the top of kernel linear address space. I know at least some virtualization developers are interested in being able to achieve this to achieve run-time sizing of a hole in which a hypervisor can live, or at least to test out the performance characteristics of different sized holes.

I have not run any performance numbers yet to see how much the cost of making this dynamic affects native performance, but again I would stress this is a highly experimental patch and I am looking for feedback and any performance data from other systems that people are kind enough to share. I'm not advocating that this get pushed into the mainline Linux tree at this point by any means!

I believe at least the Xen folks would be interested in playing around with this for experimenting with different MPT and frame table sizes for PAE support in a way that doesn't require recompiling the Linux guest each time - if the performance impact proves to be negligble, this gives a lot of flexibility to any virtual machine which runs a hypervisor aware kernel.

Although I did as much as possible to make the vsyscall relocation appear clean to userspace, I can't guarantee this patch won't set fire to your chair and electrocute your cat. Please move all pets to a safe location before attempting to use this.

Zachary Amsden <zach@xxxxxxxxxx>
Allow creation of an compile time hole at the top of linear address space.

Extended to allow a dynamic hole in linear address space, 7/2005.  This
required some serious hacking to get everything perfect, but the end result
appears to function quite nicely.  Everyone can now share the appreciation
of pseudo-undocumented ELF OS fields, which means core dumps, debuggers
and even broken or obsolete linkers may continue to work.

Signed-off-by: Zachary Amsden <zach@xxxxxxxxxx>
Index: linux-2.6.13/arch/i386/Kconfig
===================================================================
--- linux-2.6.13.orig/arch/i386/Kconfig 2005-08-04 14:14:24.000000000 -0700
+++ linux-2.6.13/arch/i386/Kconfig      2005-08-05 15:28:42.000000000 -0700
@@ -127,6 +127,20 @@
 
 endchoice
 
+config RELOCATABLE_FIXMAP
+       bool "Allow the fixmap to be placed dynamically at runtime"
+       depends on EXPERIMENTAL
+       help
+         Crazy hackers only.
+
+config MEMORY_HOLE
+       int "Create hole at top of memory (0-512 MB)"
+       range 0 512
+       default "0"
+       help
+         Useful for creating a hole in the top of memory when running
+         inside of a virtual machine monitor.
+
 config ACPI_SRAT
        bool
        default y
Index: linux-2.6.13/arch/i386/kernel/sysenter.c
===================================================================
--- linux-2.6.13.orig/arch/i386/kernel/sysenter.c       2005-08-02 
17:04:12.000000000 -0700
+++ linux-2.6.13/arch/i386/kernel/sysenter.c    2005-08-05 15:47:53.000000000 
-0700
@@ -46,22 +46,90 @@
 extern const char vsyscall_int80_start, vsyscall_int80_end;
 extern const char vsyscall_sysenter_start, vsyscall_sysenter_end;
 
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+extern const char SYSENTER_RETURN;
+const char *SYSENTER_RETURN_ADDR;
+
+static void fixup_vsyscall_elf(char *page)
+{
+       Elf32_Ehdr *hdr;
+       Elf32_Shdr *sechdrs;
+       Elf32_Phdr *phdr;
+       char *secstrings;
+       int i, j, n;
+
+       hdr = (Elf32_Ehdr *)page;
+
+       /* Sanity checks against insmoding binaries or wrong arch,
+           weird elf version */
+       if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 ||
+               !elf_check_arch(hdr) ||
+               hdr->e_type != ET_DYN)
+               panic("Bogus ELF in vsyscall DSO\n");
+
+       hdr->e_entry += VSYSCALL_RELOCATION;
+
+       sechdrs = (void *)hdr + hdr->e_shoff;
+       secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+       for (i = 1; i < hdr->e_shnum; i++) {
+               if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+                       continue;
+
+               sechdrs[i].sh_addr += VSYSCALL_RELOCATION;
+               if (strcmp(secstrings+sechdrs[i].sh_name, ".dynsym") == 0) {
+                       Elf32_Sym  *sym =  (void *)hdr + sechdrs[i].sh_offset;
+                       n = sechdrs[i].sh_size / sizeof(*sym);
+                       for (j = 1; j < n;  j++) {
+                               int ndx = sym[j].st_shndx;
+                               if (ndx == SHN_UNDEF || ndx == SHN_ABS)
+                                       continue;
+                               sym[j].st_value += VSYSCALL_RELOCATION;
+                       }
+               } else if (strcmp(secstrings+sechdrs[i].sh_name, ".dynamic") == 
0) {
+                       Elf32_Dyn *dyn = (void *)hdr + sechdrs[i].sh_offset;
+                       int tag;
+                       while ((tag = (++dyn)->d_tag) != DT_NULL) {
+                               if (tag == DT_PLTGOT || tag == DT_HASH ||
+                                   tag == DT_STRTAB || tag == DT_SYMTAB ||
+                                   tag == DT_RELA || tag == DT_INIT ||
+                                   tag == DT_FINI || tag == DT_REL ||
+                                   tag == DT_JMPREL || tag == DT_VERSYM ||
+                                   tag == DT_VERDEF || tag == DT_VERNEED)
+                                       dyn->d_un.d_val += VSYSCALL_RELOCATION;
+                       }
+               } else if (strcmp(secstrings+sechdrs[i].sh_name, ".useless") == 
0) {
+                       uint32_t *got = (void *)hdr + sechdrs[i].sh_offset;
+                       *got += VSYSCALL_RELOCATION;
+               }
+       }
+       phdr = (void *)hdr + hdr->e_phoff;
+       for (i = 0; i < hdr->e_phnum; i++) {
+               phdr[i].p_vaddr += VSYSCALL_RELOCATION;
+               phdr[i].p_paddr += VSYSCALL_RELOCATION;
+       }
+       SYSENTER_RETURN_ADDR = (char *)&SYSENTER_RETURN + VSYSCALL_RELOCATION;
+}
+#endif
+
 int __init sysenter_setup(void)
 {
        void *page = (void *)get_zeroed_page(GFP_ATOMIC);
 
-       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
-
-       if (!boot_cpu_has(X86_FEATURE_SEP)) {
+       if (!boot_cpu_has(X86_FEATURE_SEP))
                memcpy(page,
                       &vsyscall_int80_start,
                       &vsyscall_int80_end - &vsyscall_int80_start);
-               return 0;
-       }
+       else
+               memcpy(page,
+                       &vsyscall_sysenter_start,
+                       &vsyscall_sysenter_end - &vsyscall_sysenter_start);
 
-       memcpy(page,
-              &vsyscall_sysenter_start,
-              &vsyscall_sysenter_end - &vsyscall_sysenter_start);
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+       fixup_vsyscall_elf((char *)page);
+#endif
+
+       __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY_EXEC);
 
        return 0;
 }
Index: linux-2.6.13/arch/i386/kernel/asm-offsets.c
===================================================================
--- linux-2.6.13.orig/arch/i386/kernel/asm-offsets.c    2005-08-04 
14:28:35.000000000 -0700
+++ linux-2.6.13/arch/i386/kernel/asm-offsets.c 2005-08-05 15:11:45.000000000 
-0700
@@ -68,5 +68,9 @@
                 sizeof(struct tss_struct));
 
        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+       DEFINE(VSYSCALL_BASE, 0);
+#else
        DEFINE(VSYSCALL_BASE, __fix_to_virt(FIX_VSYSCALL));
+#endif
 }
Index: linux-2.6.13/arch/i386/kernel/signal.c
===================================================================
--- linux-2.6.13.orig/arch/i386/kernel/signal.c 2005-08-03 23:36:46.000000000 
-0700
+++ linux-2.6.13/arch/i386/kernel/signal.c      2005-08-05 15:11:33.000000000 
-0700
@@ -345,6 +345,8 @@
    See vsyscall-sigreturn.S.  */
 extern void __user __kernel_sigreturn;
 extern void __user __kernel_rt_sigreturn;
+#define kernel_sigreturn  (VSYSCALL_RELOCATION + (void __user 
*)&__kernel_sigreturn)
+#define kernel_rt_sigreturn  (VSYSCALL_RELOCATION + (void __user 
*)&__kernel_rt_sigreturn)
 
 static int setup_frame(int sig, struct k_sigaction *ka,
                       sigset_t *set, struct pt_regs * regs)
@@ -380,7 +382,7 @@
                        goto give_sigsegv;
        }
 
-       restorer = &__kernel_sigreturn;
+       restorer = kernel_sigreturn;
        if (ka->sa.sa_flags & SA_RESTORER)
                restorer = ka->sa.sa_restorer;
 
@@ -476,7 +478,7 @@
                goto give_sigsegv;
 
        /* Set up to return from userspace.  */
-       restorer = &__kernel_rt_sigreturn;
+       restorer = kernel_rt_sigreturn;
        if (ka->sa.sa_flags & SA_RESTORER)
                restorer = ka->sa.sa_restorer;
        err |= __put_user(restorer, &frame->pretcode);
Index: linux-2.6.13/arch/i386/kernel/entry.S
===================================================================
--- linux-2.6.13.orig/arch/i386/kernel/entry.S  2005-08-04 14:17:15.000000000 
-0700
+++ linux-2.6.13/arch/i386/kernel/entry.S       2005-08-05 14:09:15.000000000 
-0700
@@ -200,7 +200,11 @@
        pushl %ebp
        pushfl
        pushl $(__USER_CS)
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+       pushl %ss:SYSENTER_RETURN_ADDR
+#else
        pushl $SYSENTER_RETURN
+#endif
 
 /*
  * Load the potential sixth argument from user stack.
Index: linux-2.6.13/arch/i386/mm/init.c
===================================================================
--- linux-2.6.13.orig/arch/i386/mm/init.c       2005-08-04 14:39:17.000000000 
-0700
+++ linux-2.6.13/arch/i386/mm/init.c    2005-08-05 15:20:04.000000000 -0700
@@ -42,6 +42,10 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+unsigned long __FIXADDR_TOP = 0;
+#endif
+
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
@@ -478,6 +482,12 @@
                printk("NX (Execute Disable) protection: active\n");
 #endif
 
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+       if (!__FIXADDR_TOP) 
+               __FIXADDR_TOP =  0xfffff000UL-(CONFIG_MEMORY_HOLE << 20);
+       printk(KERN_INFO "Fixmap top relocated to %lxh\n", __FIXADDR_TOP);
+#endif
+
        pagetable_init();
 
        load_cr3(swapper_pg_dir);
Index: linux-2.6.13/include/asm-i386/fixmap.h
===================================================================
--- linux-2.6.13.orig/include/asm-i386/fixmap.h 2005-08-04 14:14:24.000000000 
-0700
+++ linux-2.6.13/include/asm-i386/fixmap.h      2005-08-05 15:36:13.000000000 
-0700
@@ -20,7 +20,13 @@
  * Leave one empty page between vmalloc'ed areas and
  * the start of the fixmap.
  */
-#define __FIXADDR_TOP  0xfffff000
+#ifdef CONFIG_RELOCATABLE_FIXMAP
+extern unsigned long __FIXADDR_TOP;
+#define VSYSCALL_RELOCATION __fix_to_virt(FIX_VSYSCALL)
+#else
+#define __FIXADDR_TOP  (0xfffff000-(CONFIG_MEMORY_HOLE << 20))
+#define VSYSCALL_RELOCATION 0
+#endif
 
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
Index: linux-2.6.13/include/asm-i386/elf.h
===================================================================
--- linux-2.6.13.orig/include/asm-i386/elf.h    2005-08-02 17:06:23.000000000 
-0700
+++ linux-2.6.13/include/asm-i386/elf.h 2005-08-05 15:31:32.000000000 -0700
@@ -129,7 +129,7 @@
 
 #define VSYSCALL_BASE  (__fix_to_virt(FIX_VSYSCALL))
 #define VSYSCALL_EHDR  ((const struct elfhdr *) VSYSCALL_BASE)
-#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall)
+#define VSYSCALL_ENTRY ((unsigned long) 
(VSYSCALL_RELOCATION+&__kernel_vsyscall))
 extern void __kernel_vsyscall;
 
 #define ARCH_DLINFO                                            \
Index: linux-2.6.13/include/linux/elf.h
===================================================================
--- linux-2.6.13.orig/include/linux/elf.h       2005-08-02 17:06:24.000000000 
-0700
+++ linux-2.6.13/include/linux/elf.h    2005-08-05 12:06:17.000000000 -0700
@@ -138,6 +138,9 @@
 #define DT_DEBUG       21
 #define DT_TEXTREL     22
 #define DT_JMPREL      23
+#define DT_VERSYM      0x6ffffff0
+#define DT_VERDEF      0x6ffffffc
+#define DT_VERNEED     0x6ffffffe
 #define DT_LOPROC      0x70000000
 #define DT_HIPROC      0x7fffffff
 
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.