[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [PATCH v7] RFC: x86/pvh: Make Xen PVH entrypoint PIC for x86-64
The Xen PVH entrypoint is 32bit non-PIC code running at a default load address of 0x1000000 (16MB) (CONFIG_PHYSICAL_START). Xen loads the kernel at that physical address inside the PVH container. When running a PVH Dom0, the system reserved addresses are mapped 1-1 into the PVH container. There exist system firmwares (Coreboot/EDK2) with reserved memory at 16MB. This creates a conflict where the PVH kernel cannot be loaded at that address. Modify the PVH entrypoint to be position-indepedent to allow flexibility in load address. Only the 64bit entry path is converted. A 32bit kernel is not PIC, so calling into other parts of the kernel, like xen_prepare_pvh() and mk_pgtable_32(), don't work properly when relocated. Initial PVH entry runs at the physical addresses and then transitions to the identity mapped address. While executing xen_prepare_pvh() calls through pv_ops function pointers transition to the high mapped addresses. Additionally, __va() is called on some hvm_start_info physical addresses, we need the directmap address range is used. So we need to run page tables with all of those ranges mapped. Modifying init_top_pgt tables ran into issue since startup_64/__startup_64() will modify those page tables again. Create dedicated set of page tables - pvh_init_top_pgt - for the PVH entry to avoid unwanted interactions. To avoid statically allocating space, the .bss is used. It is unused on entry and only cleared later. This saves 32kb that would otherwise be added to vmlinux. A minimum of 20kb would be needed, which would only map the lower 1GB or ram. In xen_pvh_init(), __pa() is called to find the physical address of the hypercall page. Set phys_base temporarily before calling into xen_prepare_pvh(), which calls xen_pvh_init(), and clear it afterwards. __startup_64() assumes phys_base is zero and adds load_delta to it. If phys_base is already set, the calculation results in an incorrect cr3. TODO: Sync elfnote.h from xen.git commit xxxxxxxxxx when it is commited. Signed-off-by: Jason Andryuk <jason.andryuk@xxxxxxx> --- Put this out as an example for the Xen modifications UNWIND_HINT_END_OF_STACK after lret is to silence: vmlinux.o: warning: objtool: pvh_start_xen+0x168: unreachable instruction Instead of setting and clearing phys_base, add a dedicated variable? Clearing phys_base is a little weird, but it leaves the kernel more consistent when running non-entry code. Make __startup_64() exit if phys_base is already set to allow calling multiple times, and use that and init_top_pgt instead of adding additional page tables? That won't work. __startup_64 is 64bit code, and pvh needs to create page tables in 32bit code before it can transition to 64bit long mode. Hence it can't be re-used to relocate page tables. --- arch/x86/platform/pvh/head.S | 168 +++++++++++++++++++++++++++++--- include/xen/interface/elfnote.h | 20 +++- 2 files changed, 175 insertions(+), 13 deletions(-) diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index f7235ef87bc3..48480fc8d786 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -50,11 +50,32 @@ #define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8) #define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8) +#define rva(x) ((x) - pvh_start_xen) + SYM_CODE_START_LOCAL(pvh_start_xen) UNWIND_HINT_END_OF_STACK cld - lgdt (_pa(gdt)) + /* + * See the comment for startup_32 for more details. We need to + * execute a call to get the execution address to be position + * independent, but we don't have a stack. Save and restore the + * magic field of start_info in ebx, and use that as the stack. + */ + mov (%ebx), %eax + leal 4(%ebx), %esp + ANNOTATE_INTRA_FUNCTION_CALL + call 1f +1: popl %ebp + mov %eax, (%ebx) + subl $ rva(1b), %ebp + movl $0, %esp + + leal rva(gdt)(%ebp), %eax + movl %eax, %ecx + leal rva(gdt_start)(%ebp), %ecx + movl %ecx, 2(%eax) + lgdt (%eax) mov $PVH_DS_SEL,%eax mov %eax,%ds @@ -62,14 +83,14 @@ SYM_CODE_START_LOCAL(pvh_start_xen) mov %eax,%ss /* Stash hvm_start_info. */ - mov $_pa(pvh_start_info), %edi + leal rva(pvh_start_info)(%ebp), %edi mov %ebx, %esi - mov _pa(pvh_start_info_sz), %ecx + movl rva(pvh_start_info_sz)(%ebp), %ecx shr $2,%ecx rep movsl - mov $_pa(early_stack_end), %esp + leal rva(early_stack_end)(%ebp), %esp /* Enable PAE mode. */ mov %cr4, %eax @@ -83,29 +104,145 @@ SYM_CODE_START_LOCAL(pvh_start_xen) btsl $_EFER_LME, %eax wrmsr - /* Enable pre-constructed page tables. */ - mov $_pa(init_top_pgt), %eax +/* + * Xen PVH needs a set of identity mapped and kernel high mapping + * page tables. pvh_start_xen starts running on the identity mapped + * page tables, but xen_prepare_pvh calls into the high mapping. Inside, + * __va is used, so the L4_PAGE_OFFSET needs to be populated. + * + * Instead of allocating memory memory in .init.data, we just use + * space in the BSS. This will only be used until startup_64 switches + * to the main page tables. Later, the bss will be cleared. + */ +#define pvh_init_top_pgt rva(__bss_start) +#define pvh_level3_ident_pgt (pvh_init_top_pgt + PAGE_SIZE) +#define pvh_level2_ident_pgt (pvh_level3_ident_pgt + PAGE_SIZE) +#define pvh_level3_kernel_pgt (pvh_level2_ident_pgt + 4 * PAGE_SIZE) +#define pvh_level2_kernel_pgt (pvh_level3_kernel_pgt + PAGE_SIZE) + +#define l4_index(x) (((x) >> 39) & 511) +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) + +L4_PAGE_OFFSET = l4_index(__PAGE_OFFSET_BASE_L4) +L4_START_KERNEL = l4_index(__START_KERNEL_map) +L3_START_KERNEL = pud_index(__START_KERNEL_map) + + /* Clear pvh_init_top_pgt */ + leal pvh_init_top_pgt(%ebp), %edi + movl $(PAGE_SIZE / 4), %ecx + xorl %eax, %eax + rep stosl + + /* pvh_init_top_pgt[0] = pvh_level3_ident_pgt */ + leal pvh_init_top_pgt(%ebp), %edi + leal pvh_level3_ident_pgt(%ebp), %esi + movl %esi, 0x00(%edi) + addl $_KERNPG_TABLE_NOENC, 0x00(%edi) + + /* pvh_init_top_pgt[L4_PAGE_OFFSET] = pvh_level3_ident_pgt */ + movl %esi, (L4_PAGE_OFFSET * 8)(%edi) + addl $_KERNPG_TABLE_NOENC, (L4_PAGE_OFFSET * 8)(%edi) + + /* pvh_init_top_pgt[L4_START_KERNEL] = pvh_level3_kernel_pgt */ + leal pvh_level3_kernel_pgt(%ebp), %esi + movl %esi, (L4_START_KERNEL * 8)(%edi) + addl $_PAGE_TABLE_NOENC, (L4_START_KERNEL * 8)(%edi) + + /* Clear pvh_level3_ident_pgt */ + leal pvh_level3_ident_pgt(%ebp), %edi + movl $(PAGE_SIZE / 4), %ecx + xorl %eax, %eax + rep stosl + + /* Set pvh_level3_ident_pgt[0] = pvh_level2_ident_pgt */ + leal pvh_level3_ident_pgt(%ebp), %edi + leal pvh_level2_ident_pgt(%ebp), %esi + addl $_KERNPG_TABLE_NOENC, %esi + movl %esi, 0x00(%edi) + addl $PAGE_SIZE, %esi + /* ... pvh_level3_ident_pgt[1] = pvh_level2_ident_pgt+0x1000 */ + movl %esi, 0x08(%edi) + addl $PAGE_SIZE, %esi + /* ... pvh_level3_ident_pgt[2] = pvh_level2_ident_pgt+0x2000 */ + movl %esi, 0x10(%edi) + addl $PAGE_SIZE, %esi + /* ... pvh_level3_ident_pgt[3] = pvh_level2_ident_pgt+0x3000 */ + movl %esi, 0x18(%edi) + + /* Fill pvh_level2_ident_pgt with large ident pages. */ + leal pvh_level2_ident_pgt(%ebp), %edi + movl $__PAGE_KERNEL_IDENT_LARGE_EXEC, %esi + movl $(PTRS_PER_PMD*4), %ecx +1: movl %esi, 0(%edi) + addl $(1 << PMD_SHIFT), %esi + addl $8, %edi + decl %ecx + jnz 1b + + /* Clear pvh_level3_kernel_pgt */ + leal pvh_level3_kernel_pgt(%ebp), %edi + movl $(PAGE_SIZE / 4), %ecx + xorl %eax, %eax + rep stosl + + /* pvh_level3_kernel_pgt[L3_START_KERNEL] = pvh_level2_kernel_pgt */ + leal pvh_level3_kernel_pgt(%ebp), %edi + leal pvh_level2_kernel_pgt(%ebp), %esi + movl %esi, (L3_START_KERNEL * 8)(%edi) + addl $_KERNPG_TABLE_NOENC, (L3_START_KERNEL * 8)(%edi) + + mov %ebp, %ebx + subl $LOAD_PHYSICAL_ADDR, %ebx /* offset */ + + /* Fill pvh_level2_kernel_pgt with large pages. */ + leal pvh_level2_kernel_pgt(%ebp), %edi + movl $__PAGE_KERNEL_LARGE_EXEC, %esi + addl %ebx, %esi + movl $(KERNEL_IMAGE_SIZE / PMD_SIZE), %ecx +1: movl %esi, 0(%edi) + addl $(1 << PMD_SHIFT), %esi + addl $8, %edi + decl %ecx + jnz 1b + + /* Switch to page tables. */ + leal pvh_init_top_pgt(%ebp), %eax mov %eax, %cr3 mov $(X86_CR0_PG | X86_CR0_PE), %eax mov %eax, %cr0 /* Jump to 64-bit mode. */ - ljmp $PVH_CS_SEL, $_pa(1f) + pushl $PVH_CS_SEL + leal rva(1f)(%ebp), %eax + pushl %eax + lretl /* 64-bit entry point. */ .code64 1: + UNWIND_HINT_END_OF_STACK + /* Set base address in stack canary descriptor. */ mov $MSR_GS_BASE,%ecx - mov $_pa(canary), %eax + leal rva(canary)(%ebp), %eax xor %edx, %edx wrmsr + /* Calculate load offset from LOAD_PHYSICAL_ADDR and store in + * phys_base. __pa() needs phys_base set to calculate the the + * hypercall page in xen_pvh_init(). */ + movq %rbp, %rbx + subq $LOAD_PHYSICAL_ADDR, %rbx + movq %rbx, phys_base(%rip) call xen_prepare_pvh + /* Clear phys_base. startup_64/__startup_64 will *add* to its value, + so start from 0. */ + xor %rbx, %rbx + movq %rbx, phys_base(%rip) /* startup_64 expects boot_params in %rsi. */ - mov $_pa(pvh_bootparams), %rsi - mov $_pa(startup_64), %rax + lea rva(pvh_bootparams)(%ebp), %rsi + lea rva(startup_64)(%ebp), %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax @@ -137,13 +274,14 @@ SYM_CODE_START_LOCAL(pvh_start_xen) ljmp $PVH_CS_SEL, $_pa(startup_32) #endif + SYM_CODE_END(pvh_start_xen) .section ".init.data","aw" .balign 8 SYM_DATA_START_LOCAL(gdt) - .word gdt_end - gdt_start - .long _pa(gdt_start) + .word gdt_end - gdt_start - 1 + .long _pa(gdt_start) /* x86-64 will overwrite if relocated. */ .word 0 SYM_DATA_END(gdt) SYM_DATA_START_LOCAL(gdt_start) @@ -163,5 +301,11 @@ SYM_DATA_START_LOCAL(early_stack) .fill BOOT_STACK_SIZE, 1, 0 SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end) +#ifdef CONFIG_X86_64 + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_RELOC, + .long CONFIG_PHYSICAL_ALIGN; + .long LOAD_PHYSICAL_ADDR; + .long 0xffffffff) +#endif ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, _ASM_PTR (pvh_start_xen - __START_KERNEL_map)) diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 38deb1214613..4deb63ca7633 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -185,9 +185,27 @@ */ #define XEN_ELFNOTE_PHYS32_ENTRY 18 +/* + * Physical loading constraints for PVH kernels + * + * The presence of this note indicates the kernel supports relocating itself. + * + * The note may include up to three 32bit values to place constraints on the + * guest physical loading addresses and alignment for a PVH kernel. Values + * are read in the following order: + * - a required start alignment (default 0x200000) + * - a minimum address for the start of the image (default 0) + * - a maximum address for the last byte of the image (default 0xffffffff) + * + * When this note specifies an alignment value, it is used. Otherwise the + * maximum p_align value from loadable ELF Program Headers is used, if it is + * greater than or equal to 4k (0x1000). Otherwise, the default is used. + */ +#define XEN_ELFNOTE_PHYS32_RELOC 19 + /* * The number of the highest elfnote defined. */ -#define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_ENTRY +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_PHYS32_RELOC #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ -- 2.44.0
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |