[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v2 4/9] xen/pvh: Bootstrap PVH guest



On 26/01/17 20:41, Boris Ostrovsky wrote:
> Start PVH guest at XEN_ELFNOTE_PHYS32_ENTRY address. Setup hypercall
> page, initialize boot_params, enable early page tables.
> 
> Since this stub is executed before kernel entry point we cannot use
> variables in .bss which is cleared by kernel. We explicitly place
> variables that are initialized here into .data.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
> ---
> Changes in v2:
> * Assembly cleanup
> * Check for e820 size in init_pvh_bootparams()
> * Check XEN_HVM_START_MAGIC_VALUE in start_info
> 
> 
>  arch/x86/xen/Kconfig     |   2 +-
>  arch/x86/xen/Makefile    |   1 +
>  arch/x86/xen/enlighten.c |  98 ++++++++++++++++++++++++++++++++-
>  arch/x86/xen/xen-pvh.S   | 137 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  include/xen/xen.h        |   5 ++
>  5 files changed, 241 insertions(+), 2 deletions(-)
>  create mode 100644 arch/x86/xen/xen-pvh.S
> 
> diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
> index c7b15f3..76b6dbd 100644
> --- a/arch/x86/xen/Kconfig
> +++ b/arch/x86/xen/Kconfig
> @@ -53,5 +53,5 @@ config XEN_DEBUG_FS
>  
>  config XEN_PVH
>       bool "Support for running as a PVH guest"
> -     depends on X86_64 && XEN && XEN_PVHVM
> +     depends on XEN && XEN_PVHVM && ACPI
>       def_bool n
> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
> index e47e527..cb0164a 100644
> --- a/arch/x86/xen/Makefile
> +++ b/arch/x86/xen/Makefile
> @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS)  += debugfs.o
>  obj-$(CONFIG_XEN_DOM0)               += vga.o
>  obj-$(CONFIG_SWIOTLB_XEN)    += pci-swiotlb-xen.o
>  obj-$(CONFIG_XEN_EFI)                += efi.o
> +obj-$(CONFIG_XEN_PVH)                += xen-pvh.o
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 828f1b2..c82fe14 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -45,6 +45,7 @@
>  #include <xen/interface/memory.h>
>  #include <xen/interface/nmi.h>
>  #include <xen/interface/xen-mca.h>
> +#include <xen/interface/hvm/start_info.h>
>  #include <xen/features.h>
>  #include <xen/page.h>
>  #include <xen/hvm.h>
> @@ -121,7 +122,8 @@
>  DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
>  EXPORT_PER_CPU_SYMBOL(xen_vcpu_id);
>  
> -enum xen_domain_type xen_domain_type = XEN_NATIVE;
> +enum xen_domain_type xen_domain_type
> +     __attribute__((section(".data"))) = XEN_NATIVE;
>  EXPORT_SYMBOL_GPL(xen_domain_type);
>  
>  unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
> @@ -176,6 +178,17 @@ struct tls_descs {
>   */
>  static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
>  
> +#ifdef CONFIG_XEN_PVH
> +/*
> + * PVH variables. These need to live in data segment since they are
> + * initialized before startup_{32|64}, which clear .bss, are invoked.
> + */
> +bool xen_pvh __attribute__((section(".data"))) = 0;
> +struct hvm_start_info pvh_start_info __attribute__((section(".data")));
> +unsigned int pvh_start_info_sz = sizeof(pvh_start_info);

While I believe this can live in .bss as it isn't used after clearing
.bss there should either be a comment why this is save or you should
attribute it as .data, too.

> +struct boot_params pvh_bootparams __attribute__((section(".data")));
> +#endif
> +
>  static void clamp_max_cpus(void)
>  {
>  #ifdef CONFIG_SMP
> @@ -1656,6 +1669,89 @@ asmlinkage __visible void __init xen_start_kernel(void)
>  #endif
>  }
>  
> +#ifdef CONFIG_XEN_PVH
> +static void __init init_pvh_bootparams(void)
> +{
> +     struct xen_memory_map memmap;
> +     unsigned int i;
> +     int rc;
> +
> +     memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
> +
> +     memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_map);
> +     set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_map);
> +     rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
> +     if (rc) {
> +             xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
> +             BUG();
> +     }
> +
> +     if (memmap.nr_entries < E820MAX) {

Shouldn't this be E820MAX - 1?
What happens if memmap.nr_entries is already
ARRAY_SIZE(pvh_bootparams.e820_map) ?

> +             pvh_bootparams.e820_map[memmap.nr_entries].addr =
> +                     ISA_START_ADDRESS;
> +             pvh_bootparams.e820_map[memmap.nr_entries].size =
> +                     ISA_END_ADDRESS - ISA_START_ADDRESS;
> +             pvh_bootparams.e820_map[memmap.nr_entries++].type =
> +                     E820_RESERVED;

I'd rather split out the '++' to a separate statement.

> +     } else
> +             xen_raw_printk("Warning: Can fit ISA range into e820\n");
> +
> +     sanitize_e820_map(pvh_bootparams.e820_map,
> +                       ARRAY_SIZE(pvh_bootparams.e820_map),
> +                       &memmap.nr_entries);
> +
> +     pvh_bootparams.e820_entries = memmap.nr_entries;
> +     for (i = 0; i < pvh_bootparams.e820_entries; i++)
> +             e820_add_region(pvh_bootparams.e820_map[i].addr,
> +                             pvh_bootparams.e820_map[i].size,
> +                             pvh_bootparams.e820_map[i].type);
> +
> +     pvh_bootparams.hdr.cmd_line_ptr =
> +             pvh_start_info.cmdline_paddr;
> +
> +     /* The first module is always ramdisk. */
> +     if (pvh_start_info.nr_modules) {
> +             struct hvm_modlist_entry *modaddr =
> +                     __va(pvh_start_info.modlist_paddr);
> +             pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
> +             pvh_bootparams.hdr.ramdisk_size = modaddr->size;
> +     }
> +
> +     /*
> +      * See Documentation/x86/boot.txt.
> +      *
> +      * Version 2.12 supports Xen entry point but we will use default x86/PC
> +      * environment (i.e. hardware_subarch 0).
> +      */
> +     pvh_bootparams.hdr.version = 0x212;
> +     pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
> +}
> +
> +/*
> + * This routine (and those that it might call) should not use
> + * anything that lives in .bss since that segment will be cleared later.
> + */
> +void __init xen_prepare_pvh(void)
> +{
> +     u32 msr;
> +     u64 pfn;
> +
> +     if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
> +             xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
> +                             pvh_start_info.magic);
> +             BUG();
> +     }
> +
> +     xen_pvh = 1;
> +
> +     msr = cpuid_ebx(xen_cpuid_base() + 2);
> +     pfn = __pa(hypercall_page);
> +     wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
> +
> +     init_pvh_bootparams();
> +}
> +#endif
> +
>  void __ref xen_hvm_init_shared_info(void)
>  {
>       int cpu;
> diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
> new file mode 100644
> index 0000000..410036a
> --- /dev/null
> +++ b/arch/x86/xen/xen-pvh.S
> @@ -0,0 +1,137 @@
> +/*
> + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License along
> + * with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +     .code32
> +     .text
> +#define _pa(x)          ((x) - __START_KERNEL_map)
> +
> +#include <linux/elfnote.h>
> +#include <linux/init.h>
> +#include <linux/linkage.h>
> +#include <asm/segment.h>
> +#include <asm/asm.h>
> +#include <asm/boot.h>
> +#include <asm/processor-flags.h>
> +#include <asm/msr.h>
> +#include <xen/interface/elfnote.h>
> +
> +     __HEAD
> +
> +/* Entry point for PVH guests. */

Could you add some comments about register conetnts at entry?

> +ENTRY(pvh_start_xen)
> +     cld
> +
> +     lgdt (_pa(gdt))
> +
> +     mov $(__BOOT_DS),%eax
> +     mov %eax,%ds
> +     mov %eax,%es
> +     mov %eax,%ss
> +
> +     /* Stash hvm_start_info. */
> +     mov $_pa(pvh_start_info), %edi
> +     mov %ebx, %esi
> +     mov _pa(pvh_start_info_sz), %ecx
> +     shr $2,%ecx
> +     rep
> +     movsl
> +
> +     mov $_pa(early_stack_end), %esp
> +
> +     /* Enable PAE mode. */
> +     mov %cr4, %eax
> +     orl $X86_CR4_PAE, %eax
> +     mov %eax, %cr4
> +
> +#ifdef CONFIG_X86_64
> +     /* Enable Long mode. */
> +     mov $MSR_EFER, %ecx
> +     rdmsr
> +     btsl $_EFER_LME, %eax
> +     wrmsr
> +
> +     /* Enable pre-constructed page tables. */
> +     mov $_pa(init_level4_pgt), %eax
> +     mov %eax, %cr3
> +     mov $(X86_CR0_PG | X86_CR0_PE), %eax
> +     mov %eax, %cr0
> +
> +     /* Jump to 64-bit mode. */
> +        ljmp $__KERNEL_CS, $_pa(1f)

Indentation

> +
> +     /* 64-bit entry point. */
> +     .code64
> +1:
> +     call xen_prepare_pvh
> +
> +     /* startup_64 expects boot_params in %rsi. */
> +     mov $_pa(pvh_bootparams), %rsi
> +     mov $_pa(startup_64), %rax
> +     jmp *%rax
> +
> +#else /* CONFIG_X86_64 */
> +
> +     call mk_early_pgtbl_32
> +
> +     mov $_pa(initial_page_table), %eax
> +     mov %eax, %cr3
> +
> +     mov %cr0, %eax
> +     or $(X86_CR0_PG | X86_CR0_PE), %eax
> +     mov %eax, %cr0
> +
> +     ljmp $__BOOT_CS, $1f
> +1:
> +     call xen_prepare_pvh
> +     mov $_pa(pvh_bootparams), %esi
> +
> +     /* startup_32 doesn't expect paging and PAE to be on. */
> +     ljmp $__BOOT_CS, $_pa(2f)
> +2:
> +     mov %cr0, %eax
> +     and $~X86_CR0_PG, %eax
> +     mov %eax, %cr0
> +     mov %cr4, %eax
> +     and $~X86_CR4_PAE, %eax
> +     mov %eax, %cr4
> +
> +     ljmp    $0x10, $_pa(startup_32)

Any reason to use 0x10 instead of __BOOT_CS?

> +#endif
> +ENDPROC(pvh_start_xen)
> +
> +     .data

Alignment?

> +gdt:
> +     .word   gdt_end - gdt
> +     .long   _pa(gdt)

This is a rather strange construct: the NULL descriptor of the
GDT being used as space for lgdt operand.

> +     .word   0
> +     .quad   0x0000000000000000 /* NULL descriptor */

And this comment is wrong: the NULL descriptor is at "gdt:".

> +#ifdef CONFIG_X86_64
> +     .quad   0x00af9a000000ffff /* __KERNEL_CS */

Mind adding comments about the semantics of those constants?
Or use GDT_ENTRY() macro?

> +#else
> +     .quad   0x00cf9a000000ffff /* __KERNEL_CS */
> +#endif
> +     .quad   0x00cf92000000ffff /* __KERNEL_DS */
> +gdt_end:
> +
> +     .bss
> +     .balign 4
> +early_stack:
> +     .fill 16, 1, 0

Is the stack size large enough? With a hypercall being executed in
xen_prepare_pvh() I doubt this will be okay.

> +early_stack_end:
> +
> +     ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
> +                  _ASM_PTR (pvh_start_xen - __START_KERNEL_map))
> diff --git a/include/xen/xen.h b/include/xen/xen.h
> index d0f9684..6e8b7fc 100644
> --- a/include/xen/xen.h
> +++ b/include/xen/xen.h
> @@ -29,6 +29,11 @@ enum xen_domain_type {
>  #define xen_initial_domain() (0)
>  #endif       /* CONFIG_XEN_DOM0 */
>  
> +#ifdef CONFIG_XEN_PVH
> +extern bool xen_pvh;
> +#define xen_pvh_domain()     (xen_hvm_domain() && xen_pvh)
> +#else
>  #define xen_pvh_domain()     (0)
> +#endif
>  
>  #endif       /* _XEN_XEN_H */
> 


Juergen

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.