[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 1/3] xen/riscv: introduce setup_initial_pages


  • To: Oleksii Kurochko <oleksii.kurochko@xxxxxxxxx>
  • From: Jan Beulich <jbeulich@xxxxxxxx>
  • Date: Tue, 28 Mar 2023 17:34:43 +0200
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=suse.com; dmarc=pass action=none header.from=suse.com; dkim=pass header.d=suse.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=sAjNoidCv6+NrE51YETuS9refmykrqxDPyRgNKFbHsU=; b=WS2niqD2vCUCZ68e459tfiGv90ijr3NE+LQ44F9tgiC7O5stIHRCWvs6K/U2MPSNndwPIqeyHoKLyJ4VNOujnjRI/tnDv7wGx3R3SQzcKMIxMmyaOGKz9YoHwKH9Fh/8rJ+qfmfYjztmrWETkFZ7ZNkwQfuh25iOOYJa4Z35mq7nZnueGhgORXMMbHoPDjCJDWMte6VdlBOAVWDXelBCLlRAaKdySvi/kHC+LO9Nx2MAq4+fKSZnoK/eYLaZwfdS3eVan6fCENdSqfTQGnyzc42RXe5912tdf5pPQ0SzxJ1H7YB4mVdnQGRf68GGbxUNtyPvokAer8f1dyI9jTmpSQ==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=Y2SSe1V+NQAlXNcNOTFDzMkSfzeBDqD+Tm2FrI8oDbQYycesrYLabH2Mwvlp2hSNbLEbqgsSqdObqEMuAiApajLR0Awk59NJsPK8rEEfeU2DAUjjKtbbqI5xOpWeqQQGsrOJL+QFYaWaKSF/jG8iZUcgBgjSUJJ/GmR6Ogc9fuqGRuZHNpv63VVvNovl7QMfJ+QKJyHvw9cMtXPu0QHjiFg/hoM/nHeCGD/01aFsdGC/jXK58fZZazbWCy7XTymg6Wolq3yM4yZ067OuvMpGb0Ne8ojjgjGj4sOKX3EoJfsC0RPQ6AAFxeP0pPGRWdy7CxiQMszGObvefXcRjiI/Vw==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=suse.com;
  • Cc: Julien Grall <julien@xxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>, Gianluca Guida <gianluca@xxxxxxxxxxxx>, Bob Eshleman <bobbyeshleman@xxxxxxxxx>, Alistair Francis <alistair.francis@xxxxxxx>, Connor Davis <connojdavis@xxxxxxxxx>, xen-devel@xxxxxxxxxxxxxxxxxxxx
  • Delivery-date: Tue, 28 Mar 2023 15:35:00 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

On 27.03.2023 19:17, Oleksii Kurochko wrote:
> --- a/xen/arch/riscv/include/asm/config.h
> +++ b/xen/arch/riscv/include/asm/config.h
> @@ -39,12 +39,25 @@
>    name:
>  #endif
>  
> -#define XEN_VIRT_START  _AT(UL, 0x80200000)
> +#define ADDRESS_SPACE_END (_AC(-1, UL))
> +#define SZ_1G             0x40000000

No need for this - please use GB(1) (see xen/config.h) in its stead.

> +#ifdef CONFIG_RISCV_64
> +#define XEN_VIRT_START    (ADDRESS_SPACE_END - SZ_1G + 1)

I first thought the " + 1" would be rendering the address misaligned.
May I suggest (ADDRESS_SPACE_END + 1 - SZ_1G) to help avoiding this
impression?

> +#else
> +#error "RV32 isn't supported"asm

Stray "asm" at the end of the line.

> --- /dev/null
> +++ b/xen/arch/riscv/include/asm/page.h
> @@ -0,0 +1,62 @@
> +#ifndef _ASM_RISCV_PAGE_H
> +#define _ASM_RISCV_PAGE_H
> +
> +#include <xen/const.h>
> +#include <xen/types.h>
> +
> +#define VPN_MASK                    ((unsigned long)(PAGETABLE_ENTRIES - 1))
> +
> +#define XEN_PT_LEVEL_ORDER(lvl)     ((lvl) * PAGETABLE_ORDER)
> +#define XEN_PT_LEVEL_SHIFT(lvl)     (XEN_PT_LEVEL_ORDER(lvl) + PAGE_SHIFT)
> +#define XEN_PT_LEVEL_SIZE(lvl)      (_AT(paddr_t, 1) << 
> XEN_PT_LEVEL_SHIFT(lvl))
> +#define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) - 1))
> +#define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK << XEN_PT_LEVEL_SHIFT(lvl))
> +
> +#define PTE_VALID                   BIT(0, UL)
> +#define PTE_READABLE                BIT(1, UL)
> +#define PTE_WRITABLE                BIT(2, UL)
> +#define PTE_EXECUTABLE              BIT(3, UL)
> +#define PTE_USER                    BIT(4, UL)
> +#define PTE_GLOBAL                  BIT(5, UL)
> +#define PTE_ACCESSED                BIT(6, UL)
> +#define PTE_DIRTY                   BIT(7, UL)
> +#define PTE_RSW                     (BIT(8, UL) | BIT(9, UL))
> +
> +#define PTE_LEAF_DEFAULT            (PTE_VALID | PTE_READABLE | PTE_WRITABLE)
> +#define PTE_TABLE                   (PTE_VALID)
> +
> +/* Calculate the offsets into the pagetables for a given VA */
> +#define pt_linear_offset(lvl, va)   ((va) >> XEN_PT_LEVEL_SHIFT(lvl))
> +
> +#define pt_index(lvl, va) pt_linear_offset(lvl, (va) & 
> XEN_PT_LEVEL_MASK(lvl))
> +
> +#ifdef CONFIG_RISCV_64
> +typedef uint64_t pte_t;
> +#else
> +typedef uint32_t pte_t;
> +#endif

Are you sure you don't want to encapsulate these in a struct, for type
safety purposes?

> +#define addr_to_pte(x) (((x) >> PTE_PPN_SHIFT) << PAGE_SHIFT)

This looking like the inverse of ...

> +/* Shift the VPN[x] or PPN[x] fields of a virtual or physical address
> + * to become the shifted PPN[x] fields of a page table entry */
> +#define addr_to_ppn(x) (((x) >> PAGE_SHIFT) << PTE_PPN_SHIFT)

... this, does that one want to be called "ppn_to_addr()"? Otherwise I
can't see how name an operation fit together.

> +static inline pte_t paddr_to_pte(const unsigned long paddr,

paddr_t?

> +                                 const unsigned long permissions)
> +{
> +    pte_t tmp = (pte_t)addr_to_ppn(paddr);
> +    return tmp | permissions;
> +}
> +
> +static inline pte_t pte_to_paddr(const unsigned long paddr)

A function of this name wants to take pte_t as parameter and return
paddr_t (note the type safety remark higher up, as it becomes relevant
right here). It also hardly can be implemented ...

> +{
> +    return (pte_t)addr_to_pte(paddr);

... in terms of a function/macro with effectively the opposite name.

> --- /dev/null
> +++ b/xen/arch/riscv/mm.c
> @@ -0,0 +1,277 @@
> +#include <xen/init.h>
> +#include <xen/kernel.h>
> +
> +#include <asm/early_printk.h>
> +#include <asm/config.h>
> +#include <asm/csr.h>
> +#include <asm/mm.h>
> +#include <asm/page.h>
> +#include <asm/processor.h>
> +
> +#define PGTBL_INITIAL_COUNT 8

What does this magic number derive from? (A comment may help.)

> +#define PGTBL_ENTRY_AMOUNT  (PAGE_SIZE/sizeof(pte_t))

Nit: Style (blanks around binary operators).

> +struct mmu_desc {
> +    unsigned long num_levels;
> +    uint32_t pgtbl_count;

Nit: Style (as before please avoid fixed width types when their use
isn't really warranted; afaics unsigned int will do fine here and
elsewhere below).

> +    pte_t *next_pgtbl;
> +    pte_t *pgtbl_base;
> +};
> +
> +extern unsigned long __bss_start[];
> +extern unsigned long __bss_end[];

Why are these being added here? They aren't being used afaics.

> +extern unsigned char cpu0_boot_stack[STACK_SIZE];
> +
> +#define PHYS_OFFSET ((unsigned long)_start - XEN_VIRT_START)
> +#define LOAD_TO_LINK(addr) ((addr) - PHYS_OFFSET)
> +#define LINK_TO_LOAD(addr) ((addr) + PHYS_OFFSET)
> +
> +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +stage1_pgtbl_root[PGTBL_ENTRY_AMOUNT];
> +
> +pte_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
> +stage1_pgtbl_nonroot[PGTBL_INITIAL_COUNT * PGTBL_ENTRY_AMOUNT];
> +
> +#define HANDLE_PGTBL()                                                      \
> +    index =  pt_index(curr_lvl_num, page_addr);                             \

Nit: Style (stray double blanks).

> +    if ( pte_is_valid(pgtbl[index]) ) {                                     \

Nit: Style (brace placement throughout the macro - in really small
macros the way you have it may be okay as an exception, but this macro
isn't small).

> +        /* Find L{ 0-3 } table */                                           \
> +        pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]);                        \
> +    } else {                                                                \
> +        /* Allocate new L{0-3} page table */                                \
> +        if ( mmu_desc->pgtbl_count == PGTBL_INITIAL_COUNT ) {               \
> +            early_printk("(XEN) No initial table available\n");             \
> +            /* panic(), BUG() or ASSERT() aren't ready now. */              \
> +            die();                                                          \
> +        }                                                                   \
> +        mmu_desc->pgtbl_count++;                                            \
> +        pgtbl[index] = paddr_to_pte((unsigned long)mmu_desc->next_pgtbl,    \
> +                                    PTE_VALID);                             \
> +        pgtbl = mmu_desc->next_pgtbl;                                       \
> +        mmu_desc->next_pgtbl += PGTBL_ENTRY_AMOUNT;                         \
> +    }
> +
> +static void __init setup_initial_mapping(struct mmu_desc *mmu_desc,
> +                                  unsigned long map_start,
> +                                  unsigned long map_end,
> +                                  unsigned long pa_start,
> +                                  unsigned long permission)

Nit: Style (indentation, and it feels like I commented on this before).

> +{
> +    uint32_t index;
> +    pte_t *pgtbl;
> +    unsigned long page_addr;
> +    unsigned int curr_lvl_num;
> +    pte_t pte_to_be_written;
> +    unsigned long paddr;
> +
> +    if ( (unsigned long)_start % XEN_PT_LEVEL_SIZE(0) ) {

Nit: Style (brace placement).

> +        early_printk("(XEN) Xen should be loaded at 4k boundary\n");
> +        die();
> +    }
> +
> +    if ( map_start & ~XEN_PT_LEVEL_MAP_MASK(0) ||
> +        pa_start & ~XEN_PT_LEVEL_MAP_MASK(0) ) {

Nit: Style (indentation and brace placement; more below).

> +        early_printk("(XEN) map and pa start addresses should be aligned\n");
> +        /* panic(), BUG() or ASSERT() aren't ready now. */
> +        die();
> +    }
> +
> +    page_addr = map_start;
> +    while ( page_addr < map_end ) {
> +        pgtbl = mmu_desc->pgtbl_base;
> +
> +        switch (mmu_desc->num_levels)
> +        {
> +        case 4: /* Level 3 */
> +            curr_lvl_num = 3;
> +            HANDLE_PGTBL();
> +        case 3: /* Level 2 */
> +            curr_lvl_num = 2;
> +            HANDLE_PGTBL();
> +        case 2: /* Level 1 */
> +            curr_lvl_num = 1;
> +            HANDLE_PGTBL();

Am I getting it right that curr_lvl_num solely exists to pass a parameter
to the macro? If so, why doesn't the macro take an appropriate parameter
instead?

> +        case 1: /* Level 0 */
> +            index = pt_index(0, page_addr);
> +            paddr = (page_addr - map_start) + pa_start;
> +            pte_to_be_written = paddr_to_pte(paddr, permission);
> +
> +            if ( !pte_is_valid(pgtbl[index]) )
> +                pgtbl[index] = pte_to_be_written;
> +            else {
> +                /*
> +                 * get an adresses of current pte and that one to
> +                 * be written without permission flags
> +                 */
> +                unsigned long curr_pte =
> +                    (unsigned long)&pgtbl[index] & (PTE_PPN_SHIFT - 1);
> +                pte_to_be_written &= (PTE_PPN_SHIFT - 1);
> +
> +                if ( curr_pte != pte_to_be_written ) {
> +                    early_printk("PTE that is intended to write isn't the \
> +                                 same that the once are overwriting\n");

Iirc there are compilers which warn about line continuation characters
inside a string literal. Since adjacent string literals are concatenated
by the compiler anyway, you'll find elsewhere we have the equivalent of

                    early_printk("PTE that is intended to write isn't the"
                                 " same that the once are overwriting\n");

This will also avoid an excessive number of blanks in the middle of the
string.

> +                    /* panic(), <asm/bug.h> aren't ready now. */
> +                    die();
> +                }
> +            }
> +        }
> +
> +        /* Point to next page */
> +        page_addr += XEN_PT_LEVEL_SIZE(0);
> +    }
> +}
> +
> +static void __init calc_pgtbl_lvls_num(struct  mmu_desc *mmu_desc)
> +{
> +    unsigned long satp_mode = RV_STAGE1_MODE;
> +
> +    /* Number of page table levels */
> +    switch (satp_mode) {

Nit: Style (missing blanks and brace placement).

> +    case SATP_MODE_SV32:
> +        mmu_desc->num_levels = 2;
> +        break;
> +    case SATP_MODE_SV39:
> +        mmu_desc->num_levels = 3;
> +        break;
> +    case SATP_MODE_SV48:
> +        mmu_desc->num_levels = 4;
> +        break;
> +    default:
> +        early_printk("(XEN) Unsupported SATP_MODE\n");
> +        while (1);

die() (until you have panic()) like above?

> +    }
> +}
> +
> +static void __init setup_ptes_permission(struct mmu_desc *mmu_desc)
> +{
> +    pte_t *pgtbl;
> +    uint32_t index;
> +    uint32_t permissions = PTE_VALID;
> +    unsigned long start = LOAD_TO_LINK((unsigned long)_start);
> +    unsigned long end   = LOAD_TO_LINK((unsigned long)_end);
> +
> +    while ( start < end )
> +    {
> +        pgtbl = (pte_t *) mmu_desc->pgtbl_base;

Nit: Style (extra blank after closing parenthesis). But afaict you don't
need a case here in the first place; as said before, please avoid casts
whenever possible.

> +        switch (mmu_desc->num_levels)

Nit: Style (Missing blanks).

> +        {
> +        case 4: /* find Level 3 page table*/
> +            index = pt_index(3, start);
> +            pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]);
> +        case 3: /* find level 2 page table */
> +            index = pt_index(2, start);
> +            pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]);
> +        case 2: /* find level 1 page table */
> +            index = pt_index(1, start);
> +            pgtbl = (pte_t *)pte_to_paddr(pgtbl[index]);
> +        case 1: /* find level 0 page table */
> +            index = pt_index(0, start);
> +
> +            if ( is_kernel_text(LINK_TO_LOAD(start)) ||
> +                 is_kernel_inittext(LINK_TO_LOAD(start)) )
> +                permissions |= PTE_EXECUTABLE | PTE_READABLE;
> +
> +            if ( is_kernel_rodata(LINK_TO_LOAD(start)) )
> +                permissions |= PTE_READABLE;
> +
> +            pgtbl[index] |= permissions;

setup_initial_mapping() has installed R/W mappings. Hence ORing in
PTE_READABLE is meaningless here, and neither .text nor .rodata
will end up write-protected. The question is ...

> +        }
> +
> +        start += PAGE_SIZE;
> +    }
> +}
> +
> +/*
> + * setup_initial_pagetables:
> + *
> + * Build the page tables for Xen that map the following:
> + *  1. Calculate page table's level numbers.
> + *  2. Init mmu description structure.
> + *  3. Check that linker addresses range doesn't overlap
> + *     with load addresses range
> + *  4. Map all linker addresses and load addresses ( it shouldn't
> + *     be 1:1 mapped and will be 1:1 mapped only in case if
> + *     linker address is equal to load address ) with
> + *     RW permissions by default.
> + *  5. Setup proper PTE permissions for each section.
> + */
> +void __init setup_initial_pagetables(void)
> +{
> +    struct mmu_desc mmu_desc = { 0, 0, NULL, 0 };
> +
> +    /*
> +     * Access to _{stard, end } is always PC-relative
> +     * thereby when access them we will get load adresses
> +     * of start and end of Xen
> +     * To get linker addresses LOAD_TO_LINK() is required
> +     * to use
> +     */
> +    unsigned long load_start    = (unsigned long)_start;
> +    unsigned long load_end      = (unsigned long)_end;
> +    unsigned long linker_start  = LOAD_TO_LINK(load_start);
> +    unsigned long linker_end    = LOAD_TO_LINK(load_end);
> +
> +    if ( (linker_start != load_start) &&
> +         (linker_start <= load_end) && (load_start <= linker_end) ) {
> +        early_printk("(XEN) linker and load address ranges overlap\n");
> +        die();
> +    }
> +
> +    calc_pgtbl_lvls_num(&mmu_desc);
> +
> +    mmu_desc.pgtbl_base = stage1_pgtbl_root;
> +    mmu_desc.next_pgtbl = stage1_pgtbl_nonroot;
> +
> +    setup_initial_mapping(&mmu_desc,
> +                          linker_start,
> +                          linker_end,
> +                          load_start,
> +                          PTE_LEAF_DEFAULT);
> +
> +    setup_ptes_permission(&mmu_desc);

...: Why does this require a 2nd pass / function in the first place?

> +}
> +
> +void __init enable_mmu()
> +{
> +    /*
> +     * Calculate a linker time address of the mmu_is_enabled
> +     * label and update CSR_STVEC with it.
> +     * MMU is configured in a way where linker addresses are mapped
> +     * on load addresses so in a case when linker addresses are not equal
> +     * to load addresses, after MMU is enabled, it will cause
> +     * an exception and jump to linker time addresses.
> +     * Otherwise if load addresses are equal to linker addresses the code
> +     * after mmu_is_enabled label will be executed without exception.
> +     */
> +    csr_write(CSR_STVEC, LOAD_TO_LINK((unsigned long)&&mmu_is_enabled));

That looks like a pretty unusual way of moving from physical to virtual
addresses. But if it works ...

> +    /* Ensure page table writes precede loading the SATP */
> +    asm volatile("sfence.vma");
> +
> +    /* Enable the MMU and load the new pagetable for Xen */
> +    csr_write(CSR_SATP,
> +              ((unsigned long)stage1_pgtbl_root >> PAGE_SHIFT) | 
> RV_STAGE1_MODE << SATP_MODE_SHIFT);

Nit: Style (excessively long line).

> +    asm volatile(".align 2");
> +mmu_is_enabled:

Nit: Style (labels want indenting by at least one blank).

> @@ -26,3 +27,13 @@ void __init noreturn start_xen(unsigned long bootcpu_id,
>  
>      unreachable();
>  }
> +
> +void __init noreturn cont_after_mmu_is_enabled(void)

To prevent undue link-time optimization such a function also wants to
be noinline.

Jan



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.