[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v3 9/9] xen/ppc: mm-radix: Allocate all paging structures at runtime



Hi Jan,

On 3/25/24 10:39 AM, Jan Beulich wrote:
> On 14.03.2024 23:15, Shawn Anastasio wrote:
>> --- a/xen/arch/ppc/mm-radix.c
>> +++ b/xen/arch/ppc/mm-radix.c
>> @@ -21,69 +21,101 @@ void enable_mmu(void);
>>  #define radix_dprintk(...)
>>  #endif
>>
>> -#define INITIAL_LVL1_PD_COUNT      1
>> -#define INITIAL_LVL2_LVL3_PD_COUNT 2
>> -#define INITIAL_LVL4_PT_COUNT      256
>> -
>> -static size_t __initdata initial_lvl1_pd_pool_used;
>> -static struct lvl1_pd initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT];
>> -
>> -static size_t __initdata initial_lvl2_lvl3_pd_pool_used;
>> -static struct lvl2_pd initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT];
>> -
>> -static size_t __initdata initial_lvl4_pt_pool_used;
>> -static struct lvl4_pt initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT];
>> -
>> -/* Only reserve minimum Partition and Process tables  */
>>  #define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 
>> */
>>  #define PATB_SIZE      (1UL << PATB_SIZE_LOG2)
>> -#define PRTB_SIZE_LOG2 12
>> +#define PRTB_SIZE_LOG2 24 /* Maximum process table size on POWER9 */
>>  #define PRTB_SIZE      (1UL << PRTB_SIZE_LOG2)
>>
>> -static struct patb_entry
>> -    __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct 
>> patb_entry)];
>> +static struct patb_entry *initial_patb;
>> +static struct prtb_entry *initial_prtb;
>>
>> -static struct prtb_entry
>> -    __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct 
>> prtb_entry)];
>> +static mfn_t __initdata min_alloc_mfn = {-1};
>> +static mfn_t __initdata max_alloc_mfn = {0};
>>
>> -static __init struct lvl1_pd *lvl1_pd_pool_alloc(void)
>> +/*
>> + * A thin wrapper for alloc_boot_pages that keeps track of the maximum and
>> + * minimum mfns that have been allocated. This information is used by
>> + * setup_initial_mapping to include the allocated pages in the initial
>> + * page mapping.
>> + */
>> +static mfn_t __init initial_page_alloc(unsigned long nr_pfns,
>> +                                       unsigned long pfn_align)
>>  {
>> -    if ( initial_lvl1_pd_pool_used >= INITIAL_LVL1_PD_COUNT )
>> -    {
>> -        early_printk("Ran out of space for LVL1 PD!\n");
>> -        die();
>> -    }
>> +    mfn_t mfn_first, mfn_last;
>>
>> -    return &initial_lvl1_pd_pool[initial_lvl1_pd_pool_used++];
>> -}
>> +    mfn_first = alloc_boot_pages(nr_pfns, pfn_align);
>> +    mfn_last = _mfn(mfn_x(mfn_first) + nr_pfns - 1);
> 
> Please can you use mfn_add() here?
> 

Good catch, will do.

>> -static __init struct lvl2_pd *lvl2_pd_pool_alloc(void)
>> -{
>> -    if ( initial_lvl2_lvl3_pd_pool_used >= INITIAL_LVL2_LVL3_PD_COUNT )
>> -    {
>> -        early_printk("Ran out of space for LVL2/3 PD!\n");
>> -        die();
>> -    }
>> +    min_alloc_mfn = _mfn(min(mfn_x(min_alloc_mfn), mfn_x(mfn_first)));
>> +    max_alloc_mfn = _mfn(max(mfn_x(max_alloc_mfn), mfn_x(mfn_last)));
> 
> Together with the comment ahead of the function - is there some kind of
> assumption here that this range won't span almost all of system memory?
> E.g. expecting allocations to be almost contiguous? If so, I wonder how
> reliable this is, and whether using a rangeset wouldn't be better here.
> 

You're right that this is only sane (i.e. not mapping almost all of
system memory) when the assumption that alloc_boot_pages returns
mostly-contiguous regions holds. I'm not super happy with this either,
but I struggled to come up with a better solution that doesn't involve
re-inventing a rangeset-like data structure.

Looking into your suggestion of using xen/common's rangeset, it looks
like that won't work since it relies on xmalloc which is not yet set up.
I suspect there is a chicken-and-egg problem here that would preclude
xmalloc from sanely working this early on in the boot, but I might be
wrong about that.

I could reinvent a basic statically-allocated rangeset data structure
for this purpose if you think that's the best path forward.

>> @@ -105,81 +138,47 @@ static void __init setup_initial_mapping(struct 
>> lvl1_pd *lvl1,
>>          die();
>>      }
>>
>> +    /* Identity map Xen itself */
>>      for ( page_addr = map_start; page_addr < map_end; page_addr += 
>> PAGE_SIZE )
>>      {
>> -        struct lvl2_pd *lvl2;
>> -        struct lvl3_pd *lvl3;
>> -        struct lvl4_pt *lvl4;
>> -        pde_t *pde;
>> -        pte_t *pte;
>> -
>> -        /* Allocate LVL 2 PD if necessary */
>> -        pde = pt_entry(lvl1, page_addr);
>> -        if ( !pde_is_valid(*pde) )
>> -        {
>> -            lvl2 = lvl2_pd_pool_alloc();
>> -            *pde = paddr_to_pde(__pa(lvl2), PDE_VALID,
>> -                                XEN_PT_ENTRIES_LOG2_LVL_2);
>> -        }
>> -        else
>> -            lvl2 = __va(pde_to_paddr(*pde));
>> +        unsigned long flags;
>>
>> -        /* Allocate LVL 3 PD if necessary */
>> -        pde = pt_entry(lvl2, page_addr);
>> -        if ( !pde_is_valid(*pde) )
>> +        if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) )
>>          {
>> -            lvl3 = lvl3_pd_pool_alloc();
>> -            *pde = paddr_to_pde(__pa(lvl3), PDE_VALID,
>> -                                XEN_PT_ENTRIES_LOG2_LVL_3);
>> +            radix_dprintk("%016lx being marked as TEXT (RX)\n", page_addr);
>> +            flags = PTE_XEN_RX;
>>          }
>> -        else
>> -            lvl3 = __va(pde_to_paddr(*pde));
>> -
>> -        /* Allocate LVL 4 PT if necessary */
>> -        pde = pt_entry(lvl3, page_addr);
>> -        if ( !pde_is_valid(*pde) )
>> +        else if ( is_kernel_rodata(page_addr) )
>>          {
>> -            lvl4 = lvl4_pt_pool_alloc();
>> -            *pde = paddr_to_pde(__pa(lvl4), PDE_VALID,
>> -                                XEN_PT_ENTRIES_LOG2_LVL_4);
>> +            radix_dprintk("%016lx being marked as RODATA (RO)\n", 
>> page_addr);
>> +            flags = PTE_XEN_RO;
>>          }
>>          else
>> -            lvl4 = __va(pde_to_paddr(*pde));
>> -
>> -        /* Finally, create PTE in LVL 4 PT */
>> -        pte = pt_entry(lvl4, page_addr);
>> -        if ( !pte_is_valid(*pte) )
>>          {
>> -            unsigned long paddr = (page_addr - map_start) + phys_base;
>> -            unsigned long flags;
>> -
>> -            radix_dprintk("%016lx being mapped to %016lx\n", paddr, 
>> page_addr);
>> -            if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) 
>> )
>> -            {
>> -                radix_dprintk("%016lx being marked as TEXT (RX)\n", 
>> page_addr);
>> -                flags = PTE_XEN_RX;
>> -            }
>> -            else if ( is_kernel_rodata(page_addr) )
>> -            {
>> -                radix_dprintk("%016lx being marked as RODATA (RO)\n", 
>> page_addr);
>> -                flags = PTE_XEN_RO;
>> -            }
>> -            else
>> -            {
>> -                radix_dprintk("%016lx being marked as DEFAULT (RW)\n", 
>> page_addr);
>> -                flags = PTE_XEN_RW;
>> -            }
>> -
>> -            *pte = paddr_to_pte(paddr, flags);
>> -            radix_dprintk("%016lx is the result of PTE map\n",
>> -                paddr_to_pte(paddr, flags).pte);
>> -        }
>> -        else
>> -        {
>> -            early_printk("BUG: Tried to create PTE for already-mapped 
>> page!");
>> -            die();
>> +            radix_dprintk("%016lx being marked as DEFAULT (RW)\n", 
>> page_addr);
>> +            flags = PTE_XEN_RW;
>>          }
>> +
>> +        map_page_initial(lvl1, page_addr, (page_addr - map_start) + 
>> phys_base, flags);
>> +    }
>> +
>> +    previous_max_alloc_mfn = max_alloc_mfn;
>> +
>> +    /*
>> +     * Identity map all pages we've allocated for paging structures. This 
>> act
>> +     * itself will allocate more pages, so continue until we've mapped from
>> +     * `max_alloc_mfn` down to `min_alloc_mfn`. This assumes that the heap 
>> grows
>> +     * downwards, which matches the behavior of alloc_boot_pages.
>> +     */
>> +    for ( page_addr = (vaddr_t)__va(mfn_to_maddr(max_alloc_mfn));
>> +          mfn_to_maddr(min_alloc_mfn) <= __pa(page_addr);
>> +          page_addr -= PAGE_SIZE)
>> +    {
>> +        map_page_initial(lvl1, page_addr, __pa(page_addr), PTE_XEN_RW);
>>      }
>> +
>> +    if ( mfn_x(previous_max_alloc_mfn) != mfn_x(max_alloc_mfn) )
>> +        panic("Early page heap unexpectedly grew upwards\n");
>>  }
> 
> Oh, yet another assumption on allocator behavior.

Agreed, it's not ideal, but the assumption is explicitly checked and
the worst case is a panic/failure to boot if it is ever explicitly
broken.

Also related to your previous comment, if we go forward with a
rangeset/rangeset-like data structure this assumption could be
eliminated.

> 
> Jan

Thanks,
Shawn



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.