[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH v3 9/9] xen/ppc: mm-radix: Allocate all paging structures at runtime
Hi Jan, On 3/25/24 10:39 AM, Jan Beulich wrote: > On 14.03.2024 23:15, Shawn Anastasio wrote: >> --- a/xen/arch/ppc/mm-radix.c >> +++ b/xen/arch/ppc/mm-radix.c >> @@ -21,69 +21,101 @@ void enable_mmu(void); >> #define radix_dprintk(...) >> #endif >> >> -#define INITIAL_LVL1_PD_COUNT 1 >> -#define INITIAL_LVL2_LVL3_PD_COUNT 2 >> -#define INITIAL_LVL4_PT_COUNT 256 >> - >> -static size_t __initdata initial_lvl1_pd_pool_used; >> -static struct lvl1_pd initial_lvl1_pd_pool[INITIAL_LVL1_PD_COUNT]; >> - >> -static size_t __initdata initial_lvl2_lvl3_pd_pool_used; >> -static struct lvl2_pd initial_lvl2_lvl3_pd_pool[INITIAL_LVL2_LVL3_PD_COUNT]; >> - >> -static size_t __initdata initial_lvl4_pt_pool_used; >> -static struct lvl4_pt initial_lvl4_pt_pool[INITIAL_LVL4_PT_COUNT]; >> - >> -/* Only reserve minimum Partition and Process tables */ >> #define PATB_SIZE_LOG2 16 /* Only supported partition table size on POWER9 >> */ >> #define PATB_SIZE (1UL << PATB_SIZE_LOG2) >> -#define PRTB_SIZE_LOG2 12 >> +#define PRTB_SIZE_LOG2 24 /* Maximum process table size on POWER9 */ >> #define PRTB_SIZE (1UL << PRTB_SIZE_LOG2) >> >> -static struct patb_entry >> - __aligned(PATB_SIZE) initial_patb[PATB_SIZE / sizeof(struct >> patb_entry)]; >> +static struct patb_entry *initial_patb; >> +static struct prtb_entry *initial_prtb; >> >> -static struct prtb_entry >> - __aligned(PRTB_SIZE) initial_prtb[PRTB_SIZE / sizeof(struct >> prtb_entry)]; >> +static mfn_t __initdata min_alloc_mfn = {-1}; >> +static mfn_t __initdata max_alloc_mfn = {0}; >> >> -static __init struct lvl1_pd *lvl1_pd_pool_alloc(void) >> +/* >> + * A thin wrapper for alloc_boot_pages that keeps track of the maximum and >> + * minimum mfns that have been allocated. This information is used by >> + * setup_initial_mapping to include the allocated pages in the initial >> + * page mapping. >> + */ >> +static mfn_t __init initial_page_alloc(unsigned long nr_pfns, >> + unsigned long pfn_align) >> { >> - if ( initial_lvl1_pd_pool_used >= INITIAL_LVL1_PD_COUNT ) >> - { >> - early_printk("Ran out of space for LVL1 PD!\n"); >> - die(); >> - } >> + mfn_t mfn_first, mfn_last; >> >> - return &initial_lvl1_pd_pool[initial_lvl1_pd_pool_used++]; >> -} >> + mfn_first = alloc_boot_pages(nr_pfns, pfn_align); >> + mfn_last = _mfn(mfn_x(mfn_first) + nr_pfns - 1); > > Please can you use mfn_add() here? > Good catch, will do. >> -static __init struct lvl2_pd *lvl2_pd_pool_alloc(void) >> -{ >> - if ( initial_lvl2_lvl3_pd_pool_used >= INITIAL_LVL2_LVL3_PD_COUNT ) >> - { >> - early_printk("Ran out of space for LVL2/3 PD!\n"); >> - die(); >> - } >> + min_alloc_mfn = _mfn(min(mfn_x(min_alloc_mfn), mfn_x(mfn_first))); >> + max_alloc_mfn = _mfn(max(mfn_x(max_alloc_mfn), mfn_x(mfn_last))); > > Together with the comment ahead of the function - is there some kind of > assumption here that this range won't span almost all of system memory? > E.g. expecting allocations to be almost contiguous? If so, I wonder how > reliable this is, and whether using a rangeset wouldn't be better here. > You're right that this is only sane (i.e. not mapping almost all of system memory) when the assumption that alloc_boot_pages returns mostly-contiguous regions holds. I'm not super happy with this either, but I struggled to come up with a better solution that doesn't involve re-inventing a rangeset-like data structure. Looking into your suggestion of using xen/common's rangeset, it looks like that won't work since it relies on xmalloc which is not yet set up. I suspect there is a chicken-and-egg problem here that would preclude xmalloc from sanely working this early on in the boot, but I might be wrong about that. I could reinvent a basic statically-allocated rangeset data structure for this purpose if you think that's the best path forward. >> @@ -105,81 +138,47 @@ static void __init setup_initial_mapping(struct >> lvl1_pd *lvl1, >> die(); >> } >> >> + /* Identity map Xen itself */ >> for ( page_addr = map_start; page_addr < map_end; page_addr += >> PAGE_SIZE ) >> { >> - struct lvl2_pd *lvl2; >> - struct lvl3_pd *lvl3; >> - struct lvl4_pt *lvl4; >> - pde_t *pde; >> - pte_t *pte; >> - >> - /* Allocate LVL 2 PD if necessary */ >> - pde = pt_entry(lvl1, page_addr); >> - if ( !pde_is_valid(*pde) ) >> - { >> - lvl2 = lvl2_pd_pool_alloc(); >> - *pde = paddr_to_pde(__pa(lvl2), PDE_VALID, >> - XEN_PT_ENTRIES_LOG2_LVL_2); >> - } >> - else >> - lvl2 = __va(pde_to_paddr(*pde)); >> + unsigned long flags; >> >> - /* Allocate LVL 3 PD if necessary */ >> - pde = pt_entry(lvl2, page_addr); >> - if ( !pde_is_valid(*pde) ) >> + if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) ) >> { >> - lvl3 = lvl3_pd_pool_alloc(); >> - *pde = paddr_to_pde(__pa(lvl3), PDE_VALID, >> - XEN_PT_ENTRIES_LOG2_LVL_3); >> + radix_dprintk("%016lx being marked as TEXT (RX)\n", page_addr); >> + flags = PTE_XEN_RX; >> } >> - else >> - lvl3 = __va(pde_to_paddr(*pde)); >> - >> - /* Allocate LVL 4 PT if necessary */ >> - pde = pt_entry(lvl3, page_addr); >> - if ( !pde_is_valid(*pde) ) >> + else if ( is_kernel_rodata(page_addr) ) >> { >> - lvl4 = lvl4_pt_pool_alloc(); >> - *pde = paddr_to_pde(__pa(lvl4), PDE_VALID, >> - XEN_PT_ENTRIES_LOG2_LVL_4); >> + radix_dprintk("%016lx being marked as RODATA (RO)\n", >> page_addr); >> + flags = PTE_XEN_RO; >> } >> else >> - lvl4 = __va(pde_to_paddr(*pde)); >> - >> - /* Finally, create PTE in LVL 4 PT */ >> - pte = pt_entry(lvl4, page_addr); >> - if ( !pte_is_valid(*pte) ) >> { >> - unsigned long paddr = (page_addr - map_start) + phys_base; >> - unsigned long flags; >> - >> - radix_dprintk("%016lx being mapped to %016lx\n", paddr, >> page_addr); >> - if ( is_kernel_text(page_addr) || is_kernel_inittext(page_addr) >> ) >> - { >> - radix_dprintk("%016lx being marked as TEXT (RX)\n", >> page_addr); >> - flags = PTE_XEN_RX; >> - } >> - else if ( is_kernel_rodata(page_addr) ) >> - { >> - radix_dprintk("%016lx being marked as RODATA (RO)\n", >> page_addr); >> - flags = PTE_XEN_RO; >> - } >> - else >> - { >> - radix_dprintk("%016lx being marked as DEFAULT (RW)\n", >> page_addr); >> - flags = PTE_XEN_RW; >> - } >> - >> - *pte = paddr_to_pte(paddr, flags); >> - radix_dprintk("%016lx is the result of PTE map\n", >> - paddr_to_pte(paddr, flags).pte); >> - } >> - else >> - { >> - early_printk("BUG: Tried to create PTE for already-mapped >> page!"); >> - die(); >> + radix_dprintk("%016lx being marked as DEFAULT (RW)\n", >> page_addr); >> + flags = PTE_XEN_RW; >> } >> + >> + map_page_initial(lvl1, page_addr, (page_addr - map_start) + >> phys_base, flags); >> + } >> + >> + previous_max_alloc_mfn = max_alloc_mfn; >> + >> + /* >> + * Identity map all pages we've allocated for paging structures. This >> act >> + * itself will allocate more pages, so continue until we've mapped from >> + * `max_alloc_mfn` down to `min_alloc_mfn`. This assumes that the heap >> grows >> + * downwards, which matches the behavior of alloc_boot_pages. >> + */ >> + for ( page_addr = (vaddr_t)__va(mfn_to_maddr(max_alloc_mfn)); >> + mfn_to_maddr(min_alloc_mfn) <= __pa(page_addr); >> + page_addr -= PAGE_SIZE) >> + { >> + map_page_initial(lvl1, page_addr, __pa(page_addr), PTE_XEN_RW); >> } >> + >> + if ( mfn_x(previous_max_alloc_mfn) != mfn_x(max_alloc_mfn) ) >> + panic("Early page heap unexpectedly grew upwards\n"); >> } > > Oh, yet another assumption on allocator behavior. Agreed, it's not ideal, but the assumption is explicitly checked and the worst case is a panic/failure to boot if it is ever explicitly broken. Also related to your previous comment, if we go forward with a rangeset/rangeset-like data structure this assumption could be eliminated. > > Jan Thanks, Shawn
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |