[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 2/2] xen/mm: Introduce PGC_state_uninitialised



> -----Original Message-----
> From: Xen-devel <xen-devel-bounces@xxxxxxxxxxxxxxxxxxxx> On Behalf Of David 
> Woodhouse
> Sent: 19 March 2020 21:22
> To: xen-devel@xxxxxxxxxxxxxxxxxxxx
> Cc: Stefano Stabellini <sstabellini@xxxxxxxxxx>; Julien Grall 
> <julien@xxxxxxx>; Wei Liu <wl@xxxxxxx>;
> Andrew Cooper <andrew.cooper3@xxxxxxxxxx>; Ian Jackson 
> <ian.jackson@xxxxxxxxxxxxx>; George Dunlap
> <george.dunlap@xxxxxxxxxx>; hongyxia@xxxxxxxxxx; Jan Beulich 
> <jbeulich@xxxxxxxx>; Volodymyr Babchuk
> <Volodymyr_Babchuk@xxxxxxxx>; Roger Pau Monné <roger.pau@xxxxxxxxxx>
> Subject: [Xen-devel] [PATCH 2/2] xen/mm: Introduce PGC_state_uninitialised
> 
> From: David Woodhouse <dwmw@xxxxxxxxxxxx>
> 
> It is possible for pages to enter general circulation without ever
> being process by init_heap_pages().
> 
> For example, pages of the multiboot module containing the initramfs may
> be assigned via assign_pages() to dom0 as it is created. And some code
> including map_pages_to_xen() has checks on 'system_state' to determine
> whether to use the boot or the heap allocator, but it seems impossible
> to prove that pages allocated by the boot allocator are not subsequently
> freed with free_heap_pages().
> 
> This actually works fine in the majority of cases; there are only a few
> esoteric corner cases which init_heap_pages() handles before handing the
> page range off to free_heap_pages():
>  • Excluding MFN #0 to avoid inappropriate cross-zone merging.
>  • Ensuring that the node information structures exist, when the first
>    page(s) of a given node are handled.
>  • High order allocations crossing from one node to another.
> 
> To handle this case, shift PG_state_inuse from its current value of
> zero, to another value. Use zero, which is the initial state of the
> entire frame table, as PG_state_uninitialised.
> 
> Fix a couple of assertions which were assuming that PG_state_inuse is
> zero, and make them cope with the PG_state_uninitialised case too where
> appopriate.
> 
> Finally, make free_heap_pages() call through to init_heap_pages() when
> given a page range which has not been initialised. This cannot keep
> recursing because init_heap_pages() will set each page state to
> PGC_state_inuse before passing it back to free_heap_pages() for the
> second time.
> 
> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
> ---
>  xen/arch/x86/mm.c        |  3 ++-
>  xen/common/page_alloc.c  | 44 +++++++++++++++++++++++++++++-----------
>  xen/include/asm-arm/mm.h |  3 ++-
>  xen/include/asm-x86/mm.h |  3 ++-
>  4 files changed, 38 insertions(+), 15 deletions(-)
> 
> diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
> index 62507ca651..5f0581c072 100644
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -491,7 +491,8 @@ void share_xen_page_with_guest(struct page_info *page, 
> struct domain *d,
> 
>      page_set_owner(page, d);
>      smp_wmb(); /* install valid domain ptr before updating refcnt. */
> -    ASSERT((page->count_info & ~PGC_xen_heap) == 0);
> +    ASSERT((page->count_info & ~PGC_xen_heap) == PGC_state_inuse ||
> +           (page->count_info & ~PGC_xen_heap) == PGC_state_uninitialised);

Could the page state perhaps be bumped to inuse in this case? It seems odd to 
leave state uninitialized yet succeed in sharing with a guest.

> 
>      /* Only add to the allocation list if the domain isn't dying. */
>      if ( !d->is_dying )
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 8d72a64f4e..4f7971f2a1 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -252,6 +252,8 @@ struct bootmem_region {
>  static struct bootmem_region __initdata
>      bootmem_region_list[PAGE_SIZE / sizeof(struct bootmem_region)];
>  static unsigned int __initdata nr_bootmem_regions;
> +static void init_heap_pages(struct page_info *pg, unsigned long nr_pages,
> +                            bool scrub);
> 
>  struct scrub_region {
>      unsigned long offset;
> @@ -1390,6 +1392,17 @@ static void free_heap_pages(
>      ASSERT(order <= MAX_ORDER);
>      ASSERT(node >= 0);
> 
> +    if ( page_state_is(pg, uninitialised) )
> +    {
> +        init_heap_pages(pg, 1 << order, need_scrub);
> +        /*
> +         * init_heap_pages() will call back into free_heap_pages() for
> +         * each page but cannot keep recursing because each page will
> +         * be set to PGC_state_inuse first.
> +         */
> +        return;
> +    }
> +
>      spin_lock(&heap_lock);
> 
>      for ( i = 0; i < (1 << order); i++ )
> @@ -1771,11 +1784,10 @@ int query_page_offline(mfn_t mfn, uint32_t *status)
>   * latter is not on a MAX_ORDER boundary, then we reserve the page by
>   * not freeing it to the buddy allocator.
>   */
> -static void init_heap_pages(
> -    struct page_info *pg, unsigned long nr_pages)
> +static void init_heap_pages(struct page_info *pg, unsigned long nr_pages,
> +                            bool scrub)
>  {
>      unsigned long i;
> -    bool idle_scrub = false;
> 
>      /*
>       * Keep MFN 0 away from the buddy allocator to avoid crossing zone
> @@ -1800,7 +1812,7 @@ static void init_heap_pages(
>      spin_unlock(&heap_lock);
> 
>      if ( system_state < SYS_STATE_active && opt_bootscrub == BOOTSCRUB_IDLE )
> -        idle_scrub = true;
> +        scrub = true;
> 
>      for ( i = 0; i < nr_pages; i++ )
>      {
> @@ -1828,7 +1840,8 @@ static void init_heap_pages(
>              nr_pages -= n;
>          }
> 
> -        free_heap_pages(pg + i, 0, scrub_debug || idle_scrub);

Would it be worth an ASSERT(!pg[i].count_info) here in case something is added 
which erroneously modifies the page count info before this is done?

> +        pg[i].count_info = PGC_state_inuse;
> +        free_heap_pages(pg + i, 0, scrub_debug || scrub);
>      }
>  }
> 
> @@ -1864,7 +1877,7 @@ void __init end_boot_allocator(void)
>          if ( (r->s < r->e) &&
>               (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
>          {
> -            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
> +            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s, false);
>              r->e = r->s;
>              break;
>          }
> @@ -1873,7 +1886,7 @@ void __init end_boot_allocator(void)
>      {
>          struct bootmem_region *r = &bootmem_region_list[i];
>          if ( r->s < r->e )
> -            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
> +            init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s, false);
>      }
>      nr_bootmem_regions = 0;
> 
> @@ -2142,7 +2155,7 @@ void init_xenheap_pages(paddr_t ps, paddr_t pe)
> 
>      memguard_guard_range(maddr_to_virt(ps), pe - ps);
> 
> -    init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
> +    init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT, false);
>  }
> 
> 
> @@ -2251,7 +2264,7 @@ void init_domheap_pages(paddr_t ps, paddr_t pe)
>      if ( mfn_x(emfn) <= mfn_x(smfn) )
>          return;
> 
> -    init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn));
> +    init_heap_pages(mfn_to_page(smfn), mfn_x(emfn) - mfn_x(smfn), false);
>  }
> 
> 
> @@ -2280,7 +2293,8 @@ int assign_pages(
> 
>          for ( i = 0; i < (1ul << order); i++ )
>          {
> -            ASSERT(!(pg[i].count_info & ~PGC_extra));
> +            ASSERT((pg[i].count_info & ~PGC_extra) == PGC_state_inuse ||
> +                   (pg[i].count_info & ~PGC_extra) == 
> PGC_state_uninitialised);

Again, perhaps bump the state to inuse if it is uninitialized...

>              if ( pg[i].count_info & PGC_extra )
>                  extra_pages++;
>          }
> @@ -2316,10 +2330,16 @@ int assign_pages(
>      for ( i = 0; i < (1 << order); i++ )
>      {
>          ASSERT(page_get_owner(&pg[i]) == NULL);
> +        /*
> +         * Note: Not using page_state_is() here. The ASSERT requires that
> +         * all other bits in count_info are zero, in addition to PGC_state
> +         * being appropriate.
> +         */
> +        ASSERT((pg[i].count_info & ~PGC_extra) == PGC_state_inuse ||
> +               (pg[i].count_info & ~PGC_extra) == PGC_state_uninitialised);

...then this ASSERT can be tightened.

>          page_set_owner(&pg[i], d);
>          smp_wmb(); /* Domain pointer must be visible before updating refcnt. 
> */
> -        pg[i].count_info =
> -            (pg[i].count_info & PGC_extra) | PGC_allocated | 1;
> +        pg[i].count_info = (pg[i].count_info & PGC_state) | PGC_allocated | 
> 1;

The PGC_extra seems to have vapourized here.

  Paul

>          page_list_add_tail(&pg[i], &d->page_list);
>      }
> 
> diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
> index a877791d1c..49663fa98a 100644
> --- a/xen/include/asm-arm/mm.h
> +++ b/xen/include/asm-arm/mm.h
> @@ -113,12 +113,13 @@ struct page_info
>    * { inuse, offlining, offlined, free, broken_offlining, broken }
>    */
>  #define PGC_state                  PG_mask(7, 9)
> -#define PGC_state_inuse            PG_mask(0, 9)
> +#define PGC_state_uninitialised    PG_mask(0, 9)
>  #define PGC_state_offlining        PG_mask(1, 9)
>  #define PGC_state_offlined         PG_mask(2, 9)
>  #define PGC_state_free             PG_mask(3, 9)
>  #define PGC_state_broken_offlining PG_mask(4, 9) /* Broken and offlining */
>  #define PGC_state_broken           PG_mask(5, 9) /* Broken and offlined */
> +#define PGC_state_inuse            PG_mask(6, 9)
> 
>  #define pgc_is(pgc, st)            (((pgc) & PGC_state) == PGC_state_##st)
>  #define page_state_is(pg, st)       pgc_is((pg)->count_info, st)
> diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
> index 1203f1b179..5fbbca5f05 100644
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -72,12 +72,13 @@
>    * { inuse, offlining, offlined, free, broken_offlining, broken }
>    */
>  #define PGC_state                  PG_mask(7, 9)
> -#define PGC_state_inuse            PG_mask(0, 9)
> +#define PGC_state_uninitialised    PG_mask(0, 9)
>  #define PGC_state_offlining        PG_mask(1, 9)
>  #define PGC_state_offlined         PG_mask(2, 9)
>  #define PGC_state_free             PG_mask(3, 9)
>  #define PGC_state_broken_offlining PG_mask(4, 9) /* Broken and offlining */
>  #define PGC_state_broken           PG_mask(5, 9) /* Broken and offlined */
> +#define PGC_state_inuse            PG_mask(6, 9)
> 
>  #define pgc_is(pgc, st)            (((pgc) & PGC_state) == PGC_state_##st)
>  #define page_state_is(pg, st)       pgc_is((pg)->count_info, st)
> --
> 2.21.0
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxxx
> https://lists.xenproject.org/mailman/listinfo/xen-devel


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.