[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 1/8] mm: Place unscrubbed pages at the end of pagelist



>>> On 19.05.17 at 17:50, <boris.ostrovsky@xxxxxxxxxx> wrote:
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -383,6 +383,8 @@ typedef struct page_list_head 
> heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
>  static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
>  #define heap(node, zone, order) ((*_heap[node])[zone][order])
>  
> +static unsigned long node_need_scrub[MAX_NUMNODES];

Just as a remark - I think it is inefficient to store per-node data
this way (equally applies to _heap[]); this would better be put
into per-node memory. Since we don't have (and likely also don't
want to have) this_node() / per_node(), perhaps we could
(ab)use per-CPU data for this.

> @@ -798,11 +814,18 @@ static struct page_info *alloc_heap_pages(
>      return NULL;
>  
>   found: 
> +    need_scrub = (pg->u.free.first_dirty != INVALID_DIRTY_IDX);
> +
>      /* We may have to halve the chunk a number of times. */
>      while ( j != order )
>      {
> -        PFN_ORDER(pg) = --j;
> -        page_list_add_tail(pg, &heap(node, zone, j));
> +        /*
> +         * Some of the sub-chunks may be clean but we will mark them
> +         * as dirty (if need_scrub is set) to avoid traversing the
> +         * list here.
> +         */
> +        page_list_add_scrub(pg, node, zone, --j,
> +                            need_scrub ? 0 : INVALID_DIRTY_IDX);

I suppose this is going to be improved in subsequent patches,
and hence the comment will be gone by the end of the series?

> @@ -851,11 +874,20 @@ static int reserve_offlined_page(struct page_info *head)
>      int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 
> 0;
>      struct page_info *cur_head;
>      int cur_order;
> +    bool need_scrub;
>  
>      ASSERT(spin_is_locked(&heap_lock));
>  
>      cur_head = head;
>  
> +    /*
> +     * We may break the buddy so let's mark the head as clean. Then, when
> +     * merging chunks back into the heap, we will see whether the chunk has
> +     * unscrubbed pages and set its first_dirty properly.
> +     */
> +    need_scrub = (head->u.free.first_dirty != INVALID_DIRTY_IDX);
> +    head->u.free.first_dirty = INVALID_DIRTY_IDX;
> +
>      page_list_del(head, &heap(node, zone, head_order));
>  
>      while ( cur_head < (head + (1 << head_order)) )
> @@ -873,6 +905,8 @@ static int reserve_offlined_page(struct page_info *head)
>  
>          while ( cur_order < head_order )
>          {
> +            unsigned int first_dirty = INVALID_DIRTY_IDX;
> +
>              next_order = cur_order + 1;
>  
>              if ( (cur_head + (1 << next_order)) >= (head + ( 1 << 
> head_order)) )
> @@ -892,8 +926,20 @@ static int reserve_offlined_page(struct page_info *head)
>              {
>              merge:
>                  /* We don't consider merging outside the head_order. */
> -                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
> -                PFN_ORDER(cur_head) = cur_order;
> +
> +                /* See if any of the pages indeed need scrubbing. */
> +                if ( need_scrub )
> +                {
> +                    for ( i = 0; i < (1 << cur_order); i++ )
> +                        if ( test_bit(_PGC_need_scrub,
> +                                      &cur_head[i].count_info) )
> +                        {
> +                            first_dirty = i;
> +                            break;
> +                        }
> +                }

This is inefficient - at the point you set need_scrub you could
instead latch the first page needing to be scrubbed, and start
the loop only there (and in some cases you would get away
without entering the loop altogether).

> @@ -919,9 +965,52 @@ static int reserve_offlined_page(struct page_info *head)
>      return count;
>  }
>  
> +static void scrub_free_pages(unsigned int node)
> +{
> +    struct page_info *pg;
> +    unsigned int zone;
> +
> +    ASSERT(spin_is_locked(&heap_lock));
> +
> +    if ( !node_need_scrub[node] )
> +        return;
> +
> +    for ( zone = 0; zone < NR_ZONES; zone++ )
> +    {
> +        unsigned int order = MAX_ORDER;
> +        do {

Blank line between declaration(s) and statement(s) please.

Also, considering that this entire function runs with the heap lock
held, I think we really need to split that one into per-node ones.
But of course, as with the NUMA locality related remark, this isn't
a request to necessarily do this in the series here.

> @@ -977,35 +1076,54 @@ static void free_heap_pages(
>  
>          if ( (page_to_mfn(pg) & mask) )
>          {
> +            struct page_info *predecessor = pg - mask;
> +
>              /* Merge with predecessor block? */
> -            if ( !mfn_valid(_mfn(page_to_mfn(pg-mask))) ||
> -                 !page_state_is(pg-mask, free) ||
> -                 (PFN_ORDER(pg-mask) != order) ||
> -                 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
> +            if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
> +                 !page_state_is(predecessor, free) ||
> +                 (PFN_ORDER(predecessor) != order) ||
> +                 (phys_to_nid(page_to_maddr(predecessor)) != node) )
>                  break;
> -            pg -= mask;
> -            page_list_del(pg, &heap(node, zone, order));
> +
> +            page_list_del(predecessor, &heap(node, zone, order));
> +
> +            if ( predecessor->u.free.first_dirty != INVALID_DIRTY_IDX )
> +                need_scrub = true;
> +                /* ... and keep predecessor's first_dirty. */
> +            else if ( pg->u.free.first_dirty != INVALID_DIRTY_IDX )
> +                predecessor->u.free.first_dirty = (1U << order) +
> +                                                  pg->u.free.first_dirty;
> +
> +            pg->u.free.first_dirty = INVALID_DIRTY_IDX;

Is this really needed?

> +            pg = predecessor;
>          }
>          else
>          {
> +            struct page_info *successor = pg + mask;
> +
>              /* Merge with successor block? */
> -            if ( !mfn_valid(_mfn(page_to_mfn(pg+mask))) ||
> -                 !page_state_is(pg+mask, free) ||
> -                 (PFN_ORDER(pg+mask) != order) ||
> -                 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
> +            if ( !mfn_valid(_mfn(page_to_mfn(successor))) ||
> +                 !page_state_is(successor, free) ||
> +                 (PFN_ORDER(successor) != order) ||
> +                 (phys_to_nid(page_to_maddr(successor)) != node) )
>                  break;
> -            page_list_del(pg + mask, &heap(node, zone, order));
> +            page_list_del(successor, &heap(node, zone, order));
> +
> +            need_scrub |= (successor->u.free.first_dirty != 
> INVALID_DIRTY_IDX);

|= on a boolean is slightly odd - perhaps better us if() just like you
do on the other path?

> +            successor->u.free.first_dirty = INVALID_DIRTY_IDX;

Same here - is this really needed?

> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -87,6 +87,9 @@ struct page_info
>  
>          /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>          struct {
> +            /* Index of the first *possibly* unscrubbed page in the buddy. */
> +#define INVALID_DIRTY_IDX -1U

This needs parentheses and would better be either ~0U or UINT_MAX.

> +            unsigned int first_dirty;

On x86 this change is fine at present, albeit not optimal. Its ARM
equivalent, however, grows struct page_info in the 32-bit case,
which I don't think is wanted or needed. You really only need
MAX_ORDER+1 bits here, so I'd suggest making this a bit field
(also on x86, to at once make obvious how many bits remain
unused), and perhaps making need_tlbflush a single bit at once
(unless its address is being taken somewhere).

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.