[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] Re: [PATCH R3 7/7] xen/balloon: Memory hotplug support for Xen balloon driver



On Thu, Feb 03, 2011 at 05:30:33PM +0100, Daniel Kiper wrote:
> Features and fixes:
>   - new version of memory hotplug patch which supports
>     among others memory allocation policies during errors
>     (try until success or stop at first error),
>   - this version of patch was tested with tmem
>     (selfballooning and frontswap) and works
>     very well with it,
>   - some other minor fixes.
> 
> Signed-off-by: Daniel Kiper <dkiper@xxxxxxxxxxxx>
> ---
>  drivers/xen/Kconfig   |   10 ++
>  drivers/xen/balloon.c |  231 
> ++++++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 230 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
> index 07bec09..8f880aa 100644
> --- a/drivers/xen/Kconfig
> +++ b/drivers/xen/Kconfig
> @@ -9,6 +9,16 @@ config XEN_BALLOON
>         the system to expand the domain's memory allocation, or alternatively
>         return unneeded memory to the system.
>  
> +config XEN_BALLOON_MEMORY_HOTPLUG
> +     bool "Memory hotplug support for Xen balloon driver"
> +     default n
> +     depends on XEN_BALLOON && MEMORY_HOTPLUG
> +     help
> +       Memory hotplug support for Xen balloon driver allows expanding memory
> +       available for the system above limit declared at system startup.
> +       It is very useful on critical systems which require long
> +       run without rebooting.
> +
>  config XEN_SCRUB_PAGES
>       bool "Scrub pages before returning them to system"
>       depends on XEN_BALLOON
> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index b1e199c..e43e928 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -6,6 +6,12 @@
>   * Copyright (c) 2003, B Dragovic
>   * Copyright (c) 2003-2004, M Williamson, K Fraser
>   * Copyright (c) 2005 Dan M. Smith, IBM Corporation
> + * Copyright (c) 2010 Daniel Kiper
> + *
> + * Memory hotplug support was written by Daniel Kiper. Work on
> + * it was sponsored by Google under Google Summer of Code 2010
> + * program. Jeremy Fitzhardinge from Xen.org was the mentor for
> + * this project.
>   *
>   * This program is free software; you can redistribute it and/or
>   * modify it under the terms of the GNU General Public License version 2
> @@ -44,6 +50,7 @@
>  #include <linux/list.h>
>  #include <linux/sysdev.h>
>  #include <linux/gfp.h>
> +#include <linux/memory.h>
>  
>  #include <asm/page.h>
>  #include <asm/pgalloc.h>
> @@ -80,6 +87,9 @@ enum bp_state {
>       BP_HUNGRY
>  };
>  
> +#define MH_POLICY_TRY_UNTIL_SUCCESS  0
> +#define MH_POLICY_STOP_AT_FIRST_ERROR        1

Cool.

> +
>  struct balloon_stats {
>       /* We aim for 'current allocation' == 'target allocation'. */
>       unsigned long current_pages;
> @@ -89,6 +99,10 @@ struct balloon_stats {
>       unsigned long balloon_high;
>       unsigned long schedule_delay;
>       unsigned long max_schedule_delay;
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +     unsigned long boot_max_pfn;
> +     unsigned long mh_policy;
> +#endif
>  };
>  
>  static DEFINE_MUTEX(balloon_mutex);
> @@ -206,18 +220,199 @@ static void update_schedule_delay(enum bp_state state)
>       balloon_stats.schedule_delay = new_schedule_delay;
>  }
>  
> -static unsigned long current_target(void)
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +static struct resource *allocate_memory_resource(unsigned long nr_pages)
> +{
> +     resource_size_t r_min, r_size;
> +     struct resource *r;
> +
> +     /*
> +      * Look for first unused memory region starting at page
> +      * boundary. Skip last memory section created at boot time
> +      * because it may contains unused memory pages with PG_reserved
> +      * bit not set (online_pages() require PG_reserved bit set).
> +      */
> +
> +     r = kzalloc(sizeof(struct resource), GFP_KERNEL);
> +
> +     if (!r)
> +             return NULL;
> +
> +     r->name = "System RAM";
> +     r->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
> +     r_min = 
> PFN_PHYS(section_nr_to_pfn(pfn_to_section_nr(balloon_stats.boot_max_pfn) + 
> 1));
> +     r_size = nr_pages << PAGE_SHIFT;
> +
> +     if (allocate_resource(&iomem_resource, r, r_size, r_min,
> +                                     ULONG_MAX, PAGE_SIZE, NULL, NULL) < 0) {
> +             kfree(r);
> +             return NULL;
> +     }
> +
> +     return r;
> +}
> +
> +static struct resource *adjust_memory_resource(struct resource *r, unsigned 
> long nr_pages)
> +{
> +     int rc;
> +
> +     if (r->end + 1 - (nr_pages << PAGE_SHIFT) == r->start) {

Will this actually occur? Say I called 'allocate_additional_memory' with 512
and got -ENOMEM (so the hypercall failed complelty). The mh->policy 
MH_POLICY_STOP_AT_FIRST_ERROR. So we end up here. Assume the r_min is
0x100000000, then r->start is 0x100000000 and r->end is 0x100200000.

So:
  100200001 - (200000) == 0x100000000 ?


> +             rc = release_resource(r);
> +             BUG_ON(rc < 0);
> +             kfree(r);
> +             return NULL;
> +     }
> +
> +     rc = adjust_resource(r, r->start, r->end + 1 - r->start -
> +                             (nr_pages << PAGE_SHIFT));

If we wanted 512 pages, and only got 256 and want to adjust the region, we
would want it be:
0x100000000 -> 0x100100000 right?

So with the third argument that comes out to be:

0x100200000 + 1 - 0x100000000 - (100000) = 100001

which is just one page above what we requested?

> +
> +     BUG_ON(rc < 0);

Can we just do WARN_ON, and return NULL instead (and also release the resource)?
> +
> +     return r;
> +}
> +
> +static int allocate_additional_memory(struct resource *r, unsigned long 
> nr_pages)
> +{
> +     int rc;
> +     struct xen_memory_reservation reservation = {
> +             .address_bits = 0,
> +             .extent_order = 0,
> +             .domid        = DOMID_SELF
> +     };
> +     unsigned long flags, i, pfn, pfn_start;
> +
> +     if (!nr_pages)
> +             return 0;
> +
> +     pfn_start = PFN_UP(r->end) - nr_pages;
> +
> +     if (nr_pages > ARRAY_SIZE(frame_list))
> +             nr_pages = ARRAY_SIZE(frame_list);
> +
> +     for (i = 0, pfn = pfn_start; i < nr_pages; ++i, ++pfn)
> +             frame_list[i] = pfn;
> +
> +     set_xen_guest_handle(reservation.extent_start, frame_list);
> +     reservation.nr_extents = nr_pages;
> +
> +     spin_lock_irqsave(&xen_reservation_lock, flags);
> +
> +     rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
> +
> +     if (rc <= 0)
> +             return (rc < 0) ? rc : -ENOMEM;
> +

So if we populated some of them (say we want to 512, but only did 64),
don't we want to do the loop below?  Also you look to be forgetting to
do a spin_unlock_irqrestore if you quit here.

> +     for (i = 0, pfn = pfn_start; i < rc; ++i, ++pfn) {
> +             BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) &&
> +                    phys_to_machine_mapping_valid(pfn));
> +             set_phys_to_machine(pfn, frame_list[i]);
> +     }
> +
> +     spin_unlock_irqrestore(&xen_reservation_lock, flags);
> +
> +     return rc;
> +}
> +
> +static void hotplug_allocated_memory(struct resource *r)
>  {
> -     unsigned long target = balloon_stats.target_pages;
> +     int nid, rc;
> +     resource_size_t r_size;
> +     struct memory_block *mem;
> +     unsigned long pfn;
> +
> +     r_size = r->end + 1 - r->start;

Why bump it by one byte?

> +     nid = memory_add_physaddr_to_nid(r->start);
> +
> +     rc = add_registered_memory(nid, r->start, r_size);
> +
> +     if (rc) {
> +             pr_err("%s: add_registered_memory: Memory hotplug failed: %i\n",
> +                     __func__, rc);
> +             balloon_stats.target_pages = balloon_stats.current_pages;
> +             return;
> +     }
> +
> +     if (xen_pv_domain())
> +             for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); ++pfn)

I think you the r->start to be PFN_UP just in case the r->start is not page
aligned. Thought I am not sure it would even happen anymore, as M A Young
found the culprit that made it possible for us to setup memory regions
non-aligned and that is fixed now (in 2.6.38).

> +                     if (!PageHighMem(pfn_to_page(pfn))) {
> +                             rc = HYPERVISOR_update_va_mapping(
> +                                     (unsigned long)__va(pfn << PAGE_SHIFT),
> +                                     mfn_pte(pfn_to_mfn(pfn), PAGE_KERNEL), 
> 0);
> +                             BUG_ON(rc);
> +                     }
>  
> -     target = min(target,
> -                  balloon_stats.current_pages +
> -                  balloon_stats.balloon_low +
> -                  balloon_stats.balloon_high);
> +     rc = online_pages(PFN_DOWN(r->start), r_size >> PAGE_SHIFT);
>  
> -     return target;
> +     if (rc) {
> +             pr_err("%s: online_pages: Failed: %i\n", __func__, rc);
> +             balloon_stats.target_pages = balloon_stats.current_pages;
> +             return;
> +     }
> +
> +     for (pfn = PFN_DOWN(r->start); pfn < PFN_UP(r->end); pfn += 
> PAGES_PER_SECTION) {

Ditto. Can you do PFN_UP(r->start)?

> +             mem = find_memory_block(__pfn_to_section(pfn));
> +             BUG_ON(!mem);
> +             BUG_ON(!present_section_nr(mem->phys_index));
> +             mutex_lock(&mem->state_mutex);
> +             mem->state = MEM_ONLINE;
> +             mutex_unlock(&mem->state_mutex);
> +     }
> +
> +     balloon_stats.current_pages += r_size >> PAGE_SHIFT;
>  }
>  
> +static enum bp_state request_additional_memory(long credit)
> +{
> +     int rc;
> +     static struct resource *r;
> +     static unsigned long pages_left;
> +
> +     if ((credit <= 0 || balloon_stats.balloon_low ||
> +                             balloon_stats.balloon_high) && !r)
> +             return BP_DONE;
> +
> +     if (!r) {
> +             r = allocate_memory_resource(credit);
> +
> +             if (!r)
> +                     return BP_ERROR;
> +
> +             pages_left = credit;
> +     }
> +
> +     rc = allocate_additional_memory(r, pages_left);
> +
> +     if (rc < 0) {
> +             if (balloon_stats.mh_policy == MH_POLICY_TRY_UNTIL_SUCCESS)
> +                     return BP_ERROR;
> +
> +             r = adjust_memory_resource(r, pages_left);
> +
> +             if (!r)
> +                     return BP_ERROR;

Say we failed the hypercall completly and got -ENOMEM from the 
'allocate_additional_memory'.
I presume the adjust_memory_resource at this point would have deleted 'r', 
which means
that (!r) and we return BP_ERROR.

But that means that we aren't following the MH_POLICY_STOP_AT_FIRST_ERROR as
the balloon_process will retry again and the again, and again??

> +     } else {
> +             pages_left -= rc;
> +

So say I request 512 pages (mh_policy is MH_POLICY_STOP_AT_FIRST_ERROR),
but only got 256. I adjust the pages_left to be 256 and then
> +             if (pages_left)
> +                     return BP_HUNGRY;

we return BP_HUNGRY. That makes 'balloon_process' retry with 512 pages, and we
keep on trying and call "allocate_additional_memory", which fails once more
(returns 256), and we end up returning BP_HUNGRY, and retry... and so on.

Would it be make sense to have a check here for the 
MH_POLICY_STOP_AT_FIRST_ERROR
and if so call the adjust_memory_memory_resource as well?

> +     }
> +
> +     hotplug_allocated_memory(r);
> +
> +     r = NULL;
> +
> +     return BP_DONE;
> +}
> +#else
> +static enum bp_state request_additional_memory(long credit)
> +{
> +     if (balloon_stats.balloon_low && balloon_stats.balloon_high &&
> +                     balloon_stats.target_pages > 
> balloon_stats.current_pages)
> +             balloon_stats.target_pages = balloon_stats.current_pages;
> +     return BP_DONE;
> +}
> +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
> +
>  static enum bp_state increase_reservation(unsigned long nr_pages)
>  {
>       enum bp_state state = BP_DONE;
> @@ -352,15 +547,17 @@ static enum bp_state decrease_reservation(unsigned long 
> nr_pages)
>   */
>  static void balloon_process(struct work_struct *work)
>  {
> -     enum bp_state rc, state = BP_DONE;
> +     enum bp_state rc, state;
>       long credit;
>  
>       mutex_lock(&balloon_mutex);
>  
>       do {
> -             credit = current_target() - balloon_stats.current_pages;
> +             credit = balloon_stats.target_pages - 
> balloon_stats.current_pages;
>  
> -             if (credit > 0) {
> +             state = request_additional_memory(credit);
> +
> +             if (credit > 0 && state == BP_DONE) {
>                       rc = increase_reservation(credit);
>                       state = (rc == BP_ERROR) ? BP_ERROR : state;
>               }
> @@ -450,6 +647,11 @@ static int __init balloon_init(void)
>       balloon_stats.schedule_delay = 1;
>       balloon_stats.max_schedule_delay = 32;
>  
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +     balloon_stats.boot_max_pfn = max_pfn;
> +     balloon_stats.mh_policy = MH_POLICY_STOP_AT_FIRST_ERROR;
> +#endif
> +
>       register_balloon(&balloon_sysdev);
>  
>       /*
> @@ -506,6 +708,10 @@ BALLOON_SHOW(high_kb, "%lu\n", 
> PAGES2KB(balloon_stats.balloon_high));
>  static SYSDEV_ULONG_ATTR(schedule_delay, 0644, balloon_stats.schedule_delay);
>  static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, 
> balloon_stats.max_schedule_delay);
>  
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +static SYSDEV_ULONG_ATTR(memory_hotplug_policy, 0644, 
> balloon_stats.mh_policy);
> +#endif
> +
>  static ssize_t show_target_kb(struct sys_device *dev, struct 
> sysdev_attribute *attr,
>                             char *buf)
>  {
> @@ -568,7 +774,10 @@ static struct sysdev_attribute *balloon_attrs[] = {
>       &attr_target_kb,
>       &attr_target,
>       &attr_schedule_delay.attr,
> -     &attr_max_schedule_delay.attr
> +     &attr_max_schedule_delay.attr,
> +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
> +     &attr_memory_hotplug_policy.attr
> +#endif
>  };
>  
>  static struct attribute *balloon_info_attrs[] = {
> -- 
> 1.5.6.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.