[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest



On 27/08/13 09:52, Elena Ufimtseva wrote:
> Uses subop hypercall to request XEN about vnuma topology.
> Sets the memory blocks (aligned by XEN), cpus, distance table
> on boot. NUMA support should be compiled in kernel.
> 
> NUMA support should be compiled in kernel.
> 
> Memory blocks are aligned by xen. Xen is aware of guest initial
> RAM layout.
> If the memory blocks are incorrect, call for default linux numa
> dummy init.
> 
> Requires XEN with vNUMA support.
> 
> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
> 
> TODO:
> Change subop from XENMEM to SYSCTL.

XENMEM subop seems right here.

> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
> ---
>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>  arch/x86/mm/numa.c                   |    8 +++
>  arch/x86/xen/enlighten.c             |  115 
> ++++++++++++++++++++++++++++++++++
>  include/xen/interface/memory.h       |    1 +
>  4 files changed, 160 insertions(+)
>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
> 
> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h 
> b/arch/x86/include/asm/xen/vnuma-xen.h
> new file mode 100644
> index 0000000..73c1bde
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/vnuma-xen.h
> @@ -0,0 +1,36 @@
> +#ifndef _ASM_X86_VNUMA_XEN_H
> +#define _ASM_X86_VNUMA_XEN_H
> +
> +#ifdef CONFIG_XEN
> +int __initdata xen_numa_init(void);
> +
> +struct vnuma_memblk {
> +               uint64_t start, end;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
> +
> +struct vnuma_topology_info {
> +     domid_t domid;
> +     uint16_t nr_nodes;
> +     GUEST_HANDLE(vnuma_memblk) vmemblk;
> +     GUEST_HANDLE(int) vdistance;
> +     GUEST_HANDLE(int) cpu_to_node;
> +     GUEST_HANDLE(int) vnode_to_pnode;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
> +
> +struct xen_domctl {
> +     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
> +     domid_t  domain;
> +     union {
> +             struct vnuma_topology_info vnuma;
> +             } u;
> +};
> +typedef struct xen_domctl xen_domctl_t;
> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);

Most of this is the public interface provided by Xen, this should be in
include/xen/interface/memory.h. And only the xen_numa_init() prototype
in this header.

> +
> +#else
> +int __init xen_numa_init(void) {}
> +#endif
> +
> +#endif /* _ASM_X86_VNUMA_XEN_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8bf93ba..3ec7855 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -20,6 +20,10 @@

#include <asm/xen/vnuma.h>

here.

>  #include "numa_internal.h"
>  
> +#ifdef CONFIG_XEN
> +#include "asm/xen/vnuma-xen.h"
> +#endif

Instead of this.

> +
>  int __initdata numa_off;
>  nodemask_t numa_nodes_parsed __initdata;
>  
> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>       if (!numa_off) {
> +#ifdef CONFIG_XEN
> +             if (xen_pv_domain() && !numa_init(xen_numa_init))
> +                     return;
> +#endif

Move the test for xen_pv_domain() into xen_numa_init.

>  #ifdef CONFIG_X86_NUMAQ
>               if (!numa_init(numaq_numa_init))
>                       return;
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index 193097e..4658e9d 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -33,6 +33,7 @@
>  #include <linux/memblock.h>
>  #include <linux/edd.h>
>  
> +#include <asm/numa.h>
>  #include <xen/xen.h>
>  #include <xen/events.h>
>  #include <xen/interface/xen.h>
> @@ -69,6 +70,7 @@
>  #include <asm/mwait.h>
>  #include <asm/pci_x86.h>
>  #include <asm/pat.h>
> +#include <asm/xen/vnuma-xen.h>
>  
>  #ifdef CONFIG_ACPI
>  #include <linux/acpi.h>
> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm 
> __refconst = {
>       .x2apic_available       = xen_x2apic_para_available,
>  };
>  EXPORT_SYMBOL(x86_hyper_xen_hvm);
> +
> +/* Xen PV NUMA topology initialization */
> +int __initdata xen_numa_init(void)

This function needs to more clearly separate the different parts.  This
could be by splitting it into several functions or by improving the
comments and spacing.

I'd also move it to a new file: arch/x86/xen/vnuma.c.

> +{
> +     struct vnuma_memblk *vblk;
> +     struct vnuma_topology_info numa_topo;
> +     uint64_t size, start, end;
> +     int i, j, cpu, rc;
> +     u64 phys, physd, physc;
> +     u64 new_end_pfn;
> +     size_t mem_size;
> +
> +     int *vdistance, *cpu_to_node;
> +     unsigned long dist_size, cpu_to_node_size;
> +     numa_topo.domid = DOMID_SELF;
> +     rc = -EINVAL;
> +     /* block mem reservation for memblks */
> +     mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
> +     phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, 
> PAGE_SIZE);
> +     if (!phys) {
> +             pr_info("vNUMA: can't allocate memory for membloks size %zu\n", 
> mem_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }
> +     if (memblock_reserve(phys, mem_size) < 0) {
> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", 
> mem_size);
> +             rc = -ENOMEM;
> +             goto errout;
> +     }

Does mem_block_alloc() work here instead of the find_in_range/reserve
combo?  Similarly for the other allocations.

> +     vblk = __va(phys);
> +     set_xen_guest_handle(numa_topo.vmemblk, vblk);

Please group all the memory allocations followed by grouping the
initialization of the hypercall arguments.

> +     if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)
> +             goto errout;

ret = HYPERVISOR_memory_op(...);
if (ret < 0)
     goto out;

> +     if (numa_topo.nr_nodes == 0)
> +             /* Pass to DUMMY numa */
> +             goto errout;
> +     if (numa_topo.nr_nodes > num_possible_cpus()) {
> +             pr_info("vNUMA: Node without cpu is not supported, nodes = 
> %d\n", numa_topo.nr_nodes);
> +             goto errout;
> +     }

Is this a constraint of the Linux NUMA support?

> +     new_end_pfn = 0;
> +     /* We have sizes in pages received from hypervisor, no holes and 
> ordered */

How does this play with the relocate of frames done for dom0 during
initial setup?

> +     for (i = 0; i < numa_topo.nr_nodes; i++) {
> +             start = vblk[i].start;
> +             end = vblk[i].end;
> +             size = end - start;
> +             pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = 
> %#010Lx\n",
> +                     i, size, start, end);
> +             if (numa_add_memblk(i, start, end)) {
> +                     pr_info("vNUMA: Could not add memblk[%d]\n", i);
> +                     rc = -EAGAIN;
> +                     goto errout;
> +             }
> +             node_set(i, numa_nodes_parsed);
> +     }
> +     setup_nr_node_ids();
> +     /* Setting the cpu, apicid to node */
> +     for_each_cpu(cpu, cpu_possible_mask) {
> +             pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, 
> cpu_to_node[cpu]);
> +             set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +             numa_set_node(cpu, cpu_to_node[cpu]);
> +             __apicid_to_node[cpu] = cpu_to_node[cpu];
> +             cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +     }
> +     setup_nr_node_ids();
> +     rc = 0;
> +     for (i = 0; i < numa_topo.nr_nodes; i++)

Style: {} around this multi-line body.

> +             for (j = 0; j < numa_topo.nr_nodes; j++) {
> +                     numa_set_distance(i, j, *(vdistance + 
> j*numa_topo.nr_nodes + i));
> +                     pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance 
> + j*numa_topo.nr_nodes + i));
> +             }
> +errout:

This isn't (always) an error path, name this label "out".

> +     if (phys)
> +             memblock_free(__pa(phys), mem_size);
> +     if (physd)
> +             memblock_free(__pa(physd), dist_size);
> +     if (physc)
> +             memblock_free(__pa(physc), cpu_to_node_size);
> +     if (rc)
> +             pr_debug("XEN: hypercall failed to get vNUMA topology.\n");

Remove this debug print, you already have prints for specific failures.

> +     return rc;
> +}
> +
>  #endif

David

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.