[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC v2 1/2] linux/vnuma: vNUMA for PV domu guest



On 17/09/13 09:34, Elena Ufimtseva wrote:
> Requests NUMA topology info from Xen by issuing subop
> hypercall. Initializes NUMA nodes, sets number of CPUs,
> distance table and NUMA nodes memory ranges during boot.
> vNUMA topology defined by user in VM config file. Memory
> ranges are represented by structure vnuma_topology_info
> where start and end of memory area are defined in guests
> pfns numbers, constructed and aligned accordingly to
> e820 domain map.
> In case the received structure has errors, will fail to
> dummy numa init.
> Requires XEN with applied patches from vnuma patchset;
> 
> Changes since v1:
> - moved the test for xen_pv_domain() into xen_numa_init;
> - replaced memory block search/allocation by single memblock_alloc;
> - moved xen_numa_init to vnuma.c from enlighten.c;
> - moved memblock structure to public interface memory.h;
> - specified signedness of vnuma topology structure members;
> - removed excessive debug output;
> 
> TODO:
> - consider common interface for Dom0, HVM and PV guests to provide
> vNUMA topology;
> - dynamic numa balancing at the time of this patch (kernel 3.11
> 6e4664525b1db28f8c4e1130957f70a94c19213e with boot parameter
> numa_balancing=true that is such by default) crashes numa-enabled
> guest. Investigate further.

> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -19,6 +19,7 @@
>  #include <asm/amd_nb.h>

#include <asm/xen/vnuma.h> here...

>  #include "numa_internal.h"
> +#include "asm/xen/vnuma.h"

... not here.

> --- /dev/null
> +++ b/arch/x86/xen/vnuma.c
> @@ -0,0 +1,92 @@
> +#include <linux/err.h>
> +#include <linux/memblock.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/memory.h>
> +#include <asm/xen/interface.h>
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/vnuma.h>
> +#ifdef CONFIG_NUMA
> +/* Xen PV NUMA topology initialization */
> +static unsigned int xen_vnuma_init = 0;
> +int xen_vnuma_support()
> +{
> +     return xen_vnuma_init;
> +}

I'm not sure how this and the usage in the next patch actually work.
xen_vnuma_init is only set after the test of numa_off prior to calling
xen_numa_init() which will set xen_vnuma_init.

> +int __init xen_numa_init(void)
> +{
> +     int rc;
> +     unsigned int i, j, cpu, idx, pcpus;
> +     u64 phys, physd, physc;
> +     unsigned int *vdistance, *cpu_to_node;
> +     unsigned long mem_size, dist_size, cpu_to_node_size;
> +     struct vnuma_memarea *varea;
> +
> +     struct vnuma_topology_info numa_topo = {
> +             .domid = DOMID_SELF
> +     };
> +     rc = -EINVAL;
> +     if (!xen_pv_domain())
> +             return rc;
> +     pcpus = num_possible_cpus();
> +     mem_size =  pcpus * sizeof(struct vnuma_memarea);
> +     dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
> +     cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
> +     phys = memblock_alloc(mem_size, PAGE_SIZE);
> +     physd = memblock_alloc(dist_size, PAGE_SIZE);
> +     physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
> +     if (!phys || !physc || !physd)
> +             goto vnumaout;
> +     varea = __va(phys);
> +     vdistance  = __va(physd);
> +     cpu_to_node  = __va(physc);
> +     set_xen_guest_handle(numa_topo.vmemarea, varea);
> +     set_xen_guest_handle(numa_topo.vdistance, vdistance);
> +     set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
> +     rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo);
> +     if (rc < 0)
> +             goto vnumaout;
> +     rc = -EINVAL;
> +     if (numa_topo.nr_nodes == 0) {
> +             /* will pass to dummy_numa_init */
> +             goto vnumaout;
> +     }
> +     if (numa_topo.nr_nodes > num_possible_cpus()) {
> +             pr_debug("vNUMA: Node without cpu is not supported in this 
> version.\n");
> +             goto vnumaout;
> +     }
> +     /*
> +      * NUMA nodes memory ranges are in pfns, constructed and
> +      * aligned based on e820 ram domain map
> +     */
> +     for (i = 0; i < numa_topo.nr_nodes; i++) {
> +             if (numa_add_memblk(i, varea[i].start, varea[i].end))
> +                     /* pass to numa_dummy_init */
> +                     goto vnumaout;

If there's a failure here, numa may be partially setup.  Do you need to
undo any of the bits that have already setup?

> +             node_set(i, numa_nodes_parsed);
> +     }
> +     setup_nr_node_ids();
> +     /* Setting the cpu, apicid to node */
> +     for_each_cpu(cpu, cpu_possible_mask) {
> +             set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +             numa_set_node(cpu, cpu_to_node[cpu]);
> +             __apicid_to_node[cpu] = cpu_to_node[cpu];
> +             cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +     }
> +     for (i = 0; i < numa_topo.nr_nodes; i++) {
> +             for (j = 0; j < numa_topo.nr_nodes; j++) {
> +                     idx = (j * numa_topo.nr_nodes) + i;
> +                     numa_set_distance(i, j, *(vdistance + idx));
> +             }
> +     }
> +     rc = 0;
> +     xen_vnuma_init = 1;
> +vnumaout:

Call this label "out".

> +     if (phys)
> +             memblock_free(__pa(phys), mem_size);
> +     if (physd)
> +             memblock_free(__pa(physd), dist_size);
> +     if (physc)
> +             memblock_free(__pa(physc), cpu_to_node_size);
> +     return rc;

If you return an error, x86_numa_init() will try to call setup for other
NUMA system.  Consider calling numa_dummy_init() directly instead and
then returning success.

> +}

Please use blank lines to space the logical bits of this function out
some more.

> +#endif
> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
> index 2ecfe4f..4237f51 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -263,4 +263,31 @@ struct xen_remove_from_physmap {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>  
> +/* vNUMA structures */
> +struct vnuma_memarea {
> +     uint64_t start, end;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memarea);
> +
> +struct vnuma_topology_info {
> +     /* OUT */
> +     domid_t domid;
> +     /* IN */
> +     uint16_t nr_nodes; /* number of virtual numa nodes */
> +     uint32_t _pad;

Is this _pad intended to make this structure uniform across 32-bit and
64-bit guests?  The following GUEST_HANDLES() have different sizes on
32- and 64-bit guests.

> +     /* distance table */
> +     GUEST_HANDLE(uint) vdistance;
> +     /* cpu mapping to vnodes */
> +     GUEST_HANDLE(uint) cpu_to_node;
> +     /*
> +     * array of numa memory areas constructed by Xen
> +     * where start and end are pfn numbers of the area
> +     * Xen takes into account domains e820 map
> +     */
> +     GUEST_HANDLE(vnuma_memarea) vmemarea;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
> +
> +#define XENMEM_get_vnuma_info        25
> +
>  #endif /* __XEN_PUBLIC_MEMORY_H__ */

David

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.