[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] xl: add memory allocation logic for numa platform



On Tue, 2011-08-09 at 05:55 +0100, Zhang, Yang Z wrote:
> For numa platform, we need to allocate memory for guest on which guest cpu 
> reside. This patch add
> this feature for xl. Just use the simple algorithm to select the best node.
> 
> diff -r 0f36c2eec2e1 tools/libxl/libxl.c
> --- a/tools/libxl/libxl.c       Thu Jul 28 15:40:54 2011 +0100
> +++ b/tools/libxl/libxl.c       Tue Aug 09 12:46:18 2011 +0800
> @@ -2236,6 +2236,109 @@
>      return ERROR_FAIL;
>  }
> 
> +static inline void set_bit(int nr, volatile void *addr)
> +{
> +    asm volatile (
> +      "btsl %1,%0"
> +       : "=m" (ADDR)
> +       : "Ir" (nr), "m" (ADDR) : "memory");
> +}

Please use C code for this. There is no way this code is performance
critical need to hand code asm for it.

I'm any case if you need something like this you should define a proper
API for dealing with the libxl_nodeinfo data type and not a void * based
thing. Take a look at the libxl_cpumap_* methods for example.

Actually, looking closer, the "cpumap" member of your nodeinfo should
actually _be_ a cpumap and use the correct interfaces, having it as a
string makes no sense at all AFAICT.

> +
> +int libxl_get_numainfo(libxl_ctx *ctx, libxl_numainfo_t *numainfo)
> +{
> +    xc_numainfo_t ninfo = { 0 };
> +    libxl_physinfo physinfo = { 0 };
> +    libxl_topologyinfo topoinfo;
> +    int i,  max_nodes, max_cpus, node;
> +    libxl_nodeinfo_t *nodeinfo;
> +    DECLARE_HYPERCALL_BUFFER(xc_node_to_memsize_t, node_memsize);
> +    DECLARE_HYPERCALL_BUFFER(xc_node_to_memfree_t, node_memfree);
> +
> +    if (libxl_get_physinfo(ctx, &physinfo))
> +        goto out;
> +
> +    max_cpus = physinfo.max_cpu_id + 1;
> +    max_nodes = NUMA_NO_NODE + 1;
> +    numainfo->max_cpus = max_cpus;
> +
> +    numainfo->cpu_to_node = calloc(max_cpus, sizeof (unsigned long));
> +    if (numainfo->cpu_to_node == NULL)
> +        goto out;
> +
> +    numainfo->nodeinfo = (char *)calloc(max_nodes, sizeof(libxl_nodeinfo_t));
> +    if (numainfo->nodeinfo == NULL)
> +        goto out;
> +
> +    nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo;

Why declare it as a string if you want it to be a libxl_nodeinfo? Please
just use the correct types in the IDL.

> +    node_memsize = xc_hypercall_buffer_alloc(ctx->xch, node_memsize, 
> sizeof(*node_memsize) * max_nodes);
> +    if ( node_memsize == NULL )
> +        goto out;
> +    node_memfree = xc_hypercall_buffer_alloc(ctx->xch, node_memfree, 
> sizeof(*node_memfree) * max_nodes);
> +    if ( node_memfree == NULL )
> +        goto out;
> +
> +    set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
> +    set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
> +    ninfo.max_node_index = max_nodes - 1;
> +
> +    if ( xc_numainfo(ctx->xch, &ninfo) != 0 )
> +        goto out;
> +
> +    max_nodes = ninfo.max_node_index + 1;
> +    numainfo->max_nodes = max_nodes;
> +
> +    if (libxl_get_topologyinfo(ctx, &topoinfo))
> +        goto out;
> +
> +    for ( i = 0; i <= max_nodes; i++ ) {
> +        if (node_memsize[i] != INVALID_MEM_NODE) {
> +            nodeinfo[i].online = 1;
> +            nodeinfo[i].cpumap = malloc(BITS_TO_LONGS(max_cpus) * sizeof 
> (unsigned long));
> +            bzero(nodeinfo[i].cpumap, BITS_TO_LONGS(max_cpus) * sizeof 
> (unsigned long));
> +
> +            /* Total Memory */
> +            nodeinfo[i].total_memkb = node_memsize[i] >> 10; /* KB */
> +
> +            /* Free Memory */
> +            nodeinfo[i].free_memkb = node_memfree[i] >> 10; /* KB */
> +        } else
> +            nodeinfo[i].online = 0;
> +    }
> +
> +    for (i = 0; i < max_cpus; i++)
> +        if (topoinfo.coremap.array[i] != LIBXL_CPUARRAY_INVALID_ENTRY) {
> +            node = topoinfo.nodemap.array[i];
> +            set_bit(i, nodeinfo[node].cpumap);
> +            numainfo->cpu_to_node[i] = node;
> +        }
> +    libxl_topologyinfo_destroy(&topoinfo);
> +
> +    xc_hypercall_buffer_free(ctx->xch, node_memsize);
> +    xc_hypercall_buffer_free(ctx->xch, node_memfree);
> +    return 0;
> +
> +out:
> +    if (numainfo->cpu_to_node)
> +        free(numainfo->cpu_to_node);
> +    if (numainfo->nodeinfo);
> +        free(numainfo->nodeinfo);
> +    xc_hypercall_buffer_free(ctx->xch, node_memsize);
> +    xc_hypercall_buffer_free(ctx->xch, node_memfree);
> +    return ERROR_FAIL;
> +}
> +
> +void libxl_free_numainfo(libxl_numainfo_t *numainfo)
> +{
> +    int i;
> +    libxl_nodeinfo_t *nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo;
> +
> +    for(i = 0; i < numainfo->max_nodes; i++)
> +        if(nodeinfo[i].cpumap)
> +            free(nodeinfo[i].cpumap);
> +    free(numainfo->cpu_to_node);
> +    free(numainfo->nodeinfo);

The IDL will define you a libxl_numainfo_destroy which does pretty much
exactly this, if you use the right types in the IDL of course.

> +}
> +
>  const libxl_version_info* libxl_get_version_info(libxl_ctx *ctx)
>  {
>      union {
> diff -r 0f36c2eec2e1 tools/libxl/libxl.h
> --- a/tools/libxl/libxl.h       Thu Jul 28 15:40:54 2011 +0100
> +++ b/tools/libxl/libxl.h       Tue Aug 09 12:46:18 2011 +0800
> @@ -459,6 +459,16 @@
> 
>  int libxl_get_physinfo(libxl_ctx *ctx, libxl_physinfo *physinfo);
>  int libxl_get_topologyinfo(libxl_ctx *ctx, libxl_topologyinfo *info);
> +
> +#define NUMA_NO_NODE    0xFF
> +#define INVALID_MEM_NODE     0ul
> +#define BITS_PER_LONG (sizeof(unsigned long) * 8)
> +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
> +#define ADDR (*(volatile long *) addr)

These are polluting the user visible interface with non-namespaced
headers. Since you should be using the libxl_cpumap type for your
cpumaps I expect a bunch of them should just go away. Others should be
moved to the relevant internal header and IMHO that ADDR macro has no
business existing in the first place.

> +
> +int libxl_get_numainfo(libxl_ctx *ctx, libxl_numainfo_t *numainfo);
> +void libxl_free_numainfo(libxl_numainfo_t *numainfo);
> +
>  libxl_vcpuinfo *libxl_list_vcpu(libxl_ctx *ctx, uint32_t domid,
>                                         int *nb_vcpu, int *nrcpus);
>  int libxl_set_vcpuaffinity(libxl_ctx *ctx, uint32_t domid, uint32_t vcpuid,
> diff -r 0f36c2eec2e1 tools/libxl/libxl.idl
> --- a/tools/libxl/libxl.idl     Thu Jul 28 15:40:54 2011 +0100
> +++ b/tools/libxl/libxl.idl     Tue Aug 09 12:46:18 2011 +0800
> @@ -366,6 +366,19 @@
>      ("socketmap", libxl_cpuarray, False, "cpu to socket map"),
>      ("nodemap", libxl_cpuarray,   False, "cpu to node map"),
>      ])
> +libxl_nodeinfo = Struct("nodeinfo_t", [
> +    ("free_memkb", uint32),
> +    ("total_memkb", uint32),
> +    ("candidate", uint32),
> +    ("online", uint32),
> +    ("cpumap", string),
> +    ])
> +libxl_numainfo = Struct("numainfo_t", [

libxl does not use the _t suffic on data types.

How do these two new datastructures relate to the existing topology
info?

I think you also need to add comments to many of these fields (you can
write them as the third entry in the tuple). What is a candidate for
example? It seems to relate to some internal detail of the algorithm
used by libxl internally rather than being a field which is useful to
the external user of libxl. It would be preferable to separate out such
internal and external data and only expose the external stuff via the
API.

> +    ("nodeinfo", string),
> +    ("max_nodes", uint32),
> +    ("cpu_to_node", string),
> +    ("max_cpus", uint32),
> +    ])
> 
>  libxl_sched_credit = Struct("sched_credit", [
>      ("weight", integer),
> diff -r 0f36c2eec2e1 tools/libxl/libxl_create.c
> --- a/tools/libxl/libxl_create.c        Thu Jul 28 15:40:54 2011 +0100
> +++ b/tools/libxl/libxl_create.c        Tue Aug 09 12:46:18 2011 +0800
> @@ -159,6 +159,86 @@
>      return 0;
>  }
> 
> +static int find_best_node(libxl_ctx *ctx, libxl_numainfo_t *numainfo)
> +{
> +    int nr_doms, i, j, nr_vcpu, nrcpus, best_node, pcpu, node_id;
> +    unsigned long max_nodes = numainfo->max_nodes;
> +    unsigned long *nodeload;
> +    libxl_dominfo *dominfo;
> +    libxl_vcpuinfo *vcpuinfo;
> +    libxl_nodeinfo_t *nodeinfo = (libxl_nodeinfo_t *)numainfo->nodeinfo;
> +
> +    nodeload = malloc(max_nodes * sizeof(*nodeload));
> +    bzero(nodeload, max_nodes * sizeof(*nodeload));
> +
> +    if (!(dominfo = libxl_list_domain(ctx, &nr_doms)))
> +        goto out;
> +
> +    for (i = 0; i < nr_doms; i++) {
> +        vcpuinfo = libxl_list_vcpu(ctx, dominfo[i].domid, &nr_vcpu, &nrcpus);
> +        if (!vcpuinfo)
> +            goto out;
> +        for (j = 0; j < nr_vcpu; j++) {
> +            if (!vcpuinfo[j].online)
> +                continue;
> +            pcpu = vcpuinfo[j].cpu;
> +            node_id = numainfo->cpu_to_node[pcpu];
> +            if (nodeinfo[node_id].candidate)
> +                nodeload[node_id]++;
> +            else
> +                nodeload[node_id] += 8;
> +        }
> +        free(vcpuinfo);
> +    }
> +    best_node = 0;
> +    for (i = 1; i < max_nodes; i++)
> +        if(nodeinfo[i].candidate && nodeinfo[i].online
> +            && nodeload[i] < nodeload[best_node])
> +            best_node = i;
> +
> +    return best_node;
> +out:
> +    if (dominfo)
> +        free(dominfo);
> +    return -1;
> +}
> +
> +static int libxl_node_select(libxl_ctx *ctx, libxl_domain_build_info 
> *b_info, uint32_t domid)
> +{
> +    unsigned long i, best_node;
> +    unsigned long needmem = b_info->max_memkb;
> +    libxl_numainfo_t numainfo ={ 0 };
> +    libxl_nodeinfo_t *nodeinfo;
> +
> +    if (libxl_get_numainfo(ctx, &numainfo)) {
> +        fprintf(stderr, "libxl_get_topologyinfo failed.\n");
> +        return -1;
> +    }
> +
> +    if (numainfo.max_nodes < 2) {
> +        printf("max_nodes = %d\n", numainfo.max_nodes);
> +        return 0;
> +        }
> +
> +    nodeinfo = (libxl_nodeinfo_t *)numainfo.nodeinfo;
> +
> +    for (i = 0; i < numainfo.max_nodes; i++)
> +        if (nodeinfo[i].free_memkb > needmem)
> +            nodeinfo[i].candidate = 1;
> +
> +    best_node = find_best_node(ctx, &numainfo);
> +    if (best_node == -1) {
> +        libxl_numainfo_t_destroy(&numainfo);
> +        return -1;
> +    }
> +
> +    for (i = 0; i < b_info->max_vcpus; i++)
> +        xc_vcpu_setaffinity(ctx->xch, domid, i, (uint8_t 
> *)(nodeinfo[best_node].cpumap));
> +
> +    libxl_numainfo_t_destroy(&numainfo);
> +    return 0;
> +}
> +
>  int libxl__domain_build(libxl__gc *gc,
>                          libxl_domain_build_info *info,
>                          libxl_device_model_info *dm_info,
> @@ -168,11 +248,15 @@
>      char **vments = NULL, **localents = NULL;
>      struct timeval start_time;
>      int i, ret;
> +    libxl_ctx *ctx = libxl__gc_owner(gc);
> 
>      ret = libxl__build_pre(gc, domid, info, state);
>      if (ret)
>          goto out;
> 
> +   if (libxl_node_select(ctx, info, domid))
> +       printf("Cannot find best node, using defaul algorithm\n");

Something is up with your whitespacing here (and a few other places).
Please read tools/libxl/CODING_STYLE

Also please use the correct LIBXL__LOG functions and do not print to
stdout. This message should probably be a debug level if inedeed it
needs to exist at all. The same goes for all the other printf calls
which you have added.

Is libxl_node_select intended to be called by external users of the
library? If not then it should be libxl__node_select and take a gc not a
ctx.

>      gettimeofday(&start_time, NULL);
> 
>      switch (info->type) {
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-devel



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.