[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest



Hi Konrad

Thank you for comments, I will resend the fix in the new patch.
Regarding memblocks contrustion they are done taking into account e820
map for this domain.
I will place some comments in code as you mentioned.

Elena

On Tue, Aug 27, 2013 at 10:33 AM, Konrad Rzeszutek Wilk
<konrad.wilk@xxxxxxxxxx> wrote:
> On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:
>> Uses subop hypercall to request XEN about vnuma topology.
>> Sets the memory blocks (aligned by XEN), cpus, distance table
>> on boot. NUMA support should be compiled in kernel.
>
> Needs a bit more details (which I am sure you will add in the
> next postings). Mostly:
>  - How E820 and NUMA information mesh
>  - What the hypercall provides
>  - What changeset in the Xen provides this hypercall
>  -
>>
>> NUMA support should be compiled in kernel.
>>
>> Memory blocks are aligned by xen. Xen is aware of guest initial
>> RAM layout.
>> If the memory blocks are incorrect, call for default linux numa
>> dummy init.
>
> Unfortunatly that won't happen if you boot this under dom0. It will
> find on some AMD machines the AMD Northbridge and try to scan that.
> And blow up.
>
> If you look at the git commit that did the 'numa = 0' you will see
> the backstory.
>
> I think part of enablement of ' numa = 1' is to wrap it with
>
>         if (xen_initial_domain() && xen_supports_this_hypercall())
>                 numa = 1;
>
> in the #2 patch.
>
>>
>> Requires XEN with vNUMA support.
>>
>> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git
>> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
>>
>> TODO:
>> Change subop from XENMEM to SYSCTL.
>>
>> Signed-off-by: Elena Ufimtseva <ufimtseva@xxxxxxxxx>
>> ---
>>  arch/x86/include/asm/xen/vnuma-xen.h |   36 +++++++++++
>>  arch/x86/mm/numa.c                   |    8 +++
>>  arch/x86/xen/enlighten.c             |  115 
>> ++++++++++++++++++++++++++++++++++
>>  include/xen/interface/memory.h       |    1 +
>>  4 files changed, 160 insertions(+)
>>  create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h
>>
>> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h 
>> b/arch/x86/include/asm/xen/vnuma-xen.h
>> new file mode 100644
>> index 0000000..73c1bde
>> --- /dev/null
>> +++ b/arch/x86/include/asm/xen/vnuma-xen.h
>> @@ -0,0 +1,36 @@
>> +#ifndef _ASM_X86_VNUMA_XEN_H
>> +#define _ASM_X86_VNUMA_XEN_H
>> +
>> +#ifdef CONFIG_XEN
>> +int __initdata xen_numa_init(void);
>> +
>> +struct vnuma_memblk {
>> +               uint64_t start, end;
>> +};
>> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk);
>> +
>> +struct vnuma_topology_info {
>> +     domid_t domid;
>> +     uint16_t nr_nodes;
>> +     GUEST_HANDLE(vnuma_memblk) vmemblk;
>> +     GUEST_HANDLE(int) vdistance;
>> +     GUEST_HANDLE(int) cpu_to_node;
>> +     GUEST_HANDLE(int) vnode_to_pnode;
>
> A comment explaining what those values are meant to have would be good.
> Perhaps with a simple example.
>
>> +};
>
> The structure is not very 32-bit friendly. Would it be possible
> to add some padding so that the size of this structure is the
> same on 32-bit and 64-bit please?
>
> Perhaps:
>         domid_t domid;
>         uint16_t nr_nodes;
>         uint32_t _pad;
>         GUEST_HANDLE(vnuma_memblk) vmemblk;
>         GUEST_HANDLE(int) vdistance;
>         GUEST_HANDLE(int) cpu_to_node;
>         GUEST_HANDLE(int) vnode_to_pnode;
>
> That should make the offsets be the same on both 32 and 64-bit
> I think.
>
>> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
>> +
>> +struct xen_domctl {
>> +     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
>> +     domid_t  domain;
>> +     union {
>> +             struct vnuma_topology_info vnuma;
>> +             } u;
>
> Move that 'u' one tab back please.
>
>> +};
>> +typedef struct xen_domctl xen_domctl_t;
>
> Ewww. No typdefs in the Linux code please.
>
>> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);
>> +
>> +#else
>> +int __init xen_numa_init(void) {}
>> +#endif
>> +
>> +#endif /* _ASM_X86_VNUMA_XEN_H */
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 8bf93ba..3ec7855 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -20,6 +20,10 @@
>>
>>  #include "numa_internal.h"
>>
>> +#ifdef CONFIG_XEN
>> +#include "asm/xen/vnuma-xen.h"
>> +#endif
>> +
>>  int __initdata numa_off;
>>  nodemask_t numa_nodes_parsed __initdata;
>>
>> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void)
>>  void __init x86_numa_init(void)
>>  {
>>       if (!numa_off) {
>> +#ifdef CONFIG_XEN
>> +             if (xen_pv_domain() && !numa_init(xen_numa_init))
>
> I would remove the xen_pv_domain() check and add that in enlighten.c code.
>
>> +                     return;
>> +#endif
>>  #ifdef CONFIG_X86_NUMAQ
>>               if (!numa_init(numaq_numa_init))
>>                       return;
>> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>> index 193097e..4658e9d 100644
>> --- a/arch/x86/xen/enlighten.c
>> +++ b/arch/x86/xen/enlighten.c
>> @@ -33,6 +33,7 @@
>>  #include <linux/memblock.h>
>>  #include <linux/edd.h>
>>
>> +#include <asm/numa.h>
>>  #include <xen/xen.h>
>>  #include <xen/events.h>
>>  #include <xen/interface/xen.h>
>> @@ -69,6 +70,7 @@
>>  #include <asm/mwait.h>
>>  #include <asm/pci_x86.h>
>>  #include <asm/pat.h>
>> +#include <asm/xen/vnuma-xen.h>
>>
>>  #ifdef CONFIG_ACPI
>>  #include <linux/acpi.h>
>> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm 
>> __refconst = {
>>       .x2apic_available       = xen_x2apic_para_available,
>>  };
>>  EXPORT_SYMBOL(x86_hyper_xen_hvm);
>> +
>> +/* Xen PV NUMA topology initialization */
>> +int __initdata xen_numa_init(void)
>> +{
>> +     struct vnuma_memblk *vblk;
>> +     struct vnuma_topology_info numa_topo;
>> +     uint64_t size, start, end;
>> +     int i, j, cpu, rc;
>> +     u64 phys, physd, physc;
>> +     u64 new_end_pfn;
>> +     size_t mem_size;
>> +
>> +     int *vdistance, *cpu_to_node;
>> +     unsigned long dist_size, cpu_to_node_size;
>
> You need an extra line here;
>
>> +     numa_topo.domid = DOMID_SELF;
>> +     rc = -EINVAL;
>
> Both of those can be part of the decleration of variables above. Like so:
>         int rc = -EINVAL;
>         struct vnuma_topology_info numa_topo = {
>                 .domid = DOMID_SELF;
>         };
>
>> +     /* block mem reservation for memblks */
>> +     mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk);
>> +     phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, 
>> PAGE_SIZE);
>> +     if (!phys) {
>> +             pr_info("vNUMA: can't allocate memory for membloks size 
>> %zu\n", mem_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     if (memblock_reserve(phys, mem_size) < 0) {
>> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu 
>> \n", mem_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     vblk = __va(phys);
>> +     set_xen_guest_handle(numa_topo.vmemblk, vblk);
>> +
>> +     dist_size = num_possible_cpus() * num_possible_cpus() * 
>> sizeof(*numa_topo.vdistance);
>> +     physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, 
>> PAGE_SIZE);
>> +     if (!physd) {
>> +             pr_info("vNUMA: can't allocate memory for distance size 
>> %zu\n", dist_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     if (memblock_reserve(physd, dist_size) < 0) {
>> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu 
>> \n", dist_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     vdistance  = __va(physd);
>> +     set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance);
>> +
>> +     /* allocate memblock for vcpu to node mask of max size */
>> +     cpu_to_node_size = num_possible_cpus() * 
>> sizeof(*numa_topo.cpu_to_node);
>> +     physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 
>> cpu_to_node_size, PAGE_SIZE);
>> +     if (!physc) {
>> +             pr_info("vNUMA: can't allocate memory for distance size 
>> %zu\n", cpu_to_node_size);
>> +             rc = -ENOMEM;
>> +             goto errout;
>> +     }
>> +     if (memblock_reserve(physc, cpu_to_node_size) < 0) {
>> +             pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", 
>> cpu_to_node_size);
>> +             goto errout;
>> +     }
>> +     cpu_to_node  = __va(physc);
>> +     set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node);
>> +     if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)
>
> I think you need to do rc = HYPERVISOR...
>
> and then if you get rc = -ENOSYS (so not implemented) change the rc to zero.
>
>> +             goto errout;
>> +     if (numa_topo.nr_nodes == 0)
>> +             /* Pass to DUMMY numa */
>
> Also change rc to zero.
>
>> +             goto errout;
>> +     if (numa_topo.nr_nodes > num_possible_cpus()) {
>> +             pr_info("vNUMA: Node without cpu is not supported, nodes = 
>> %d\n", numa_topo.nr_nodes);
>> +             goto errout;
>> +     }
>> +     new_end_pfn = 0;
>> +     /* We have sizes in pages received from hypervisor, no holes and 
>> ordered */
>
> From toolstack you mean. It is the one setting it up. And I hope it sets it up
> based on the E820 it has constructed as well? Otherwise it would be a bit
> awkward if those two were different.
>
>> +     for (i = 0; i < numa_topo.nr_nodes; i++) {
>> +             start = vblk[i].start;
>> +             end = vblk[i].end;
>> +             size = end - start;
>> +             pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = 
>> %#010Lx\n",
>> +                     i, size, start, end);
>
> pr_debug perhaps?
>
>> +             if (numa_add_memblk(i, start, end)) {
>> +                     pr_info("vNUMA: Could not add memblk[%d]\n", i);
>> +                     rc = -EAGAIN;
>
> Can you unroll the NUMA topology if this fails? Or is that not a problem?
>
>> +                     goto errout;
>> +             }
>> +             node_set(i, numa_nodes_parsed);
>> +     }
>> +     setup_nr_node_ids();
>> +     /* Setting the cpu, apicid to node */
>> +     for_each_cpu(cpu, cpu_possible_mask) {
>> +             pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, 
>> cpu_to_node[cpu]);
>
> pr_debug
>> +             set_apicid_to_node(cpu, cpu_to_node[cpu]);
>> +             numa_set_node(cpu, cpu_to_node[cpu]);
>> +             __apicid_to_node[cpu] = cpu_to_node[cpu];
>> +             cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
>> +     }
>> +     setup_nr_node_ids();
>
> How come this is being called twice? It should have a comment
> explaining this extra call.
>
>> +     rc = 0;
>> +     for (i = 0; i < numa_topo.nr_nodes; i++)
>> +             for (j = 0; j < numa_topo.nr_nodes; j++) {
>> +                     numa_set_distance(i, j, *(vdistance + 
>> j*numa_topo.nr_nodes + i));
>
> You could simply this by just doing:
>
>                         int idx = (j * numa_topo.nr_nodes) + i;
>
>                         ... *(vdistance + idx));
>
>> +                     pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance 
>> + j*numa_topo.nr_nodes + i));
>
> pr_debug
>
>> +             }
>> +errout:
>> +     if (phys)
>> +             memblock_free(__pa(phys), mem_size);
>> +     if (physd)
>> +             memblock_free(__pa(physd), dist_size);
>> +     if (physc)
>> +             memblock_free(__pa(physc), cpu_to_node_size);
>> +     if (rc)
>> +             pr_debug("XEN: hypercall failed to get vNUMA topology.\n");
>
> And include the 'rc' value in the output please.
>> +     return rc;
>> +}
>> +
>>  #endif
>> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
>> index 2ecfe4f..16b8d87 100644
>> --- a/include/xen/interface/memory.h
>> +++ b/include/xen/interface/memory.h
>> @@ -263,4 +263,5 @@ struct xen_remove_from_physmap {
>>  };
>>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>>
>> +#define XENMEM_get_vnuma_info        25
>>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
>> --
>> 1.7.10.4
>>



-- 
Elena

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.