[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] Fix XEN_SYSCTL_numainfo[node].memsize for memory holes



Fix a long-standing issue (known at since 2014) with the numainfo call.

The Hypercall `XEN_SYSCTL_numainfo` returns `memsize` for each NUMA node:

xl info -n:
node:    memsize    memfree    distances
   0:  -> 67584 <-   60672      10,21  <- memsize is off by 2048 MB
   1:     65536      60958      21,10

So far, `memsize` is calculated from `NODE_DATA->node_spanned_pages`.
It includes memory holes, leading to wrong memsize on x86.

This patch gets the sum of E820_RAM entries for each NUMA node on boot,
stores it in NODE_DATA->node_present_pages and uses it for `memsize`.

It also increases it like `total_pages` on memory_add() for memory hotplug.

The new NODE_DATA->node_present_pages can be slighly lower than the
physical node's RAM due to reserved memory for some of the NUMA nodes.

For example, on this example system, NODE_DATA->node_present_pages
reports 63.5GB of usable memory on the 1st NUMA node with this patch.

This patch uses `arch_get_ram_range()` which is an architecture-provided
call that all NUMA architectures already need to provide for iterating
over the usable RAM of the system.

- Tested on 2-socket and a 4-socket x86 systems
- Memory hot-add not tested, but is identical to bumping total_pages.

Signed-off-by: Bernhard Kaindl <bernhard.kaindl@xxxxxxxxx>
---
 xen/arch/x86/x86_64/mm.c |  2 ++
 xen/common/numa.c        | 14 +++++++++++++-
 xen/common/sysctl.c      |  2 +-
 xen/include/xen/numa.h   |  3 +++
 4 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
index b2a280fba3..a22aa45060 100644
--- a/xen/arch/x86/x86_64/mm.c
+++ b/xen/arch/x86/x86_64/mm.c
@@ -1333,6 +1333,8 @@ int memory_add(unsigned long spfn, unsigned long epfn, 
unsigned int pxm)
     /* We can't revert any more */
     share_hotadd_m2p_table(&info);
     transfer_pages_to_heap(&info);
+    /* Update the node's present pages (like the total_pages of the system) */
+    NODE_DATA(node)->node_present_pages += epfn - spfn;
 
     return 0;
 
diff --git a/xen/common/numa.c b/xen/common/numa.c
index 28a09766fa..d68cbea44c 100644
--- a/xen/common/numa.c
+++ b/xen/common/numa.c
@@ -504,10 +504,22 @@ void __init setup_node_bootmem(nodeid_t nodeid, paddr_t 
start, paddr_t end)
 {
     unsigned long start_pfn = paddr_to_pfn(start);
     unsigned long end_pfn = paddr_to_pfn(end);
+    paddr_t map_start, map_end;
+    int i = 0, err;
 
     NODE_DATA(nodeid)->node_start_pfn = start_pfn;
     NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 
+    /* Add RAM pages within the node to get the present pages for memsize 
infos */
+    NODE_DATA(nodeid)->node_present_pages = 0;
+    while ( (err = arch_get_ram_range(i++, &map_start, &map_end)) != -ENOENT ) 
{
+        if ( err || map_start >= end || map_end <= start )
+            continue;  /* Skip non-RAM and maps outside of the node's memory 
range */
+        /* Add memory that is in the node's memory range (within start and 
end): */
+        map_start = max(map_start, start);
+        map_end = min(map_end, end);
+        NODE_DATA(nodeid)->node_present_pages += (map_end - map_start) >> 
PAGE_SHIFT;
+    }
     node_set_online(nodeid);
 }
 
@@ -675,7 +687,7 @@ static void cf_check dump_numa(unsigned char key)
         mfn_t mfn = _mfn(node_start_pfn(i) + 1);
 
         printk("NODE%u start->%lu size->%lu free->%lu\n",
-               i, node_start_pfn(i), node_spanned_pages(i),
+               i, node_start_pfn(i), node_present_pages(i),
                avail_node_heap_pages(i));
         /* Sanity check mfn_to_nid() */
         if ( node_spanned_pages(i) > 1 && mfn_to_nid(mfn) != i )
diff --git a/xen/common/sysctl.c b/xen/common/sysctl.c
index d02f44fe3a..cba6d3cfea 100644
--- a/xen/common/sysctl.c
+++ b/xen/common/sysctl.c
@@ -316,7 +316,7 @@ long do_sysctl(XEN_GUEST_HANDLE_PARAM(xen_sysctl_t) 
u_sysctl)
                 {
                     if ( node_online(i) )
                     {
-                        meminfo.memsize = node_spanned_pages(i) << PAGE_SHIFT;
+                        meminfo.memsize = node_present_pages(i) << PAGE_SHIFT;
                         meminfo.memfree = avail_node_heap_pages(i) << 
PAGE_SHIFT;
                     }
                     else
diff --git a/xen/include/xen/numa.h b/xen/include/xen/numa.h
index fd1511a6fb..c860f3ad1c 100644
--- a/xen/include/xen/numa.h
+++ b/xen/include/xen/numa.h
@@ -71,6 +71,7 @@ extern nodeid_t *memnodemap;
 struct node_data {
     unsigned long node_start_pfn;
     unsigned long node_spanned_pages;
+    unsigned long node_present_pages;
 };
 
 extern struct node_data node_data[];
@@ -91,6 +92,7 @@ static inline nodeid_t mfn_to_nid(mfn_t mfn)
 
 #define node_start_pfn(nid)     (NODE_DATA(nid)->node_start_pfn)
 #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
+#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
                                  NODE_DATA(nid)->node_spanned_pages)
 
@@ -123,6 +125,7 @@ extern void numa_set_processor_nodes_parsed(nodeid_t node);
 extern mfn_t first_valid_mfn;
 
 #define node_spanned_pages(nid) (max_page - mfn_x(first_valid_mfn))
+#define node_present_pages(nid) total_pages
 #define node_start_pfn(nid) mfn_x(first_valid_mfn)
 #define __node_distance(a, b) 20
 
-- 
2.43.0




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.