|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [PATCH 19/21] xen/arm: Balance Dom0 memory allocation across allowed NUMA nodes
Allocate memory for Domain-0 exclusively from the permitted NUMA nodes.
When multiple NUMA nodes are available, distribute the allocation in a
balanced manner across each of these nodes.
---
xen/arch/arm/domain_build.c | 275 ++++++++++++++++++++----------------
1 file changed, 150 insertions(+), 125 deletions(-)
diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c
index 2bf4b37f89..7960dcd33a 100644
--- a/xen/arch/arm/domain_build.c
+++ b/xen/arch/arm/domain_build.c
@@ -180,33 +180,19 @@ unsigned int __init dom0_max_vcpus(void)
static bool __init insert_11_bank(struct domain *d,
struct kernel_info *kinfo,
struct page_info *pg,
- unsigned int order)
+ unsigned int order,
+ nodeid_t node)
{
struct membanks *mem = kernel_info_get_mem(kinfo);
unsigned int i;
int res;
mfn_t smfn;
paddr_t start, size;
- nodeid_t node = 0U;
smfn = page_to_mfn(pg);
start = mfn_to_maddr(smfn);
size = pfn_to_paddr(1UL << order);
- /* This code is temporal */
- {
- struct membanks *mem = bootinfo_get_mem();
- for ( i = 0; i < mem->nr_banks; i++ )
- {
- if ( start >= mem->bank[i].start &&
- start < (mem->bank[i].start + mem->bank[i].size) )
- {
- node = get_numa_nodeid(&mem->bank[i]);
- break;
- }
- }
- }
-
D11PRINT("Allocated %#"PRIpaddr"-%#"PRIpaddr" (%ldMB/%ldMB, order %d)\n",
start, start + size,
1UL << (order + PAGE_SHIFT - 20),
@@ -293,7 +279,13 @@ fail:
}
/*
- * This is all pretty horrible.
+ * Allocate NUMA-aware memory for Dom0 with 1:1 mapping.
+ *
+ * This function distributes the requested Dom0 memory across the allowed
+ * physical NUMA nodes in a balanced manner. It implements a multi-pass
+ * scavenging loop to allow nodes to dynamically back up each other if a
+ * particular node runs out of memory, maintaining a balanced distribution
+ * while ensuring the maximum amount of requested memory is satisfied.
*
* Requirements:
*
@@ -308,155 +300,187 @@ fail:
* below 4GB, so that it can be used by non-LPAE enabled kernels (32-bit).
* 4. Some devices assigned to dom0 can only do 32-bit DMA access or
* even be more restricted. We want to allocate as much of the RAM
- * as we reasonably can that can be accessed from all the devices..
+ * as we reasonably can that can be accessed from all the devices.
* 5. For 32-bit dom0 the kernel must be located below 4GB.
- * 6. We want to have a few largers banks rather than many smaller ones.
+ * 6. We want to have a few larger banks rather than many smaller ones.
*
* For the first two requirements we need to make sure that the lowest
- * bank is sufficiently large.
- *
- * For convenience we also sort the banks by physical address.
- *
- * The memory allocator does not really give us the flexibility to
- * meet these requirements directly. So instead of proceed as follows:
- *
- * We first allocate the largest allocation we can as low as we
- * can. This then becomes the first bank. This bank must be at least
- * 128MB (or memory size requested for domain if that is smaller).
+ * bank (Bank 0) is sufficiently large to hold all boot modules.
*
- * Then we start allocating more memory, trying to allocate the
- * largest possible size and trying smaller sizes until we
- * successfully allocate something.
+ * The memory allocator does not really give us the flexibility to meet
+ * these requirements directly under NUMA topologies. So instead we proceed
+ * as follows:
*
- * We then try and insert this memory in to the list of banks. If it
- * can be merged into an existing bank then this is trivial.
+ * We first calculate the total size required for the kernel, ramdisk, and
+ * DTB to establish a safe minimum size constraint for the first bank (Bank 0).
*
- * If the new memory is before the first bank (and cannot be merged into it)
- * and is at least 128M then we allow it, otherwise we give up. Since the
- * allocator prefers to allocate high addresses first and the first bank has
- * already been allocated to be as low as possible this likely means we
- * wouldn't have been able to allocate much more memory anyway.
+ * We then enter a multi-pass outer loop that runs until the full memory
+ * request is met. In each pass, we dynamically calculate the target allocation
+ * amount for each remaining active node to ensure a balanced distribution.
*
- * Otherwise we insert a new bank. If we've reached MAX_NR_BANKS then
- * we give up.
- *
- * For 32-bit domain we require that the initial allocation for the
- * first bank is part of the low mem. For 64-bit, the first bank is preferred
- * to be allocated in the low mem. Then for subsequent allocation, we
- * initially allocate memory only from low mem. Once that runs out out
- * (as described above) we allow higher allocations and continue until
- * that runs out (or we have allocated sufficient dom0 memory).
+ * For the initial chunk (Bank 0), we try to allocate the largest possible size
+ * as low as possible, honoring the 32-bit lowmem/DMA constraints. If it fails
+ * to find lowmem space and the domain is 64-bit, it falls back to highmem
+ * without violating the minimum size needed for the boot modules.
*/
static void __init allocate_memory_11(struct domain *d,
struct kernel_info *kinfo)
{
- const unsigned int min_low_order =
- get_order_from_bytes(min_t(paddr_t, kinfo->unassigned_mem, MB(128)));
- const unsigned int min_order = get_order_from_bytes(MB(4));
+ paddr_t todo = kinfo->unassigned_mem;
+ nodeid_t node;
+
+ unsigned int max_chunk_order = get_order_from_bytes(MB(128));
+ unsigned int min_bank0_order;
+ unsigned int lowmem_bitsize = arch_get_dma_bitsize();
+ bool is_bank0 = true;
+
+ struct boot_module *kernel_mod = boot_module_find_by_kind(BOOTMOD_KERNEL);
+ struct boot_module *ramdisk_mod =
boot_module_find_by_kind(BOOTMOD_RAMDISK);
+ struct boot_module *dtb_mod = boot_module_find_by_kind(BOOTMOD_FDT);
+ paddr_t required_size = 0;
+
+ nodemask_t exhausted_nodes;
+ nodemask_t valid_nodes;
+
struct membanks *mem = kernel_info_get_mem(kinfo);
- struct page_info *pg;
- unsigned int order = get_allocation_size(kinfo->unassigned_mem);
unsigned int i;
- bool lowmem = true;
- unsigned int lowmem_bitsize = min(32U, arch_get_dma_bitsize());
- unsigned int bits;
-
/*
* TODO: Implement memory bank allocation when DOM0 is not direct
* mapped
*/
BUG_ON(!is_domain_direct_mapped(d));
- printk("Allocating 1:1 mappings totalling %ldMB for %pd:\n",
+ printk("Allocating 1:1 mappings totalling %ldMB for dom0:\n",
/* Don't want format this as PRIpaddr (16 digit hex) */
- (unsigned long)(kinfo->unassigned_mem >> 20), d);
+ (unsigned long)(kinfo->unassigned_mem >> 20));
mem->nr_banks = 0;
/*
- * First try and allocate the largest thing we can as low as
- * possible to be bank 0.
+ * Calculate the absolute minimum size required to fit the kernel,
+ * initrd, and DTB inside Bank 0
*/
- while ( order >= min_low_order )
- {
- for ( bits = order ; bits <= lowmem_bitsize; bits++ )
- {
- pg = alloc_domheap_pages(d, order, MEMF_bits(bits));
- if ( pg != NULL )
- {
- if ( !insert_11_bank(d, kinfo, pg, order) )
- BUG(); /* Cannot fail for first bank */
+ if ( kernel_mod )
+ required_size += kernel_mod->size;
+ if ( ramdisk_mod )
+ required_size += ramdisk_mod->size;
+ if ( dtb_mod )
+ required_size += dtb_mod->size;
- goto got_bank0;
- }
- }
- order--;
- }
-
- /* Failed to allocate bank0 in the lowmem region. */
- if ( is_32bit_domain(d) )
- panic("Unable to allocate first memory bank\n");
+ min_bank0_order = get_order_from_bytes(required_size);
- /* Try to allocate memory from above the lowmem region */
- printk(XENLOG_INFO "No bank has been allocated below %u-bit.\n",
- lowmem_bitsize);
- lowmem = false;
+ nodes_clear(exhausted_nodes);
+ nodes_and(valid_nodes, d->node_affinity, node_online_map);
- got_bank0:
+ BUG_ON(nodes_empty(valid_nodes));
- /*
- * If we failed to allocate bank0 in the lowmem region,
- * continue allocating from above the lowmem and fill in banks.
- */
- order = get_allocation_size(kinfo->unassigned_mem);
- while ( kinfo->unassigned_mem && mem->nr_banks < mem->max_banks )
+ while ( todo > 0 )
{
- pg = alloc_domheap_pages(d, order,
- lowmem ? MEMF_bits(lowmem_bitsize) : 0);
- if ( !pg )
- {
- order --;
+ paddr_t last_todo = todo;
+ nodemask_t active_nodes;
+ unsigned int active_nodes_count;
+ unsigned int nodes_left;
- if ( lowmem && order < min_low_order)
- {
- D11PRINT("Failed at min_low_order, allow high allocations\n");
- order = get_allocation_size(kinfo->unassigned_mem);
- lowmem = false;
- continue;
- }
- if ( order >= min_order )
- continue;
+ /* Filter out exhausted nodes to find active candidates */
+ nodes_andnot(active_nodes, valid_nodes, exhausted_nodes);
+ active_nodes_count = nodes_weight(active_nodes);
- /* No more we can do */
+ if ( active_nodes_count == 0 )
+ {
+ printk(XENLOG_WARNING "Dom0 NUMA: All specified nodes are
completely exhausted.\n");
break;
}
- if ( !insert_11_bank(d, kinfo, pg, order) )
+ nodes_left = active_nodes_count;
+
+ for_each_node_mask(node, active_nodes)
{
- if ( mem->nr_banks == mem->max_banks )
- /* Nothing more we can do. */
- break;
+ paddr_t target_per_node;
+ paddr_t node_todo;
- if ( lowmem )
- {
- D11PRINT("Allocation below bank 0, allow high allocations\n");
- order = get_allocation_size(kinfo->unassigned_mem);
- lowmem = false;
- continue;
- }
- else
+ /* Target chunk size per node */
+ target_per_node = DIV_ROUND_UP(todo, nodes_left);
+ target_per_node = DIV_ROUND_UP(target_per_node, MB(128)) * MB(128);
+
+ node_todo = min(todo, target_per_node);
+
+ while ( node_todo > 0 )
{
- D11PRINT("Allocation below bank 0\n");
- break;
+ struct page_info *pg = NULL;
+ unsigned int max_order = get_allocation_size(node_todo);
+ unsigned int order;
+ paddr_t bank_size;
+
+ /*
+ * Enforce a maximum chunk cap of 128MB for all allocations
+ * except Bank 0
+ */
+ if ( !is_bank0 && max_order > max_chunk_order )
+ max_order = max_chunk_order;
+
+ for ( order = max_order; ; order-- )
+ {
+ unsigned int memflags = MEMF_node(node);
+ if ( !dom0_affinity_relaxed )
+ memflags |= MEMF_exact_node;
+
+ if ( is_bank0 )
+ {
+ unsigned int bits;
+ for ( bits = order; bits <= lowmem_bitsize; bits++ )
+ {
+ pg = alloc_domheap_pages(d, order, memflags |
MEMF_bits(bits));
+ if ( pg != NULL )
+ break;
+ }
+
+ if ( !pg && order <= min_bank0_order )
+ {
+ if ( is_32bit_domain(d) )
+ panic("Unable to allocate first memory bank
below %u-bit\n", lowmem_bitsize);
+
+ pg = alloc_domheap_pages(d, order, memflags);
+ }
+ }
+ else
+ {
+ pg = alloc_domheap_pages(d, order, memflags);
+ }
+
+ if ( pg || order == 0 )
+ break;
+ }
+
+ if ( !pg )
+ {
+ node_set(node, exhausted_nodes);
+ break;
+ }
+
+ if ( is_bank0 )
+ is_bank0 = false;
+
+ if ( !insert_11_bank(d, kinfo, pg, order, node) )
+ break;
+
+ bank_size = 1ULL << (PAGE_SHIFT + order);
+ node_todo -= bank_size;
+ todo -= bank_size;
+
+ if ( todo == 0 ) break;
}
+
+ nodes_left--;
+ if ( todo == 0 ) break;
}
/*
- * Success, next time around try again to get the largest order
- * allocation possible.
+ * Prevent infinite loop if a full pass across all active nodes
+ * yields zero progress
*/
- order = get_allocation_size(kinfo->unassigned_mem);
+ if ( todo == last_todo )
+ break;
}
if ( kinfo->unassigned_mem )
@@ -464,14 +488,15 @@ static void __init allocate_memory_11(struct domain *d,
panic("Failed to allocate requested dom0 memory. %ldMB unallocated\n",
(unsigned long)kinfo->unassigned_mem >> 20);
- for( i = 0; i < mem->nr_banks; i++ )
+ for ( i = 0; i < mem->nr_banks; i++ )
{
- printk("BANK[%d] %#"PRIpaddr"-%#"PRIpaddr" (%ldMB)\n",
+ printk("BANK[%d] %#"PRIpaddr"-%#"PRIpaddr" (%ldMB) NODE:%u\n",
i,
mem->bank[i].start,
mem->bank[i].start + mem->bank[i].size,
/* Don't want format this as PRIpaddr (16 digit hex) */
- (unsigned long)(mem->bank[i].size >> 20));
+ (unsigned long)(mem->bank[i].size >> 20),
+ get_numa_nodeid(&mem->bank[i]));
}
}
--
2.43.0
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |