Xen project Mailing List

[Xen-devel] [PATCH v3] Xen: Spread boot time page scrubbing across all available CPU's

From: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx> The page scrubbing is done in 128MB chunks in lockstep across all the CPU's. This allows for the boot CPU to hold the heap_lock whilst each chunk is being scrubbed and then release the heap_lock when all CPU's are finished scrubing their individual chunk. This allows for the heap_lock to not be held continously and for pending softirqs are to be serviced periodically across all CPU's. The page scrub memory chunks are allocated to the CPU's in a NUMA aware fashion to reduce socket interconnect overhead and improve performance. Specifically in the first phase we scrub at the same time on all the NUMA nodes that have CPUs - we also weed out the SMT threads so that we only use cores (that gives a 50% boost). The second phase is to use all of the CPUs for the NUMA nodes that have no CPUs. This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron 6386 machine from 49 seconds to 3 seconds. On a IvyBridge-EX 8 socket box with 1.5TB it cuts it down from 15 minutes to 117 seconds. v2 - Reduced default chunk size to 128MB - Added code to scrub NUMA nodes with no active CPU linked to them - Be robust to boot CPU not being linked to a NUMA node v3: - Don't use SMT threads - Take care of remainder if the number of CPUs (or memory) is odd - Restructure the worker thread - s/u64/unsigned long/ Signed-off-by: Malcolm Crossley <malcolm.crossley@xxxxxxxxxx> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx> --- docs/misc/xen-command-line.markdown | 10 ++ xen/common/page_alloc.c | 177 +++++++++++++++++++++++++++++++---- 2 files changed, 167 insertions(+), 20 deletions(-) diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index 87de2dc..a7da227 100644 --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -197,6 +197,16 @@ Scrub free RAM during boot. This is a safety feature to prevent accidentally leaking sensitive VM data into other VMs if Xen crashes and reboots. +### bootscrub_chunk_ +> `= <size>` + +> Default: `128MiB` + +Maximum RAM block size chunks to be scrubbed whilst holding the page heap lock +and not running softirqs. Reduce this if softirqs are not being run frequently +enough. Setting this to a high value may cause cause boot failure, particularly +if the NMI watchdog is also enabled. + ### cachesize > `= <size>` diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 601319c..3ad6e1d 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata = 1; boolean_param("bootscrub", opt_bootscrub); /* + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held + */ +static unsigned int __initdata opt_bootscrub_chunk = 128 * 1024 * 1024; +size_param("bootscrub_chunk", opt_bootscrub_chunk); + +/* * Bit width of the DMA heap -- used to override NUMA-node-first. * allocation strategy, which can otherwise exhaust low memory. */ @@ -90,6 +96,16 @@ static struct bootmem_region { } *__initdata bootmem_region_list; static unsigned int __initdata nr_bootmem_regions; +struct scrub_region { + unsigned long offset; + unsigned long start; + unsigned long per_cpu_sz; + unsigned long rem; + cpumask_t cpu; +}; +static struct scrub_region __initdata region[MAX_NUMNODES]; +static unsigned long __initdata chunk_size; + static void __init boot_bug(int line) { panic("Boot BUG at %s:%d", __FILE__, line); @@ -1256,45 +1272,166 @@ void __init end_boot_allocator(void) printk("\n"); } +void __init smp_scrub_heap_pages(void *data) +{ + unsigned long mfn, start, end; + struct page_info *pg; + struct scrub_region *r; + unsigned int temp_cpu, node, cpu_idx = 0; + unsigned int cpu = smp_processor_id(); + + if ( data ) + r = data; + else { + node = cpu_to_node(cpu); + if ( node == NUMA_NO_NODE ) + return; + r = &region[node]; + } + ASSERT(r != NULL); + + /* Determine the current CPU's index into CPU's linked to this node*/ + for_each_cpu ( temp_cpu, &r->cpu ) + { + if ( cpu == temp_cpu ) + break; + cpu_idx++; + } + + /* Calculate the starting mfn for this CPU's memory block */ + start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset; + + /* Calculate the end mfn into this CPU's memory block for this iteration */ + if ( r->offset + chunk_size > r->per_cpu_sz ) { + end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz; + if ( r->rem && ((cpumask_weight(&r->cpu) - 1 == cpu_idx )) ) + end += r->rem; + } + else + end = start + chunk_size; + + for ( mfn = start; mfn < end; mfn++ ) + { + pg = mfn_to_page(mfn); + + /* Check the mfn is valid and page is free. */ + if ( !mfn_valid(mfn) || !page_state_is(pg, free) ) + continue; + + scrub_one_page(pg); + } + wmb(); +} + /* - * Scrub all unallocated pages in all heap zones. This function is more - * convoluted than appears necessary because we do not want to continuously - * hold the lock while scrubbing very large memory areas. + * Scrub all unallocated pages in all heap zones. This function uses all + * online cpu's to scrub the memory in parallel. */ void __init scrub_heap_pages(void) { - unsigned long mfn; - struct page_info *pg; + cpumask_t node_cpus, temp_cpus, all_worker_cpus = {{ 0 }}; + unsigned int i, j, cpu, sibling; + unsigned long offset, max_per_cpu_sz = 0; + unsigned long start, end; + unsigned long rem = 0; if ( !opt_bootscrub ) return; - printk("Scrubbing Free RAM: "); + /* Scrub block size */ + chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT; + if ( chunk_size == 0 ) + chunk_size = 1; - for ( mfn = first_valid_mfn; mfn < max_page; mfn++ ) + /* Round #0 - figure out amounts and which CPUs to use */ + for_each_online_node ( i ) { + /* Calculate Node memory start and end address */ + start = max(node_start_pfn(i), first_valid_mfn); + end = min(node_start_pfn(i) + node_spanned_pages(i), max_page); + /* CPUs that are online and on this node (if none, that it is OK */ + cpumask_and(&node_cpus, &node_to_cpumask(i), &cpu_online_map); + cpumask_copy(&temp_cpus, &node_cpus); + /* Rip out threads. */ + for_each_cpu ( j, &temp_cpus ) + { + cpu = 0; + for_each_cpu(sibling, per_cpu(cpu_sibling_mask, j)) { + if (cpu++ == 0) /* Skip 1st CPU - the core */ + continue; + cpumask_clear_cpu(sibling, &node_cpus); + } + } + cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus); + if ( cpumask_empty(&node_cpus) ) { /* No CPUs on this node. */ + rem = 0; + region[i].per_cpu_sz = (end - start); + } else { + rem = (end - start) % cpumask_weight(&node_cpus); + region[i].per_cpu_sz = (end - start) / cpumask_weight(&node_cpus); + if ( region[i].per_cpu_sz > max_per_cpu_sz ) + max_per_cpu_sz = region[i].per_cpu_sz; + } + region[i].start = start; + region[i].rem = rem; + cpumask_copy(&region[i].cpu, &node_cpus); + + } + cpu = smp_processor_id(); + /* Round default chunk size down if required */ + if ( max_per_cpu_sz && chunk_size > max_per_cpu_sz ) + chunk_size = max_per_cpu_sz; + + printk("Scrubbing Free RAM on %u nodes using %u CPUs: ", num_online_nodes(), + cpumask_weight(&all_worker_cpus)); + + /* Round: #1 - do NUMA nodes with CPUs */ + for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size ) + { + for_each_online_node ( i ) + region[i].offset = offset; + process_pending_softirqs(); - pg = mfn_to_page(mfn); + spin_lock(&heap_lock); + on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1); + spin_unlock(&heap_lock); - /* Quick lock-free check. */ - if ( !mfn_valid(mfn) || !page_state_is(pg, free) ) + printk("."); + } + + /* Round #2: NUMA nodes with no CPUs get scrubbed with all CPUs. */ + for_each_online_node ( i ) + { + node_cpus = node_to_cpumask(i); + + if ( !cpumask_empty(&node_cpus) ) continue; - /* Every 100MB, print a progress dot. */ - if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) - printk("."); + /* We already have the node information from round #0 */ + end = region[i].start + region[i].per_cpu_sz; + rem = region[i].per_cpu_sz % cpumask_weight(&all_worker_cpus); - spin_lock(&heap_lock); + region[i].rem = rem; + region[i].per_cpu_sz /= cpumask_weight(&all_worker_cpus); + max_per_cpu_sz = region[i].per_cpu_sz; + if ( max_per_cpu_sz && chunk_size > max_per_cpu_sz ) + chunk_size = max_per_cpu_sz; + cpumask_copy(&region[i].cpu, &all_worker_cpus); - /* Re-check page status with lock held. */ - if ( page_state_is(pg, free) ) - scrub_one_page(pg); + for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size ) + { + region[i].offset = offset; - spin_unlock(&heap_lock); - } + process_pending_softirqs(); + + spin_lock(&heap_lock); + on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, &region[i], 1); + spin_unlock(&heap_lock); - printk("done.\n"); + printk("."); + } + } /* Now that the heap is initialized, run checks and set bounds * for the low mem virq algorithm. */ -- 1.7.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.