x86: introduce and use scratch CPU mask __get_page_type(), so far using an on-stack CPU mask variable, is involved in the recursion when e.g. pinning page tables. This means there may be up two five instances of the function active at a time, implying five instances of the (up to 512 bytes large) CPU mask variable. With an IRQ happening at the deepest point of the stack, and with send_guest_pirq() being called from there (leading to vcpu_kick() -> ... -> csched_vcpu_wake() -> __runq_tickle() -> cpumask_raise_softirq(), the last two of which also have CPU mask variables on their stacks), this has been observed to cause a stack overflow with a 4095-pCPU build. Introduce a per-CPU variable instead, which can then be used by any code never running in IRQ context. The mask can then also be used by other MMU code as well as by msi_compose_msg() (and quite likely we'll find further uses down the road). Signed-off-by: Jan Beulich --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2477,6 +2477,7 @@ static int __get_page_type(struct page_i int rc = 0, iommu_ret = 0; ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2))); + ASSERT(!in_irq()); for ( ; ; ) { @@ -2509,20 +2510,21 @@ static int __get_page_type(struct page_i * may be unnecessary (e.g., page was GDT/LDT) but those * circumstances should be very rare. */ - cpumask_t mask; + cpumask_t *mask = this_cpu(scratch_cpumask); - cpumask_copy(&mask, d->domain_dirty_cpumask); + BUG_ON(in_irq()); + cpumask_copy(mask, d->domain_dirty_cpumask); /* Don't flush if the timestamp is old enough */ - tlbflush_filter(&mask, page->tlbflush_timestamp); + tlbflush_filter(mask, page->tlbflush_timestamp); - if ( unlikely(!cpumask_empty(&mask)) && + if ( unlikely(!cpumask_empty(mask)) && /* Shadow mode: track only writable pages. */ (!shadow_mode_enabled(page_get_owner(page)) || ((nx & PGT_type_mask) == PGT_writable_page)) ) { perfc_incr(need_flush_tlb_flush); - flush_tlb_mask(&mask); + flush_tlb_mask(mask); } /* We lose existing type and validity. */ @@ -3404,22 +3406,22 @@ long do_mmuext_op( case MMUEXT_TLB_FLUSH_MULTI: case MMUEXT_INVLPG_MULTI: { - cpumask_t pmask; + cpumask_t *mask = this_cpu(scratch_cpumask); if ( unlikely(d != pg_owner) ) rc = -EPERM; else if ( unlikely(vcpumask_to_pcpumask(d, guest_handle_to_param(op.arg2.vcpumask, const_void), - &pmask)) ) + mask)) ) rc = -EINVAL; if ( unlikely(rc) ) break; if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI ) - flush_tlb_mask(&pmask); + flush_tlb_mask(mask); else if ( __addr_ok(op.arg1.linear_addr) ) - flush_tlb_one_mask(&pmask, op.arg1.linear_addr); + flush_tlb_one_mask(mask, op.arg1.linear_addr); break; } @@ -3457,14 +3459,14 @@ long do_mmuext_op( else if ( likely(cache_flush_permitted(d)) ) { unsigned int cpu; - cpumask_t mask; + cpumask_t *mask = this_cpu(scratch_cpumask); - cpumask_clear(&mask); + cpumask_clear(mask); for_each_online_cpu(cpu) - if ( !cpumask_intersects(&mask, + if ( !cpumask_intersects(mask, per_cpu(cpu_sibling_mask, cpu)) ) - __cpumask_set_cpu(cpu, &mask); - flush_mask(&mask, FLUSH_CACHE); + __cpumask_set_cpu(cpu, mask); + flush_mask(mask, FLUSH_CACHE); } else { @@ -4460,7 +4462,7 @@ static int __do_update_va_mapping( struct page_info *gl1pg; l1_pgentry_t *pl1e; unsigned long bmap_ptr, gl1mfn; - cpumask_t pmask; + cpumask_t *mask = NULL; int rc; perfc_incr(calls_to_update_va); @@ -4506,15 +4508,17 @@ static int __do_update_va_mapping( flush_tlb_local(); break; case UVMF_ALL: - flush_tlb_mask(d->domain_dirty_cpumask); + mask = d->domain_dirty_cpumask; break; default: + mask = this_cpu(scratch_cpumask); rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr, void), - &pmask); - flush_tlb_mask(&pmask); + mask); break; } + if ( mask ) + flush_tlb_mask(mask); break; case UVMF_INVLPG: @@ -4524,15 +4528,17 @@ static int __do_update_va_mapping( paging_invlpg(v, va); break; case UVMF_ALL: - flush_tlb_one_mask(d->domain_dirty_cpumask, va); + mask = d->domain_dirty_cpumask; break; default: + mask = this_cpu(scratch_cpumask); rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr, void), - &pmask); - flush_tlb_one_mask(&pmask, va); + mask); break; } + if ( mask ) + flush_tlb_one_mask(mask, va); break; } --- a/xen/arch/x86/msi.c +++ b/xen/arch/x86/msi.c @@ -40,7 +40,6 @@ static void __pci_disable_msix(struct ms /* bitmap indicate which fixed map is free */ static DEFINE_SPINLOCK(msix_fixmap_lock); static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES); -static DEFINE_PER_CPU(cpumask_var_t, scratch_mask); static int msix_fixmap_alloc(void) { @@ -167,7 +166,7 @@ void msi_compose_msg(unsigned vector, co if ( cpu_mask ) { - cpumask_t *mask = this_cpu(scratch_mask); + cpumask_t *mask = this_cpu(scratch_cpumask); if ( !cpumask_intersects(cpu_mask, &cpu_online_map) ) return; @@ -1458,43 +1457,12 @@ int pci_restore_msi_state(struct pci_dev return 0; } -static int msi_cpu_callback( - struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - - switch ( action ) - { - case CPU_UP_PREPARE: - if ( !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) ) - return notifier_from_errno(ENOMEM); - break; - case CPU_UP_CANCELED: - case CPU_DEAD: - free_cpumask_var(per_cpu(scratch_mask, cpu)); - break; - default: - break; - } - - return NOTIFY_DONE; -} - -static struct notifier_block msi_cpu_nfb = { - .notifier_call = msi_cpu_callback -}; - void __init early_msi_init(void) { if ( use_msi < 0 ) use_msi = !(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI); if ( !use_msi ) return; - - register_cpu_notifier(&msi_cpu_nfb); - if ( msi_cpu_callback(&msi_cpu_nfb, CPU_UP_PREPARE, NULL) & - NOTIFY_STOP_MASK ) - BUG(); } static void dump_msi(unsigned char key) --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -56,6 +56,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t /* representing HT and core siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask); +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask); + cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); @@ -646,6 +648,7 @@ static void cpu_smpboot_free(unsigned in free_cpumask_var(per_cpu(cpu_sibling_mask, cpu)); free_cpumask_var(per_cpu(cpu_core_mask, cpu)); + free_cpumask_var(per_cpu(scratch_cpumask, cpu)); if ( per_cpu(stubs.addr, cpu) ) { @@ -734,7 +737,8 @@ static int cpu_smpboot_alloc(unsigned in goto oom; if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) && - zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) ) + zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) && + alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) ) return 0; oom: @@ -791,7 +795,8 @@ void __init smp_prepare_cpus(unsigned in panic("No memory for socket CPU siblings map"); if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) || - !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) ) + !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) || + !alloc_cpumask_var(&per_cpu(scratch_cpumask, 0)) ) panic("No memory for boot CPU sibling/core maps"); set_cpu_sibling_map(0); --- a/xen/include/asm-x86/smp.h +++ b/xen/include/asm-x86/smp.h @@ -25,6 +25,7 @@ */ DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask); DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask); +DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask); void smp_send_nmi_allbutself(void);