[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 3/3] x86: introduce and use scratch CPU mask

To: "xen-devel" <xen-devel@xxxxxxxxxxxxxxxxxxxx>
From: "Jan Beulich" <JBeulich@xxxxxxxx>
Date: Thu, 08 Dec 2016 09:02:04 -0700
Cc: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
Delivery-date: Thu, 08 Dec 2016 16:02:12 +0000
List-id: Xen developer discussion <xen-devel.lists.xen.org>

__get_page_type(), so far using an on-stack CPU mask variable, is
involved in the recursion when e.g. pinning page tables. This means
there may be up two five instances of the function active at a time,
implying five instances of the (up to 512 bytes large) CPU mask
variable. With an IRQ happening at the deepest point of the stack, and
with send_guest_pirq() being called from there (leading to vcpu_kick()
-> ... -> csched_vcpu_wake() -> __runq_tickle() ->
cpumask_raise_softirq(), the last two of which also have CPU mask
variables on their stacks), this has been observed to cause a stack
overflow with a 4095-pCPU build.

Introduce a per-CPU variable instead, which can then be used by any
code never running in IRQ context.

The mask can then also be used by other MMU code as well as by
msi_compose_msg() (and quite likely we'll find further uses down the
road).

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2477,6 +2477,7 @@ static int __get_page_type(struct page_i
     int rc = 0, iommu_ret = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
+    ASSERT(!in_irq());
 
     for ( ; ; )
     {
@@ -2509,20 +2510,21 @@ static int __get_page_type(struct page_i
                  * may be unnecessary (e.g., page was GDT/LDT) but those 
                  * circumstances should be very rare.
                  */
-                cpumask_t mask;
+                cpumask_t *mask = this_cpu(scratch_cpumask);
 
-                cpumask_copy(&mask, d->domain_dirty_cpumask);
+                BUG_ON(in_irq());
+                cpumask_copy(mask, d->domain_dirty_cpumask);
 
                 /* Don't flush if the timestamp is old enough */
-                tlbflush_filter(&mask, page->tlbflush_timestamp);
+                tlbflush_filter(mask, page->tlbflush_timestamp);
 
-                if ( unlikely(!cpumask_empty(&mask)) &&
+                if ( unlikely(!cpumask_empty(mask)) &&
                      /* Shadow mode: track only writable pages. */
                      (!shadow_mode_enabled(page_get_owner(page)) ||
                       ((nx & PGT_type_mask) == PGT_writable_page)) )
                 {
                     perfc_incr(need_flush_tlb_flush);
-                    flush_tlb_mask(&mask);
+                    flush_tlb_mask(mask);
                 }
 
                 /* We lose existing type and validity. */
@@ -3404,22 +3406,22 @@ long do_mmuext_op(
         case MMUEXT_TLB_FLUSH_MULTI:
         case MMUEXT_INVLPG_MULTI:
         {
-            cpumask_t pmask;
+            cpumask_t *mask = this_cpu(scratch_cpumask);
 
             if ( unlikely(d != pg_owner) )
                 rc = -EPERM;
             else if ( unlikely(vcpumask_to_pcpumask(d,
                                    guest_handle_to_param(op.arg2.vcpumask,
                                                          const_void),
-                                   &pmask)) )
+                                   mask)) )
                 rc = -EINVAL;
             if ( unlikely(rc) )
                 break;
 
             if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
-                flush_tlb_mask(&pmask);
+                flush_tlb_mask(mask);
             else if ( __addr_ok(op.arg1.linear_addr) )
-                flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
+                flush_tlb_one_mask(mask, op.arg1.linear_addr);
             break;
         }
 
@@ -3457,14 +3459,14 @@ long do_mmuext_op(
             else if ( likely(cache_flush_permitted(d)) )
             {
                 unsigned int cpu;
-                cpumask_t mask;
+                cpumask_t *mask = this_cpu(scratch_cpumask);
 
-                cpumask_clear(&mask);
+                cpumask_clear(mask);
                 for_each_online_cpu(cpu)
-                    if ( !cpumask_intersects(&mask,
+                    if ( !cpumask_intersects(mask,
                                              per_cpu(cpu_sibling_mask, cpu)) )
-                        __cpumask_set_cpu(cpu, &mask);
-                flush_mask(&mask, FLUSH_CACHE);
+                        __cpumask_set_cpu(cpu, mask);
+                flush_mask(mask, FLUSH_CACHE);
             }
             else
             {
@@ -4460,7 +4462,7 @@ static int __do_update_va_mapping(
     struct page_info *gl1pg;
     l1_pgentry_t  *pl1e;
     unsigned long  bmap_ptr, gl1mfn;
-    cpumask_t      pmask;
+    cpumask_t     *mask = NULL;
     int            rc;
 
     perfc_incr(calls_to_update_va);
@@ -4506,15 +4508,17 @@ static int __do_update_va_mapping(
             flush_tlb_local();
             break;
         case UVMF_ALL:
-            flush_tlb_mask(d->domain_dirty_cpumask);
+            mask = d->domain_dirty_cpumask;
             break;
         default:
+            mask = this_cpu(scratch_cpumask);
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
                                                                      void),
-                                      &pmask);
-            flush_tlb_mask(&pmask);
+                                      mask);
             break;
         }
+        if ( mask )
+            flush_tlb_mask(mask);
         break;
 
     case UVMF_INVLPG:
@@ -4524,15 +4528,17 @@ static int __do_update_va_mapping(
             paging_invlpg(v, va);
             break;
         case UVMF_ALL:
-            flush_tlb_one_mask(d->domain_dirty_cpumask, va);
+            mask = d->domain_dirty_cpumask;
             break;
         default:
+            mask = this_cpu(scratch_cpumask);
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
                                                                      void),
-                                      &pmask);
-            flush_tlb_one_mask(&pmask, va);
+                                      mask);
             break;
         }
+        if ( mask )
+            flush_tlb_one_mask(mask, va);
         break;
     }
 
--- a/xen/arch/x86/msi.c
+++ b/xen/arch/x86/msi.c
@@ -40,7 +40,6 @@ static void __pci_disable_msix(struct ms
 /* bitmap indicate which fixed map is free */
 static DEFINE_SPINLOCK(msix_fixmap_lock);
 static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
-static DEFINE_PER_CPU(cpumask_var_t, scratch_mask);
 
 static int msix_fixmap_alloc(void)
 {
@@ -167,7 +166,7 @@ void msi_compose_msg(unsigned vector, co
 
     if ( cpu_mask )
     {
-        cpumask_t *mask = this_cpu(scratch_mask);
+        cpumask_t *mask = this_cpu(scratch_cpumask);
 
         if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
             return;
@@ -1458,43 +1457,12 @@ int pci_restore_msi_state(struct pci_dev
     return 0;
 }
 
-static int msi_cpu_callback(
-    struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-    unsigned int cpu = (unsigned long)hcpu;
-
-    switch ( action )
-    {
-    case CPU_UP_PREPARE:
-        if ( !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
-            return notifier_from_errno(ENOMEM);
-        break;
-    case CPU_UP_CANCELED:
-    case CPU_DEAD:
-        free_cpumask_var(per_cpu(scratch_mask, cpu));
-        break;
-    default:
-        break;
-    }
-
-    return NOTIFY_DONE;
-}
-
-static struct notifier_block msi_cpu_nfb = {
-    .notifier_call = msi_cpu_callback
-};
-
 void __init early_msi_init(void)
 {
     if ( use_msi < 0 )
         use_msi = !(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI);
     if ( !use_msi )
         return;
-
-    register_cpu_notifier(&msi_cpu_nfb);
-    if ( msi_cpu_callback(&msi_cpu_nfb, CPU_UP_PREPARE, NULL) &
-         NOTIFY_STOP_MASK )
-        BUG();
 }
 
 static void dump_msi(unsigned char key)
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -56,6 +56,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t
 /* representing HT and core siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
 
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
+
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
@@ -646,6 +648,7 @@ static void cpu_smpboot_free(unsigned in
 
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
+    free_cpumask_var(per_cpu(scratch_cpumask, cpu));
 
     if ( per_cpu(stubs.addr, cpu) )
     {
@@ -734,7 +737,8 @@ static int cpu_smpboot_alloc(unsigned in
         goto oom;
 
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
-         zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
+         zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+         alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
         return 0;
 
  oom:
@@ -791,7 +795,8 @@ void __init smp_prepare_cpus(unsigned in
         panic("No memory for socket CPU siblings map");
 
     if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
-         !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
+         !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) ||
+         !alloc_cpumask_var(&per_cpu(scratch_cpumask, 0)) )
         panic("No memory for boot CPU sibling/core maps");
 
     set_cpu_sibling_map(0);
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -25,6 +25,7 @@
  */
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
+DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
 
 void smp_send_nmi_allbutself(void);

Attachment: x86-scratch-cpumask.patch
Description: Text document

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

Follow-Ups:
- Re: [Xen-devel] [PATCH 3/3] x86: introduce and use scratch CPU mask
  - From: Andrew Cooper

References:
- [Xen-devel] [PATCH 0/3] misc cpumask adjustments
  - From: Jan Beulich

Prev by Date: Re: [Xen-devel] Xen 4.9 Development Update
Next by Date: Re: [Xen-devel] [PATCH] libelf: Fix div0 issues in elf_{shdr, phdr}_count()
Previous by thread: Re: [Xen-devel] Ping: Re: [PATCH 2/3] VT-d: correct dma_msi_set_affinity()
Next by thread: Re: [Xen-devel] [PATCH 3/3] x86: introduce and use scratch CPU mask
Index(es):
- Date
- Thread

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.