[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH RFC] x86: enable RCU based table free when PARAVIRT



On x86 software page-table walkers depend on the fact that remote TLB flush
does an IPI: walk is performed lockless but with interrupts disabled and in
case the pagetable is freed the freeing CPU will get blocked as remote TLB
flush is required. On other architecture which don't require an IPI to do
remote TLB flush we have an RCU-based mechanism (see
include/asm-generic/tlb.h for more details).

In virtualized environments we may want to override .flush_tlb_others hook
in pv_mmu_ops and use a hypercall asking the hypervisor to do remote TLB
flush for us. This breaks the assumption about IPI. Xen PV does this for
years and the upcoming remote TLB flush for Hyper-V will do it too. This
is not safe, software pagetable walkers may step on an already freed page.

Solve the issue by enabling RCU-based table free mechanism when PARAVIRT
is selected in config. Testing with kernbench doesn't show any notable
performance impact:

6-CPU host:

Average Half load -j 3 Run (std deviation):
CURRENT                                 HAVE_RCU_TABLE_FREE
=======                                 ===================
Elapsed Time 400.498 (0.179679)         Elapsed Time 399.909 (0.162853)
User Time 1098.72 (0.278536)            User Time 1097.59 (0.283894)
System Time 100.301 (0.201629)          System Time 99.736 (0.196254)
Percent CPU 299 (0)                     Percent CPU 299 (0)
Context Switches 5774.1 (69.2121)       Context Switches 5744.4 (79.4162)
Sleeps 87621.2 (78.1093)                Sleeps 87586.1 (99.7079)

Average Optimal load -j 24 Run (std deviation):
CURRENT                                 HAVE_RCU_TABLE_FREE
=======                                 ===================
Elapsed Time 219.03 (0.652534)          Elapsed Time 218.959 (0.598674)
User Time 1119.51 (21.3284)             User Time 1118.81 (21.7793)
System Time 100.499 (0.389308)          System Time 99.8335 (0.251423)
Percent CPU 432.5 (136.974)             Percent CPU 432.45 (136.922)
Context Switches 81827.4 (78029.5)      Context Switches 81818.5 (78051)
Sleeps 97124.8 (9822.4)                 Sleeps 97207.9 (9955.04)

6-CPU host:

Average Half load -j 8 Run (std deviation):
CURRENT                                 HAVE_RCU_TABLE_FREE
=======                                 ===================
Elapsed Time 213.538 (3.7891)           Elapsed Time 212.5 (3.10939)
User Time 1306.4 (1.83399)              User Time 1307.65 (1.01364)
System Time 194.59 (0.864378)           System Time 195.478 (0.794588)
Percent CPU 702.6 (13.5388)             Percent CPU 707 (11.1131)
Context Switches 21189.2 (1199.4)       Context Switches 21288.2 (552.388)
Sleeps 89390.2 (482.325)                Sleeps 89677 (277.06)

Average Optimal load -j 64 Run (std deviation):
CURRENT                                 HAVE_RCU_TABLE_FREE
=======                                 ===================
Elapsed Time 137.866 (0.787928)         Elapsed Time 138.438 (0.218792)
User Time 1488.92 (192.399)             User Time 1489.92 (192.135)
System Time 234.981 (42.5806)           System Time 236.09 (42.8138)
Percent CPU 1057.1 (373.826)            Percent CPU 1057.1 (369.114)
Context Switches 187514 (175324)        Context Switches 187358 (175060)
Sleeps 112633 (24535.5)                 Sleeps 111743 (23297.6)

Suggested-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Vitaly Kuznetsov <vkuznets@xxxxxxxxxx>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/include/asm/tlb.h |  7 +++++++
 arch/x86/mm/pgtable.c      | 15 +++++++++++----
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..9c1666ea04c9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -167,6 +167,7 @@ config X86
        select HAVE_PERF_REGS
        select HAVE_PERF_USER_STACK_DUMP
        select HAVE_REGS_AND_STACK_ACCESS_API
+       select HAVE_RCU_TABLE_FREE              if SMP && PARAVIRT
        select HAVE_RELIABLE_STACKTRACE         if X86_64 && FRAME_POINTER && 
STACK_VALIDATION
        select HAVE_STACK_VALIDATION            if X86_64
        select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index c7797307fc2b..1d074c560a48 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -15,4 +15,11 @@
 
 #include <asm-generic/tlb.h>
 
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+static inline void __tlb_remove_table(void *table)
+{
+       free_page_and_swap_cache(table);
+}
+#endif
+
 #endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 508a708eb9a6..f9a3cdb9b574 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -52,11 +52,18 @@ static int __init setup_userpte(char *arg)
 }
 early_param("userpte", setup_userpte);
 
+#ifndef CONFIG_HAVE_RCU_TABLE_FREE
+static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+       return tlb_remove_page(tlb, table);
+}
+#endif
+
 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 {
        pgtable_page_dtor(pte);
        paravirt_release_pte(page_to_pfn(pte));
-       tlb_remove_page(tlb, pte);
+       tlb_remove_table(tlb, pte);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 2
@@ -72,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
        tlb->need_flush_all = 1;
 #endif
        pgtable_pmd_page_dtor(page);
-       tlb_remove_page(tlb, page);
+       tlb_remove_table(tlb, page);
 }
 
 #if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
        paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-       tlb_remove_page(tlb, virt_to_page(pud));
+       tlb_remove_table(tlb, virt_to_page(pud));
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
 {
        paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
-       tlb_remove_page(tlb, virt_to_page(p4d));
+       tlb_remove_table(tlb, virt_to_page(p4d));
 }
 #endif /* CONFIG_PGTABLE_LEVELS > 4 */
 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
-- 
2.13.5


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
https://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.