[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] scaling problem with writable pagetables



We started to test a Linux dom0 up to 16-way (4 sockets, dual core, with HT) and we began to see some serious scaling issues, compared to scaling baremetal Linux to 16-way. We took some profiles and saw that functions in xen/arch/x86/mm.c were using disproportionately more CPU time as we scaled up the number of CPUs. Taking a quick look and those functions (for example, like update_va_mapping and do_mmu_update) it became kind of obvious that the locking probably does not scale. It appears we lock domain-wide on many of thee functions. This, IMO, causes a serious problem when running an SMP domain which happens to page fault a lot.

So, I got to thinking, just how much protection do we really need in these functions? The OS should already provide quite a bit of protection to page table writes. Is Xen imposing an even more deliberate and possibly unnecessary protection here?

So I made some changes to when we lock/unlock in most of these functions in mm.c (patch attached). Warning: I am pretty much making a shot in the dark here. I do not know this code nearly well enough to say this is the right thing to do. However, I can say without a doubt the changes make a significant change in performance:

benchmark      throughput increase with lock reduction

SDET           19%
reaim_shared   65%
reaim_fserver  16%

Below are per-function ratios of CPU time rev8830/rev8830-lock-reduction (derived from oprofile diffs)

SDET:

9.84/1   restore_all_guest
1.45/1   mod_l1_entry
2.59/1   do_softirq
1.63/1   test_guest_events
1.09/1   syscall_enter
1.35/1   propagate_page_fault
1.18/1   process_guest_except
1.13/1   timer_softirq_action
1.04/1   alloc_page_type
1.05/1   revalidate_l1
1.08/1   do_set_segment_base
1.10/1   get_s_time
1.19/1   __context_switch
1.09/1   switch_to_kernel
1.11/1   FLT4
1.62/1   xen_l3_entry_update
1.27/1   xen_invlpg_mask


reaim_shared:

1.43/1   do_update_va_mapping
1.44/1   do_page_fault
1.47/1   do_mmu_update
6.75/1   restore_all_guest
1.43/1   do_mmuext_op
1.37/1   sedf_do_schedule
1.20/1   mod_l1_entry
2.46/1   do_softirq
1.27/1   t_timer_fn
1.34/1   do_set_segment_base
1.20/1   timer_softirq_action
1.24/1   process_guest_except
1.12/1   timer_interrupt
1.14/1   evtchn_send


reaim_fserver:

1.16/1   do_update_va_mapping
1.13/1   do_page_fault
8.41/1   restore_all_guest
1.17/1   do_mmu_update
1.56/1   mod_l1_entry
2.48/1   do_softirq
1.02/1   do_mmuext_op
1.14/1   sedf_do_schedule
1.12/1   t_timer_fn
1.23/1   do_set_segment_base
1.11/1   device_not_available
1.11/1   timer_softirq_action
1.13/1   process_guest_except
1.20/1   timer_interrupt
1.15/1   copy_from_user
1.11/1   propagate_page_fault


Any comments greatly appreciated.

-Andrew

<signed-off-by: habanero@xxxxxxxxxx>
diff -Naurp xen-unstable.hg-8830/xen/arch/x86/mm.c 
xen-unstable.hg-8830-lockfix/xen/arch/x86/mm.c
--- xen-unstable.hg-8830/xen/arch/x86/mm.c      2006-02-15 16:30:31.000000000 
-0600
+++ xen-unstable.hg-8830-lockfix/xen/arch/x86/mm.c      2006-02-15 
16:32:48.000000000 -0600
@@ -1729,6 +1729,8 @@ int do_mmuext_op(
 
     cleanup_writable_pagetable(d);
 
+    UNLOCK_BIGLOCK(d);
+
     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
     {
         count &= ~MMU_UPDATE_PREEMPTED;
@@ -1957,7 +1959,6 @@ int do_mmuext_op(
     if ( unlikely(pdone != NULL) )
         __put_user(done + i, pdone);
 
-    UNLOCK_BIGLOCK(d);
     return rc;
 }
 
@@ -1982,6 +1983,8 @@ int do_mmu_update(
 
     cleanup_writable_pagetable(d);
 
+    UNLOCK_BIGLOCK(d);
+
     if ( unlikely(shadow_mode_enabled(d)) )
         check_pagetable(v, "pre-mmu"); /* debug */
 
@@ -2206,7 +2209,6 @@ int do_mmu_update(
     if ( unlikely(shadow_mode_enabled(d)) )
         check_pagetable(v, "post-mmu"); /* debug */
 
-    UNLOCK_BIGLOCK(d);
     return rc;
 }
 
@@ -2503,6 +2505,8 @@ int do_update_va_mapping(unsigned long v
 
     cleanup_writable_pagetable(d);
 
+    UNLOCK_BIGLOCK(d);
+
     if ( unlikely(shadow_mode_enabled(d)) )
         check_pagetable(v, "pre-va"); /* debug */
 
@@ -2574,8 +2578,6 @@ int do_update_va_mapping(unsigned long v
 
     process_deferred_ops(cpu);
     
-    UNLOCK_BIGLOCK(d);
-
     return rc;
 }
 
@@ -2675,13 +2677,9 @@ long do_set_gdt(unsigned long *frame_lis
     if ( copy_from_user(frames, frame_list, nr_pages * sizeof(unsigned long)) )
         return -EFAULT;
 
-    LOCK_BIGLOCK(current->domain);
-
     if ( (ret = set_gdt(current, frames, entries)) == 0 )
         local_flush_tlb();
 
-    UNLOCK_BIGLOCK(current->domain);
-
     return ret;
 }
 
@@ -2700,21 +2698,18 @@ long do_update_descriptor(u64 pa, u64 de
 
     *(u64 *)&d = desc;
 
-    LOCK_BIGLOCK(dom);
 
     if ( !VALID_MFN(mfn = gmfn_to_mfn(dom, gmfn)) ||
          (((unsigned int)pa % sizeof(struct desc_struct)) != 0) ||
          !mfn_valid(mfn) ||
          !check_descriptor(&d) )
     {
-        UNLOCK_BIGLOCK(dom);
         return -EINVAL;
     }
 
     page = mfn_to_page(mfn);
     if ( unlikely(!get_page(page, dom)) )
     {
-        UNLOCK_BIGLOCK(dom);
         return -EINVAL;
     }
 
@@ -2760,8 +2755,6 @@ long do_update_descriptor(u64 pa, u64 de
  out:
     put_page(page);
 
-    UNLOCK_BIGLOCK(dom);
-
     return ret;
 }
 
@@ -2793,7 +2786,6 @@ long arch_memory_op(int op, void *arg)
             return -ESRCH;
         }
 
-        LOCK_BIGLOCK(d);
         if ( d->arch.first_reserved_pfn == 0 )
         {
             d->arch.first_reserved_pfn = pfn = d->max_pages;
@@ -2803,7 +2795,6 @@ long arch_memory_op(int op, void *arg)
                 guest_physmap_add_page(
                     d, pfn + 1 + i, gnttab_shared_mfn(d, d->grant_table, i));
         }
-        UNLOCK_BIGLOCK(d);
 
         xrpa.first_gpfn = d->arch.first_reserved_pfn;
         xrpa.nr_gpfns   = 32;
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.