[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v2 3/3] xen: use idle vcpus to scrub pages



In case of heavy lock contention, use two percpu lists.
 - Delist a batch of pages to a percpu list from _heap[] free page list.
 - Scrub pages on this percpu list and add to another percpu free list.
 - Free those clean pages to _heap[], merge with other chunks if needed.

v2:
* Avoid having two hyperthreads within the same core doing scrubbing
* Limit (1<<SCRUB_BATCH_ORDER) pages to percpu list in one go
* Won't spin on heap lock when there is nothing to scrub
* Partial numa aware

Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
---
 xen/arch/arm/domain.c   |    1 +
 xen/arch/x86/domain.c   |    1 +
 xen/common/page_alloc.c |  130 +++++++++++++++++++++++++++++++++++++++++++++++
 xen/include/xen/mm.h    |    1 +
 4 files changed, 133 insertions(+)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 04d0cd0..b6bc3ac 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -44,6 +44,7 @@ void idle_loop(void)
         if ( cpu_is_offline(smp_processor_id()) )
             stop_cpu();
 
+        scrub_free_pages();
         local_irq_disable();
         if ( cpu_is_haltable(smp_processor_id()) )
         {
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e896210..e8d4fe7 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -116,6 +116,7 @@ static void idle_loop(void)
     {
         if ( cpu_is_offline(smp_processor_id()) )
             play_dead();
+        scrub_free_pages();
         (*pm_idle)();
         do_tasklet();
         do_softirq();
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index ab293c8..6ab1d1d 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -86,6 +86,12 @@ PAGE_LIST_HEAD(page_offlined_list);
 /* Broken page list, protected by heap_lock. */
 PAGE_LIST_HEAD(page_broken_list);
 
+/* A rough flag to indicate whether a node have need_scrub pages */
+static bool_t node_need_scrub[MAX_NUMNODES];
+static DEFINE_PER_CPU(bool_t, is_scrubbing);
+static DEFINE_PER_CPU(struct page_list_head, scrub_list_cpu);
+static DEFINE_PER_CPU(struct page_list_head, free_list_cpu);
+
 /*************************
  * BOOT-TIME ALLOCATOR
  */
@@ -948,6 +954,7 @@ static void free_heap_pages(
     {
         if ( !tainted )
         {
+            node_need_scrub[node] = 1;
             for ( i = 0; i < (1 << order); i++ )
                 pg[i].count_info |= PGC_need_scrub;
         }
@@ -1525,7 +1532,130 @@ void __init scrub_heap_pages(void)
     setup_low_mem_virq();
 }
 
+#define SCRUB_BATCH_ORDER 12
+static void __scrub_free_pages(unsigned int node, unsigned int cpu)
+{
+    struct page_info *pg, *tmp;
+    unsigned int i;
+    int order;
+    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
+    struct page_list_head *local_free_list = &this_cpu(free_list_cpu);
+
+    /* Scrub percpu list */
+    while ( !page_list_empty(local_scrub_list) )
+    {
+        pg = page_list_remove_head(local_scrub_list);
+        order = PFN_ORDER(pg);
+        ASSERT( pg && order <= SCRUB_BATCH_ORDER );
+        for ( i = 0; i < (1 << order); i++ )
+        {
+            ASSERT( test_bit(_PGC_need_scrub, &pg[i].count_info) );
+            scrub_one_page(&pg[i]);
+        }
+        page_list_add_tail(pg, local_free_list);
+        if ( softirq_pending(cpu) )
+               return;
+    }
+
+    /* free percpu free list */
+    if ( !page_list_empty(local_free_list) )
+    {
+        spin_lock(&heap_lock);
+        page_list_for_each_safe( pg, tmp, local_free_list )
+        {
+            order = PFN_ORDER(pg);
+            page_list_del(pg, local_free_list);
+            for ( i = 0; i < (1 << order); i++ )
+           {
+                pg[i].count_info |= PGC_state_free;
+                pg[i].count_info &= ~PGC_need_scrub;
+            }
+            merge_free_trunks(pg, order, node, page_to_zone(pg), 0);
+        }
+        spin_unlock(&heap_lock);
+    }
+}
+
+void scrub_free_pages(void)
+{
+    int order;
+    struct page_info *pg, *tmp;
+    unsigned int i, zone, nr_delisted = 0;
+    unsigned int cpu = smp_processor_id();
+    unsigned int node = cpu_to_node(cpu);
+    struct page_list_head *local_scrub_list = &this_cpu(scrub_list_cpu);
+
+    /* Return if our sibling already started scrubbing */
+    for_each_cpu( i, per_cpu(cpu_sibling_mask,cpu) )
+        if ( per_cpu(is_scrubbing, i) )
+            return;
+    this_cpu(is_scrubbing) = 1;
+
+    while ( !softirq_pending(cpu) )
+    {
+        if ( !node_need_scrub[node] )
+        {
+            /* Free local per cpu list before we exit */
+            __scrub_free_pages(node, cpu);
+            goto out;
+        }
+
+        /* Delist a batch of pages from global scrub list */
+        if ( page_list_empty(local_scrub_list) )
+        {
+            spin_lock(&heap_lock);
+            for ( zone = 0; zone < NR_ZONES; zone++ )
+            {
+                for ( order = MAX_ORDER; order >= 0; order-- )
+                {
+                    page_list_for_each_safe( pg, tmp, &heap(node, zone, order) 
)
+                    {
+                        if ( !test_bit(_PGC_need_scrub, &(pg->count_info)) )
+                            continue;
+
+                        page_list_del( pg, &heap(node, zone, order) );
+                        if ( order > SCRUB_BATCH_ORDER)
+                        {
+                            /* putback extra pages */
+                            i = order;
+                            while ( i != SCRUB_BATCH_ORDER )
+                            {
+                                PFN_ORDER(pg) = --i;
+                                page_list_add_tail(pg, &heap(node, zone, i));
+                                pg += 1 << i;
+                            }
+                            PFN_ORDER(pg) = SCRUB_BATCH_ORDER;
+                        }
+
+                        for ( i = 0; i < (1 << PFN_ORDER(pg)); i++ )
+                        {
+                            ASSERT( test_bit(_PGC_need_scrub, 
&pg[i].count_info) );
+                            ASSERT( !test_bit(_PGC_broken, &pg[i].count_info) 
);
+                            mark_page_offline(&pg[i], 0);
+                        }
+                        page_list_add_tail(pg, local_scrub_list);
+                        nr_delisted += ( 1 << PFN_ORDER(pg) );
+                        if ( nr_delisted >= (1 << SCRUB_BATCH_ORDER) )
+                        {
+                            nr_delisted = 0;
+                            spin_unlock(&heap_lock);
+                            goto start_scrub;
+                        }
+                    }
+                }
+            }
+
+            node_need_scrub[node] = 0;
+            spin_unlock(&heap_lock);
+        }
 
+ start_scrub:
+        __scrub_free_pages(node, cpu);
+    }
+
+ out:
+    this_cpu(is_scrubbing) = 0;
+}
 
 /*************************
  * XEN-HEAP SUB-ALLOCATOR
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index b183189..1fa8c3d 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -78,6 +78,7 @@ int query_page_offline(unsigned long mfn, uint32_t *status);
 unsigned long total_free_pages(void);
 
 void scrub_heap_pages(void);
+void scrub_free_pages(void);
 
 int assign_pages(
     struct domain *d,
-- 
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.