[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 1/2] xen/mm: move adjustment of claimed pages counters on allocation


  • To: xen-devel@xxxxxxxxxxxxxxxxxxxx
  • From: Roger Pau Monne <roger.pau@xxxxxxxxxx>
  • Date: Wed, 24 Dec 2025 20:40:34 +0100
  • Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=citrix.com; dmarc=pass action=none header.from=citrix.com; dkim=pass header.d=citrix.com; arc=none
  • Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector10001; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-AntiSpam-MessageData-ChunkCount:X-MS-Exchange-AntiSpam-MessageData-0:X-MS-Exchange-AntiSpam-MessageData-1; bh=t+/OaSnO+7dubGjNm/28Rt0UT4bJsYsC3HQ9wqyEhQQ=; b=RpMKHJCVdB9XE4SHjy8b84lLdyNbxdBuxK8jihcpHUNOcnnldV/aVDudIq96kdzBKr7aUHjXU/F0HjXgPAdTAI42UfyoBlrYc5MDceJRhddagaOogIfsaQOkWzMq37mz1KFVCMDxZWxtDI8s39z4d76Jkg1tgFEg6RhIJuu7nK/KVwl+bKfEOTuRz2CX8/eowqUhDbTIBCct+n9UrrGo9+0dpgSDvwk/QTlgAMM7I5Cth7q9q/uGjEU/gnzmLD6QKvwcuxg1abC6Jvb7oLqWvITVxjQO2RDREzkXS4Fi8zzcqYzxuOcN6NPhWTxpfQl6rKIG5AEKrRLInrPsiNKRcg==
  • Arc-seal: i=1; a=rsa-sha256; s=arcselector10001; d=microsoft.com; cv=none; b=sKljABgmgUiIALbTUnMz5Yg5P5BCqE3kHRPljKVn51QndLvPznKxo4XynplieoJ25eBZyXSZYWgT9FiMB2rvL+miBwU4gpySAXFa64TixU+eqUnkSA8jo1rqF2gaJI7/yxN/nN/BhgMumDCULsvsEd0Nt0C78oFg9jEvtfGSLgutr8Cda6lFaVRPZsNR4Fv090oY2opZOvB1LVj7FGcC82Qj2O0LCCrCmV2W02u+FzlQoyN2v72ZqTu8+sTmR+KtrKjvAfvriALseu84x9qz/YJyHye3m8fuNDPcoAS/nZsFdjRvo7uFFyUDELfytuXNjMfjOntosGIQYxQ8XaMObw==
  • Authentication-results: dkim=none (message not signed) header.d=none;dmarc=none action=none header.from=citrix.com;
  • Cc: Roger Pau Monne <roger.pau@xxxxxxxxxx>, Andrew Cooper <andrew.cooper3@xxxxxxxxxx>, Anthony PERARD <anthony.perard@xxxxxxxxxx>, Michal Orzel <michal.orzel@xxxxxxx>, Jan Beulich <jbeulich@xxxxxxxx>, Julien Grall <julien@xxxxxxx>, Stefano Stabellini <sstabellini@xxxxxxxxxx>
  • Delivery-date: Wed, 24 Dec 2025 19:40:58 +0000
  • List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

The current logic splits the update of the amount of available memory in
the system (total_avail_pages) and pending claims into two separately
locked regions.  This leads to a window between counters adjustments where
the result of total_avail_pages - outstanding_claims doesn't reflect the
real amount of free memory available, and can return a negative value due
to total_avail_pages having been updated ahead of outstanding_claims.

Fix by adjusting outstanding_claims and d->outstanding_pages in the same
place where total_avail_pages is updated.  Note that accesses to
d->outstanding_pages is protected by the global heap_lock, just like
total_avail_pages or outstanding_claims.  Add a comment to the field
declaration, and also adjust the comment at the top of
domain_set_outstanding_pages() to be clearer in that regard.

Finally, due to claims being adjusted ahead of pages having been assigned
to the domain, add logic to re-gain the claim in case assign_page() fails.
Otherwise the page is freed and the claimed amount would be lost.

Fixes: 65c9792df600 ("mmu: Introduce XENMEM_claim_pages (subop of memory ops)")
Signed-off-by: Roger Pau Monné <roger.pau@xxxxxxxxxx>
---
Changes since v1:
 - Regain the claim if allocated page cannot be assigned to the domain.
 - Adjust comments regarding d->outstanding_pages being protected by the
   heap_lock (instead of the d->page_alloc_lock).
---
 xen/common/page_alloc.c | 88 ++++++++++++++++++++++-------------------
 xen/include/xen/sched.h |  3 +-
 2 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 1f67b88a8933..b73f6e264514 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -515,30 +515,6 @@ unsigned long domain_adjust_tot_pages(struct domain *d, 
long pages)
     ASSERT(rspin_is_locked(&d->page_alloc_lock));
     d->tot_pages += pages;
 
-    /*
-     * can test d->outstanding_pages race-free because it can only change
-     * if d->page_alloc_lock and heap_lock are both held, see also
-     * domain_set_outstanding_pages below
-     */
-    if ( !d->outstanding_pages || pages <= 0 )
-        goto out;
-
-    spin_lock(&heap_lock);
-    BUG_ON(outstanding_claims < d->outstanding_pages);
-    if ( d->outstanding_pages < pages )
-    {
-        /* `pages` exceeds the domain's outstanding count. Zero it out. */
-        outstanding_claims -= d->outstanding_pages;
-        d->outstanding_pages = 0;
-    }
-    else
-    {
-        outstanding_claims -= pages;
-        d->outstanding_pages -= pages;
-    }
-    spin_unlock(&heap_lock);
-
-out:
     return d->tot_pages;
 }
 
@@ -548,9 +524,10 @@ int domain_set_outstanding_pages(struct domain *d, 
unsigned long pages)
     unsigned long claim, avail_pages;
 
     /*
-     * take the domain's page_alloc_lock, else all d->tot_page adjustments
-     * must always take the global heap_lock rather than only in the much
-     * rarer case that d->outstanding_pages is non-zero
+     * Two locks are needed here:
+     *  - d->page_alloc_lock: protects accesses to d->{tot,max,extra}_pages.
+     *  - heap_lock: protects accesses to d->outstanding_pages, 
total_avail_pages
+     *    and outstanding_claims.
      */
     nrspin_lock(&d->page_alloc_lock);
     spin_lock(&heap_lock);
@@ -995,11 +972,11 @@ static void init_free_page_fields(struct page_info *pg)
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
     unsigned int order, unsigned int memflags,
-    struct domain *d)
+    struct domain *d, unsigned int *claimed)
 {
     nodeid_t node;
     unsigned int i, buddy_order, zone, first_dirty;
-    unsigned long request = 1UL << order;
+    unsigned int request = 1UL << order;
     struct page_info *pg;
     bool need_tlbflush = false;
     uint32_t tlbflush_timestamp = 0;
@@ -1012,6 +989,7 @@ static struct page_info *alloc_heap_pages(
     ASSERT(zone_lo <= zone_hi);
     ASSERT(zone_hi < NR_ZONES);
 
+    BUILD_BUG_ON(MAX_ORDER >= sizeof(request) * 8);
     if ( unlikely(order > MAX_ORDER) )
         return NULL;
 
@@ -1071,6 +1049,28 @@ static struct page_info *alloc_heap_pages(
     total_avail_pages -= request;
     ASSERT(total_avail_pages >= 0);
 
+    if ( d && d->outstanding_pages && !(memflags & MEMF_no_refcount) )
+    {
+        /*
+         * Adjust claims in the same locked region where total_avail_pages is
+         * adjusted, not doing so would lead to a window where the amount of
+         * free memory (avail - claimed) would be incorrect.
+         *
+         * Note that by adjusting the claimed amount here it's possible for
+         * pages to fail to be assigned to the claiming domain while already
+         * having been subtracted from d->outstanding_pages.  Such claimed
+         * amount is then lost, as the pages that fail to be assigned to the
+         * domain are freed without replenishing the claim.
+         */
+        unsigned int outstanding = min(d->outstanding_pages, request);
+
+        BUG_ON(outstanding > outstanding_claims);
+        outstanding_claims -= outstanding;
+        d->outstanding_pages -= outstanding;
+        ASSERT(claimed);
+        *claimed = outstanding;
+    }
+
     check_low_mem_virq();
 
     if ( d != NULL )
@@ -1518,7 +1518,8 @@ static void free_color_heap_page(struct page_info *pg, 
bool need_scrub);
 
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    struct page_info *pg, unsigned int order, bool need_scrub)
+    struct domain *d, struct page_info *pg, unsigned int order, bool 
need_scrub,
+    unsigned int claim)
 {
     unsigned long mask;
     mfn_t mfn = page_to_mfn(pg);
@@ -1553,6 +1554,12 @@ static void free_heap_pages(
 
     avail[node][zone] += 1 << order;
     total_avail_pages += 1 << order;
+    if ( d && claim )
+    {
+        outstanding_claims += claim;
+        d->outstanding_pages += claim;
+    }
+
     if ( need_scrub )
     {
         node_need_scrub[node] += 1 << order;
@@ -1842,7 +1849,7 @@ int online_page(mfn_t mfn, uint32_t *status)
     spin_unlock(&heap_lock);
 
     if ( (y & PGC_state) == PGC_state_offlined )
-        free_heap_pages(pg, 0, false);
+        free_heap_pages(NULL, pg, 0, false, 0);
 
     return ret;
 }
@@ -1918,7 +1925,7 @@ static void _init_heap_pages(const struct page_info *pg,
 
         if ( s )
             inc_order = min(inc_order, ffsl(s) - 1U);
-        free_heap_pages(mfn_to_page(_mfn(s)), inc_order, need_scrub);
+        free_heap_pages(NULL, mfn_to_page(_mfn(s)), inc_order, need_scrub, 0);
         s += (1UL << inc_order);
     }
 }
@@ -2452,7 +2459,7 @@ void *alloc_xenheap_pages(unsigned int order, unsigned 
int memflags)
     ASSERT_ALLOC_CONTEXT();
 
     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
-                          order, memflags | MEMF_no_scrub, NULL);
+                          order, memflags | MEMF_no_scrub, NULL, NULL);
     if ( unlikely(pg == NULL) )
         return NULL;
 
@@ -2467,7 +2474,7 @@ void free_xenheap_pages(void *v, unsigned int order)
     if ( v == NULL )
         return;
 
-    free_heap_pages(virt_to_page(v), order, false);
+    free_heap_pages(NULL, virt_to_page(v), order, false, 0);
 }
 
 #else  /* !CONFIG_SEPARATE_XENHEAP */
@@ -2523,7 +2530,7 @@ void free_xenheap_pages(void *v, unsigned int order)
     for ( i = 0; i < (1u << order); i++ )
         pg[i].count_info &= ~PGC_xen_heap;
 
-    free_heap_pages(pg, order, true);
+    free_heap_pages(NULL, pg, order, true, 0);
 }
 
 #endif  /* CONFIG_SEPARATE_XENHEAP */
@@ -2656,7 +2663,7 @@ struct page_info *alloc_domheap_pages(
 {
     struct page_info *pg = NULL;
     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
-    unsigned int dma_zone;
+    unsigned int dma_zone, claimed = 0;
 
     ASSERT_ALLOC_CONTEXT();
 
@@ -2679,12 +2686,13 @@ struct page_info *alloc_domheap_pages(
     else if ( !dma_bitsize )
         memflags &= ~MEMF_no_dma;
     else if ( (dma_zone = bits_to_zone(dma_bitsize)) < zone_hi )
-        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
+        pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d,
+                              &claimed);
 
     if ( (pg == NULL) &&
          ((memflags & MEMF_no_dma) ||
           ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
-                                  memflags, d)) == NULL)) )
+                                  memflags, d, &claimed)) == NULL)) )
          return NULL;
 
     if ( d && !(memflags & MEMF_no_owner) )
@@ -2701,7 +2709,7 @@ struct page_info *alloc_domheap_pages(
         }
         if ( assign_page(pg, order, d, memflags) )
         {
-            free_heap_pages(pg, order, memflags & MEMF_no_scrub);
+            free_heap_pages(d, pg, order, memflags & MEMF_no_scrub, claimed);
             return NULL;
         }
     }
@@ -2786,7 +2794,7 @@ void free_domheap_pages(struct page_info *pg, unsigned 
int order)
             scrub = 1;
         }
 
-        free_heap_pages(pg, order, scrub);
+        free_heap_pages(NULL, pg, order, scrub, 0);
     }
 
     if ( drop_dom_ref )
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 1f77e0869b5d..d922c908c29f 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -413,7 +413,8 @@ struct domain
     unsigned int     tot_pages;
 
     unsigned int     xenheap_pages;     /* pages allocated from Xen heap */
-    unsigned int     outstanding_pages; /* pages claimed but not possessed */
+    /* Pages claimed but not possessed, protected by global heap_lock. */
+    unsigned int     outstanding_pages;
     unsigned int     max_pages;         /* maximum value for 
domain_tot_pages() */
     unsigned int     extra_pages;       /* pages not included in 
domain_tot_pages() */
 
-- 
2.51.0




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.