[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] x86: Get rid of p2m_host array allocation for HVM guests



When allocating the guest memory for an HVM domain, libxc keeps the P2M
mapping for the entirety of the guest memory around for the time of the
launch as xc_dom_image->p2m_host. For guests that have a large memory
(3904 GiB), the p2m_host allocation takes more than 7.5 GiB of space, and
leaves xl susceptible to getting OOM-killed on guest creation.

Convert the p2m_host table lookups to an arch-specific function that
returns the mapping on-the-fly for x86 HVM guests to avoid this
allocation, bringing down xl's memory usage from > 8GiB to < 70Mib for
such launches.

Signed-off-by: Varad Gautam <vrd@xxxxxxxxx>

---

Applies to stable-4.11+.

 tools/libxc/include/xc_dom.h |  11 +++-
 tools/libxc/xc_dom_arm.c     |   2 +
 tools/libxc/xc_dom_core.c    |   4 +-
 tools/libxc/xc_dom_x86.c     | 126 ++++++++++++++++++++++++++++---------------
 4 files changed, 99 insertions(+), 44 deletions(-)

diff --git a/tools/libxc/include/xc_dom.h b/tools/libxc/include/xc_dom.h
index 8a66889..43abc0d 100644
--- a/tools/libxc/include/xc_dom.h
+++ b/tools/libxc/include/xc_dom.h
@@ -131,6 +131,9 @@ struct xc_dom_image {
      * a hybrid guest this means that it maps GPFNs to GPFNS.
      *
      * Note that the input is offset by rambase.
+     *
+     * This is not populated for guests that provide an arch-specific
+     * lookup hook in arch_hooks.
      */
     xen_pfn_t *p2m_host;
     void *p2m_guest;
@@ -274,6 +277,10 @@ struct xc_dom_arch {
     int arch_private_size;
 
     struct xc_dom_arch *next;
+
+    /* arch-specific p2m table lookup to get rid of the p2m_host array stored 
in
+     * xc_dom_image. */
+    xen_pfn_t (*p2m_host) (struct xc_dom_image *dom, unsigned long idx);
 };
 void xc_dom_register_arch_hooks(struct xc_dom_arch *hooks);
 
@@ -437,7 +444,9 @@ static inline xen_pfn_t xc_dom_p2m(struct xc_dom_image 
*dom, xen_pfn_t pfn)
         return pfn;
     if (pfn < dom->rambase_pfn || pfn >= dom->rambase_pfn + dom->total_pages)
         return INVALID_MFN;
-    return dom->p2m_host[pfn - dom->rambase_pfn];
+    return dom->arch_hooks->p2m_host ?
+            dom->arch_hooks->p2m_host(dom, pfn - dom->rambase_pfn)
+            : dom->p2m_host[pfn - dom->rambase_pfn];
 }
 
 #endif /* _XC_DOM_H */
diff --git a/tools/libxc/xc_dom_arm.c b/tools/libxc/xc_dom_arm.c
index 5b9eca6..b15c6d2 100644
--- a/tools/libxc/xc_dom_arm.c
+++ b/tools/libxc/xc_dom_arm.c
@@ -547,6 +547,7 @@ static struct xc_dom_arch xc_dom_32 = {
     .meminit = meminit,
     .bootearly = bootearly,
     .bootlate = bootlate,
+    .p2m_host = NULL,
 };
 
 static struct xc_dom_arch xc_dom_64 = {
@@ -563,6 +564,7 @@ static struct xc_dom_arch xc_dom_64 = {
     .meminit = meminit,
     .bootearly = bootearly,
     .bootlate = bootlate,
+    .p2m_host = NULL,
 };
 
 static void __init register_arch_hooks(void)
diff --git a/tools/libxc/xc_dom_core.c b/tools/libxc/xc_dom_core.c
index 9bd04cb..f3eaae3 100644
--- a/tools/libxc/xc_dom_core.c
+++ b/tools/libxc/xc_dom_core.c
@@ -985,7 +985,9 @@ int xc_dom_update_guest_p2m(struct xc_dom_image *dom)
                   __FUNCTION__, dom->p2m_size);
         p2m_32 = dom->p2m_guest;
         for ( i = 0; i < dom->p2m_size; i++ )
-            if ( dom->p2m_host[i] != INVALID_PFN )
+            if ( dom->arch_hooks->p2m_host )
+                p2m_32[i] = dom->arch_hooks->p2m_host(dom, i);
+            else if ( dom->p2m_host[i] != INVALID_PFN )
                 p2m_32[i] = dom->p2m_host[i];
             else
                 p2m_32[i] = (uint32_t) - 1;
diff --git a/tools/libxc/xc_dom_x86.c b/tools/libxc/xc_dom_x86.c
index 3ab918c..58f9894 100644
--- a/tools/libxc/xc_dom_x86.c
+++ b/tools/libxc/xc_dom_x86.c
@@ -101,6 +101,10 @@ struct xc_dom_image_x86 {
 #define MAPPING_MAX 2
     struct xc_dom_x86_mapping maps[MAPPING_MAX];
     struct xc_dom_params *params;
+
+    /* Used to fake vmemrange information in case vNUMA information was not 
provided. */
+    xen_vmemrange_t dummy_vmemrange[2];
+    unsigned int nr_dummy_vmemranges;
 };
 
 /* get guest IO ABI protocol */
@@ -1252,13 +1256,13 @@ static int meminit_hvm(struct xc_dom_image *dom)
     unsigned int memflags = 0;
     int claim_enabled = dom->claim_enabled;
     uint64_t total_pages;
-    xen_vmemrange_t dummy_vmemrange[2];
     unsigned int dummy_vnode_to_pnode[1];
     xen_vmemrange_t *vmemranges;
     unsigned int *vnode_to_pnode;
     unsigned int nr_vmemranges, nr_vnodes;
     xc_interface *xch = dom->xch;
     uint32_t domid = dom->guest_domid;
+    struct xc_dom_image_x86 *domx86 = dom->arch_private;
 
     if ( nr_pages > target_pages )
         memflags |= XENMEMF_populate_on_demand;
@@ -1274,25 +1278,26 @@ static int meminit_hvm(struct xc_dom_image *dom)
          * has no effect on the actual result.
          */
 
-        dummy_vmemrange[0].start = 0;
-        dummy_vmemrange[0].end   = dom->lowmem_end;
-        dummy_vmemrange[0].flags = 0;
-        dummy_vmemrange[0].nid   = 0;
-        nr_vmemranges = 1;
+        domx86->dummy_vmemrange[0].start = 0;
+        domx86->dummy_vmemrange[0].end   = dom->lowmem_end;
+        domx86->dummy_vmemrange[0].flags = 0;
+        domx86->dummy_vmemrange[0].nid   = 0;
+        domx86->nr_dummy_vmemranges = 1;
 
         if ( dom->highmem_end > (1ULL << 32) )
         {
-            dummy_vmemrange[1].start = 1ULL << 32;
-            dummy_vmemrange[1].end   = dom->highmem_end;
-            dummy_vmemrange[1].flags = 0;
-            dummy_vmemrange[1].nid   = 0;
+            domx86->dummy_vmemrange[1].start = 1ULL << 32;
+            domx86->dummy_vmemrange[1].end   = dom->highmem_end;
+            domx86->dummy_vmemrange[1].flags = 0;
+            domx86->dummy_vmemrange[1].nid   = 0;
 
-            nr_vmemranges++;
+            domx86->nr_dummy_vmemranges++;
         }
 
         dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
         nr_vnodes = 1;
-        vmemranges = dummy_vmemrange;
+        vmemranges = domx86->dummy_vmemrange;
+        nr_vmemranges = domx86->nr_dummy_vmemranges;
         vnode_to_pnode = dummy_vnode_to_pnode;
     }
     else
@@ -1329,25 +1334,6 @@ static int meminit_hvm(struct xc_dom_image *dom)
     }
 
     dom->p2m_size = p2m_size;
-    dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) *
-                                      dom->p2m_size);
-    if ( dom->p2m_host == NULL )
-    {
-        DOMPRINTF("Could not allocate p2m");
-        goto error_out;
-    }
-
-    for ( i = 0; i < p2m_size; i++ )
-        dom->p2m_host[i] = ((xen_pfn_t)-1);
-    for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
-    {
-        uint64_t pfn;
-
-        for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT;
-              pfn < vmemranges[vmemid].end >> PAGE_SHIFT;
-              pfn++ )
-            dom->p2m_host[pfn] = pfn;
-    }
 
     /*
      * Try to claim pages for early warning of insufficient memory available.
@@ -1395,8 +1381,12 @@ static int meminit_hvm(struct xc_dom_image *dom)
      */
     if ( dom->device_model )
     {
+        xen_pfn_t pfn_batch[0xa0];
+        for ( i = 0; i < 0xa0; i++ )
+            pfn_batch[i] = dom->arch_hooks->p2m_host(dom, i);
+
         rc = xc_domain_populate_physmap_exact(
-            xch, domid, 0xa0, 0, memflags, &dom->p2m_host[0x00]);
+            xch, domid, 0xa0, 0, memflags, &pfn_batch[0x00]);
         if ( rc != 0 )
         {
             DOMPRINTF("Could not populate low memory (< 0xA0).\n");
@@ -1439,7 +1429,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
             if ( count > max_pages )
                 count = max_pages;
 
-            cur_pfn = dom->p2m_host[cur_pages];
+            cur_pfn = dom->arch_hooks->p2m_host(dom, cur_pages);
 
             /* Take care the corner cases of super page tails */
             if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
@@ -1465,8 +1455,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
                 xen_pfn_t sp_extents[nr_extents];
 
                 for ( i = 0; i < nr_extents; i++ )
-                    sp_extents[i] =
-                        dom->p2m_host[cur_pages+(i<<SUPERPAGE_1GB_SHIFT)];
+                    sp_extents[i] = dom->arch_hooks->p2m_host(dom, 
cur_pages+(i<<SUPERPAGE_1GB_SHIFT));
 
                 done = xc_domain_populate_physmap(xch, domid, nr_extents,
                                                   SUPERPAGE_1GB_SHIFT,
@@ -1505,8 +1494,7 @@ static int meminit_hvm(struct xc_dom_image *dom)
                     xen_pfn_t sp_extents[nr_extents];
 
                     for ( i = 0; i < nr_extents; i++ )
-                        sp_extents[i] =
-                            dom->p2m_host[cur_pages+(i<<SUPERPAGE_2MB_SHIFT)];
+                        sp_extents[i] = dom->arch_hooks->p2m_host(dom, 
cur_pages+(i<<SUPERPAGE_2MB_SHIFT));
 
                     done = xc_domain_populate_physmap(xch, domid, nr_extents,
                                                       SUPERPAGE_2MB_SHIFT,
@@ -1521,14 +1509,39 @@ static int meminit_hvm(struct xc_dom_image *dom)
                     }
                 }
             }
-
             /* Fall back to 4kB extents. */
             if ( count != 0 )
             {
-                rc = xc_domain_populate_physmap_exact(
-                    xch, domid, count, 0, new_memflags, 
&dom->p2m_host[cur_pages]);
-                cur_pages += count;
-                stat_normal_pages += count;
+                unsigned long nr_extents;
+                xen_pfn_t *pfn_batch;
+
+                pfn_batch = calloc(SUPERPAGE_1GB_NR_PFNS, sizeof(*pfn_batch));
+                if  ( !pfn_batch ) {
+                    DOMPRINTF("Could not allocate memory to construct physmap 
batch.");
+                    rc = -1;
+                    goto error_out;
+                }
+
+                while ( count > 0 ) {
+                    for ( i = 0; i < count && i < SUPERPAGE_1GB_NR_PFNS; i++)
+                        pfn_batch[i] = dom->arch_hooks->p2m_host(dom, 
cur_pages+i);
+
+                    nr_extents = count > SUPERPAGE_1GB_NR_PFNS ? 
SUPERPAGE_1GB_NR_PFNS : count;
+                    rc = xc_domain_populate_physmap_exact(xch, domid, 
nr_extents,
+                                                      0, new_memflags, 
&pfn_batch[0]);
+                    if ( rc != 0 ) {
+                        DOMPRINTF("Could not populate physmap batch.");
+                        free(pfn_batch);
+                        rc = -1;
+                        goto error_out;
+                    }
+
+                    stat_normal_pages += nr_extents;
+                    cur_pages += nr_extents;
+                    count -= nr_extents;
+                }
+
+                free(pfn_batch);
             }
         }
 
@@ -1780,6 +1793,31 @@ static int bootlate_hvm(struct xc_dom_image *dom)
     return 0;
 }
 
+static xen_pfn_t p2m_host_hvm(struct xc_dom_image *dom, unsigned long idx)
+{
+    struct xc_dom_image_x86 *domx86 = dom->arch_private;
+    xen_vmemrange_t *vmemranges;
+    unsigned int nr_vmemranges;
+    int vmemid;
+
+    if ( dom->nr_vmemranges ) {
+        vmemranges = dom->vmemranges;
+        nr_vmemranges = dom->nr_vmemranges;
+    } else {
+        vmemranges = domx86->dummy_vmemrange;
+        nr_vmemranges = domx86->nr_dummy_vmemranges;
+    }
+
+    for ( vmemid = 0; vmemid < nr_vmemranges ; vmemid++ ) {
+        if ( idx >= (vmemranges[vmemid].start >> XC_DOM_PAGE_SHIFT(dom))
+             && idx < (vmemranges[vmemid].end >> XC_DOM_PAGE_SHIFT(dom)) ) {
+            return idx;
+        }
+    }
+
+    return ((xen_pfn_t)-1);
+}
+
 bool xc_dom_translated(const struct xc_dom_image *dom)
 {
     /* HVM guests are translated.  PV guests are not. */
@@ -1805,6 +1843,7 @@ static struct xc_dom_arch xc_dom_32_pae = {
     .meminit = meminit_pv,
     .bootearly = bootearly,
     .bootlate = bootlate_pv,
+    .p2m_host = NULL,
 };
 
 static struct xc_dom_arch xc_dom_64 = {
@@ -1824,6 +1863,7 @@ static struct xc_dom_arch xc_dom_64 = {
     .meminit = meminit_pv,
     .bootearly = bootearly,
     .bootlate = bootlate_pv,
+    .p2m_host = NULL,
 };
 
 static struct xc_dom_arch xc_hvm_32 = {
@@ -1831,6 +1871,7 @@ static struct xc_dom_arch xc_hvm_32 = {
     .native_protocol = XEN_IO_PROTO_ABI_X86_32,
     .page_shift = PAGE_SHIFT_X86,
     .sizeof_pfn = 4,
+    .arch_private_size = sizeof(struct xc_dom_image_x86),
     .alloc_magic_pages = alloc_magic_pages_hvm,
     .alloc_pgtables = alloc_pgtables_hvm,
     .setup_pgtables = NULL,
@@ -1840,6 +1881,7 @@ static struct xc_dom_arch xc_hvm_32 = {
     .meminit = meminit_hvm,
     .bootearly = bootearly,
     .bootlate = bootlate_hvm,
+    .p2m_host = p2m_host_hvm,
 };
 
 static void __init register_arch_hooks(void)
-- 
2.7.4




Amazon Development Center Germany GmbH
Krausenstr. 38
10117 Berlin
Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich
Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B
Sitz: Berlin
Ust-ID: DE 289 237 879




_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.