[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [RFC/PATCH] Improve speed of mapping guest memory into Dom0

To: <xen-devel@xxxxxxxxxxxxx>
From: Mats Petersson <mats.petersson@xxxxxxxxxx>
Date: Wed, 14 Nov 2012 11:13:11 +0000
Delivery-date: Wed, 14 Nov 2012 11:13:38 +0000
List-id: Xen developer discussion <xen-devel.lists.xen.org>

Whilst investigating why "guest not pingable time is too long", I foundthat the rate at which guest memory is copied on a localhost is quiteslow. I tracked down that a large portion of the time copying the guestmemory was actually spent mapping guest pages into Dom0 memory space,and then found that there are two hypervisor call for every page, one tomap that page in, and one to flush the TLB. So with some work, I managedto get it to map a larger number of pages at once (the complication hereis that Xen isn't allocating machine physical pages "in order", whichthe original remap function expects).

I originally prototyped this on a 2.6.32 kernel, and then ported itagain onto the 3.7rc kernel.

When microbenchmarking only the map/unmap of guest memory into Dom0,this provides a speed up of around 10x for the mapping itself, comparedto the original 2.6 kernel, and a little less, around 7-8x - in clockcycles the mapping with the new code, for a block of 1024 pages, takesaround 650k cycles, where the 3.7 kernel takes 5M, and the 2.6 kernelaround 7.6M.

When comparing the total localhost migration time, this makes a 15%improvment on a light load (idle) guest with 1GB memory, on a heavy loadguest (aggressively dirtying 800MB of the 1GB guest memory) it givesmore than 20% improvement. (When comapring 3.7 "original" vs. with thepatch below - 3.7 is a little better than the 2.6 kernel in my benchmarks).

The basic principle of the change is to pass a "list" of mfns (pointerto a sequence of mfns) to a new xen_remap_domain_mfn_list function,instead of the existing xen_remap_domain_mfn_range.


This change should only affect xen parts of the kernel.

I have also found that the munmap() call used to unmap the guest memoryfrom Dom0 is about 35% slower in 3.7 kernel than in the 2.6 kernel (3.8Mcycles vs 2.8M cycles). I think this could be made quicker by using adirect write of zero rather than the compare exchange operation that iscurrently used [which traps into Xen, performs the compare & exchange] -the compare exchange is required if the mapping is part of a mapped filethat needs to write dirty pages out to the backing storage, but for"we've mapped guest memory to Dom0", there is no need to atomicallycheck if it's dirty [it shouldn't be dirty in the first place, as weonly read from it]. Unfortunately, this is generic kernel code, so Ifear it's hard to get changes approved. I have a feeling, however, thatif the memory is not a memory mapped file [with write allowed], it wouldbe faster to write zero to the page-table entries even in native code.


--
Mats

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dcf5f2d..b8c022c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2482,6 +2482,12 @@ struct remap_data {
     struct mmu_update *mmu_update;
 };

+struct remap_list_data {
+    unsigned long *mfn;
+    pgprot_t prot;
+    struct mmu_update *mmu_update;
+};
+
 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
                  unsigned long addr, void *data)
 {

@@ -2495,6 +2501,19 @@ static int remap_area_mfn_pte_fn(pte_t *ptep,pgtable_t token,

     return 0;
 }

+static int remap_area_mfn_list_pte_fn(pte_t *ptep, pgtable_t token,
+                      unsigned long addr, void *data)
+{
+    struct remap_list_data *rmd = data;
+    pte_t pte = pte_mkspecial(pfn_pte(*rmd->mfn++, rmd->prot));
+
+    rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
+    rmd->mmu_update->val = pte_val_ma(pte);
+    rmd->mmu_update++;
+
+    return 0;
+}
+
 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
                    unsigned long addr,
                    unsigned long mfn, int nr,
@@ -2542,3 +2561,77 @@ out:
     return err;
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+
+/* like xen_remap_domain_mfn_range, but does a list of mfn's, rather
+ * than the, for xen, quite useless, consecutive pages.
+ */
+int xen_remap_domain_mfn_list(struct vm_area_struct *vma,
+                  unsigned long addr,
+                  unsigned long *mfn, int nr,
+                  int *err_ptr,
+                  pgprot_t prot, unsigned domid)
+{
+    struct remap_list_data rmd;
+    struct mmu_update mmu_update[REMAP_BATCH_SIZE];
+    int batch;
+    int done;
+    unsigned long range;
+    int err = 0;
+
+    if (xen_feature(XENFEAT_auto_translated_physmap))
+        return -EINVAL;
+
+    prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
+

+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP |VM_IO)));

+
+    rmd.mfn = mfn;
+    rmd.prot = prot;
+
+    while (nr) {
+        batch = min(REMAP_BATCH_SIZE, nr);
+        range = (unsigned long)batch << PAGE_SHIFT;
+
+        rmd.mmu_update = mmu_update;
+        err = apply_to_page_range(vma->vm_mm, addr, range,
+                      remap_area_mfn_list_pte_fn, &rmd);
+        if (err)
+        {

+ printk("xen_remap_domain_mfn_list: apply_to_range:err=%d\n", err);

+            goto out;
+        }
+
+        err = HYPERVISOR_mmu_update(mmu_update, batch, &done, domid);
+        if (err < 0)
+        {
+            int i;
+            /* TODO: We should remove this printk later */

+ printk("xen_remap_domain_mfn_list: mmu_update: err=%d,done=%d, batch=%d\n", err, done, batch);

+            err_ptr[done] = err;
+
+            /* now do the remaining part of this batch */
+            for(i = done+1; i < batch; i++)
+            {

+ int tmp_err = HYPERVISOR_mmu_update(&mmu_update[i], 1,NULL, domid);

+                if (tmp_err < 0)
+                {
+                    err_ptr[i] = tmp_err;
+                }
+            }
+
+            goto out;
+        }
+
+        nr -= batch;
+        addr += range;
+        err_ptr += batch;
+    }
+
+    err = 0;
+out:
+
+    xen_flush_tlb_all();
+
+    return err;
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_list);
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 8adb9cc..b39a7b7 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -151,6 +151,41 @@ static int traverse_pages(unsigned nelem, size_t size,
     return ret;
 }

+/*
+ * Similar to traverse_pages, but use each page as a "block" of
+ * data to be processed as one unit.
+ */
+static int traverse_pages_block(unsigned nelem, size_t size,
+                struct list_head *pos,
+                int (*fn)(void *data, int nr, void *state),
+                void *state)
+{
+    void *pagedata;
+    unsigned pageidx;
+    int ret = 0;
+
+    BUG_ON(size > PAGE_SIZE);
+
+    pageidx = PAGE_SIZE;
+    pagedata = NULL;    /* hush, gcc */
+
+    while (nelem) {
+        int nr = (PAGE_SIZE/size);
+        struct page *page;
+        if (nr > nelem)
+            nr = nelem;
+        pos = pos->next;
+        page = list_entry(pos, struct page, lru);
+        pagedata = page_address(page);
+        ret = (*fn)(pagedata, nr, state);
+        if (ret)
+            break;
+        nelem -= nr;
+    }
+
+    return ret;
+}
+
 struct mmap_mfn_state {
     unsigned long va;
     struct vm_area_struct *vma;
@@ -250,7 +285,7 @@ struct mmap_batch_state {
      *      0 for no errors
      *      1 if at least one error has happened (and no
      *          -ENOENT errors have happened)
-     *      -ENOENT if at least 1 -ENOENT has happened.
+     *      -ENOENT if at least one -ENOENT has happened.
      */
     int global_error;
     /* An array for individual errors */
@@ -260,17 +295,17 @@ struct mmap_batch_state {
     xen_pfn_t __user *user_mfn;
 };

-static int mmap_batch_fn(void *data, void *state)
+static int mmap_batch_fn(void *data, int nr, void *state)
 {
     xen_pfn_t *mfnp = data;
+
     struct mmap_batch_state *st = state;
     int ret;

-    ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
-                     st->vma->vm_page_prot, st->domain);
+    BUG_ON(nr < 0);

-    /* Store error code for second pass. */
-    *(st->err++) = ret;

+ ret = xen_remap_domain_mfn_list(st->vma, st->va & PAGE_MASK, mfnp,nr, st->err,

+                    st->vma->vm_page_prot, st->domain);

     /* And see if it affects the global_error. */
     if (ret < 0) {
@@ -282,7 +317,7 @@ static int mmap_batch_fn(void *data, void *state)
                 st->global_error = 1;
         }
     }
-    st->va += PAGE_SIZE;
+    st->va += PAGE_SIZE * nr;

     return 0;
 }

@@ -303,6 +338,7 @@ static int mmap_return_errors_v1(void *data, void*state)

     return __put_user(*mfnp, st->user_mfn++);
 }

+
 static struct vm_operations_struct privcmd_vm_ops;

 static long privcmd_ioctl_mmap_batch(void __user *udata, int version)

@@ -319,6 +355,7 @@ static long privcmd_ioctl_mmap_batch(void __user*udata, int version)

     if (!xen_initial_domain())
         return -EPERM;

+
     switch (version) {
     case 1:
         if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch)))

@@ -378,8 +415,8 @@ static long privcmd_ioctl_mmap_batch(void __user*udata, int version)

     state.err           = err_array;

     /* mmap_batch_fn guarantees ret == 0 */
-    BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t),
-                 &pagelist, mmap_batch_fn, &state));
+    BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
+                    &pagelist, mmap_batch_fn, &state));

     up_write(&mm->mmap_sem);

diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 6a198e4..15ae4f7 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h

@@ -29,4 +29,10 @@ int xen_remap_domain_mfn_range(struct vm_area_struct*vma,

                    unsigned long mfn, int nr,
                    pgprot_t prot, unsigned domid);

+int xen_remap_domain_mfn_list(struct vm_area_struct *vma,
+                  unsigned long addr,
+                  unsigned long *mfn, int nr,
+                  int *err_ptr,
+                  pgprot_t prot, unsigned domid);
+
 #endif /* INCLUDE_XEN_OPS_H */





_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

Follow-Ups:
- Re: [Xen-devel] [RFC/PATCH] Improve speed of mapping guest memory into Dom0
  - From: David Vrabel

Prev by Date: Re: [Xen-devel] Tools backports to 4.1
Next by Date: Re: [Xen-devel] [PATCH] xen-tmem-list-parse: fix ugly parse output
Previous by thread: [Xen-devel] [PATCH] Support user domain create extensions
Next by thread: Re: [Xen-devel] [RFC/PATCH] Improve speed of mapping guest memory into Dom0
Index(es):
- Date
- Thread

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.