[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] [PATCHv6 3/3] xen/privcmd: improve performance of MMAPBATCH_V2
On Fri, 6 Mar 2015, David Vrabel wrote: > Make the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) map > multiple frames at a time rather than one at a time, despite the pages > being non-consecutive GFNs. > > xen_remap_foreign_mfn_array() is added which maps an array of GFNs > (instead of a consecutive range of GFNs). > > Migrate times are significantly improved (when using a PV toolstack > domain). For example, for an idle 12 GiB PV guest: > > Before After > real 0m38.179s 0m26.868s > user 0m15.096s 0m13.652s > sys 0m28.988s 0m18.732s > > Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx> Nice! > arch/arm/xen/enlighten.c | 20 ++++++-- > arch/x86/xen/mmu.c | 102 ++++++++++++++++++++++++++++++++-------- > drivers/xen/privcmd.c | 117 > ++++++++++++++++++++++++++++++++-------------- > drivers/xen/xlate_mmu.c | 50 ++++++++++++-------- > include/xen/xen-ops.h | 47 +++++++++++++++++-- > 5 files changed, 253 insertions(+), 83 deletions(-) > > diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c > index 5c04389..9929cb6 100644 > --- a/arch/arm/xen/enlighten.c > +++ b/arch/arm/xen/enlighten.c > @@ -53,15 +53,27 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); > > static __read_mostly int xen_events_irq = -1; > > -int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > +int xen_remap_domain_mfn_array(struct vm_area_struct *vma, > unsigned long addr, > - xen_pfn_t mfn, int nr, > - pgprot_t prot, unsigned domid, > + xen_pfn_t *mfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid, > struct page **pages) > { > - return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, > + return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr, > prot, domid, pages); > } > +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); > + > +/* Not used in ARM. Use xen_remap_domain_mfn_array(). */ > +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > + unsigned long addr, > + xen_pfn_t mfn, int nr, > + pgprot_t prot, unsigned domid, > + struct page **pages) > +{ > + return -ENOSYS; > +} > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); I understand that it is not used, but since you have already introduced xen_remap_domain_mfn_array, you might as well implement xen_remap_domain_mfn_range by calling xen_remap_domain_mfn_array or xen_xlate_remap_gfn_array. > int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c > index 3d536a5..78d352f 100644 > --- a/arch/x86/xen/mmu.c > +++ b/arch/x86/xen/mmu.c > @@ -2439,7 +2439,8 @@ void __init xen_hvm_init_mmu_ops(void) > #define REMAP_BATCH_SIZE 16 > > struct remap_data { > - unsigned long mfn; > + xen_pfn_t *mfn; > + bool contiguous; > pgprot_t prot; > struct mmu_update *mmu_update; > }; > @@ -2448,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, > pgtable_t token, > unsigned long addr, void *data) > { > struct remap_data *rmd = data; > - pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); > + pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot)); > + > + /* If we have a contigious range, just update the mfn itself, > + else update pointer to be "next mfn". */ > + if (rmd->contiguous) > + (*rmd->mfn)++; > + else > + rmd->mfn++; > > rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; > rmd->mmu_update->val = pte_val_ma(pte); > @@ -2457,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, > pgtable_t token, > return 0; > } > > -int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > - unsigned long addr, > - xen_pfn_t mfn, int nr, > - pgprot_t prot, unsigned domid, > - struct page **pages) > - > +static int do_remap_mfn(struct vm_area_struct *vma, > + unsigned long addr, > + xen_pfn_t *mfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid, > + struct page **pages) > { > + int err = 0; > struct remap_data rmd; > struct mmu_update mmu_update[REMAP_BATCH_SIZE]; > - int batch; > unsigned long range; > - int err = 0; > + int mapped = 0; > > BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); > > if (xen_feature(XENFEAT_auto_translated_physmap)) { > #ifdef CONFIG_XEN_PVH > /* We need to update the local page tables and the xen HAP */ > - return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot, > - domid, pages); > + return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr, > + prot, domid, pages); > #else > return -EINVAL; > #endif > @@ -2484,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct > *vma, > > rmd.mfn = mfn; > rmd.prot = prot; > + /* We use the err_ptr to indicate if there we are doing a contigious > + * mapping or a discontigious mapping. */ > + rmd.contiguous = !err_ptr; > > while (nr) { > - batch = min(REMAP_BATCH_SIZE, nr); > + int index = 0; > + int done = 0; > + int batch = min(REMAP_BATCH_SIZE, nr); > + int batch_left = batch; > range = (unsigned long)batch << PAGE_SHIFT; > > rmd.mmu_update = mmu_update; > @@ -2495,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct > *vma, > if (err) > goto out; > > - err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); > - if (err < 0) > - goto out; > + /* We record the error for each page that gives an error, but > + * continue mapping until the whole set is done */ > + do { > + int i; > + > + err = HYPERVISOR_mmu_update(&mmu_update[index], > + batch_left, &done, domid); > + > + /* > + * @err_ptr may be the same buffer as @mfn, so > + * only clear it after each chunk of @mfn is > + * used. > + */ > + if (err_ptr) { > + for (i = index; i < index + done; i++) > + err_ptr[i] = 0; > + } > + if (err < 0) { > + if (!err_ptr) > + goto out; > + err_ptr[i] = err; > + done++; /* Skip failed frame. */ > + } else > + mapped += done; > + batch_left -= done; > + index += done; > + } while (batch_left); > > nr -= batch; > addr += range; > + if (err_ptr) > + err_ptr += batch; > } > - > - err = 0; > out: > > xen_flush_tlb_all(); > > - return err; > + return err < 0 ? err : mapped; > +} > + > +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > + unsigned long addr, > + xen_pfn_t mfn, int nr, > + pgprot_t prot, unsigned domid, > + struct page **pages) > +{ > + return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages); > } > EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); > > +int xen_remap_domain_mfn_array(struct vm_area_struct *vma, > + unsigned long addr, > + xen_pfn_t *mfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid, struct page **pages) > +{ > + /* We BUG_ON because it's a programmer error to pass a NULL err_ptr, > + * and the consequences later is quite hard to detect what the actual > + * cause of "wrong memory was mapped in". > + */ > + BUG_ON(err_ptr == NULL); > + return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages); > +} > +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array); > + > + > /* Returns: 0 success */ > int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > int numpgs, struct page **pages) > @@ -2526,3 +2589,4 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct > *vma, > #endif > } > EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range); > + > diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c > index 59ac71c..b667c99 100644 > --- a/drivers/xen/privcmd.c > +++ b/drivers/xen/privcmd.c > @@ -159,6 +159,40 @@ static int traverse_pages(unsigned nelem, size_t size, > return ret; > } > > +/* > + * Similar to traverse_pages, but use each page as a "block" of > + * data to be processed as one unit. > + */ > +static int traverse_pages_block(unsigned nelem, size_t size, > + struct list_head *pos, > + int (*fn)(void *data, int nr, void *state), > + void *state) > +{ > + void *pagedata; > + unsigned pageidx; > + int ret = 0; > + > + BUG_ON(size > PAGE_SIZE); I looks like that PAGE_SIZE needs to be a multiple of size. Maybe we can add a BUG_ON for that too. > + pageidx = PAGE_SIZE; > + > + while (nelem) { > + int nr = (PAGE_SIZE/size); > + struct page *page; > + if (nr > nelem) > + nr = nelem; > + pos = pos->next; > + page = list_entry(pos, struct page, lru); > + pagedata = page_address(page); > + ret = (*fn)(pagedata, nr, state); > + if (ret) > + break; > + nelem -= nr; > + } > + > + return ret; > +} > + > struct mmap_mfn_state { > unsigned long va; > struct vm_area_struct *vma; > @@ -274,39 +308,25 @@ struct mmap_batch_state { > /* auto translated dom0 note: if domU being created is PV, then mfn is > * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). > */ > -static int mmap_batch_fn(void *data, void *state) > +static int mmap_batch_fn(void *data, int nr, void *state) > { > xen_pfn_t *mfnp = data; > struct mmap_batch_state *st = state; > struct vm_area_struct *vma = st->vma; > struct page **pages = vma->vm_private_data; > - struct page *cur_page = NULL; > + struct page **cur_pages = NULL; > int ret; > > if (xen_feature(XENFEAT_auto_translated_physmap)) > - cur_page = pages[st->index++]; > + cur_pages = &pages[st->index]; > > - ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, > - st->vma->vm_page_prot, st->domain, > - &cur_page); > + BUG_ON(nr < 0); > + ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr, > + (int *)mfnp, st->vma->vm_page_prot, > + st->domain, cur_pages); > > - /* Store error code for second pass. */ > - if (st->version == 1) { > - if (ret < 0) { > - /* > - * V1 encodes the error codes in the 32bit top nibble > of the > - * mfn (with its known limitations vis-a-vis 64 bit > callers). > - */ > - *mfnp |= (ret == -ENOENT) ? > - PRIVCMD_MMAPBATCH_PAGED_ERROR : > - PRIVCMD_MMAPBATCH_MFN_ERROR; > - } > - } else { /* st->version == 2 */ > - *((int *) mfnp) = ret; > - } > - > - /* And see if it affects the global_error. */ > - if (ret < 0) { > + /* Adjust the global_error? */ > + if (ret != nr) { > if (ret == -ENOENT) > st->global_error = -ENOENT; > else { > @@ -315,23 +335,35 @@ static int mmap_batch_fn(void *data, void *state) > st->global_error = 1; > } > } > - st->va += PAGE_SIZE; > + st->va += PAGE_SIZE * nr; > + st->index += nr; > > return 0; > } > > -static int mmap_return_errors(void *data, void *state) > +static int mmap_return_error(int err, struct mmap_batch_state *st) > { > - struct mmap_batch_state *st = state; > + int ret; > > if (st->version == 1) { > - xen_pfn_t mfnp = *((xen_pfn_t *) data); > - if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR) > - return __put_user(mfnp, st->user_mfn++); > - else > + if (err) { > + xen_pfn_t mfn; > + > + ret = __get_user(mfn, st->user_mfn); > + if (ret < 0) > + return ret; > + /* > + * V1 encodes the error codes in the 32bit top > + * nibble of the mfn (with its known > + * limitations vis-a-vis 64 bit callers). > + */ > + mfn |= (ret == -ENOENT) ? > + PRIVCMD_MMAPBATCH_PAGED_ERROR : > + PRIVCMD_MMAPBATCH_MFN_ERROR; > + return __put_user(mfn, st->user_mfn++); > + } else > st->user_mfn++; Instead of having to resort to __get_user, couldn't you examine err and base you the choice between PRIVCMD_MMAPBATCH_MFN_ERROR and PRIVCMD_MMAPBATCH_PAGED_ERROR on it, similarly to what we did before? Also I think you should spend a few more words in the commit message regarding the changes you are making to the error reporting path. > } else { /* st->version == 2 */ > - int err = *((int *) data); > if (err) > return __put_user(err, st->user_err++); > else > @@ -341,6 +373,21 @@ static int mmap_return_errors(void *data, void *state) > return 0; > } > > +static int mmap_return_errors(void *data, int nr, void *state) > +{ > + struct mmap_batch_state *st = state; > + int *errs = data; > + int i; > + int ret; > + > + for (i = 0; i < nr; i++) { > + ret = mmap_return_error(errs[i], st); > + if (ret < 0) > + return ret; > + } > + return 0; > +} > > /* Allocate pfns that are then mapped with gmfns from foreign domid. Update > * the vma with the page info to use later. > * Returns: 0 if success, otherwise -errno > @@ -472,8 +519,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, > int version) > state.version = version; > > /* mmap_batch_fn guarantees ret == 0 */ > - BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, mmap_batch_fn, &state)); > + BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), > + &pagelist, mmap_batch_fn, &state)); > > up_write(&mm->mmap_sem); > > @@ -481,8 +528,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, > int version) > /* Write back errors in second pass. */ > state.user_mfn = (xen_pfn_t *)m.arr; > state.user_err = m.err; > - ret = traverse_pages(m.num, sizeof(xen_pfn_t), > - &pagelist, > mmap_return_errors, &state); > + ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), > + &pagelist, mmap_return_errors, > &state); > } else > ret = 0; > > diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c > index 7724d90..581db0f 100644 > --- a/drivers/xen/xlate_mmu.c > +++ b/drivers/xen/xlate_mmu.c > @@ -62,13 +62,15 @@ static int map_foreign_page(unsigned long lpfn, unsigned > long fgmfn, > } > > struct remap_data { > - xen_pfn_t fgmfn; /* foreign domain's gmfn */ > + xen_pfn_t *fgmfn; /* foreign domain's gmfn */ > pgprot_t prot; > domid_t domid; > struct vm_area_struct *vma; > int index; > struct page **pages; > struct xen_remap_mfn_info *info; > + int *err_ptr; > + int mapped; > }; > > static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, > @@ -80,38 +82,46 @@ static int remap_pte_fn(pte_t *ptep, pgtable_t token, > unsigned long addr, > pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot)); > int rc; > > - rc = map_foreign_page(pfn, info->fgmfn, info->domid); > - if (rc < 0) > - return rc; > - set_pte_at(info->vma->vm_mm, addr, ptep, pte); > + rc = map_foreign_page(pfn, *info->fgmfn, info->domid); > + *info->err_ptr++ = rc; > + if (!rc) { > + set_pte_at(info->vma->vm_mm, addr, ptep, pte); > + info->mapped++; > + } > + info->fgmfn++; > > return 0; > } > > -int xen_xlate_remap_gfn_range(struct vm_area_struct *vma, > +int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, > unsigned long addr, > - xen_pfn_t gfn, int nr, > - pgprot_t prot, unsigned domid, > + xen_pfn_t *mfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid, > struct page **pages) > { > int err; > struct remap_data data; > - > - /* TBD: Batching, current sole caller only does page at a time */ > - if (nr > 1) > - return -EINVAL; > - > - data.fgmfn = gfn; > - data.prot = prot; > + unsigned long range = nr << PAGE_SHIFT; > + > + /* Kept here for the purpose of making sure code doesn't break > + x86 PVOPS */ > + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); > + > + data.fgmfn = mfn; > + data.prot = prot; > data.domid = domid; > - data.vma = vma; > - data.index = 0; > + data.vma = vma; > data.pages = pages; > - err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, > + data.index = 0; > + data.err_ptr = err_ptr; > + data.mapped = 0; > + > + err = apply_to_page_range(vma->vm_mm, addr, range, > remap_pte_fn, &data); > - return err; > + return err < 0 ? err : data.mapped; > } > -EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_range); > +EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array); > > int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, > int nr, struct page **pages) > diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h > index ba7d232..225a2ff 100644 > --- a/include/xen/xen-ops.h > +++ b/include/xen/xen-ops.h > @@ -27,19 +27,56 @@ int xen_create_contiguous_region(phys_addr_t pstart, > unsigned int order, > void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); > > struct vm_area_struct; > + > +/* > + * xen_remap_domain_mfn_array() - map an array of foreign frames > + * @vma: VMA to map the pages into > + * @addr: Address at which to map the pages > + * @gfn: Array of GFNs to map > + * @nr: Number entries in the GFN array > + * @err_ptr: Returns per-GFN error status. > + * @prot: page protection mask > + * @domid: Domain owning the pages > + * @pages: Array of pages if this domain has an auto-translated physmap > + * > + * @gfn and @err_ptr may point to the same buffer, the GFNs will be > + * overwritten by the error codes after they are mapped. > + * > + * Returns the number of successfully mapped frames, or a -ve error > + * code. > + */ > +int xen_remap_domain_mfn_array(struct vm_area_struct *vma, > + unsigned long addr, > + xen_pfn_t *gfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid, > + struct page **pages); > + > +/* xen_remap_domain_mfn_range() - map a range of foreign frames > + * @vma: VMA to map the pages into > + * @addr: Address at which to map the pages > + * @gfn: First GFN to map. > + * @nr: Number frames to map > + * @prot: page protection mask > + * @domid: Domain owning the pages > + * @pages: Array of pages if this domain has an auto-translated physmap > + * > + * Returns the number of successfully mapped frames, or a -ve error > + * code. > + */ > int xen_remap_domain_mfn_range(struct vm_area_struct *vma, > unsigned long addr, > - xen_pfn_t mfn, int nr, > + xen_pfn_t gfn, int nr, > pgprot_t prot, unsigned domid, > struct page **pages); > int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, > int numpgs, struct page **pages); > #ifdef CONFIG_XEN_AUTO_XLATE > -int xen_xlate_remap_gfn_range(struct vm_area_struct *vma, > +int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, > unsigned long addr, > - xen_pfn_t gfn, int nr, > - pgprot_t prot, > - unsigned domid, > + xen_pfn_t *gfn, int nr, > + int *err_ptr, pgprot_t prot, > + unsigned domid, > struct page **pages); > int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, > int nr, struct page **pages); Since we are introducing a new internal interface, shouldn't we try to aim for something more generic, maybe like a scatter gather list? We had one consecutive range and now we also have a list of pages. It makes sense to jump straight into a list of ranges, right? _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxx http://lists.xen.org/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |