[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] hvm: make dirty logging stop requiring physical pages of order > 0
# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1195238004 0 # Node ID 68c911f7733a0158056d10e9e7997a6acfe47eb1 # Parent 2052364cb456170a70ad5c8bfb876c95f7a9fe4a hvm: make dirty logging stop requiring physical pages of order > 0 This patch re-implements the (x86) hypervisor dirty page log with a simple four-level radix tree whose nodes are all single pages, thus making migration require only order-0 pages (where before it required at least an order-5 page). Unlike the p2m radix tree implementation, the interior nodes of this tree are NOT page table nodes. I chose a lazy-allocation and -mapping approach because most pages are not marked dirty while dirty-logging is enabled. There are doubtless situations (the 'stream' benchmark, for example) where a more complex p2m-like approach is faster, but I'm not sure they're worth the effort. Signed-off-by: Dave Lively <dlively@xxxxxxxxxxxxxxx> --- xen/arch/x86/mm/paging.c | 251 +++++++++++++++++++++++++++------------ xen/arch/x86/mm/shadow/private.h | 43 +++++- xen/include/asm-x86/domain.h | 7 - xen/include/asm-x86/paging.h | 22 +++ 4 files changed, 241 insertions(+), 82 deletions(-) diff -r 2052364cb456 -r 68c911f7733a xen/arch/x86/mm/paging.c --- a/xen/arch/x86/mm/paging.c Fri Nov 16 17:59:34 2007 +0000 +++ b/xen/arch/x86/mm/paging.c Fri Nov 16 18:33:24 2007 +0000 @@ -96,36 +96,97 @@ spin_unlock(&(_d)->arch.paging.log_dirty.lock); \ } while (0) +static mfn_t paging_new_log_dirty_page(struct domain *d, void **mapping_p) +{ + mfn_t mfn; + struct page_info *page = alloc_domheap_page(NULL); + + if ( unlikely(page == NULL) ) { + d->arch.paging.log_dirty.failed_allocs++; + return _mfn(INVALID_MFN); + } + d->arch.paging.log_dirty.allocs++; + mfn = page_to_mfn(page); + *mapping_p = map_domain_page(mfn_x(mfn)); + return mfn; +} + + +static mfn_t paging_new_log_dirty_leaf(struct domain *d, uint8_t **leaf_p) +{ + mfn_t mfn = paging_new_log_dirty_page(d, (void **)leaf_p); + clear_page(*leaf_p); + return mfn; +} + + +static mfn_t paging_new_log_dirty_node(struct domain *d, mfn_t **node_p) +{ + int i; + mfn_t mfn = paging_new_log_dirty_page(d, (void **)node_p); + for (i = 0; i < LOGDIRTY_NODE_ENTRIES; i++) + (*node_p)[i] = _mfn(INVALID_MFN); + return mfn; +} + + /* allocate bitmap resources for log dirty */ int paging_alloc_log_dirty_bitmap(struct domain *d) { - if ( d->arch.paging.log_dirty.bitmap != NULL ) + mfn_t *mapping; + + if ( mfn_valid(d->arch.paging.log_dirty.top) ) return 0; - d->arch.paging.log_dirty.bitmap_size = - (domain_get_maximum_gpfn(d) + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); - d->arch.paging.log_dirty.bitmap = - xmalloc_array(unsigned long, - d->arch.paging.log_dirty.bitmap_size / BITS_PER_LONG); - if ( d->arch.paging.log_dirty.bitmap == NULL ) - { - d->arch.paging.log_dirty.bitmap_size = 0; + d->arch.paging.log_dirty.top = paging_new_log_dirty_node(d, &mapping); + if ( unlikely(!mfn_valid(d->arch.paging.log_dirty.top)) ) { + /* Clear error indicator since we're reporting this one */ + d->arch.paging.log_dirty.failed_allocs = 0; return -ENOMEM; } - memset(d->arch.paging.log_dirty.bitmap, 0, - d->arch.paging.log_dirty.bitmap_size/8); + unmap_domain_page(mapping); return 0; } + + +static void paging_free_log_dirty_page(struct domain *d, mfn_t mfn) +{ + d->arch.paging.log_dirty.allocs--; + free_domheap_page(mfn_to_page(mfn)); +} /* free bitmap resources */ void paging_free_log_dirty_bitmap(struct domain *d) { - d->arch.paging.log_dirty.bitmap_size = 0; - if ( d->arch.paging.log_dirty.bitmap ) - { - xfree(d->arch.paging.log_dirty.bitmap); - d->arch.paging.log_dirty.bitmap = NULL; + int i4, i3, i2; + + if (mfn_valid(d->arch.paging.log_dirty.top)) { + mfn_t *l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); + printk("%s: used %d pages for domain %d dirty logging\n", + __FUNCTION__, d->arch.paging.log_dirty.allocs, d->domain_id); + for (i4 = 0; i4 < LOGDIRTY_NODE_ENTRIES; i4++) { + if (mfn_valid(l4[i4])) { + mfn_t *l3 = map_domain_page(mfn_x(l4[i4])); + for (i3 = 0; i3 < LOGDIRTY_NODE_ENTRIES; i3++) { + if (mfn_valid(l3[i3])) { + mfn_t *l2 = map_domain_page(mfn_x(l3[i3])); + for (i2 = 0; i2 < LOGDIRTY_NODE_ENTRIES; i2++) + if (mfn_valid(l2[i2])) + paging_free_log_dirty_page(d, l2[i2]); + unmap_domain_page(l2); + paging_free_log_dirty_page(d, l3[i3]); + } + } + unmap_domain_page(l3); + paging_free_log_dirty_page(d, l4[i4]); + } + } + unmap_domain_page(l4); + paging_free_log_dirty_page(d, d->arch.paging.log_dirty.top); + d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); + ASSERT(d->arch.paging.log_dirty.allocs == 0); + d->arch.paging.log_dirty.failed_allocs = 0; } } @@ -187,15 +248,19 @@ void paging_mark_dirty(struct domain *d, { unsigned long pfn; mfn_t gmfn; + int changed; + mfn_t mfn, *l4, *l3, *l2; + uint8_t *l1; + int i1, i2, i3, i4; gmfn = _mfn(guest_mfn); + + ASSERT(mfn_valid(d->arch.paging.log_dirty.top)); if ( !paging_mode_log_dirty(d) || !mfn_valid(gmfn) ) return; log_dirty_lock(d); - - ASSERT(d->arch.paging.log_dirty.bitmap != NULL); /* We /really/ mean PFN here, even for non-translated guests. */ pfn = get_gpfn_from_mfn(mfn_x(gmfn)); @@ -206,37 +271,52 @@ void paging_mark_dirty(struct domain *d, * Nothing to do here... */ if ( unlikely(!VALID_M2P(pfn)) ) - { - log_dirty_unlock(d); - return; - } - - if ( likely(pfn < d->arch.paging.log_dirty.bitmap_size) ) - { - if ( !__test_and_set_bit(pfn, d->arch.paging.log_dirty.bitmap) ) - { - PAGING_DEBUG(LOGDIRTY, - "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n", - mfn_x(gmfn), pfn, d->domain_id); - d->arch.paging.log_dirty.dirty_count++; - } - } - else - { - PAGING_PRINTK("mark_dirty OOR! " - "mfn=%" PRI_mfn " pfn=%lx max=%x (dom %d)\n" - "owner=%d c=%08x t=%" PRtype_info "\n", - mfn_x(gmfn), - pfn, - d->arch.paging.log_dirty.bitmap_size, - d->domain_id, - (page_get_owner(mfn_to_page(gmfn)) - ? page_get_owner(mfn_to_page(gmfn))->domain_id - : -1), - mfn_to_page(gmfn)->count_info, - mfn_to_page(gmfn)->u.inuse.type_info); - } - + goto out; + + i1 = L1_LOGDIRTY_IDX(pfn); + i2 = L2_LOGDIRTY_IDX(pfn); + i3 = L3_LOGDIRTY_IDX(pfn); + i4 = L4_LOGDIRTY_IDX(pfn); + + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); + mfn = l4[i4]; + if ( !mfn_valid(mfn) ) + mfn = l4[i4] = paging_new_log_dirty_node(d, &l3); + else + l3 = map_domain_page(mfn_x(mfn)); + unmap_domain_page(l4); + if ( unlikely(!mfn_valid(mfn)) ) + goto out; + + mfn = l3[i3]; + if ( !mfn_valid(mfn) ) + mfn = l3[i3] = paging_new_log_dirty_node(d, &l2); + else + l2 = map_domain_page(mfn_x(mfn)); + unmap_domain_page(l3); + if ( unlikely(!mfn_valid(mfn)) ) + goto out; + + mfn = l2[i2]; + if ( !mfn_valid(mfn) ) + mfn = l2[i2] = paging_new_log_dirty_leaf(d, &l1); + else + l1 = map_domain_page(mfn_x(mfn)); + unmap_domain_page(l2); + if ( unlikely(!mfn_valid(mfn)) ) + goto out; + + changed = !__test_and_set_bit(i1, l1); + unmap_domain_page(l1); + if ( changed ) + { + PAGING_DEBUG(LOGDIRTY, + "marked mfn %" PRI_mfn " (pfn=%lx), dom %d\n", + mfn_x(gmfn), pfn, d->domain_id); + d->arch.paging.log_dirty.dirty_count++; + } + + out: log_dirty_unlock(d); } @@ -244,7 +324,11 @@ void paging_mark_dirty(struct domain *d, * clear the bitmap and stats as well. */ int paging_log_dirty_op(struct domain *d, struct xen_domctl_shadow_op *sc) { - int i, rv = 0, clean = 0, peek = 1; + int rv = 0, clean = 0, peek = 1; + unsigned long pages = 0; + mfn_t *l4, *l3, *l2; + uint8_t *l1; + int i4, i3, i2; domain_pause(d); log_dirty_lock(d); @@ -270,37 +354,55 @@ int paging_log_dirty_op(struct domain *d /* caller may have wanted just to clean the state or access stats. */ peek = 0; - if ( (peek || clean) && (d->arch.paging.log_dirty.bitmap == NULL) ) + if ( (peek || clean) && !mfn_valid(d->arch.paging.log_dirty.top) ) { rv = -EINVAL; /* perhaps should be ENOMEM? */ goto out; } - if ( sc->pages > d->arch.paging.log_dirty.bitmap_size ) - sc->pages = d->arch.paging.log_dirty.bitmap_size; - -#define CHUNK (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ - for ( i = 0; i < sc->pages; i += CHUNK ) - { - int bytes = ((((sc->pages - i) > CHUNK) - ? CHUNK - : (sc->pages - i)) + 7) / 8; - - if ( likely(peek) ) - { - if ( copy_to_guest_offset( - sc->dirty_bitmap, i/8, - (uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), bytes) ) - { - rv = -EFAULT; - goto out; + if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) { + printk("%s: %d failed page allocs while logging dirty pages\n", + __FUNCTION__, d->arch.paging.log_dirty.failed_allocs); + rv = -ENOMEM; + goto out; + } + + pages = 0; + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); + for ( i4 = 0; pages < sc->pages && i4 < LOGDIRTY_NODE_ENTRIES; i4++ ) { + l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL; + for ( i3 = 0; pages < sc->pages && i3 < LOGDIRTY_NODE_ENTRIES; i3++ ) { + l2 = l3 && mfn_valid(l3[i3]) ? map_domain_page(mfn_x(l3[i3])) : NULL; + for ( i2 = 0; pages < sc->pages && i2 < LOGDIRTY_NODE_ENTRIES; i2++ ) { + static uint8_t zeroes[PAGE_SIZE]; + unsigned int bytes = PAGE_SIZE; + l1 = l2 && mfn_valid(l2[i2]) ? map_domain_page(mfn_x(l2[i2])) : zeroes; + if ( unlikely(((sc->pages - pages + 7) >> 3) < bytes) ) + bytes = (unsigned int)((sc->pages - pages + 7) >> 3); + if ( likely(peek) ) { + if ( copy_to_guest_offset(sc->dirty_bitmap, pages >> 3, l1, bytes) != 0) { + rv = -EFAULT; + goto out; + } + } + + if ( clean && l1 != zeroes ) + clear_page(l1); + + pages += bytes << 3; + if (l1 != zeroes) + unmap_domain_page(l1); } + if (l2) + unmap_domain_page(l2); } - - if ( clean ) - memset((uint8_t *)d->arch.paging.log_dirty.bitmap + (i/8), 0, bytes); - } -#undef CHUNK + if (l3) + unmap_domain_page(l3); + } + unmap_domain_page(l4); + + if (pages < sc->pages) + sc->pages = pages; log_dirty_unlock(d); @@ -338,6 +440,7 @@ void paging_log_dirty_init(struct domain d->arch.paging.log_dirty.enable_log_dirty = enable_log_dirty; d->arch.paging.log_dirty.disable_log_dirty = disable_log_dirty; d->arch.paging.log_dirty.clean_dirty_bitmap = clean_dirty_bitmap; + d->arch.paging.log_dirty.top = _mfn(INVALID_MFN); } /* This function fress log dirty bitmap resources. */ diff -r 2052364cb456 -r 68c911f7733a xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Fri Nov 16 17:59:34 2007 +0000 +++ b/xen/arch/x86/mm/shadow/private.h Fri Nov 16 18:33:24 2007 +0000 @@ -491,17 +491,50 @@ sh_mfn_is_dirty(struct domain *d, mfn_t /* Is this guest page dirty? Call only in log-dirty mode. */ { unsigned long pfn; + mfn_t mfn, *l4, *l3, *l2; + uint8_t *l1; + int rv; + ASSERT(shadow_mode_log_dirty(d)); - ASSERT(d->arch.paging.log_dirty.bitmap != NULL); + ASSERT(mfn_valid(d->arch.paging.log_dirty.top)); /* We /really/ mean PFN here, even for non-translated guests. */ pfn = get_gpfn_from_mfn(mfn_x(gmfn)); - if ( likely(VALID_M2P(pfn)) - && likely(pfn < d->arch.paging.log_dirty.bitmap_size) - && test_bit(pfn, d->arch.paging.log_dirty.bitmap) ) + if ( unlikely(!VALID_M2P(pfn)) ) + return 0; + + if (d->arch.paging.log_dirty.failed_allocs > 0) + /* If we have any failed allocations our dirty log is bogus. + * Since we can't signal an error here, be conservative and + * report "dirty" in this case. (The only current caller, + * _sh_propagate, leaves known-dirty pages writable, preventing + * subsequent dirty-logging faults from them.) + */ return 1; - return 0; + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); + mfn = l4[L4_LOGDIRTY_IDX(pfn)]; + unmap_domain_page(l4); + if (!mfn_valid(mfn)) + return 0; + + l3 = map_domain_page(mfn_x(mfn)); + mfn = l3[L3_LOGDIRTY_IDX(pfn)]; + unmap_domain_page(l3); + if (!mfn_valid(mfn)) + return 0; + + l2 = map_domain_page(mfn_x(mfn)); + mfn = l2[L2_LOGDIRTY_IDX(pfn)]; + unmap_domain_page(l2); + if (!mfn_valid(mfn)) + return 0; + + l1 = map_domain_page(mfn_x(mfn)); + rv = test_bit(L1_LOGDIRTY_IDX(pfn), l1); + unmap_domain_page(l1); + + return rv; } diff -r 2052364cb456 -r 68c911f7733a xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Fri Nov 16 17:59:34 2007 +0000 +++ b/xen/include/asm-x86/domain.h Fri Nov 16 18:33:24 2007 +0000 @@ -158,9 +158,10 @@ struct log_dirty_domain { int locker; /* processor that holds the lock */ const char *locker_function; /* func that took it */ - /* log-dirty bitmap to record dirty pages */ - unsigned long *bitmap; - unsigned int bitmap_size; /* in pages, bit per page */ + /* log-dirty radix tree to record dirty pages */ + mfn_t top; + unsigned int allocs; + unsigned int failed_allocs; /* log-dirty mode stats */ unsigned int fault_count; diff -r 2052364cb456 -r 68c911f7733a xen/include/asm-x86/paging.h --- a/xen/include/asm-x86/paging.h Fri Nov 16 17:59:34 2007 +0000 +++ b/xen/include/asm-x86/paging.h Fri Nov 16 18:33:24 2007 +0000 @@ -152,6 +152,28 @@ void paging_log_dirty_init(struct domain /* mark a page as dirty */ void paging_mark_dirty(struct domain *d, unsigned long guest_mfn); +/* + * Log-dirty radix tree indexing: + * All tree nodes are PAGE_SIZE bytes, mapped on-demand. + * Leaf nodes are simple bitmaps; 1 bit per guest pfn. + * Interior nodes are arrays of LOGDIRTY_NODE_ENTRIES mfns. + * TODO: Dynamic radix tree height. Most guests will only need 2 levels. + * The fourth level is basically unusable on 32-bit Xen. + * TODO2: Abstract out the radix-tree mechanics? + */ +#define LOGDIRTY_NODE_ENTRIES (1 << PAGETABLE_ORDER) +#define L1_LOGDIRTY_IDX(pfn) ((pfn) & ((1 << (PAGE_SHIFT+3)) - 1)) +#define L2_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3)) & \ + (LOGDIRTY_NODE_ENTRIES-1)) +#define L3_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER)) & \ + (LOGDIRTY_NODE_ENTRIES-1)) +#if BITS_PER_LONG == 64 +#define L4_LOGDIRTY_IDX(pfn) (((pfn) >> (PAGE_SHIFT+3+PAGETABLE_ORDER*2)) & \ + (LOGDIRTY_NODE_ENTRIES-1)) +#else +#define L4_LOGDIRTY_IDX(pfn) 0 +#endif + /***************************************************************************** * Entry points into the paging-assistance code */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |