[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen master] x86/EPT: don't walk entire page tables when globally changing types
commit 90ac32559bfbd08127638ba13f99b5ed565cfc2b Author: Jan Beulich <jbeulich@xxxxxxxx> AuthorDate: Fri May 2 11:50:43 2014 +0200 Commit: Jan Beulich <jbeulich@xxxxxxxx> CommitDate: Fri May 2 11:50:43 2014 +0200 x86/EPT: don't walk entire page tables when globally changing types Instead leverage the EPT_MISCONFIG VM exit by marking just the top level entries as needing recalculation of their type, propagating the the recalculation state down as necessary such that the actual recalculation gets done upon access. For this to work, we have to - restrict the types between which conversions can be done (right now only the two types involved in log dirty tracking need to be taken care of) - remember the ranges that log dirty tracking was requested for as well as whether global log dirty tracking is in effect Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> Reviewed-by: Tim Deegan <tim@xxxxxxx> --- xen/arch/x86/mm/hap/hap.c | 12 +++ xen/arch/x86/mm/p2m-ept.c | 178 +++++++++++++++++++++++-------------- xen/arch/x86/mm/p2m.c | 66 ++++++++++++-- xen/include/asm-x86/hvm/vmx/vmx.h | 2 +- xen/include/asm-x86/p2m.h | 13 +++ 5 files changed, 196 insertions(+), 75 deletions(-) diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c index a7593e7..38c01d6 100644 --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -110,11 +110,18 @@ int hap_track_dirty_vram(struct domain *d, if ( begin_pfn != dirty_vram->begin_pfn || begin_pfn + nr != dirty_vram->end_pfn ) { + unsigned long ostart = dirty_vram->begin_pfn; + unsigned long oend = dirty_vram->end_pfn; + dirty_vram->begin_pfn = begin_pfn; dirty_vram->end_pfn = begin_pfn + nr; paging_unlock(d); + if ( oend > ostart ) + p2m_change_type_range(d, ostart, oend, + p2m_ram_logdirty, p2m_ram_rw); + /* set l1e entries of range within P2M table to be read-only. */ p2m_change_type_range(d, begin_pfn, begin_pfn + nr, p2m_ram_rw, p2m_ram_logdirty); @@ -150,11 +157,16 @@ int hap_track_dirty_vram(struct domain *d, * If zero pages specified while tracking dirty vram * then stop tracking */ + begin_pfn = dirty_vram->begin_pfn; + nr = dirty_vram->end_pfn - dirty_vram->begin_pfn; xfree(dirty_vram); d->arch.hvm_domain.dirty_vram = NULL; } paging_unlock(d); + if ( nr ) + p2m_change_type_range(d, begin_pfn, begin_pfn + nr, + p2m_ram_logdirty, p2m_ram_rw); } out: if ( dirty_bitmap ) diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c index 5d19965..ce12758 100644 --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -188,7 +188,6 @@ static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry, epte->mfn += i * trunk; epte->snp = (iommu_enabled && iommu_snoop); ASSERT(!epte->rsvd1); - ASSERT(!epte->avail1); ASSERT(!epte->avail3); ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access); @@ -271,7 +270,12 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only, return GUEST_TABLE_NORMAL_PAGE; } -static bool_t ept_invalidate_emt(mfn_t mfn) +/* + * Invalidate (via setting the EMT field to an invalid value) all valid + * present entries in the given page table, optionally marking the entries + * also for their subtrees needing P2M type re-calculation. + */ +static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc) { ept_entry_t *epte = map_domain_page(mfn_x(mfn)); unsigned int i; @@ -282,10 +286,12 @@ static bool_t ept_invalidate_emt(mfn_t mfn) ept_entry_t e = atomic_read_ept_entry(&epte[i]); if ( !is_epte_valid(&e) || !is_epte_present(&e) || - e.emt == MTRR_NUM_TYPES ) + (e.emt == MTRR_NUM_TYPES && (e.recalc || !recalc)) ) continue; e.emt = MTRR_NUM_TYPES; + if ( recalc ) + e.recalc = 1; atomic_write_ept_entry(&epte[i], e); changed = 1; } @@ -295,23 +301,29 @@ static bool_t ept_invalidate_emt(mfn_t mfn) return changed; } -bool_t ept_handle_misconfig(uint64_t gpa) +/* + * Resolve deliberately mis-configured (EMT field set to an invalid value) + * entries in the page table hierarchy for the given GFN: + * - calculate the correct value for the EMT field, + * - if marked so, re-calculate the P2M type, + * - propagate EMT and re-calculation flag down to the next page table level + * for entries not involved in the translation of the given GFN. + * Returns: + * - negative errno values in error, + * - zero if no adjustment was done, + * - a positive value if at least one adjustment was done. + */ +static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn) { - struct vcpu *curr = current; - struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain); struct ept_data *ept = &p2m->ept; unsigned int level = ept_get_wl(ept); - unsigned long gfn = PFN_DOWN(gpa); unsigned long mfn = ept_get_asr(ept); ept_entry_t *epte; - int okay; + int rc = 0; if ( !mfn ) return 0; - p2m_lock(p2m); - - okay = -curr->arch.hvm_vmx.ept_spurious_misconfig; for ( ; ; --level ) { ept_entry_t e; @@ -341,6 +353,13 @@ bool_t ept_handle_misconfig(uint64_t gpa) _mfn(e.mfn), 0, &ipat, e.sa_p2mt == p2m_mmio_direct); e.ipat = ipat; + if ( e.recalc && p2m_is_changeable(e.sa_p2mt) ) + { + e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + i) + ? p2m_ram_logdirty : p2m_ram_rw; + ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access); + } + e.recalc = 0; atomic_write_ept_entry(&epte[i], e); } } @@ -349,6 +368,28 @@ bool_t ept_handle_misconfig(uint64_t gpa) int emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn), level * EPT_TABLE_ORDER, &ipat, e.sa_p2mt == p2m_mmio_direct); + bool_t recalc = e.recalc; + + if ( recalc && p2m_is_changeable(e.sa_p2mt) ) + { + unsigned long mask = ~0UL << (level * EPT_TABLE_ORDER); + + switch ( p2m_is_logdirty_range(p2m, gfn & mask, + gfn | ~mask) ) + { + case 0: + e.sa_p2mt = p2m_ram_rw; + e.recalc = 0; + break; + case 1: + e.sa_p2mt = p2m_ram_logdirty; + e.recalc = 0; + break; + default: /* Force split. */ + emt = -1; + break; + } + } if ( unlikely(emt < 0) ) { if ( ept_split_super_page(p2m, &e, level, level - 1) ) @@ -359,27 +400,31 @@ bool_t ept_handle_misconfig(uint64_t gpa) continue; } ept_free_entry(p2m, &e, level); - okay = 0; + rc = -ENOMEM; break; } e.emt = emt; e.ipat = ipat; + e.recalc = 0; + if ( recalc && p2m_is_changeable(e.sa_p2mt) ) + ept_p2m_type_to_flags(&e, e.sa_p2mt, e.access); atomic_write_ept_entry(&epte[i], e); } - okay = 1; + rc = 1; break; } if ( e.emt == MTRR_NUM_TYPES ) { ASSERT(is_epte_present(&e)); - ept_invalidate_emt(_mfn(e.mfn)); + ept_invalidate_emt(_mfn(e.mfn), e.recalc); smp_wmb(); e.emt = 0; + e.recalc = 0; atomic_write_ept_entry(&epte[i], e); unmap_domain_page(epte); - okay = 1; + rc = 1; } else if ( is_epte_present(&e) && !e.emt ) unmap_domain_page(epte); @@ -390,18 +435,34 @@ bool_t ept_handle_misconfig(uint64_t gpa) } unmap_domain_page(epte); - if ( okay > 0 ) + if ( rc ) { struct vcpu *v; - for_each_vcpu ( curr->domain, v ) + for_each_vcpu ( p2m->domain, v ) v->arch.hvm_vmx.ept_spurious_misconfig = 1; } + + return rc; +} + +bool_t ept_handle_misconfig(uint64_t gpa) +{ + struct vcpu *curr = current; + struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain); + bool_t spurious; + int rc; + + p2m_lock(p2m); + + spurious = curr->arch.hvm_vmx.ept_spurious_misconfig; + rc = resolve_misconfig(p2m, PFN_DOWN(gpa)); curr->arch.hvm_vmx.ept_spurious_misconfig = 0; ept_sync_domain(p2m); + p2m_unlock(p2m); - return !!okay; + return spurious ? (rc >= 0) : (rc > 0); } /* @@ -417,13 +478,12 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, ept_entry_t *table, *ept_entry = NULL; unsigned long gfn_remainder = gfn; int i, target = order / EPT_TABLE_ORDER; - int rc = 0; - int ret = 0; + int ret, rc = 0; bool_t direct_mmio = (p2mt == p2m_mmio_direct); uint8_t ipat = 0; int need_modify_vtd_table = 1; int vtd_pte_present = 0; - int needs_sync = 1; + enum { sync_off, sync_on, sync_check } needs_sync = sync_check; ept_entry_t old_entry = { .epte = 0 }; ept_entry_t new_entry = { .epte = 0 }; struct ept_data *ept = &p2m->ept; @@ -441,12 +501,23 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, (order % EPT_TABLE_ORDER) ) return -EINVAL; + /* Carry out any eventually pending earlier changes first. */ + ret = resolve_misconfig(p2m, gfn); + if ( ret < 0 ) + { + ept_sync_domain(p2m); + return ret; + } + if ( ret > 0 ) + needs_sync = sync_on; + ASSERT((target == 2 && hvm_hap_has_1gb()) || (target == 1 && hvm_hap_has_2mb()) || (target == 0)); table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); + ret = GUEST_TABLE_MAP_FAILED; for ( i = ept_get_wl(ept); i > target; i-- ) { ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i); @@ -480,8 +551,8 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, /* We reached the target level. */ /* No need to flush if the old entry wasn't valid */ - if ( !is_epte_present(ept_entry) ) - needs_sync = 0; + if ( needs_sync == sync_check && !is_epte_present(ept_entry) ) + needs_sync = sync_off; /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB), * the intermediate tables will be freed below after the ept flush @@ -556,7 +627,7 @@ ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, out: unmap_domain_page(table); - if ( needs_sync ) + if ( needs_sync != sync_off ) ept_sync_domain(p2m); /* For non-nested p2m, may need to change VT-d page table.*/ @@ -598,6 +669,7 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m, u32 index; int i; int ret = 0; + bool_t recalc = 0; mfn_t mfn = _mfn(INVALID_MFN); struct ept_data *ept = &p2m->ept; @@ -613,6 +685,8 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m, for ( i = ept_get_wl(ept); i > 0; i-- ) { retry: + if ( table[gfn_remainder >> (i * EPT_TABLE_ORDER)].recalc ) + recalc = 1; ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); if ( !ret ) goto out; @@ -659,7 +733,12 @@ static mfn_t ept_get_entry(struct p2m_domain *p2m, if ( is_epte_valid(ept_entry) ) { - *t = ept_entry->sa_p2mt; + if ( (recalc || ept_entry->recalc) && + p2m_is_changeable(ept_entry->sa_p2mt) ) + *t = p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty + : p2m_ram_rw; + else + *t = ept_entry->sa_p2mt; *a = ept_entry->access; mfn = _mfn(ept_entry->mfn); @@ -735,53 +814,18 @@ out: return; } -/* - * Walk the whole p2m table, changing any entries of the old type - * to the new type. This is used in hardware-assisted paging to - * quickly enable or diable log-dirty tracking - */ -static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level, - p2m_type_t ot, p2m_type_t nt) -{ - ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn)); - - for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) - { - if ( !is_epte_valid(epte + i) ) - continue; - - if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) ) - ept_change_entry_type_page(_mfn(epte[i].mfn), - ept_page_level - 1, ot, nt); - else - { - e = atomic_read_ept_entry(&epte[i]); - if ( e.sa_p2mt != ot ) - continue; - - e.sa_p2mt = nt; - ept_p2m_type_to_flags(&e, nt, e.access); - atomic_write_ept_entry(&epte[i], e); - } - } - - unmap_domain_page(epte); -} - static void ept_change_entry_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt) { - struct ept_data *ept = &p2m->ept; - if ( ept_get_asr(ept) == 0 ) - return; + unsigned long mfn = ept_get_asr(&p2m->ept); - BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); - BUG_ON(p2m_is_mmio(ot) || p2m_is_mmio(nt)); + if ( !mfn || ot == nt ) + return; - ept_change_entry_type_page(_mfn(ept_get_asr(ept)), - ept_get_wl(ept), ot, nt); + BUG_ON(!p2m_is_changeable(ot) || !p2m_is_changeable(nt)); - ept_sync_domain(p2m); + if ( ept_invalidate_emt(_mfn(mfn), 1) ) + ept_sync_domain(p2m); } static void ept_memory_type_changed(struct p2m_domain *p2m) @@ -791,7 +835,7 @@ static void ept_memory_type_changed(struct p2m_domain *p2m) if ( !mfn ) return; - if ( ept_invalidate_emt(_mfn(mfn)) ) + if ( ept_invalidate_emt(_mfn(mfn), 0) ) ept_sync_domain(p2m); } diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index 1d0528b..80bcb33 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -116,8 +116,14 @@ static int p2m_init_hostp2m(struct domain *d) if ( p2m ) { - d->arch.p2m = p2m; - return 0; + p2m->logdirty_ranges = rangeset_new(d, "log-dirty", + RANGESETF_prettyprint_hex); + if ( p2m->logdirty_ranges ) + { + d->arch.p2m = p2m; + return 0; + } + p2m_free_one(p2m); } return -ENOMEM; } @@ -129,6 +135,7 @@ static void p2m_teardown_hostp2m(struct domain *d) if ( p2m ) { + rangeset_destroy(p2m->logdirty_ranges); p2m_free_one(p2m); d->arch.p2m = NULL; } @@ -191,12 +198,25 @@ int p2m_init(struct domain *d) return rc; } +int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start, + unsigned long end) +{ + ASSERT(!p2m_is_nestedp2m(p2m)); + if ( p2m->global_logdirty || + rangeset_contains_range(p2m->logdirty_ranges, start, end) ) + return 1; + if ( rangeset_overlaps_range(p2m->logdirty_ranges, start, end) ) + return -1; + return 0; +} + void p2m_change_entry_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) { struct p2m_domain *p2m = p2m_get_hostp2m(d); p2m_lock(p2m); p2m->change_entry_type_global(p2m, ot, nt); + p2m->global_logdirty = (nt == p2m_ram_logdirty); p2m_unlock(p2m); } @@ -713,6 +733,7 @@ void p2m_change_type_range(struct domain *d, unsigned long gfn; mfn_t mfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); + int rc = 0; BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); @@ -726,11 +747,22 @@ void p2m_change_type_range(struct domain *d, mfn = p2m->get_entry(p2m, gfn, &pt, &a, 0, &order); while ( order > PAGE_ORDER_4K ) { - if ( pt != ot ) - break; - if ( !(gfn & ((1UL << order) - 1)) && - end > (gfn | ((1UL << order) - 1)) ) - break; + unsigned long mask = ~0UL << order; + + /* + * Log-dirty ranges starting/ending in the middle of a super page + * (with a page split still pending) can't have a consistent type + * reported for the full range and hence need the split to be + * enforced here. + */ + if ( !p2m_is_changeable(pt) || + p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask) >= 0 ) + { + if ( pt != ot ) + break; + if ( !(gfn & ~mask) && end > (gfn | ~mask) ) + break; + } if ( order == PAGE_ORDER_1G ) order = PAGE_ORDER_2M; else @@ -744,6 +776,26 @@ void p2m_change_type_range(struct domain *d, break; } + switch ( nt ) + { + case p2m_ram_rw: + if ( ot == p2m_ram_logdirty ) + rc = rangeset_remove_range(p2m->logdirty_ranges, start, end - 1); + break; + case p2m_ram_logdirty: + if ( ot == p2m_ram_rw ) + rc = rangeset_add_range(p2m->logdirty_ranges, start, end - 1); + break; + default: + break; + } + if ( rc ) + { + printk(XENLOG_G_ERR "Error %d manipulating Dom%d's log-dirty ranges\n", + rc, d->domain_id); + domain_crash(d); + } + p2m->defer_nested_flush = 0; if ( nestedhvm_enabled(d) ) p2m_flush_nestedp2m(d); diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index 2e8cd70..c8bb548 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -38,7 +38,7 @@ typedef union { ipat : 1, /* bit 6 - Ignore PAT memory type */ sp : 1, /* bit 7 - Is this a superpage? */ rsvd1 : 2, /* bits 9:8 - Reserved for future use */ - avail1 : 1, /* bit 10 - Software available 1 */ + recalc : 1, /* bit 10 - Software available 1 */ snp : 1, /* bit 11 - VT-d snoop control in shared EPT/VT-d usage */ mfn : 40, /* bits 51:12 - Machine physical frame number */ diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h index 86847e9..06d5bad 100644 --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -140,6 +140,10 @@ typedef unsigned int p2m_query_t; | p2m_to_mask(p2m_grant_map_ro) \ | p2m_to_mask(p2m_ram_shared) ) +/* Types that can be subject to bulk transitions. */ +#define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \ + | p2m_to_mask(p2m_ram_logdirty) ) + #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand)) /* Pageable types */ @@ -168,6 +172,7 @@ typedef unsigned int p2m_query_t; #define p2m_is_hole(_t) (p2m_to_mask(_t) & P2M_HOLE_TYPES) #define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES) #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES) +#define p2m_is_changeable(_t) (p2m_to_mask(_t) & P2M_CHANGEABLE_TYPES) #define p2m_is_pod(_t) (p2m_to_mask(_t) & P2M_POD_TYPES) #define p2m_is_grant(_t) (p2m_to_mask(_t) & P2M_GRANT_TYPES) /* Grant types are *not* considered valid, because they can be @@ -211,6 +216,11 @@ struct p2m_domain { * threaded on in LRU order. */ struct list_head np2m_list; + /* Host p2m: Log-dirty ranges registered for the domain. */ + struct rangeset *logdirty_ranges; + + /* Host p2m: Global log-dirty mode enabled for the domain. */ + bool_t global_logdirty; /* Host p2m: when this flag is set, don't flush all the nested-p2m * tables on every host-p2m change. The setter of this flag @@ -511,6 +521,9 @@ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, /* Report a change affecting memory types. */ void p2m_memory_type_changed(struct domain *d); +int p2m_is_logdirty_range(struct p2m_domain *, unsigned long start, + unsigned long end); + /* Set mmio addresses in the p2m table (for pass-through) */ int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn); -- generated by git-patchbot for /home/xen/git/xen.git#master _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |