[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 1/4] Out-of-sync L1 shadows: OOS base
This patch implements the basic mechanisms to get pagetables out of sync and back in sync again. Signed-off-by: Gianluca Guida <gianluca.guida@xxxxxxxxxxxxx> Signed-off-by: Tim Deegan <tim.deegan@xxxxxxxxxxxxx> Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxxxxx> diff -r 26ecd1f9e128 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/arch/x86/mm.c Fri Jun 20 15:10:08 2008 +0100 @@ -1933,9 +1933,15 @@ int get_page_type(struct page_info *page { struct domain *d = page_get_owner(page); - /* Never allow a shadowed frame to go from type count 0 to 1 */ - if ( d && shadow_mode_enabled(d) ) - shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); + /* Normally we should never let a page go from type count 0 + * to type count 1 when it is shadowed. One exception: + * out-of-sync shadowed pages are allowed to become + * writeable. */ + if ( d && shadow_mode_enabled(d) + && (page->count_info & PGC_page_table) + && !((page->shadow_flags & (1u<<29)) + && type == PGT_writable_page) ) + shadow_remove_all_shadows(d->vcpu[0], _mfn(page_to_mfn(page))); ASSERT(!(x & PGT_pae_xen_l2)); if ( (x & PGT_type_mask) != type ) diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/common.c --- a/xen/arch/x86/mm/shadow/common.c Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/arch/x86/mm/shadow/common.c Fri Jun 20 15:10:08 2008 +0100 @@ -54,6 +54,10 @@ void shadow_domain_init(struct domain *d /* Use shadow pagetables for log-dirty support */ paging_log_dirty_init(d, shadow_enable_log_dirty, shadow_disable_log_dirty, shadow_clean_dirty_bitmap); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + d->arch.paging.shadow.oos_active = 0; +#endif } /* Setup the shadow-specfic parts of a vcpu struct. Note: The most important @@ -64,6 +68,13 @@ void shadow_domain_init(struct domain *d */ void shadow_vcpu_init(struct vcpu *v) { +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + int i; + + for ( i = 0; i < SHADOW_OOS_PAGES; i++ ) + v->arch.paging.shadow.oos[i] = _mfn(INVALID_MFN); +#endif + v->arch.paging.mode = &SHADOW_INTERNAL_NAME(sh_paging_mode, 3); } @@ -427,6 +438,404 @@ void shadow_continue_emulation(struct sh } } } + + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/**************************************************************************/ +/* Out-of-sync shadows. */ + +/* From time to time, we let a shadowed pagetable page go out of sync + * with its shadow: the guest is allowed to write directly to the page, + * and those writes are not synchronously reflected in the shadow. + * This lets us avoid many emulations if the guest is writing a lot to a + * pagetable, but it relaxes a pretty important invariant in the shadow + * pagetable design. Therefore, some rules: + * + * 1. Only L1 pagetables may go out of sync: any page that is shadowed + * at at higher level must be synchronously updated. This makes + * using linear shadow pagetables much less dangerous. + * That means that: (a) unsyncing code needs to check for higher-level + * shadows, and (b) promotion code needs to resync. + * + * 2. All shadow operations on a guest page require the page to be brought + * back into sync before proceeding. This must be done under the + * shadow lock so that the page is guaranteed to remain synced until + * the operation completes. + * + * Exceptions to this rule: the pagefault and invlpg handlers may + * update only one entry on an out-of-sync page without resyncing it. + * + * 3. Operations on shadows that do not start from a guest page need to + * be aware that they may be handling an out-of-sync shadow. + * + * 4. Operations that do not normally take the shadow lock (fast-path + * #PF handler, INVLPG) must fall back to a locking, syncing version + * if they see an out-of-sync table. + * + * 5. Operations corresponding to guest TLB flushes (MOV CR3, INVLPG) + * must explicitly resync all relevant pages or update their + * shadows. + * + * Currently out-of-sync pages are listed in a simple open-addressed + * hash table with a second chance (must resist temptation to radically + * over-engineer hash tables...) The virtual address of the access + * which caused us to unsync the page is also kept in the hash table, as + * a hint for finding the writable mappings later. + * + * We keep a hash per vcpu, because we want as much as possible to do + * the re-sync on the save vcpu we did the unsync on, so the VA hint + * will be valid. + */ + + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL +static void sh_oos_audit(struct domain *d) +{ + int idx, expected_idx, expected_idx_alt; + struct page_info *pg; + struct vcpu *v; + + for_each_vcpu(d, v) + { + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) + { + mfn_t *oos = v->arch.paging.shadow.oos; + if ( !mfn_valid(oos[idx]) ) + continue; + + expected_idx = mfn_x(oos[idx]) % SHADOW_OOS_PAGES; + expected_idx_alt = ((expected_idx + 1) % SHADOW_OOS_PAGES); + if ( idx != expected_idx && idx != expected_idx_alt ) + { + printk("%s: idx %d contains gmfn %lx, expected at %d or %d.\n", + __func__, idx, mfn_x(oos[idx]), + expected_idx, expected_idx_alt); + BUG(); + } + pg = mfn_to_page(oos[idx]); + if ( !(pg->count_info & PGC_page_table) ) + { + printk("%s: idx %x gmfn %lx not a pt (count %"PRIx32")\n", + __func__, idx, mfn_x(oos[idx]), pg->count_info); + BUG(); + } + if ( !(pg->shadow_flags & SHF_out_of_sync) ) + { + printk("%s: idx %x gmfn %lx not marked oos (flags %lx)\n", + __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); + BUG(); + } + if ( (pg->shadow_flags & SHF_page_type_mask & ~SHF_L1_ANY) ) + { + printk("%s: idx %x gmfn %lx shadowed as non-l1 (flags %lx)\n", + __func__, idx, mfn_x(oos[idx]), pg->shadow_flags); + BUG(); + } + } + } +} +#endif + +#if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn) +{ + int idx; + struct vcpu *v; + mfn_t *oos; + + ASSERT(mfn_is_out_of_sync(gmfn)); + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + return; + } + + SHADOW_ERROR("gmfn %lx marked OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); +} +#endif + +/* Update the shadow, but keep the page out of sync. */ +static inline void _sh_resync_l1(struct vcpu *v, mfn_t gmfn) +{ + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(mfn_valid(gmfn)); + ASSERT(page_is_out_of_sync(pg)); + + /* Call out to the appropriate per-mode resyncing function */ + if ( pg->shadow_flags & SHF_L1_32 ) + SHADOW_INTERNAL_NAME(sh_resync_l1, 2)(v, gmfn); + else if ( pg->shadow_flags & SHF_L1_PAE ) + SHADOW_INTERNAL_NAME(sh_resync_l1, 3)(v, gmfn); +#if CONFIG_PAGING_LEVELS >= 4 + else if ( pg->shadow_flags & SHF_L1_64 ) + SHADOW_INTERNAL_NAME(sh_resync_l1, 4)(v, gmfn); +#endif +} + +/* Pull all the entries on an out-of-sync page back into sync. */ +static void _sh_resync(struct vcpu *v, mfn_t gmfn, unsigned long va) +{ + struct page_info *pg = mfn_to_page(gmfn); + + ASSERT(shadow_locked_by_me(v->domain)); + ASSERT(mfn_is_out_of_sync(gmfn)); + /* Guest page must be shadowed *only* as L1 when out of sync. */ + ASSERT(!(mfn_to_page(gmfn)->shadow_flags & SHF_page_type_mask + & ~SHF_L1_ANY)); + ASSERT(!sh_page_has_multiple_shadows(mfn_to_page(gmfn))); + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx, va=%lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); + + /* Need to pull write access so the page *stays* in sync. + * This might be rather slow but we hope that in the common case + * we're handling this pagetable after a guest walk has pulled + * write access the fast way. */ + switch ( sh_remove_write_access(v, gmfn, 0, va) ) + { + default: + case 0: + break; + + case 1: + flush_tlb_mask(v->domain->domain_dirty_cpumask); + break; + + case -1: + /* An unfindable writeable typecount has appeared, probably via a + * grant table entry: can't shoot the mapping, so try to unshadow + * the page. If that doesn't work either, the guest is granting + * his pagetables and must be killed after all. */ + sh_remove_shadows(v, gmfn, 0 /* Be thorough */, 1 /* Must succeed */); + return; + } + + /* No more writable mappings of this page, please */ + pg->shadow_flags &= ~SHF_oos_may_write; + + /* Update the shadows with current guest entries. */ + _sh_resync_l1(v, gmfn); + + /* Now we know all the entries are synced, and will stay that way */ + pg->shadow_flags &= ~SHF_out_of_sync; + perfc_incr(shadow_resync); +} + + +/* Add an MFN to the list of out-of-sync guest pagetables */ +static void oos_hash_add(struct vcpu *v, mfn_t gmfn, unsigned long va) +{ + int idx; + mfn_t *oos = v->arch.paging.shadow.oos; + unsigned long *oos_va = v->arch.paging.shadow.oos_va; + + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_valid(oos[idx]) + && (mfn_x(oos[idx]) % SHADOW_OOS_PAGES) == idx ) + { + /* Punt the current occupant into the next slot */ + SWAP(oos[idx], gmfn); + SWAP(oos_va[idx], va); + idx = (idx + 1) % SHADOW_OOS_PAGES; + } + if ( mfn_valid(oos[idx]) ) + { + /* Crush the current occupant. */ + _sh_resync(v, oos[idx], oos_va[idx]); + perfc_incr(shadow_unsync_evict); + } + oos[idx] = gmfn; + oos_va[idx] = va; +} + +/* Remove an MFN from the list of out-of-sync guest pagetables */ +static void oos_hash_remove(struct vcpu *v, mfn_t gmfn) +{ + int idx; + mfn_t *oos; + struct domain *d = v->domain; + + SHADOW_PRINTK("D%dV%d gmfn %lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn)); + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + { + oos[idx] = _mfn(INVALID_MFN); + return; + } + } + + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); +} + +/* Pull a single guest page back into sync */ +void sh_resync(struct vcpu *v, mfn_t gmfn) +{ + int idx; + mfn_t *oos; + unsigned long *oos_va; + struct domain *d = v->domain; + + for_each_vcpu(d, v) + { + oos = v->arch.paging.shadow.oos; + oos_va = v->arch.paging.shadow.oos_va; + idx = mfn_x(gmfn) % SHADOW_OOS_PAGES; + if ( mfn_x(oos[idx]) != mfn_x(gmfn) ) + idx = (idx + 1) % SHADOW_OOS_PAGES; + + if ( mfn_x(oos[idx]) == mfn_x(gmfn) ) + { + _sh_resync(v, gmfn, oos_va[idx]); + oos[idx] = _mfn(INVALID_MFN); + return; + } + } + + SHADOW_ERROR("gmfn %lx was OOS but not in hash table\n", mfn_x(gmfn)); + BUG(); +} + +/* Figure out whether it's definitely safe not to sync this l1 table, + * by making a call out to the mode in which that shadow was made. */ +static int sh_skip_sync(struct vcpu *v, mfn_t gl1mfn) +{ + struct page_info *pg = mfn_to_page(gl1mfn); + if ( pg->shadow_flags & SHF_L1_32 ) + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 2)(v, gl1mfn); + else if ( pg->shadow_flags & SHF_L1_PAE ) + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 3)(v, gl1mfn); +#if CONFIG_PAGING_LEVELS >= 4 + else if ( pg->shadow_flags & SHF_L1_64 ) + return SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, 4)(v, gl1mfn); +#endif + SHADOW_ERROR("gmfn 0x%lx was OOS but not shadowed as an l1.\n", + mfn_x(gl1mfn)); + BUG(); + return 0; /* BUG() is no longer __attribute__((noreturn)). */ +} + + +/* Pull all out-of-sync pages back into sync. Pages brought out of sync + * on other vcpus are allowed to remain out of sync, but their contents + * will be made safe (TLB flush semantics); pages unsynced by this vcpu + * are brought back into sync and write-protected. If skip != 0, we try + * to avoid resyncing at all if we think we can get away with it. */ +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking) +{ + int idx; + struct vcpu *other; + mfn_t *oos = v->arch.paging.shadow.oos; + unsigned long *oos_va = v->arch.paging.shadow.oos_va; + + SHADOW_PRINTK("d=%d, v=%d\n", v->domain->domain_id, v->vcpu_id); + + ASSERT(do_locking || shadow_locked_by_me(v->domain)); + + if ( !this ) + goto resync_others; + + if ( do_locking ) + shadow_lock(v->domain); + + /* First: resync all of this vcpu's oos pages */ + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) + if ( mfn_valid(oos[idx]) ) + { + /* Write-protect and sync contents */ + _sh_resync(v, oos[idx], oos_va[idx]); + oos[idx] = _mfn(INVALID_MFN); + } + + if ( do_locking ) + shadow_unlock(v->domain); + + resync_others: + if ( !others ) + return; + + /* Second: make all *other* vcpus' oos pages safe. */ + for_each_vcpu(v->domain, other) + { + if ( v == other ) + continue; + + if ( do_locking ) + shadow_lock(v->domain); + + oos = other->arch.paging.shadow.oos; + oos_va = other->arch.paging.shadow.oos_va; + + for ( idx = 0; idx < SHADOW_OOS_PAGES; idx++ ) + { + if ( !mfn_valid(oos[idx]) ) + continue; + + if ( skip ) + { + /* Update the shadows and leave the page OOS. */ + if ( sh_skip_sync(v, oos[idx]) ) + continue; + _sh_resync_l1(other, oos[idx]); + } + else + { + /* Write-protect and sync contents */ + _sh_resync(other, oos[idx], oos_va[idx]); + oos[idx] = _mfn(INVALID_MFN); + } + } + + if ( do_locking ) + shadow_unlock(v->domain); + } +} + +/* Allow a shadowed page to go out of sync */ +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va) +{ + struct page_info *pg; + + ASSERT(shadow_locked_by_me(v->domain)); + + SHADOW_PRINTK("d=%d, v=%d, gmfn=%05lx va %lx\n", + v->domain->domain_id, v->vcpu_id, mfn_x(gmfn), va); + + pg = mfn_to_page(gmfn); + + /* Guest page must be shadowed *only* as L1 and *only* once when out + * of sync. Also, get out now if it's already out of sync. + * Also, can't safely unsync if some vcpus have paging disabled.*/ + if ( pg->shadow_flags & + ((SHF_page_type_mask & ~SHF_L1_ANY) | SHF_out_of_sync) + || sh_page_has_multiple_shadows(pg) + || !is_hvm_domain(v->domain) + || !v->domain->arch.paging.shadow.oos_active ) + return 0; + + pg->shadow_flags |= SHF_out_of_sync|SHF_oos_may_write; + oos_hash_add(v, gmfn, va); + perfc_incr(shadow_unsync); + return 1; +} + +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ + /**************************************************************************/ /* Code for "promoting" a guest page to the point where the shadow code is @@ -439,6 +848,12 @@ void shadow_promote(struct vcpu *v, mfn_ struct page_info *page = mfn_to_page(gmfn); ASSERT(mfn_valid(gmfn)); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Is the page already shadowed and out of sync? */ + if ( page_is_out_of_sync(page) ) + sh_resync(v, gmfn); +#endif /* We should never try to promote a gmfn that has writeable mappings */ ASSERT((page->u.inuse.type_info & PGT_type_mask) != PGT_writable_page @@ -463,7 +878,14 @@ void shadow_demote(struct vcpu *v, mfn_t clear_bit(type, &page->shadow_flags); if ( (page->shadow_flags & SHF_page_type_mask) == 0 ) + { +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Was the page out of sync? */ + if ( page_is_out_of_sync(page) ) + oos_hash_remove(v, gmfn); +#endif clear_bit(_PGC_page_table, &page->count_info); + } } /**************************************************************************/ @@ -1297,6 +1719,27 @@ static void sh_hash_audit_bucket(struct /* Bad shadow flags on guest page? */ BUG_ON( !(gpg->shadow_flags & (1<<sp->type)) ); /* Bad type count on guest page? */ +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( sp->type == SH_type_l1_32_shadow + || sp->type == SH_type_l1_pae_shadow + || sp->type == SH_type_l1_64_shadow ) + { + if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page + && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) + { + if ( !page_is_out_of_sync(gpg) ) + { + SHADOW_ERROR("MFN %#lx shadowed (by %#"PRI_mfn")" + " and not OOS but has typecount %#lx\n", + sp->backpointer, + mfn_x(shadow_page_to_mfn(sp)), + gpg->u.inuse.type_info); + BUG(); + } + } + } + else /* Not an l1 */ +#endif if ( (gpg->u.inuse.type_info & PGT_type_mask) == PGT_writable_page && (gpg->u.inuse.type_info & PGT_count_mask) != 0 ) { @@ -1608,7 +2051,8 @@ void sh_destroy_shadow(struct vcpu *v, m /* Remove all writeable mappings of a guest frame from the shadow tables * Returns non-zero if we need to flush TLBs. * level and fault_addr desribe how we found this to be a pagetable; - * level==0 means we have some other reason for revoking write access.*/ + * level==0 means we have some other reason for revoking write access. + * If level==0 we are allowed to fail, returning -1. */ int sh_remove_write_access(struct vcpu *v, mfn_t gmfn, unsigned int level, @@ -1659,7 +2103,12 @@ int sh_remove_write_access(struct vcpu * return 0; /* Early exit if it's already a pagetable, or otherwise not writeable */ - if ( sh_mfn_is_a_page_table(gmfn) + if ( (sh_mfn_is_a_page_table(gmfn) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Unless they've been allowed to go out of sync with their shadows */ + && !mfn_oos_may_write(gmfn) +#endif + ) || (pg->u.inuse.type_info & PGT_count_mask) == 0 ) return 0; @@ -1676,7 +2125,7 @@ int sh_remove_write_access(struct vcpu * } #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC - if ( v == current && level != 0 ) + if ( v == current ) { unsigned long gfn; /* Heuristic: there is likely to be only one writeable mapping, @@ -1690,6 +2139,8 @@ int sh_remove_write_access(struct vcpu * return 1; \ } while (0) + if ( level == 0 && fault_addr ) + GUESS(fault_addr, 6); if ( v->arch.paging.mode->guest_levels == 2 ) { @@ -1780,6 +2231,9 @@ int sh_remove_write_access(struct vcpu * * mapping -- ioreq page, grant mapping, &c. */ if ( (mfn_to_page(gmfn)->u.inuse.type_info & PGT_count_mask) != 0 ) { + if ( level == 0 ) + return -1; + SHADOW_ERROR("can't remove write access to mfn %lx: guest has " "%lu special-use mappings of it\n", mfn_x(gmfn), (mfn_to_page(gmfn)->u.inuse.type_info&PGT_count_mask)); @@ -2159,6 +2613,13 @@ static void sh_update_paging_modes(struc ASSERT(shadow_mode_translate(d)); ASSERT(shadow_mode_external(d)); +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all our pages now, because if a page goes out + * of sync with paging enabled and is resynced with paging + * disabled, the resync will go wrong. */ + shadow_resync_all(v, 0); +#endif /* OOS */ + if ( !hvm_paging_enabled(v) ) { /* When the guest has CR0.PG clear, we provide a 32-bit, non-PAE @@ -2253,6 +2714,27 @@ static void sh_update_paging_modes(struc // different values for CR4.PSE and CR4.PGE at the same time. // This *does* happen, at least for CR4.PGE... } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* We need to check that all the vcpus have paging enabled to + * unsync PTs. */ + if ( is_hvm_domain(d) ) + { + int pe = 1; + struct vcpu *vptr; + + for_each_vcpu(d, vptr) + { + if ( !hvm_paging_enabled(vptr) ) + { + pe = 0; + break; + } + } + + d->arch.paging.shadow.oos_active = pe; + } +#endif /* OOS */ v->arch.paging.mode->update_cr3(v, 0); } @@ -3044,7 +3526,11 @@ void shadow_audit_tables(struct vcpu *v) if ( !(SHADOW_AUDIT_ENABLE) ) return; - + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + sh_oos_audit(v->domain); +#endif + if ( SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES_FULL ) mask = ~1; /* Audit every table in the system */ else diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/arch/x86/mm/shadow/multi.c Fri Jun 20 15:10:08 2008 +0100 @@ -305,22 +305,54 @@ shadow_check_gwalk(struct vcpu *v, unsig } /* Remove write access permissions from a gwalk_t in a batch, and - * return OR-ed result for TLB flush hint - */ + * return OR-ed result for TLB flush hint and need to rewalk the guest + * pages. + * + * Syncing pages will remove write access to that page; but it may + * also give write access to other pages in the path. If we resync any + * pages, re-walk from the beginning. + */ +#define GW_RMWR_FLUSHTLB 1 +#define GW_RMWR_REWALK 2 + static inline uint32_t gw_remove_write_accesses(struct vcpu *v, unsigned long va, walk_t *gw) { - int rc = 0; + uint32_t rc = 0; #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - rc = sh_remove_write_access(v, gw->l3mfn, 3, va); -#endif - rc |= sh_remove_write_access(v, gw->l2mfn, 2, va); -#endif +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( mfn_is_out_of_sync(gw->l3mfn) ) + { + sh_resync(v, gw->l3mfn); + rc = GW_RMWR_REWALK; + } + else +#endif /* OOS */ + if ( sh_remove_write_access(v, gw->l3mfn, 3, va) ) + rc = GW_RMWR_FLUSHTLB; +#endif /* GUEST_PAGING_LEVELS >= 4 */ + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( mfn_is_out_of_sync(gw->l2mfn) ) + { + sh_resync(v, gw->l2mfn); + rc |= GW_RMWR_REWALK; + } + else +#endif /* OOS */ + if ( sh_remove_write_access(v, gw->l2mfn, 2, va) ) + rc |= GW_RMWR_FLUSHTLB; +#endif /* GUEST_PAGING_LEVELS >= 3 */ + if ( !(guest_supports_superpages(v) && - (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) ) - rc |= sh_remove_write_access(v, gw->l1mfn, 1, va); + (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + && !mfn_is_out_of_sync(gw->l1mfn) +#endif /* OOS */ + && sh_remove_write_access(v, gw->l1mfn, 1, va) ) + rc |= GW_RMWR_FLUSHTLB; return rc; } @@ -882,7 +914,12 @@ _sh_propagate(struct vcpu *v, // protect guest page tables // - if ( unlikely((level == 1) && sh_mfn_is_a_page_table(target_mfn)) ) + if ( unlikely((level == 1) + && sh_mfn_is_a_page_table(target_mfn) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) + && !mfn_oos_may_write(target_mfn) +#endif /* OOS */ + ) ) { if ( shadow_mode_trap_reads(d) ) { @@ -1125,6 +1162,9 @@ static int shadow_set_l4e(struct vcpu *v domain_crash(v->domain); return SHADOW_SET_ERROR; } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) + shadow_resync_all(v, 0); +#endif } /* Write the new entry */ @@ -1163,12 +1203,17 @@ static int shadow_set_l3e(struct vcpu *v | (((unsigned long)sl3e) & ~PAGE_MASK)); if ( shadow_l3e_get_flags(new_sl3e) & _PAGE_PRESENT ) + { /* About to install a new reference */ if ( !sh_get_ref(v, shadow_l3e_get_mfn(new_sl3e), paddr) ) { domain_crash(v->domain); return SHADOW_SET_ERROR; - } + } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC ) + shadow_resync_all(v, 0); +#endif + } /* Write the new entry */ shadow_write_entries(sl3e, &new_sl3e, 1, sl3mfn); @@ -1219,12 +1264,29 @@ static int shadow_set_l2e(struct vcpu *v | (((unsigned long)sl2e) & ~PAGE_MASK)); if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) + { + mfn_t sl1mfn = shadow_l2e_get_mfn(new_sl2e); + /* About to install a new reference */ - if ( !sh_get_ref(v, shadow_l2e_get_mfn(new_sl2e), paddr) ) + if ( !sh_get_ref(v, sl1mfn, paddr) ) { domain_crash(v->domain); return SHADOW_SET_ERROR; - } + } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + { + struct shadow_page_info *sp = mfn_to_shadow_page(sl1mfn); + mfn_t gl1mfn = _mfn(sp->backpointer); + + /* If the shadow is a fl1 then the backpointer contains + the GFN instead of the GMFN, and it's definitely not + OOS. */ + if ( (sp->type != SH_type_fl1_shadow) && mfn_valid(gl1mfn) + && mfn_is_out_of_sync(gl1mfn) ) + sh_resync(v, gl1mfn); + } +#endif + } /* Write the new entry */ #if GUEST_PAGING_LEVELS == 2 @@ -2544,6 +2606,97 @@ static int validate_gl1e(struct vcpu *v, return result; } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/**************************************************************************/ +/* Special validation function for re-syncing out-of-sync shadows. + * Walks the *shadow* page, and for every entry that it finds, + * revalidates the guest entry that corresponds to it. + * N.B. This function is called with the vcpu that unsynced the page, + * *not* the one that is causing it to be resynced. */ +void sh_resync_l1(struct vcpu *v, mfn_t gmfn) +{ + mfn_t sl1mfn; + shadow_l1e_t *sl1p; + guest_l1e_t *gl1p, *gp; + int rc = 0; + + sl1mfn = get_shadow_status(v, gmfn, SH_type_l1_shadow); + ASSERT(mfn_valid(sl1mfn)); /* Otherwise we would not have been called */ + + gp = sh_map_domain_page(gmfn); + gl1p = gp; + + SHADOW_FOREACH_L1E(sl1mfn, sl1p, &gl1p, 0, { + rc |= validate_gl1e(v, gl1p, sl1mfn, sl1p); + }); + + sh_unmap_domain_page(gp); + + /* Setting shadow L1 entries should never need us to flush the TLB */ + ASSERT(!(rc & SHADOW_SET_FLUSH)); +} + +/* Figure out whether it's definitely safe not to sync this l1 table. + * That is: if we can tell that it's only used once, and that the + * toplevel shadow responsible is not one of ours. + * N.B. This function is called with the vcpu that required the resync, + * *not* the one that originally unsynced the page, but it is + * called in the *mode* of the vcpu that unsynced it. Clear? Good. */ +int sh_safe_not_to_sync(struct vcpu *v, mfn_t gl1mfn) +{ + struct shadow_page_info *sp; + mfn_t smfn; + + smfn = get_shadow_status(v, gl1mfn, SH_type_l1_shadow); + ASSERT(mfn_valid(smfn)); /* Otherwise we would not have been called */ + + /* Up to l2 */ + sp = mfn_to_shadow_page(smfn); + if ( sp->count != 1 || !sp->up ) + return 0; + smfn = _mfn(sp->up >> PAGE_SHIFT); + ASSERT(mfn_valid(smfn)); + +#if (SHADOW_PAGING_LEVELS == 4) + /* up to l3 */ + sp = mfn_to_shadow_page(smfn); + if ( sp->count != 1 || !sp->up ) + return 0; + smfn = _mfn(sp->up >> PAGE_SHIFT); + ASSERT(mfn_valid(smfn)); + + /* up to l4 */ + sp = mfn_to_shadow_page(smfn); + if ( sp->count != 1 + || sh_type_is_pinnable(v, SH_type_l3_64_shadow) || !sp->up ) + return 0; + smfn = _mfn(sp->up >> PAGE_SHIFT); + ASSERT(mfn_valid(smfn)); + +#if (GUEST_PAGING_LEVELS == 2) + /* In 2-on-3 shadow mode the up pointer contains the link to the + * shadow page, but the shadow_table contains only the first of the + * four pages that makes the PAE top shadow tables. */ + smfn = _mfn(mfn_x(smfn) & ~0x3UL); +#endif + +#endif + + if ( pagetable_get_pfn(v->arch.shadow_table[0]) == mfn_x(smfn) +#if (SHADOW_PAGING_LEVELS == 3) + || pagetable_get_pfn(v->arch.shadow_table[1]) == mfn_x(smfn) + || pagetable_get_pfn(v->arch.shadow_table[2]) == mfn_x(smfn) + || pagetable_get_pfn(v->arch.shadow_table[3]) == mfn_x(smfn) +#endif + ) + return 0; + + /* Only in use in one toplevel shadow, and it's not the one we're + * running on */ + return 1; +} +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ + /**************************************************************************/ /* Functions which translate and install the shadows of arbitrary guest @@ -2805,6 +2958,7 @@ static int sh_page_fault(struct vcpu *v, int r; fetch_type_t ft = 0; p2m_type_t p2mt; + uint32_t rc; #if SHADOW_OPTIMIZATIONS & SHOPT_FAST_EMULATION int fast_emul = 0; #endif @@ -2830,6 +2984,17 @@ static int sh_page_fault(struct vcpu *v, { fast_emul = 1; gmfn = _mfn(v->arch.paging.shadow.last_emulated_mfn); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Fall back to the slow path if we're trying to emulate + writes to an out of sync page. */ + if ( mfn_valid(gmfn) && mfn_is_out_of_sync(gmfn) ) + { + v->arch.paging.last_write_emul_ok = 0; + goto page_fault_slow_path; + } +#endif /* OOS */ + perfc_incr(shadow_fault_fast_emulate); goto early_emulation; } @@ -2855,6 +3020,31 @@ static int sh_page_fault(struct vcpu *v, sizeof(sl1e)) == 0) && sh_l1e_is_magic(sl1e)) ) { +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* First, need to check that this isn't an out-of-sync + * shadow l1e. If it is, we fall back to the slow path, which + * will sync it up again. */ + { + shadow_l2e_t sl2e; + mfn_t gl1mfn; + if ( (__copy_from_user(&sl2e, + (sh_linear_l2_table(v) + + shadow_l2_linear_offset(va)), + sizeof(sl2e)) != 0) + || !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) + || !mfn_valid(gl1mfn = _mfn(mfn_to_shadow_page( + shadow_l2e_get_mfn(sl2e))->backpointer)) + || unlikely(mfn_is_out_of_sync(gl1mfn)) ) + { + /* Hit the slow path as if there had been no + * shadow entry at all, and let it tidy up */ + ASSERT(regs->error_code & PFEC_page_present); + regs->error_code ^= (PFEC_reserved_bit|PFEC_page_present); + goto page_fault_slow_path; + } + } +#endif /* SHOPT_OUT_OF_SYNC */ + if ( sh_l1e_is_gnp(sl1e) ) { /* Not-present in a guest PT: pass to the guest as @@ -2890,6 +3080,10 @@ static int sh_page_fault(struct vcpu *v, return EXCRET_fault_fixed; } } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + page_fault_slow_path: +#endif #endif /* SHOPT_FAST_FAULT_PATH */ /* Detect if this page fault happened while we were already in Xen @@ -2904,7 +3098,21 @@ static int sh_page_fault(struct vcpu *v, return 0; } - if ( guest_walk_tables(v, va, &gw, regs->error_code) != 0 ) + rewalk: + rc = guest_walk_tables(v, va, &gw, regs->error_code); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + if ( !(rc & _PAGE_PRESENT) ) + regs->error_code |= PFEC_page_present; + else if ( regs->error_code & PFEC_page_present ) + { + SHADOW_ERROR("OOS paranoia: Something is wrong in guest TLB" + " flushing. Have fun debugging it.\n"); + regs->error_code &= ~PFEC_page_present; + } +#endif + + if ( rc != 0 ) { perfc_incr(shadow_fault_bail_real_fault); SHADOW_PRINTK("not a shadow fault\n"); @@ -2948,7 +3156,10 @@ static int sh_page_fault(struct vcpu *v, shadow_lock(d); - if ( gw_remove_write_accesses(v, va, &gw) ) + rc = gw_remove_write_accesses(v, va, &gw); + + /* First bit set: Removed write access to a page. */ + if ( rc & GW_RMWR_FLUSHTLB ) { /* Write permission removal is also a hint that other gwalks * overlapping with this one may be inconsistent @@ -2958,11 +3169,20 @@ static int sh_page_fault(struct vcpu *v, flush_tlb_mask(d->domain_dirty_cpumask); } +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Second bit set: Resynced a page. Re-walk needed. */ + if ( rc & GW_RMWR_REWALK ) + { + shadow_unlock(d); + goto rewalk; + } +#endif /* OOS */ + if ( !shadow_check_gwalk(v, va, &gw) ) { perfc_incr(shadow_inconsistent_gwalk); shadow_unlock(d); - return EXCRET_fault_fixed; + goto rewalk; } shadow_audit_tables(v); @@ -3001,7 +3221,12 @@ static int sh_page_fault(struct vcpu *v, #endif /* Need to emulate accesses to page tables */ - if ( sh_mfn_is_a_page_table(gmfn) ) + if ( sh_mfn_is_a_page_table(gmfn) +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Unless they've been allowed to go out of sync with their shadows */ + && !mfn_is_out_of_sync(gmfn) +#endif + ) { if ( ft == ft_demand_write ) { @@ -3215,6 +3440,7 @@ sh_invlpg(struct vcpu *v, unsigned long * instruction should be issued on the hardware, or 0 if it's safe not * to do so. */ { + mfn_t sl1mfn; shadow_l2e_t sl2e; perfc_incr(shadow_invlpg); @@ -3278,12 +3504,64 @@ sh_invlpg(struct vcpu *v, unsigned long // If so, then we'll need to flush the entire TLB (because that's // easier than invalidating all of the individual 4K pages). // - if ( mfn_to_shadow_page(shadow_l2e_get_mfn(sl2e))->type + sl1mfn = shadow_l2e_get_mfn(sl2e); + if ( mfn_to_shadow_page(sl1mfn)->type == SH_type_fl1_shadow ) { flush_tlb_local(); return 0; } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Check to see if the SL1 is out of sync. */ + { + mfn_t gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + struct page_info *pg = mfn_to_page(gl1mfn); + if ( mfn_valid(gl1mfn) + && page_is_out_of_sync(pg) ) + { + /* The test above may give false positives, since we don't + * hold the shadow lock yet. Check again with the lock held. */ + shadow_lock(v->domain); + + /* This must still be a copy-from-user because we didn't + * have the shadow lock last time we checked, and the + * higher-level shadows might have disappeared under our + * feet. */ + if ( __copy_from_user(&sl2e, + sh_linear_l2_table(v) + + shadow_l2_linear_offset(va), + sizeof (sl2e)) != 0 ) + { + perfc_incr(shadow_invlpg_fault); + shadow_unlock(v->domain); + return 0; + } + + if ( !(shadow_l2e_get_flags(sl2e) & _PAGE_PRESENT) ) + { + shadow_unlock(v->domain); + return 0; + } + + sl1mfn = shadow_l2e_get_mfn(sl2e); + gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + pg = mfn_to_page(gl1mfn); + + if ( likely(sh_mfn_is_a_page_table(gl1mfn) + && page_is_out_of_sync(pg) ) ) + { + shadow_l1e_t *sl1; + sl1 = sh_linear_l1_table(v) + shadow_l1_linear_offset(va); + /* Remove the shadow entry that maps this VA */ + (void) shadow_set_l1e(v, sl1, shadow_l1e_empty(), sl1mfn); + } + shadow_unlock(v->domain); + /* Need the invlpg, to pick up the disappeareance of the sl1e */ + return 1; + } + } +#endif return 1; } @@ -3709,6 +3987,13 @@ sh_update_cr3(struct vcpu *v, int do_loc ASSERT(v->arch.cr3 == 0); return; } + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all the shadow entries on a TLB flush. Resync + * current vcpus OOS pages before switching to the new shadow + * tables so that the VA hint is still valid. */ + shadow_resync_current_vcpu(v, do_locking); +#endif if ( do_locking ) shadow_lock(v->domain); @@ -3938,6 +4223,15 @@ sh_update_cr3(struct vcpu *v, int do_loc /* Release the lock, if we took it (otherwise it's the caller's problem) */ if ( do_locking ) shadow_unlock(v->domain); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Need to resync all the shadow entries on a TLB flush. We only + * update the shadows, leaving the pages out of sync. Also, we try + * to skip synchronization of shadows not mapped in the new + * tables. */ + shadow_sync_other_vcpus(v, do_locking); +#endif + } @@ -4437,23 +4731,35 @@ sh_x86_emulate_cmpxchg8b(struct vcpu *v, #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES -#define AUDIT_FAIL(_level, _fmt, _a...) do { \ - printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ - "gl" #_level "mfn = %" PRI_mfn \ - " sl" #_level "mfn = %" PRI_mfn \ - " &gl" #_level "e = %p &sl" #_level "e = %p" \ - " gl" #_level "e = %" SH_PRI_gpte \ - " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ - GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ - _level, guest_index(gl ## _level ## e), \ - mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ - gl ## _level ## e, sl ## _level ## e, \ - gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ - ##_a); \ - BUG(); \ - done = 1; \ -} while (0) - +#define AUDIT_FAIL(_level, _fmt, _a...) do { \ + printk("Shadow %u-on-%u audit failed at level %i, index %i\n" \ + "gl" #_level "mfn = %" PRI_mfn \ + " sl" #_level "mfn = %" PRI_mfn \ + " &gl" #_level "e = %p &sl" #_level "e = %p" \ + " gl" #_level "e = %" SH_PRI_gpte \ + " sl" #_level "e = %" SH_PRI_pte "\nError: " _fmt "\n", \ + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ + _level, guest_index(gl ## _level ## e), \ + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ + gl ## _level ## e, sl ## _level ## e, \ + gl ## _level ## e->l ## _level, sl ## _level ## e->l ## _level, \ + ##_a); \ + BUG(); \ + done = 1; \ +} while (0) + +#define AUDIT_FAIL_MIN(_level, _fmt, _a...) do { \ + printk("Shadow %u-on-%u audit failed at level %i\n" \ + "gl" #_level "mfn = %" PRI_mfn \ + " sl" #_level "mfn = %" PRI_mfn \ + " Error: " _fmt "\n", \ + GUEST_PAGING_LEVELS, SHADOW_PAGING_LEVELS, \ + _level, \ + mfn_x(gl ## _level ## mfn), mfn_x(sl ## _level ## mfn), \ + ##_a); \ + BUG(); \ + done = 1; \ +} while (0) static char * sh_audit_flags(struct vcpu *v, int level, int gflags, int sflags) @@ -4494,6 +4800,16 @@ int sh_audit_l1_table(struct vcpu *v, mf /* Follow the backpointer */ gl1mfn = _mfn(mfn_to_shadow_page(sl1mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Out-of-sync l1 shadows can contain anything: just check the OOS hash */ + if ( page_is_out_of_sync(mfn_to_page(gl1mfn)) ) + { + oos_audit_hash_is_present(v->domain, gl1mfn); + return 0; + } +#endif + gl1e = gp = sh_map_domain_page(gl1mfn); SHADOW_FOREACH_L1E(sl1mfn, sl1e, &gl1e, done, { @@ -4574,6 +4890,13 @@ int sh_audit_l2_table(struct vcpu *v, mf /* Follow the backpointer */ gl2mfn = _mfn(mfn_to_shadow_page(sl2mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Only L1's may be out of sync. */ + if ( page_is_out_of_sync(mfn_to_page(gl2mfn)) ) + AUDIT_FAIL_MIN(2, "gmfn %lx is out of sync", mfn_x(gl2mfn)); +#endif + gl2e = gp = sh_map_domain_page(gl2mfn); SHADOW_FOREACH_L2E(sl2mfn, sl2e, &gl2e, done, v->domain, { @@ -4616,6 +4939,13 @@ int sh_audit_l3_table(struct vcpu *v, mf /* Follow the backpointer */ gl3mfn = _mfn(mfn_to_shadow_page(sl3mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Only L1's may be out of sync. */ + if ( page_is_out_of_sync(mfn_to_page(gl3mfn)) ) + AUDIT_FAIL_MIN(3, "gmfn %lx is out of sync", mfn_x(gl3mfn)); +#endif + gl3e = gp = sh_map_domain_page(gl3mfn); SHADOW_FOREACH_L3E(sl3mfn, sl3e, &gl3e, done, { @@ -4656,6 +4986,13 @@ int sh_audit_l4_table(struct vcpu *v, mf /* Follow the backpointer */ gl4mfn = _mfn(mfn_to_shadow_page(sl4mfn)->backpointer); + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) + /* Only L1's may be out of sync. */ + if ( page_is_out_of_sync(mfn_to_page(gl4mfn)) ) + AUDIT_FAIL_MIN(4, "gmfn %lx is out of sync", mfn_x(gl4mfn)); +#endif + gl4e = gp = sh_map_domain_page(gl4mfn); SHADOW_FOREACH_L4E(sl4mfn, sl4e, &gl4e, done, v->domain, { diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/multi.h --- a/xen/arch/x86/mm/shadow/multi.h Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/arch/x86/mm/shadow/multi.h Fri Jun 20 15:10:08 2008 +0100 @@ -115,3 +115,13 @@ SHADOW_INTERNAL_NAME(sh_destroy_monitor_ extern struct paging_mode SHADOW_INTERNAL_NAME(sh_paging_mode, GUEST_LEVELS); + +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC +extern void +SHADOW_INTERNAL_NAME(sh_resync_l1, GUEST_LEVELS) + (struct vcpu *v, mfn_t gmfn); + +extern int +SHADOW_INTERNAL_NAME(sh_safe_not_to_sync, GUEST_LEVELS) + (struct vcpu*v, mfn_t gmfn); +#endif diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/arch/x86/mm/shadow/private.h Fri Jun 20 15:10:08 2008 +0100 @@ -63,8 +63,9 @@ extern int shadow_audit_enable; #define SHOPT_SKIP_VERIFY 0x20 /* Skip PTE v'fy when safe to do so */ #define SHOPT_VIRTUAL_TLB 0x40 /* Cache guest v->p translations */ #define SHOPT_FAST_EMULATION 0x80 /* Fast write emulation */ +#define SHOPT_OUT_OF_SYNC 0x100 /* Allow guest writes to L1 PTs */ -#define SHADOW_OPTIMIZATIONS 0xff +#define SHADOW_OPTIMIZATIONS 0x1ff /****************************************************************************** @@ -301,6 +302,62 @@ static inline int sh_type_is_pinnable(st #define SHF_PAE (SHF_L1_PAE|SHF_FL1_PAE|SHF_L2_PAE|SHF_L2H_PAE) #define SHF_64 (SHF_L1_64|SHF_FL1_64|SHF_L2_64|SHF_L2H_64|SHF_L3_64|SHF_L4_64) +#define SHF_L1_ANY (SHF_L1_32|SHF_L1_PAE|SHF_L1_64) + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/* Marks a guest L1 page table which is shadowed but not write-protected. + * If set, then *only* L1 shadows (SHF_L1_*) are allowed. + * + * out_of_sync indicates that the shadow tables may not reflect the + * guest tables. If it is clear, then the shadow tables *must* reflect + * the guest tables. + * + * oos_may_write indicates that a page may have writable mappings. + * + * Most of the time the flags are synonymous. There is a short period of time + * during resync that oos_may_write is clear but out_of_sync is not. If a + * codepath is called during that time and is sensitive to oos issues, it may + * need to use the second flag. + */ +#define SHF_out_of_sync (1u<<30) +#define SHF_oos_may_write (1u<<29) +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ + +static inline int sh_page_has_multiple_shadows(struct page_info *pg) +{ + u32 shadows; + if ( !(pg->count_info & PGC_page_table) ) + return 0; + shadows = pg->shadow_flags & SHF_page_type_mask; + /* More than one type bit set in shadow-flags? */ + return ( (shadows & ~(1UL << find_first_set_bit(shadows))) != 0 ); +} + +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/* The caller must verify this is reasonable to call; i.e., valid mfn, + * domain is translated, &c */ +static inline int page_is_out_of_sync(struct page_info *p) +{ + return (p->count_info & PGC_page_table) + && (p->shadow_flags & SHF_out_of_sync); +} + +static inline int mfn_is_out_of_sync(mfn_t gmfn) +{ + return page_is_out_of_sync(mfn_to_page(mfn_x(gmfn))); +} + +static inline int page_oos_may_write(struct page_info *p) +{ + return (p->count_info & PGC_page_table) + && (p->shadow_flags & SHF_oos_may_write); +} + +static inline int mfn_oos_may_write(mfn_t gmfn) +{ + return page_oos_may_write(mfn_to_page(mfn_x(gmfn))); +} +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /****************************************************************************** * Various function declarations @@ -351,7 +408,50 @@ int shadow_cmpxchg_guest_entry(struct vc int shadow_cmpxchg_guest_entry(struct vcpu *v, intpte_t *p, intpte_t *old, intpte_t new, mfn_t gmfn); +#if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) +/* Allow a shadowed page to go out of sync */ +int sh_unsync(struct vcpu *v, mfn_t gmfn, unsigned long va); +/* Pull an out-of-sync page back into sync. */ +void sh_resync(struct vcpu *v, mfn_t gmfn); + +/* Pull all out-of-sync shadows back into sync. If skip != 0, we try + * to avoid resyncing where we think we can get away with it. */ + +void sh_resync_all(struct vcpu *v, int skip, int this, int others, int do_locking); + +static inline void +shadow_resync_all(struct vcpu *v, int do_locking) +{ + sh_resync_all(v, + 0 /* skip */, + 1 /* this */, + 1 /* others */, + do_locking); +} + +static inline void +shadow_resync_current_vcpu(struct vcpu *v, int do_locking) +{ + sh_resync_all(v, + 0 /* skip */, + 1 /* this */, + 0 /* others */, + do_locking); +} + +static inline void +shadow_sync_other_vcpus(struct vcpu *v, int do_locking) +{ + sh_resync_all(v, + 1 /* skip */, + 0 /* this */, + 1 /* others */, + do_locking); +} + +void oos_audit_hash_is_present(struct domain *d, mfn_t gmfn); +#endif /* (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) */ /****************************************************************************** * Flags used in the return value of the shadow_set_lXe() functions... diff -r 26ecd1f9e128 xen/arch/x86/mm/shadow/types.h --- a/xen/arch/x86/mm/shadow/types.h Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/arch/x86/mm/shadow/types.h Fri Jun 20 15:10:08 2008 +0100 @@ -438,6 +438,10 @@ struct shadow_walk_t #define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap) #define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry) +#if SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC +#define sh_resync_l1 INTERNAL_NAME(sh_resync_l1) +#define sh_safe_not_to_sync INTERNAL_NAME(sh_safe_not_to_sync) +#endif /* The sh_guest_(map|get)_* functions depends on Xen's paging levels */ #define sh_guest_map_l1e \ diff -r 26ecd1f9e128 xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/include/asm-x86/domain.h Fri Jun 20 15:10:08 2008 +0100 @@ -103,6 +103,9 @@ struct shadow_domain { * emulation and remove write permission */ atomic_t gtable_dirty_version; + + /* OOS */ + int oos_active; }; struct shadow_vcpu { @@ -122,6 +125,10 @@ struct shadow_vcpu { unsigned long last_emulated_frame; /* Last MFN that we emulated a write successfully */ unsigned long last_emulated_mfn; + + /* Shadow out-of-sync: pages that this vcpu has let go out of sync */ + mfn_t oos[SHADOW_OOS_PAGES]; + unsigned long oos_va[SHADOW_OOS_PAGES]; }; /************************************************/ diff -r 26ecd1f9e128 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/include/asm-x86/mm.h Fri Jun 20 15:10:08 2008 +0100 @@ -130,6 +130,9 @@ static inline u32 pickle_domptr(struct d /* The order of the largest allocation unit we use for shadow pages */ #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ +/* The number of out-of-sync shadows we allow per vcpu (prime, please) */ +#define SHADOW_OOS_PAGES 7 + #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) diff -r 26ecd1f9e128 xen/include/asm-x86/perfc_defn.h --- a/xen/include/asm-x86/perfc_defn.h Fri Jun 20 12:26:23 2008 +0100 +++ b/xen/include/asm-x86/perfc_defn.h Fri Jun 20 15:10:08 2008 +0100 @@ -80,6 +80,7 @@ PERFCOUNTER(shadow_writeable_h_3, "shad PERFCOUNTER(shadow_writeable_h_3, "shadow writeable: 64b w2k3") PERFCOUNTER(shadow_writeable_h_4, "shadow writeable: linux low/solaris") PERFCOUNTER(shadow_writeable_h_5, "shadow writeable: linux high") +PERFCOUNTER(shadow_writeable_h_6, "shadow writeable: unsync va") PERFCOUNTER(shadow_writeable_bf, "shadow writeable brute-force") PERFCOUNTER(shadow_mappings, "shadow removes all mappings") PERFCOUNTER(shadow_mappings_bf, "shadow rm-mappings brute-force") @@ -101,4 +102,8 @@ PERFCOUNTER(shadow_em_ex_non_pt, "shad PERFCOUNTER(shadow_em_ex_non_pt, "shadow extra non-pt-write op") PERFCOUNTER(shadow_em_ex_fail, "shadow extra emulation failed") +PERFCOUNTER(shadow_unsync, "shadow OOS unsyncs") +PERFCOUNTER(shadow_unsync_evict, "shadow OOS evictions") +PERFCOUNTER(shadow_resync, "shadow OOS resyncs") + /*#endif*/ /* __XEN_PERFC_DEFN_H__ */ _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |