diff -r a9ffd495b5fc xen/arch/x86/mm/Makefile --- a/xen/arch/x86/mm/Makefile Tue Nov 11 15:22:27 2008 +0000 +++ b/xen/arch/x86/mm/Makefile Wed Nov 12 11:15:29 2008 +0000 @@ -3,3 +3,9 @@ subdir-y += hap obj-y += paging.o obj-y += p2m.o +obj-y += guest_walk_2.o +obj-y += guest_walk_3.o +obj-$(x86_64) += guest_walk_4.o + +guest_walk_%.o: guest_walk.c $(HDRS) Makefile + $(CC) $(CFLAGS) -DGUEST_PAGING_LEVELS=$* -c $< -o $@ diff -r a9ffd495b5fc xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Tue Nov 11 15:22:27 2008 +0000 +++ b/xen/arch/x86/mm/shadow/multi.c Wed Nov 12 14:44:03 2008 +0000 @@ -157,95 +157,23 @@ delete_shadow_status(struct vcpu *v, mfn put_page(mfn_to_page(gmfn)); } -/**************************************************************************/ -/* CPU feature support querying */ - -static inline int -guest_supports_superpages(struct vcpu *v) -{ - /* The _PAGE_PSE bit must be honoured in HVM guests, whenever - * CR4.PSE is set or the guest is in PAE or long mode. - * It's also used in the dummy PT for vcpus with CR4.PG cleared. */ - return (is_hvm_vcpu(v) && - (GUEST_PAGING_LEVELS != 2 - || !hvm_paging_enabled(v) - || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE))); -} - -static inline int -guest_supports_nx(struct vcpu *v) -{ - if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx ) - return 0; - if ( !is_hvm_vcpu(v) ) - return cpu_has_nx; - return hvm_nx_enabled(v); -} - /**************************************************************************/ /* Functions for walking the guest page tables */ -/* Flags that are needed in a pagetable entry, with the sense of NX inverted */ -static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) -{ - static uint32_t flags[] = { - /* I/F - Usr Wr */ - /* 0 0 0 0 */ _PAGE_PRESENT, - /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW, - /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER, - /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, - /* 0 1 0 0 */ _PAGE_PRESENT, - /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW, - /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER, - /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, - /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, - /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, - /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, - /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, - /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, - /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, - /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, - /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, - }; - - /* Don't demand not-NX if the CPU wouldn't enforce it. */ - if ( !guest_supports_nx(v) ) - pfec &= ~PFEC_insn_fetch; - - /* Don't demand R/W if the CPU wouldn't enforce it. */ - if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) - && !(pfec & PFEC_user_mode) ) - pfec &= ~PFEC_write_access; - - return flags[(pfec & 0x1f) >> 1]; -} - -/* Modify a guest pagetable entry to set the Accessed and Dirty bits. - * Returns non-zero if it actually writes to guest memory. */ -static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) -{ - guest_intpte_t old, new; - int ret = 0; - - old = *(guest_intpte_t *)walk_p; - new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0); - if ( old != new ) - { - /* Write the new entry into the walk, and try to write it back - * into the guest table as well. If the guest table has changed - * under out feet then leave it alone. */ - *(guest_intpte_t *)walk_p = new; - if( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) - ret = 1; - - /* FIXME -- this code is longer than necessary */ - if(set_dirty) - TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_AD); - else - TRACE_SHADOW_PATH_FLAG(TRCE_SFLAG_SET_A); - } - return ret; +static inline uint32_t +sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw, + uint32_t pfec) +{ + return guest_walk_tables(v, va, gw, pfec, +#if GUEST_PAGING_LEVELS == 3 /* PAE */ + _mfn(INVALID_MFN), + v->arch.paging.shadow.gl3e +#else /* 32 or 64 */ + pagetable_get_mfn(v->arch.guest_table), + v->arch.paging.shadow.guest_vtable +#endif + ); } /* This validation is called with lock held, and after write permission @@ -364,236 +292,6 @@ gw_remove_write_accesses(struct vcpu *v, return rc; } -/* Walk the guest pagetables, after the manner of a hardware walker. - * - * Inputs: a vcpu, a virtual address, a walk_t to fill, a - * pointer to a pagefault code - * - * We walk the vcpu's guest pagetables, filling the walk_t with what we - * see and adding any Accessed and Dirty bits that are needed in the - * guest entries. Using the pagefault code, we check the permissions as - * we go. For the purposes of reading pagetables we treat all non-RAM - * memory as contining zeroes. - * - * The walk is done in a lock-free style, with some sanity check postponed - * after grabbing shadow lock later. Those delayed checks will make sure - * no inconsistent mapping being translated into shadow page table. - * - * Returns 0 for success, or the set of permission bits that we failed on - * if the walk did not complete. - * N.B. This is different from the old return code but almost no callers - * checked the old return code anyway. - */ -static uint32_t -guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, uint32_t pfec) -{ - struct domain *d = v->domain; - p2m_type_t p2mt; - guest_l1e_t *l1p = NULL; - guest_l2e_t *l2p = NULL; -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - guest_l3e_t *l3p = NULL; - guest_l4e_t *l4p; -#endif - uint32_t gflags, mflags, rc = 0; - int pse; - - perfc_incr(shadow_guest_walk); - memset(gw, 0, sizeof(*gw)); - gw->va = va; - - /* Mandatory bits that must be set in every entry. We invert NX, to - * calculate as if there were an "X" bit that allowed access. - * We will accumulate, in rc, the set of flags that are missing. */ - mflags = mandatory_flags(v, pfec); - -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - - /* Get the l4e from the top level table and check its flags*/ - gw->l4mfn = pagetable_get_mfn(v->arch.guest_table); - l4p = ((guest_l4e_t *)v->arch.paging.shadow.guest_vtable); - gw->l4e = l4p[guest_l4_table_offset(va)]; - gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - if ( rc & _PAGE_PRESENT ) goto out; - - /* Map the l3 table */ - gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt); - if ( !p2m_is_ram(p2mt) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - ASSERT(mfn_valid(gw->l3mfn)); - - /* Get the l3e and check its flags*/ - l3p = sh_map_domain_page(gw->l3mfn); - gw->l3e = l3p[guest_l3_table_offset(va)]; - gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - if ( rc & _PAGE_PRESENT ) - goto out; - -#else /* PAE only... */ - - /* Get l3e from the cache of the top level table and check its flag */ - gw->l3e = v->arch.paging.shadow.gl3e[guest_l3_table_offset(va)]; - if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - -#endif /* PAE or 64... */ - - /* Map the l2 table */ - gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt); - if ( !p2m_is_ram(p2mt) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - ASSERT(mfn_valid(gw->l2mfn)); - - /* Get the l2e */ - l2p = sh_map_domain_page(gw->l2mfn); - gw->l2e = l2p[guest_l2_table_offset(va)]; - -#else /* 32-bit only... */ - - /* Get l2e from the top level table */ - gw->l2mfn = pagetable_get_mfn(v->arch.guest_table); - l2p = ((guest_l2e_t *)v->arch.paging.shadow.guest_vtable); - gw->l2e = l2p[guest_l2_table_offset(va)]; - -#endif /* All levels... */ - - gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - if ( rc & _PAGE_PRESENT ) - goto out; - - pse = (guest_supports_superpages(v) && - (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); - - if ( pse ) - { - /* Special case: this guest VA is in a PSE superpage, so there's - * no guest l1e. We make one up so that the propagation code - * can generate a shadow l1 table. Start with the gfn of the - * first 4k-page of the superpage. */ - gfn_t start = guest_l2e_get_gfn(gw->l2e); - /* Grant full access in the l1e, since all the guest entry's - * access controls are enforced in the shadow l2e. */ - int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| - _PAGE_ACCESSED|_PAGE_DIRTY); - /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 - * of the level 1. */ - if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) - flags |= _PAGE_PAT; - /* Copy the cache-control bits to the l1 as well, because we - * can't represent PAT in the (non-PSE) shadow l2e. :( - * This could cause problems if a guest ever maps an area of - * memory with superpages using more than one caching mode. */ - flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD); - /* Increment the pfn by the right number of 4k pages. - * The ~0x1 is to mask out the PAT bit mentioned above. */ - start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); - gw->l1e = guest_l1e_from_gfn(start, flags); - gw->l1mfn = _mfn(INVALID_MFN); - } - else - { - /* Not a superpage: carry on and find the l1e. */ - gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt); - if ( !p2m_is_ram(p2mt) ) - { - rc |= _PAGE_PRESENT; - goto out; - } - ASSERT(mfn_valid(gw->l1mfn)); - l1p = sh_map_domain_page(gw->l1mfn); - gw->l1e = l1p[guest_l1_table_offset(va)]; - gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT; - rc |= ((gflags & mflags) ^ mflags); - } - - /* Go back and set accessed and dirty bits only if the walk was a - * success. Although the PRMs say higher-level _PAGE_ACCESSED bits - * get set whenever a lower-level PT is used, at least some hardware - * walkers behave this way. */ - if ( rc == 0 ) - { -#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */ - if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) - paging_mark_dirty(d, mfn_x(gw->l4mfn)); - if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) ) - paging_mark_dirty(d, mfn_x(gw->l3mfn)); -#endif - if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, - (pse && (pfec & PFEC_write_access))) ) - paging_mark_dirty(d, mfn_x(gw->l2mfn)); - if ( !pse ) - { - if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, - (pfec & PFEC_write_access)) ) - paging_mark_dirty(d, mfn_x(gw->l1mfn)); - } - } - - out: -#if GUEST_PAGING_LEVELS == 4 - if ( l3p ) sh_unmap_domain_page(l3p); -#endif -#if GUEST_PAGING_LEVELS >= 3 - if ( l2p ) sh_unmap_domain_page(l2p); -#endif - if ( l1p ) sh_unmap_domain_page(l1p); - - return rc; -} - -/* Given a walk_t, translate the gw->va into the guest's notion of the - * corresponding frame number. */ -static inline gfn_t -guest_walk_to_gfn(walk_t *gw) -{ - if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) - return _gfn(INVALID_GFN); - return guest_l1e_get_gfn(gw->l1e); -} - -/* Given a walk_t, translate the gw->va into the guest's notion of the - * corresponding physical address. */ -static inline paddr_t -guest_walk_to_gpa(walk_t *gw) -{ - if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) - return 0; - return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK); -} - -#if 0 /* Keep for debugging */ -/* Pretty-print the contents of a guest-walk */ -static inline void print_gw(walk_t *gw) -{ - SHADOW_PRINTK("GUEST WALK TO %#lx:\n", gw->va); -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - SHADOW_PRINTK(" l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn)); - SHADOW_PRINTK(" l4e=%" SH_PRI_gpte "\n", gw->l4e.l4); - SHADOW_PRINTK(" l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn)); -#endif /* PAE or 64... */ - SHADOW_PRINTK(" l3e=%" SH_PRI_gpte "\n", gw->l3e.l3); -#endif /* All levels... */ - SHADOW_PRINTK(" l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn)); - SHADOW_PRINTK(" l2e=%" SH_PRI_gpte "\n", gw->l2e.l2); - SHADOW_PRINTK(" l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn)); - SHADOW_PRINTK(" l1e=%" SH_PRI_gpte "\n", gw->l1e.l1); -} -#endif /* 0 */ - #if SHADOW_AUDIT & SHADOW_AUDIT_ENTRIES /* Lightweight audit: pass all the shadows associated with this guest walk * through the audit mechanisms */ @@ -654,7 +352,7 @@ sh_guest_map_l1e(struct vcpu *v, unsigne // XXX -- this is expensive, but it's easy to cobble together... // FIXME! - if ( guest_walk_tables(v, addr, &gw, PFEC_page_present) == 0 + if ( sh_walk_guest_tables(v, addr, &gw, PFEC_page_present) == 0 && mfn_valid(gw.l1mfn) ) { if ( gl1mfn ) @@ -676,7 +374,7 @@ sh_guest_get_eff_l1e(struct vcpu *v, uns // XXX -- this is expensive, but it's easy to cobble together... // FIXME! - (void) guest_walk_tables(v, addr, &gw, PFEC_page_present); + (void) sh_walk_guest_tables(v, addr, &gw, PFEC_page_present); *(guest_l1e_t *)eff_l1e = gw.l1e; } #endif /* CONFIG == GUEST (== SHADOW) */ @@ -3329,9 +3027,14 @@ static int sh_page_fault(struct vcpu *v, } rewalk: + + /* The walk is done in a lock-free style, with some sanity check + * postponed after grabbing shadow lock later. Those delayed checks + * will make sure no inconsistent mapping being translated into + * shadow page table. */ version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version); rmb(); - rc = guest_walk_tables(v, va, &gw, regs->error_code); + rc = sh_walk_guest_tables(v, va, &gw, regs->error_code); #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC) regs->error_code &= ~PFEC_page_present; @@ -3919,7 +3622,7 @@ sh_gva_to_gfn(struct vcpu *v, unsigned l return vtlb_gfn; #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */ - if ( guest_walk_tables(v, va, &gw, pfec[0]) != 0 ) + if ( sh_walk_guest_tables(v, va, &gw, pfec[0]) != 0 ) { if ( !(guest_l1e_get_flags(gw.l1e) & _PAGE_PRESENT) ) pfec[0] &= ~PFEC_page_present; diff -r a9ffd495b5fc xen/include/asm-x86/guest_pt.h --- a/xen/include/asm-x86/guest_pt.h Tue Nov 11 15:22:27 2008 +0000 +++ b/xen/include/asm-x86/guest_pt.h Wed Nov 12 14:53:40 2008 +0000 @@ -175,6 +175,32 @@ static inline guest_l4e_t guest_l4e_from #endif /* GUEST_PAGING_LEVELS != 2 */ +/* Which pagetable features are supported on this vcpu? */ + +static inline int +guest_supports_superpages(struct vcpu *v) +{ + /* The _PAGE_PSE bit must be honoured in HVM guests, whenever + * CR4.PSE is set or the guest is in PAE or long mode. + * It's also used in the dummy PT for vcpus with CR4.PG cleared. */ + return (is_hvm_vcpu(v) && + (GUEST_PAGING_LEVELS != 2 + || !hvm_paging_enabled(v) + || (v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PSE))); +} + +static inline int +guest_supports_nx(struct vcpu *v) +{ + if ( GUEST_PAGING_LEVELS == 2 || !cpu_has_nx ) + return 0; + if ( !is_hvm_vcpu(v) ) + return cpu_has_nx; + return hvm_nx_enabled(v); +} + + + /* Type used for recording a walk through guest pagetables. It is * filled in by the pagetable walk function, and also used as a cache * for later walks. When we encounter a superpage l2e, we fabricate an @@ -201,4 +227,67 @@ struct guest_pagetable_walk }; +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding frame number. */ +static inline gfn_t +guest_walk_to_gfn(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) + return _gfn(INVALID_GFN); + return guest_l1e_get_gfn(gw->l1e); +} + +/* Given a walk_t, translate the gw->va into the guest's notion of the + * corresponding physical address. */ +static inline paddr_t +guest_walk_to_gpa(walk_t *gw) +{ + if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) ) + return 0; + return guest_l1e_get_paddr(gw->l1e) + (gw->va & ~PAGE_MASK); +} + +/* Walk the guest pagetables, after the manner of a hardware walker. + * + * Inputs: a vcpu, a virtual address, a walk_t to fill, a + * pointer to a pagefault code, the MFN of the guest's + * top-level pagetable, and a mapping of the + * guest's top-level pagetable. + * + * We walk the vcpu's guest pagetables, filling the walk_t with what we + * see and adding any Accessed and Dirty bits that are needed in the + * guest entries. Using the pagefault code, we check the permissions as + * we go. For the purposes of reading pagetables we treat all non-RAM + * memory as contining zeroes. + * + * Returns 0 for success, or the set of permission bits that we failed on + * if the walk did not complete. */ + +/* Macro-fu so you can call guest_walk_tables() and get the right one. */ +#define GPT_RENAME2(_n, _l) _n ## _ ## _l ## _levels +#define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l) +#define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS) + +extern uint32_t +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, + uint32_t pfec, mfn_t top_mfn, void *top_map); + +/* Pretty-print the contents of a guest-walk */ +static inline void print_gw(walk_t *gw) +{ + gdprintk(XENLOG_INFO, "GUEST WALK TO %#lx:\n", gw->va); +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + gdprintk(XENLOG_INFO, " l4mfn=%" PRI_mfn "\n", mfn_x(gw->l4mfn)); + gdprintk(XENLOG_INFO, " l4e=%" PRI_gpte "\n", gw->l4e.l4); + gdprintk(XENLOG_INFO, " l3mfn=%" PRI_mfn "\n", mfn_x(gw->l3mfn)); +#endif /* PAE or 64... */ + gdprintk(XENLOG_INFO, " l3e=%" PRI_gpte "\n", gw->l3e.l3); +#endif /* All levels... */ + gdprintk(XENLOG_INFO, " l2mfn=%" PRI_mfn "\n", mfn_x(gw->l2mfn)); + gdprintk(XENLOG_INFO, " l2e=%" PRI_gpte "\n", gw->l2e.l2); + gdprintk(XENLOG_INFO, " l1mfn=%" PRI_mfn "\n", mfn_x(gw->l1mfn)); + gdprintk(XENLOG_INFO, " l1e=%" PRI_gpte "\n", gw->l1e.l1); +} + #endif /* _XEN_ASM_GUEST_PT_H */ diff -r a9ffd495b5fc xen/include/asm-x86/perfc_defn.h --- a/xen/include/asm-x86/perfc_defn.h Tue Nov 11 15:22:27 2008 +0000 +++ b/xen/include/asm-x86/perfc_defn.h Wed Nov 12 11:15:29 2008 +0000 @@ -33,6 +33,7 @@ PERFCOUNTER(ptwr_emulations, "wri PERFCOUNTER(exception_fixed, "pre-exception fixed") +PERFCOUNTER(guest_walk, "guest pagetable walks") /* Shadow counters */ PERFCOUNTER(shadow_alloc, "calls to shadow_alloc") @@ -92,7 +93,6 @@ PERFCOUNTER(shadow_up_pointer, "shad PERFCOUNTER(shadow_up_pointer, "shadow unshadow by up-pointer") PERFCOUNTER(shadow_unshadow_bf, "shadow unshadow brute-force") PERFCOUNTER(shadow_get_page_fail, "shadow_get_page_from_l1e failed") -PERFCOUNTER(shadow_guest_walk, "shadow walks guest tables") PERFCOUNTER(shadow_check_gwalk, "shadow checks gwalk") PERFCOUNTER(shadow_inconsistent_gwalk, "shadow check inconsistent gwalk") PERFCOUNTER(shadow_rm_write_flush_tlb, diff -r a9ffd495b5fc xen/arch/x86/mm/guest_walk.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/guest_walk.c Wed Nov 12 14:55:40 2008 +0000 @@ -0,0 +1,260 @@ +/****************************************************************************** + * arch/x86/mm/guest_walk.c + * + * Pagetable walker for guest memory accesses. + * + * Parts of this code are Copyright (c) 2006 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include + + +/* Flags that are needed in a pagetable entry, with the sense of NX inverted */ +static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) +{ + static uint32_t flags[] = { + /* I/F - Usr Wr */ + /* 0 0 0 0 */ _PAGE_PRESENT, + /* 0 0 0 1 */ _PAGE_PRESENT|_PAGE_RW, + /* 0 0 1 0 */ _PAGE_PRESENT|_PAGE_USER, + /* 0 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, + /* 0 1 0 0 */ _PAGE_PRESENT, + /* 0 1 0 1 */ _PAGE_PRESENT|_PAGE_RW, + /* 0 1 1 0 */ _PAGE_PRESENT|_PAGE_USER, + /* 0 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER, + /* 1 0 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, + /* 1 0 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, + /* 1 0 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, + /* 1 0 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, + /* 1 1 0 0 */ _PAGE_PRESENT|_PAGE_NX_BIT, + /* 1 1 0 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT, + /* 1 1 1 0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT, + /* 1 1 1 1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT, + }; + + /* Don't demand not-NX if the CPU wouldn't enforce it. */ + if ( !guest_supports_nx(v) ) + pfec &= ~PFEC_insn_fetch; + + /* Don't demand R/W if the CPU wouldn't enforce it. */ + if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) + && !(pfec & PFEC_user_mode) ) + pfec &= ~PFEC_write_access; + + return flags[(pfec & 0x1f) >> 1]; +} + +/* Modify a guest pagetable entry to set the Accessed and Dirty bits. + * Returns non-zero if it actually writes to guest memory. */ +static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty) +{ + guest_intpte_t old, new; + + old = *(guest_intpte_t *)walk_p; + new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0); + if ( old != new ) + { + /* Write the new entry into the walk, and try to write it back + * into the guest table as well. If the guest table has changed + * under out feet then leave it alone. */ + *(guest_intpte_t *)walk_p = new; + if ( cmpxchg(((guest_intpte_t *)guest_p), old, new) == old ) + return 1; + } + return 0; +} + + +/* Walk the guest pagetables, after the manner of a hardware walker. */ +uint32_t +guest_walk_tables(struct vcpu *v, unsigned long va, walk_t *gw, + uint32_t pfec, mfn_t top_mfn, void *top_map) +{ + struct domain *d = v->domain; + p2m_type_t p2mt; + guest_l1e_t *l1p = NULL; + guest_l2e_t *l2p = NULL; +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + guest_l3e_t *l3p = NULL; + guest_l4e_t *l4p; +#endif + uint32_t gflags, mflags, rc = 0; + int pse; + + perfc_incr(guest_walk); + memset(gw, 0, sizeof(*gw)); + gw->va = va; + + /* Mandatory bits that must be set in every entry. We invert NX, to + * calculate as if there were an "X" bit that allowed access. + * We will accumulate, in rc, the set of flags that are missing. */ + mflags = mandatory_flags(v, pfec); + +#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ +#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ + + /* Get the l4e from the top level table and check its flags*/ + gw->l4mfn = top_mfn; + l4p = (guest_l4e_t *) top_map; + gw->l4e = l4p[guest_l4_table_offset(va)]; + gflags = guest_l4e_get_flags(gw->l4e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + if ( rc & _PAGE_PRESENT ) goto out; + + /* Map the l3 table */ + gw->l3mfn = gfn_to_mfn(d, guest_l4e_get_gfn(gw->l4e), &p2mt); + if ( !p2m_is_ram(p2mt) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + ASSERT(mfn_valid(mfn_x(gw->l3mfn))); + + /* Get the l3e and check its flags*/ + l3p = map_domain_page(mfn_x(gw->l3mfn)); + gw->l3e = l3p[guest_l3_table_offset(va)]; + gflags = guest_l3e_get_flags(gw->l3e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + if ( rc & _PAGE_PRESENT ) + goto out; + +#else /* PAE only... */ + + /* Get the l3e and check its flag */ + gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)]; + if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + +#endif /* PAE or 64... */ + + /* Map the l2 table */ + gw->l2mfn = gfn_to_mfn(d, guest_l3e_get_gfn(gw->l3e), &p2mt); + if ( !p2m_is_ram(p2mt) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + ASSERT(mfn_valid(mfn_x(gw->l2mfn))); + + /* Get the l2e */ + l2p = map_domain_page(mfn_x(gw->l2mfn)); + gw->l2e = l2p[guest_l2_table_offset(va)]; + +#else /* 32-bit only... */ + + /* Get l2e from the top level table */ + gw->l2mfn = top_mfn; + l2p = (guest_l2e_t *) top_map; + gw->l2e = l2p[guest_l2_table_offset(va)]; + +#endif /* All levels... */ + + gflags = guest_l2e_get_flags(gw->l2e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + if ( rc & _PAGE_PRESENT ) + goto out; + + pse = (guest_supports_superpages(v) && + (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE)); + + if ( pse ) + { + /* Special case: this guest VA is in a PSE superpage, so there's + * no guest l1e. We make one up so that the propagation code + * can generate a shadow l1 table. Start with the gfn of the + * first 4k-page of the superpage. */ + gfn_t start = guest_l2e_get_gfn(gw->l2e); + /* Grant full access in the l1e, since all the guest entry's + * access controls are enforced in the shadow l2e. */ + int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW| + _PAGE_ACCESSED|_PAGE_DIRTY); + /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7 + * of the level 1. */ + if ( (guest_l2e_get_flags(gw->l2e) & _PAGE_PSE_PAT) ) + flags |= _PAGE_PAT; + /* Copy the cache-control bits to the l1 as well, because we + * can't represent PAT in the (non-PSE) shadow l2e. :( + * This could cause problems if a guest ever maps an area of + * memory with superpages using more than one caching mode. */ + flags |= guest_l2e_get_flags(gw->l2e) & (_PAGE_PWT|_PAGE_PCD); + /* Increment the pfn by the right number of 4k pages. + * The ~0x1 is to mask out the PAT bit mentioned above. */ + start = _gfn((gfn_x(start) & ~0x1) + guest_l1_table_offset(va)); + gw->l1e = guest_l1e_from_gfn(start, flags); + gw->l1mfn = _mfn(INVALID_MFN); + } + else + { + /* Not a superpage: carry on and find the l1e. */ + gw->l1mfn = gfn_to_mfn(d, guest_l2e_get_gfn(gw->l2e), &p2mt); + if ( !p2m_is_ram(p2mt) ) + { + rc |= _PAGE_PRESENT; + goto out; + } + ASSERT(mfn_valid(mfn_x(gw->l1mfn))); + l1p = map_domain_page(mfn_x(gw->l1mfn)); + gw->l1e = l1p[guest_l1_table_offset(va)]; + gflags = guest_l1e_get_flags(gw->l1e) ^ _PAGE_NX_BIT; + rc |= ((gflags & mflags) ^ mflags); + } + + /* Go back and set accessed and dirty bits only if the walk was a + * success. Although the PRMs say higher-level _PAGE_ACCESSED bits + * get set whenever a lower-level PT is used, at least some hardware + * walkers behave this way. */ + if ( rc == 0 ) + { +#if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */ + if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) ) + paging_mark_dirty(d, mfn_x(gw->l4mfn)); + if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e, 0) ) + paging_mark_dirty(d, mfn_x(gw->l3mfn)); +#endif + if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e, + (pse && (pfec & PFEC_write_access))) ) + paging_mark_dirty(d, mfn_x(gw->l2mfn)); + if ( !pse ) + { + if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, + (pfec & PFEC_write_access)) ) + paging_mark_dirty(d, mfn_x(gw->l1mfn)); + } + } + + out: +#if GUEST_PAGING_LEVELS == 4 + if ( l3p ) unmap_domain_page(l3p); +#endif +#if GUEST_PAGING_LEVELS >= 3 + if ( l2p ) unmap_domain_page(l2p); +#endif + if ( l1p ) unmap_domain_page(l1p); + + return rc; +}