[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-3.0.3-testing] [XEN] Support lightweight shadow-translate PV guests, for paravirt-ops.
# HG changeset patch # User Tim Deegan <tim.deegan@xxxxxxxxxxxxx> # Date 1159459854 -3600 # Node ID b6ee084892dad84750bb8aa3fb89056c3aa21633 # Parent 5f42b4824e455c6350a06c4e3061f663e2d2f39e [XEN] Support lightweight shadow-translate PV guests, for paravirt-ops. This is a modified subset of Michael Fetterman's shadow-translate work. Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxxxxx> --- xen/arch/x86/domain.c | 11 xen/arch/x86/mm.c | 189 ++++++++-------- xen/arch/x86/mm/shadow/common.c | 161 +++++-------- xen/arch/x86/mm/shadow/multi.c | 426 +++++++++++++++++++++++-------------- xen/arch/x86/mm/shadow/multi.h | 7 xen/arch/x86/mm/shadow/private.h | 49 ---- xen/arch/x86/mm/shadow/types.h | 31 ++ xen/arch/x86/traps.c | 2 xen/include/asm-x86/domain.h | 2 xen/include/asm-x86/guest_access.h | 20 - xen/include/asm-x86/mm.h | 2 xen/include/asm-x86/shadow.h | 82 ++++++- 12 files changed, 573 insertions(+), 409 deletions(-) diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/domain.c Thu Sep 28 17:10:54 2006 +0100 @@ -334,8 +334,10 @@ int arch_set_info_guest( } else { - if ( !get_page_and_type(mfn_to_page(cr3_pfn), d, - PGT_base_page_table) ) + if ( shadow_mode_refcounts(d) + ? !get_page(mfn_to_page(cr3_pfn), d) + : !get_page_and_type(mfn_to_page(cr3_pfn), d, + PGT_base_page_table) ) { destroy_gdt(v); return -EINVAL; @@ -952,7 +954,10 @@ void domain_relinquish_resources(struct pfn = pagetable_get_pfn(v->arch.guest_table_user); if ( pfn != 0 ) { - put_page_and_type(mfn_to_page(pfn)); + if ( shadow_mode_refcounts(d) ) + put_page(mfn_to_page(pfn)); + else + put_page_and_type(mfn_to_page(pfn)); v->arch.guest_table_user = pagetable_null(); } #endif diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/mm.c Thu Sep 28 17:10:54 2006 +0100 @@ -427,23 +427,11 @@ int map_ldt_shadow_page(unsigned int off unsigned long gmfn, mfn; l1_pgentry_t l1e, nl1e; unsigned long gva = v->arch.guest_context.ldt_base + (off << PAGE_SHIFT); - int res; - -#if defined(__x86_64__) - /* If in user mode, switch to kernel mode just to read LDT mapping. */ - int user_mode = !(v->arch.flags & TF_kernel_mode); -#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) -#elif defined(__i386__) -#define TOGGLE_MODE() ((void)0) -#endif + int okay; BUG_ON(unlikely(in_irq())); - TOGGLE_MODE(); - __copy_from_user(&l1e, &linear_pg_table[l1_linear_offset(gva)], - sizeof(l1e)); - TOGGLE_MODE(); - + guest_get_eff_kern_l1e(v, gva, &l1e); if ( unlikely(!(l1e_get_flags(l1e) & _PAGE_PRESENT)) ) return 0; @@ -452,17 +440,17 @@ int map_ldt_shadow_page(unsigned int off if ( unlikely(!VALID_MFN(mfn)) ) return 0; - res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); - - if ( !res && unlikely(shadow_mode_refcounts(d)) ) + okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); + + if ( !okay && unlikely(shadow_mode_refcounts(d)) ) { shadow_lock(d); shadow_remove_write_access(d->vcpu[0], _mfn(mfn), 0, 0); - res = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); + okay = get_page_and_type(mfn_to_page(mfn), d, PGT_ldt_page); shadow_unlock(d); } - if ( unlikely(!res) ) + if ( unlikely(!okay) ) return 0; nl1e = l1e_from_pfn(mfn, l1e_get_flags(l1e) | _PAGE_RW); @@ -1233,7 +1221,7 @@ static inline int update_l1e(l1_pgentry_ } } #endif - if ( unlikely(shadow_mode_enabled(v->domain)) ) + if ( unlikely(shadow_mode_enabled(v->domain)) && rv ) { shadow_validate_guest_entry(v, _mfn(gl1mfn), pl1e); shadow_unlock(v->domain); @@ -1251,6 +1239,9 @@ static int mod_l1_entry(l1_pgentry_t *pl if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) ) return 0; + + if ( unlikely(shadow_mode_refcounts(d)) ) + return update_l1e(pl1e, ol1e, nl1e, gl1mfn, current); if ( l1e_get_flags(nl1e) & _PAGE_PRESENT ) { @@ -1871,6 +1862,14 @@ static int set_foreigndom(domid_t domid) } } + if ( unlikely(shadow_mode_translate(d)) ) + { + MEM_LOG("%s: can not mix foreign mappings with translated domains", + __func__); + info->foreign = NULL; + okay = 0; + } + out: return okay; } @@ -1902,7 +1901,7 @@ int do_mmuext_op( { struct mmuext_op op; int rc = 0, i = 0, okay; - unsigned long mfn, type; + unsigned long mfn = 0, gmfn = 0, type; unsigned int done = 0; struct page_info *page; struct vcpu *v = current; @@ -1947,7 +1946,8 @@ int do_mmuext_op( } okay = 1; - mfn = op.arg1.mfn; + gmfn = op.arg1.mfn; + mfn = gmfn_to_mfn(FOREIGNDOM, gmfn); page = mfn_to_page(mfn); switch ( op.cmd ) @@ -2022,7 +2022,6 @@ int do_mmuext_op( break; case MMUEXT_NEW_BASEPTR: - mfn = gmfn_to_mfn(current->domain, mfn); okay = new_guest_cr3(mfn); this_cpu(percpu_mm_info).deferred_ops &= ~DOP_FLUSH_TLB; break; @@ -2031,8 +2030,13 @@ int do_mmuext_op( case MMUEXT_NEW_USER_BASEPTR: okay = 1; if (likely(mfn != 0)) - okay = get_page_and_type_from_pagenr( - mfn, PGT_root_page_table, d); + { + if ( shadow_mode_refcounts(d) ) + okay = get_page_from_pagenr(mfn, d); + else + okay = get_page_and_type_from_pagenr( + mfn, PGT_root_page_table, d); + } if ( unlikely(!okay) ) { MEM_LOG("Error while installing new mfn %lx", mfn); @@ -2043,7 +2047,12 @@ int do_mmuext_op( pagetable_get_pfn(v->arch.guest_table_user); v->arch.guest_table_user = pagetable_from_pfn(mfn); if ( old_mfn != 0 ) - put_page_and_type(mfn_to_page(old_mfn)); + { + if ( shadow_mode_refcounts(d) ) + put_page(mfn_to_page(old_mfn)); + else + put_page_and_type(mfn_to_page(old_mfn)); + } } break; #endif @@ -2504,17 +2513,26 @@ static int create_grant_va_mapping( { l1_pgentry_t *pl1e, ol1e; struct domain *d = v->domain; + unsigned long gl1mfn; + int okay; ASSERT(spin_is_locked(&d->big_lock)); adjust_guest_l1e(nl1e); - pl1e = &linear_pg_table[l1_linear_offset(va)]; - - if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) || - !update_l1e(pl1e, ol1e, nl1e, - l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]), v) ) + pl1e = guest_map_l1e(v, va, &gl1mfn); + if ( !pl1e ) + { + MEM_LOG("Could not find L1 PTE for address %lx", va); return GNTST_general_error; + } + ol1e = *pl1e; + okay = update_l1e(pl1e, ol1e, nl1e, gl1mfn, v); + guest_unmap_l1e(v, pl1e); + pl1e = NULL; + + if ( !okay ) + return GNTST_general_error; if ( !shadow_mode_refcounts(d) ) put_page_from_l1e(ol1e, d); @@ -2523,17 +2541,19 @@ static int create_grant_va_mapping( } static int destroy_grant_va_mapping( - unsigned long addr, unsigned long frame, struct domain *d) + unsigned long addr, unsigned long frame, struct vcpu *v) { l1_pgentry_t *pl1e, ol1e; + unsigned long gl1mfn; + int rc = 0; - pl1e = &linear_pg_table[l1_linear_offset(addr)]; - - if ( unlikely(__get_user(ol1e.l1, &pl1e->l1) != 0) ) - { - MEM_LOG("Could not find PTE entry for address %lx", addr); + pl1e = guest_map_l1e(v, addr, &gl1mfn); + if ( !pl1e ) + { + MEM_LOG("Could not find L1 PTE for address %lx", addr); return GNTST_general_error; } + ol1e = *pl1e; /* * Check that the virtual address supplied is actually mapped to @@ -2543,19 +2563,21 @@ static int destroy_grant_va_mapping( { MEM_LOG("PTE entry %lx for address %lx doesn't match frame %lx", l1e_get_pfn(ol1e), addr, frame); - return GNTST_general_error; + rc = GNTST_general_error; + goto out; } /* Delete pagetable entry. */ - if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), - l2e_get_pfn(__linear_l2_table[l2_linear_offset(addr)]), - d->vcpu[0] /* Change for per-vcpu shadows */)) ) + if ( unlikely(!update_l1e(pl1e, ol1e, l1e_empty(), gl1mfn, v)) ) { MEM_LOG("Cannot delete PTE entry at %p", (unsigned long *)pl1e); - return GNTST_general_error; - } - - return 0; + rc = GNTST_general_error; + goto out; // this is redundant & unnecessary, but informative + } + + out: + guest_unmap_l1e(v, pl1e); + return rc; } int create_grant_host_mapping( @@ -2578,7 +2600,7 @@ int destroy_grant_host_mapping( { if ( flags & GNTMAP_contains_pte ) return destroy_grant_pte_mapping(addr, frame, current->domain); - return destroy_grant_va_mapping(addr, frame, current->domain); + return destroy_grant_va_mapping(addr, frame, current); } int steal_page( @@ -2634,7 +2656,8 @@ int do_update_va_mapping(unsigned long v l1_pgentry_t val = l1e_from_intpte(val64); struct vcpu *v = current; struct domain *d = v->domain; - unsigned long vmask, bmap_ptr; + l1_pgentry_t *pl1e; + unsigned long vmask, bmap_ptr, gl1mfn; cpumask_t pmask; int rc = 0; @@ -2643,35 +2666,17 @@ int do_update_va_mapping(unsigned long v if ( unlikely(!__addr_ok(va) && !shadow_mode_external(d)) ) return -EINVAL; - if ( unlikely(shadow_mode_refcounts(d)) ) - { - DPRINTK("Grant op on a shadow-refcounted domain\n"); - return -EINVAL; - } - LOCK_BIGLOCK(d); - if ( likely(rc == 0) && unlikely(shadow_mode_enabled(d)) ) - { - if ( unlikely(this_cpu(percpu_mm_info).foreign && - (shadow_mode_translate(d) || - shadow_mode_translate( - this_cpu(percpu_mm_info).foreign))) ) - { - /* - * The foreign domain's pfn's are in a different namespace. There's - * not enough information in just a gpte to figure out how to - * (re-)shadow this entry. - */ - domain_crash(d); - } - } - - if ( unlikely(!mod_l1_entry( - &linear_pg_table[l1_linear_offset(va)], val, - l2e_get_pfn(__linear_l2_table[l2_linear_offset(va)]))) ) + pl1e = guest_map_l1e(v, va, &gl1mfn); + + if ( unlikely(!pl1e || !mod_l1_entry(pl1e, val, gl1mfn)) ) rc = -EINVAL; - + + if ( pl1e ) + guest_unmap_l1e(v, pl1e); + pl1e = NULL; + switch ( flags & UVMF_FLUSHTYPE_MASK ) { case UVMF_TLB_FLUSH: @@ -3033,7 +3038,7 @@ static int ptwr_emulated_update( unsigned int bytes, unsigned int do_cmpxchg) { - unsigned long pfn; + unsigned long gmfn, mfn; struct page_info *page; l1_pgentry_t pte, ol1e, nl1e, *pl1e; struct vcpu *v = current; @@ -3073,15 +3078,17 @@ static int ptwr_emulated_update( } /* Read the PTE that maps the page being updated. */ - if ( __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)], - sizeof(pte)) ) - { - MEM_LOG("ptwr_emulate: Cannot read thru linear_pg_table"); + guest_get_eff_l1e(v, addr, &pte); + if ( unlikely(!(l1e_get_flags(pte) & _PAGE_PRESENT)) ) + { + MEM_LOG("%s: Cannot get L1 PTE for guest address %lx", + __func__, addr); return X86EMUL_UNHANDLEABLE; } - pfn = l1e_get_pfn(pte); - page = mfn_to_page(pfn); + gmfn = l1e_get_pfn(pte); + mfn = gmfn_to_mfn(d, gmfn); + page = mfn_to_page(mfn); /* We are looking only for read-only mappings of p.t. pages. */ ASSERT((l1e_get_flags(pte) & (_PAGE_RW|_PAGE_PRESENT)) == _PAGE_PRESENT); @@ -3091,7 +3098,7 @@ static int ptwr_emulated_update( /* Check the new PTE. */ nl1e = l1e_from_intpte(val); - if ( unlikely(!get_page_from_l1e(nl1e, d)) ) + if ( unlikely(!get_page_from_l1e(gl1e_to_ml1e(d, nl1e), d)) ) { if ( (CONFIG_PAGING_LEVELS == 3) && (bytes == 4) && @@ -3130,13 +3137,13 @@ static int ptwr_emulated_update( if ( shadow_mode_enabled(d) ) shadow_unlock(d); unmap_domain_page(pl1e); - put_page_from_l1e(nl1e, d); + put_page_from_l1e(gl1e_to_ml1e(d, nl1e), d); return X86EMUL_CMPXCHG_FAILED; } - if ( unlikely(shadow_mode_enabled(v->domain)) ) + if ( unlikely(shadow_mode_enabled(d)) ) { shadow_validate_guest_entry(v, _mfn(page_to_mfn(page)), pl1e); - shadow_unlock(v->domain); + shadow_unlock(d); } } else @@ -3149,7 +3156,7 @@ static int ptwr_emulated_update( unmap_domain_page(pl1e); /* Finally, drop the old PTE. */ - put_page_from_l1e(ol1e, d); + put_page_from_l1e(gl1e_to_ml1e(d, ol1e), d); return X86EMUL_CONTINUE; } @@ -3198,13 +3205,13 @@ static struct x86_emulate_ops ptwr_emula }; /* Write page fault handler: check if guest is trying to modify a PTE. */ -int ptwr_do_page_fault(struct domain *d, unsigned long addr, +int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, struct cpu_user_regs *regs) { + struct domain *d = v->domain; unsigned long pfn; struct page_info *page; l1_pgentry_t pte; - l2_pgentry_t *pl2e, l2e; struct x86_emulate_ctxt emul_ctxt; LOCK_BIGLOCK(d); @@ -3213,13 +3220,9 @@ int ptwr_do_page_fault(struct domain *d, * Attempt to read the PTE that maps the VA being accessed. By checking for * PDE validity in the L2 we avoid many expensive fixups in __get_user(). */ - pl2e = &__linear_l2_table[l2_linear_offset(addr)]; - if ( __copy_from_user(&l2e, pl2e, sizeof(l2e)) || - !(l2e_get_flags(l2e) & _PAGE_PRESENT) || - __copy_from_user(&pte, &linear_pg_table[l1_linear_offset(addr)], - sizeof(pte)) ) + guest_get_eff_l1e(v, addr, &pte); + if ( !(l1e_get_flags(pte) & _PAGE_PRESENT) ) goto bail; - pfn = l1e_get_pfn(pte); page = mfn_to_page(pfn); diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/mm/shadow/common.c --- a/xen/arch/x86/mm/shadow/common.c Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/mm/shadow/common.c Thu Sep 28 17:10:54 2006 +0100 @@ -75,35 +75,27 @@ sh_x86_emulate_read_std(unsigned long ad unsigned int bytes, struct x86_emulate_ctxt *ctxt) { - struct vcpu *v = current; - if ( hvm_guest(v) ) - { - *val = 0; - // XXX -- this is WRONG. - // It entirely ignores the permissions in the page tables. - // In this case, that is only a user vs supervisor access check. - // - if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) - { + *val = 0; + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that is only a user vs supervisor access check. + // + if ( hvm_copy(val, addr, bytes, HVM_COPY_IN) ) + { #if 0 - SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", - v->domain->domain_id, v->vcpu_id, - addr, *val, bytes); -#endif - return X86EMUL_CONTINUE; - } - - /* If we got here, there was nothing mapped here, or a bad GFN - * was mapped here. This should never happen: we're here because - * of a write fault at the end of the instruction we're emulating. */ - SHADOW_PRINTK("read failed to va %#lx\n", addr); - return X86EMUL_PROPAGATE_FAULT; - } - else - { - SHADOW_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } + struct vcpu *v = current; + SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", + v->domain->domain_id, v->vcpu_id, + addr, *val, bytes); +#endif + return X86EMUL_CONTINUE; + } + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating. */ + SHADOW_PRINTK("read failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; } static int @@ -112,33 +104,26 @@ sh_x86_emulate_write_std(unsigned long a unsigned int bytes, struct x86_emulate_ctxt *ctxt) { +#if 0 struct vcpu *v = current; -#if 0 SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", v->domain->domain_id, v->vcpu_id, addr, val, bytes); #endif - if ( hvm_guest(v) ) - { - // XXX -- this is WRONG. - // It entirely ignores the permissions in the page tables. - // In this case, that includes user vs supervisor, and - // write access. - // - if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) - return X86EMUL_CONTINUE; - - /* If we got here, there was nothing mapped here, or a bad GFN - * was mapped here. This should never happen: we're here because - * of a write fault at the end of the instruction we're emulating, - * which should be handled by sh_x86_emulate_write_emulated. */ - SHADOW_PRINTK("write failed to va %#lx\n", addr); - return X86EMUL_PROPAGATE_FAULT; - } - else - { - SHADOW_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } + + // XXX -- this is WRONG. + // It entirely ignores the permissions in the page tables. + // In this case, that includes user vs supervisor, and + // write access. + // + if ( hvm_copy(&val, addr, bytes, HVM_COPY_OUT) ) + return X86EMUL_CONTINUE; + + /* If we got here, there was nothing mapped here, or a bad GFN + * was mapped here. This should never happen: we're here because + * of a write fault at the end of the instruction we're emulating, + * which should be handled by sh_x86_emulate_write_emulated. */ + SHADOW_PRINTK("write failed to va %#lx\n", addr); + return X86EMUL_PROPAGATE_FAULT; } static int @@ -152,15 +137,7 @@ sh_x86_emulate_write_emulated(unsigned l SHADOW_PRINTK("d=%u v=%u a=%#lx v=%#lx bytes=%u\n", v->domain->domain_id, v->vcpu_id, addr, val, bytes); #endif - if ( hvm_guest(v) ) - { - return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt); - } - else - { - SHADOW_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } + return v->arch.shadow.mode->x86_emulate_write(v, addr, &val, bytes, ctxt); } static int @@ -175,16 +152,8 @@ sh_x86_emulate_cmpxchg_emulated(unsigned SHADOW_PRINTK("d=%u v=%u a=%#lx o?=%#lx n:=%#lx bytes=%u\n", v->domain->domain_id, v->vcpu_id, addr, old, new, bytes); #endif - if ( hvm_guest(v) ) - { - return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, - bytes, ctxt); - } - else - { - SHADOW_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } + return v->arch.shadow.mode->x86_emulate_cmpxchg(v, addr, old, new, + bytes, ctxt); } static int @@ -201,16 +170,8 @@ sh_x86_emulate_cmpxchg8b_emulated(unsign v->domain->domain_id, v->vcpu_id, addr, old_hi, old_lo, new_hi, new_lo, ctxt); #endif - if ( hvm_guest(v) ) - { - return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, - new_lo, new_hi, ctxt); - } - else - { - SHADOW_PRINTK("this operation is not emulated yet\n"); - return X86EMUL_UNHANDLEABLE; - } + return v->arch.shadow.mode->x86_emulate_cmpxchg8b(v, addr, old_lo, old_hi, + new_lo, new_hi, ctxt); } @@ -267,7 +228,7 @@ void shadow_demote(struct vcpu *v, mfn_t /* Validate a pagetable change from the guest and update the shadows. * Returns a bitmask of SHADOW_SET_* flags. */ -static int +int __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *entry, u32 size) { @@ -367,7 +328,9 @@ void void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, void *entry, u32 size) -/* This is the entry point for emulated writes to pagetables in HVM guests */ +/* This is the entry point for emulated writes to pagetables in HVM guests and + * PV translated guests. + */ { struct domain *d = v->domain; int rc; @@ -806,7 +769,7 @@ void shadow_free(struct domain *d, mfn_t /* Divert some memory from the pool to be used by the p2m mapping. * This action is irreversible: the p2m mapping only ever grows. - * That's OK because the p2m table only exists for external domains, + * That's OK because the p2m table only exists for translated domains, * and those domains can't ever turn off shadow mode. * Also, we only ever allocate a max-order chunk, so as to preserve * the invariant that shadow_prealloc() always works. @@ -830,7 +793,12 @@ shadow_alloc_p2m_pages(struct domain *d) d->arch.shadow.total_pages -= (1<<SHADOW_MAX_ORDER); for (i = 0; i < (1<<SHADOW_MAX_ORDER); i++) { - /* Unlike shadow pages, mark p2m pages as owned by the domain */ + /* Unlike shadow pages, mark p2m pages as owned by the domain. + * Marking the domain as the owner would normally allow the guest to + * create mappings of these pages, but these p2m pages will never be + * in the domain's guest-physical address space, and so that is not + * believed to be a concern. + */ page_set_owner(&pg[i], d); list_add_tail(&pg[i].list, &d->arch.shadow.p2m_freelist); } @@ -2269,7 +2237,7 @@ void sh_update_paging_modes(struct vcpu // if ( !test_bit(_VCPUF_initialised, &v->vcpu_flags) ) { - printk("%s: postponing determination of shadow mode\n", __func__); + SHADOW_PRINTK("%s: postponing determination of shadow mode\n", __func__); return; } @@ -2294,6 +2262,7 @@ void sh_update_paging_modes(struct vcpu #else #error unexpected paging mode #endif + v->arch.shadow.translate_enabled = !!shadow_mode_translate(d); } else { @@ -2303,8 +2272,8 @@ void sh_update_paging_modes(struct vcpu ASSERT(shadow_mode_translate(d)); ASSERT(shadow_mode_external(d)); - v->arch.shadow.hvm_paging_enabled = !!hvm_paging_enabled(v); - if ( !v->arch.shadow.hvm_paging_enabled ) + v->arch.shadow.translate_enabled = !!hvm_paging_enabled(v); + if ( !v->arch.shadow.translate_enabled ) { /* Set v->arch.guest_table to use the p2m map, and choose @@ -2381,13 +2350,14 @@ void sh_update_paging_modes(struct vcpu if ( v->arch.shadow.mode != old_mode ) { - SHADOW_PRINTK("new paging mode: d=%u v=%u g=%u s=%u " - "(was g=%u s=%u)\n", - d->domain_id, v->vcpu_id, - v->arch.shadow.mode->guest_levels, - v->arch.shadow.mode->shadow_levels, - old_mode ? old_mode->guest_levels : 0, - old_mode ? old_mode->shadow_levels : 0); + SHADOW_PRINTK("new paging mode: d=%u v=%u pe=%d g=%u s=%u " + "(was g=%u s=%u)\n", + d->domain_id, v->vcpu_id, + hvm_guest(v) ? !!hvm_paging_enabled(v) : 1, + v->arch.shadow.mode->guest_levels, + v->arch.shadow.mode->shadow_levels, + old_mode ? old_mode->guest_levels : 0, + old_mode ? old_mode->shadow_levels : 0); if ( old_mode && (v->arch.shadow.mode->shadow_levels != old_mode->shadow_levels) ) @@ -2467,6 +2437,7 @@ static int shadow_enable(struct domain * /* Sanity check the arguments */ if ( (d == current->domain) || shadow_mode_enabled(d) || + ((mode & SHM2_translate) && !(mode & SHM2_refcounts)) || ((mode & SHM2_external) && !(mode & SHM2_translate)) ) { rv = -EINVAL; @@ -2522,7 +2493,7 @@ static int shadow_enable(struct domain * out: shadow_unlock(d); domain_unpause(d); - return 0; + return rv; } void shadow_teardown(struct domain *d) diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/mm/shadow/multi.c Thu Sep 28 17:10:54 2006 +0100 @@ -483,8 +483,7 @@ static u32 guest_set_ad_bits(struct vcpu unsigned int level, fetch_type_t ft) { - u32 flags, shflags, bit; - struct page_info *pg; + u32 flags; int res = 0; ASSERT(valid_mfn(gmfn) @@ -502,11 +501,10 @@ static u32 guest_set_ad_bits(struct vcpu if ( unlikely(GUEST_PAGING_LEVELS == 3 && level == 3) ) return flags; - /* Need the D bit as well for writes, in l1es and 32bit/PAE PSE l2es. */ + /* Need the D bit as well for writes, in L1es and PSE L2es. */ if ( ft == ft_demand_write - && (level == 1 || - (level == 2 && GUEST_PAGING_LEVELS < 4 - && (flags & _PAGE_PSE) && guest_supports_superpages(v))) ) + && (level == 1 || + (level == 2 && (flags & _PAGE_PSE) && guest_supports_superpages(v))) ) { if ( (flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) == (_PAGE_DIRTY | _PAGE_ACCESSED) ) @@ -524,76 +522,69 @@ static u32 guest_set_ad_bits(struct vcpu /* Set the bit(s) */ sh_mark_dirty(v->domain, gmfn); - SHADOW_DEBUG(A_AND_D, "gfn = %"SH_PRI_gfn", " + SHADOW_DEBUG(A_AND_D, "gfn = %" SH_PRI_gfn ", " "old flags = %#x, new flags = %#x\n", - guest_l1e_get_gfn(*ep), guest_l1e_get_flags(*ep), flags); + gfn_x(guest_l1e_get_gfn(*ep)), guest_l1e_get_flags(*ep), flags); *ep = guest_l1e_from_gfn(guest_l1e_get_gfn(*ep), flags); - /* May need to propagate this change forward to other kinds of shadow */ - pg = mfn_to_page(gmfn); - if ( !sh_mfn_is_a_page_table(gmfn) ) - { - /* This guest pagetable is not yet shadowed at all. */ - // MAF: I think this assert is busted... If this gmfn has not yet - // been promoted, then it seems perfectly reasonable for there to be - // outstanding type refs to it... - /* TJD: No. If the gmfn has not been promoted, we must at least - * have recognised that it is a pagetable, and pulled write access. - * The type count should only be non-zero if it is actually a page - * table. The test above was incorrect, though, so I've fixed it. */ - ASSERT((pg->u.inuse.type_info & PGT_count_mask) == 0); - return flags; - } - - shflags = pg->shadow_flags & SHF_page_type_mask; - while ( shflags ) - { - bit = find_first_set_bit(shflags); - ASSERT(shflags & (1u << bit)); - shflags &= ~(1u << bit); - if ( !(pg->shadow_flags & (1u << bit)) ) - continue; - switch ( bit ) - { - case PGC_SH_type_to_index(PGC_SH_l1_shadow): - if (level != 1) - res |= sh_map_and_validate_gl1e(v, gmfn, ep, sizeof (*ep)); - break; - case PGC_SH_type_to_index(PGC_SH_l2_shadow): - if (level != 2) - res |= sh_map_and_validate_gl2e(v, gmfn, ep, sizeof (*ep)); - break; -#if GUEST_PAGING_LEVELS == 3 /* PAE only */ - case PGC_SH_type_to_index(PGC_SH_l2h_shadow): - if (level != 2) - res |= sh_map_and_validate_gl2he(v, gmfn, ep, sizeof (*ep)); - break; -#endif -#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */ - case PGC_SH_type_to_index(PGC_SH_l3_shadow): - if (level != 3) - res |= sh_map_and_validate_gl3e(v, gmfn, ep, sizeof (*ep)); - break; -#if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */ - case PGC_SH_type_to_index(PGC_SH_l4_shadow): - if (level != 4) - res |= sh_map_and_validate_gl4e(v, gmfn, ep, sizeof (*ep)); - break; -#endif -#endif - default: - SHADOW_ERROR("mfn %"SH_PRI_mfn" is shadowed in multiple " - "modes: A&D bits may be out of sync (flags=%#x).\n", - mfn_x(gmfn), pg->shadow_flags); - /* XXX Shadows in other modes will not be updated, so will - * have their A and D bits out of sync. */ - } - } - + /* Propagate this change to any existing shadows */ + res = __shadow_validate_guest_entry(v, gmfn, ep, sizeof(*ep)); + /* We should never need to flush the TLB or recopy PAE entries */ - ASSERT( res == 0 || res == SHADOW_SET_CHANGED ); + ASSERT((res == 0) || (res == SHADOW_SET_CHANGED)); + return flags; } + +#if (CONFIG_PAGING_LEVELS == GUEST_PAGING_LEVELS) && (CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS) +void * +sh_guest_map_l1e(struct vcpu *v, unsigned long addr, + unsigned long *gl1mfn) +{ + void *pl1e = NULL; + walk_t gw; + + ASSERT(shadow_mode_translate(v->domain)); + + // XXX -- this is expensive, but it's easy to cobble together... + // FIXME! + + shadow_lock(v->domain); + guest_walk_tables(v, addr, &gw, 1); + + if ( gw.l2e && + (guest_l2e_get_flags(*gw.l2e) & _PAGE_PRESENT) && + !(guest_supports_superpages(v) && (guest_l2e_get_flags(*gw.l2e) & _PAGE_PSE)) ) + { + if ( gl1mfn ) + *gl1mfn = mfn_x(gw.l1mfn); + pl1e = map_domain_page(mfn_x(gw.l1mfn)) + + (guest_l1_table_offset(addr) * sizeof(guest_l1e_t)); + } + + unmap_walk(v, &gw); + shadow_unlock(v->domain); + + return pl1e; +} + +void +sh_guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e) +{ + walk_t gw; + + ASSERT(shadow_mode_translate(v->domain)); + + // XXX -- this is expensive, but it's easy to cobble together... + // FIXME! + + shadow_lock(v->domain); + guest_walk_tables(v, addr, &gw, 1); + *(guest_l1e_t *)eff_l1e = gw.eff_l1e; + unmap_walk(v, &gw); + shadow_unlock(v->domain); +} +#endif /* CONFIG==SHADOW==GUEST */ /**************************************************************************/ /* Functions to compute the correct index into a shadow page, given an @@ -709,17 +700,6 @@ shadow_l4_index(mfn_t *smfn, u32 guest_i * to the _PAGE_DIRTY bit handling), but for L[234], they are grouped together * into the respective demand_fault functions. */ - -#define CHECK(_cond) \ -do { \ - if (unlikely(!(_cond))) \ - { \ - printk("%s %s %d ASSERTION (%s) FAILED\n", \ - __func__, __FILE__, __LINE__, #_cond); \ - return -1; \ - } \ -} while (0); - // The function below tries to capture all of the flag manipulation for the // demand and propagate functions into one place. // @@ -728,6 +708,16 @@ sh_propagate_flags(struct vcpu *v, mfn_t u32 gflags, guest_l1e_t *guest_entry_ptr, mfn_t gmfn, int mmio, int level, fetch_type_t ft) { +#define CHECK(_cond) \ +do { \ + if (unlikely(!(_cond))) \ + { \ + printk("%s %s %d ASSERTION (%s) FAILED\n", \ + __func__, __FILE__, __LINE__, #_cond); \ + domain_crash(d); \ + } \ +} while (0); + struct domain *d = v->domain; u32 pass_thru_flags; u32 sflags; @@ -763,6 +753,10 @@ sh_propagate_flags(struct vcpu *v, mfn_t return 0; } + // Set the A and D bits in the guest entry, if we need to. + if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) ) + gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft); + // PAE does not allow NX, RW, USER, ACCESSED, or DIRTY bits in its L3e's... // if ( (SHADOW_PAGING_LEVELS == 3) && (level == 3) ) @@ -797,17 +791,12 @@ sh_propagate_flags(struct vcpu *v, mfn_t // Higher level entries do not, strictly speaking, have dirty bits, but // since we use shadow linear tables, each of these entries may, at some // point in time, also serve as a shadow L1 entry. - // By setting both the A&D bits in each of these, we eliminate the burden + // By setting both the A&D bits in each of these, we eliminate the burden // on the hardware to update these bits on initial accesses. // if ( (level > 1) && !((SHADOW_PAGING_LEVELS == 3) && (level == 3)) ) sflags |= _PAGE_ACCESSED | _PAGE_DIRTY; - - // Set the A and D bits in the guest entry, if we need to. - if ( guest_entry_ptr && (ft & FETCH_TYPE_DEMAND) ) - gflags = guest_set_ad_bits(v, gmfn, guest_entry_ptr, level, ft); - // If the A or D bit has not yet been set in the guest, then we must // prevent the corresponding kind of access. // @@ -815,12 +804,12 @@ sh_propagate_flags(struct vcpu *v, mfn_t !(gflags & _PAGE_ACCESSED)) ) sflags &= ~_PAGE_PRESENT; - /* D bits exist in l1es, and 32bit/PAE PSE l2es, but not 64bit PSE l2es */ - if ( unlikely( ((level == 1) - || ((level == 2) && (GUEST_PAGING_LEVELS < 4) - && guest_supports_superpages(v) && - (gflags & _PAGE_PSE))) - && !(gflags & _PAGE_DIRTY)) ) + /* D bits exist in L1es and PSE L2es */ + if ( unlikely(((level == 1) || + ((level == 2) && + (gflags & _PAGE_PSE) && + guest_supports_superpages(v))) + && !(gflags & _PAGE_DIRTY)) ) sflags &= ~_PAGE_RW; // MMIO caching @@ -869,10 +858,17 @@ sh_propagate_flags(struct vcpu *v, mfn_t } } + // PV guests in 64-bit mode use two different page tables for user vs + // supervisor permissions, making the guest's _PAGE_USER bit irrelevant. + // It is always shadowed as present... + if ( (GUEST_PAGING_LEVELS == 4) && !hvm_guest(v) ) + { + sflags |= _PAGE_USER; + } + return sflags; -} - #undef CHECK +} #if GUEST_PAGING_LEVELS >= 4 static void @@ -1732,10 +1728,20 @@ void sh_install_xen_entries_in_l4(struct __PAGE_HYPERVISOR); /* Linear mapping */ - sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = - shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); sl4e[shadow_l4_table_offset(SH_LINEAR_PT_VIRT_START)] = shadow_l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) ) + { + // linear tables may not be used with translated PV guests + sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l4e_empty(); + } + else + { + sl4e[shadow_l4_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l4e_from_mfn(gl4mfn, __PAGE_HYPERVISOR); + } if ( shadow_mode_translate(v->domain) ) { @@ -1779,7 +1785,15 @@ void sh_install_xen_entries_in_l2h(struc /* We don't set up a linear mapping here because we can't until this * l2h is installed in an l3e. sh_update_linear_entries() handles - * the linear mappings when the l3 is loaded. */ + * the linear mappings when the l3 is loaded. We zero them here, just as + * a safety measure. + */ + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START) + i] = + shadow_l2e_empty(); + for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) + sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START) + i] = + shadow_l2e_empty(); if ( shadow_mode_translate(d) ) { @@ -1817,6 +1831,12 @@ void sh_install_xen_entries_in_l3(struct l2smfn = get_shadow_status(v, l2gmfn, PGC_SH_l2h_shadow); if ( !valid_mfn(l2smfn) ) { + /* must remove write access to this page before shadowing it */ + // XXX -- should check to see whether this is better with level==0 or + // level==2... + if ( shadow_remove_write_access(v, l2gmfn, 2, 0xc0000000ul) != 0 ) + flush_tlb_mask(v->domain->domain_dirty_cpumask); + l2smfn = sh_make_shadow(v, l2gmfn, PGC_SH_l2h_shadow); } l3e_propagate_from_guest(v, &gl3e[3], gl3mfn, l2smfn, &new_sl3e, @@ -1852,10 +1872,20 @@ void sh_install_xen_entries_in_l2(struct __PAGE_HYPERVISOR); /* Linear mapping */ - sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = - shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR); sl2e[shadow_l2_table_offset(SH_LINEAR_PT_VIRT_START)] = shadow_l2e_from_mfn(sl2mfn, __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(v->domain) && !shadow_mode_external(v->domain) ) + { + // linear tables may not be used with translated PV guests + sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l2e_empty(); + } + else + { + sl2e[shadow_l2_table_offset(LINEAR_PT_VIRT_START)] = + shadow_l2e_from_mfn(gl2mfn, __PAGE_HYPERVISOR); + } if ( shadow_mode_translate(d) ) { @@ -2527,6 +2557,32 @@ static int validate_gl4e(struct vcpu *v, } l4e_propagate_from_guest(v, new_gl4e, _mfn(INVALID_MFN), sl3mfn, &new_sl4e, ft_prefetch); + + // check for updates to xen reserved slots + if ( !shadow_mode_external(v->domain) ) + { + int shadow_index = (((unsigned long)sl4p & ~PAGE_MASK) / + sizeof(shadow_l4e_t)); + int reserved_xen_slot = !is_guest_l4_slot(shadow_index); + + if ( unlikely(reserved_xen_slot) ) + { + // attempt by the guest to write to a xen reserved slot + // + SHADOW_PRINTK("%s out-of-range update " + "sl4mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n", + __func__, mfn_x(sl4mfn), shadow_index, new_sl4e.l4); + if ( shadow_l4e_get_flags(new_sl4e) & _PAGE_PRESENT ) + { + SHADOW_ERROR("out-of-range l4e update\n"); + result |= SHADOW_SET_ERROR; + } + + // do not call shadow_set_l4e... + return result; + } + } + result |= shadow_set_l4e(v, sl4p, new_sl4e, sl4mfn); return result; } @@ -2616,6 +2672,48 @@ static int validate_gl2e(struct vcpu *v, } l2e_propagate_from_guest(v, new_gl2e, _mfn(INVALID_MFN), sl1mfn, &new_sl2e, ft_prefetch); + + // check for updates to xen reserved slots in PV guests... + // XXX -- need to revisit this for PV 3-on-4 guests. + // +#if SHADOW_PAGING_LEVELS < 4 +#if CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS + if ( !shadow_mode_external(v->domain) ) + { + int shadow_index = (((unsigned long)sl2p & ~PAGE_MASK) / + sizeof(shadow_l2e_t)); + int reserved_xen_slot; + +#if SHADOW_PAGING_LEVELS == 3 + reserved_xen_slot = + (((mfn_to_page(sl2mfn)->count_info & PGC_SH_type_mask) + == PGC_SH_l2h_pae_shadow) && + (shadow_index + >= (L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)))); +#else /* SHADOW_PAGING_LEVELS == 2 */ + reserved_xen_slot = (shadow_index >= L2_PAGETABLE_FIRST_XEN_SLOT); +#endif + + if ( unlikely(reserved_xen_slot) ) + { + // attempt by the guest to write to a xen reserved slot + // + SHADOW_PRINTK("%s out-of-range update " + "sl2mfn=%05lx index=0x%x val=%" SH_PRI_pte "\n", + __func__, mfn_x(sl2mfn), shadow_index, new_sl2e.l2); + if ( shadow_l2e_get_flags(new_sl2e) & _PAGE_PRESENT ) + { + SHADOW_ERROR("out-of-range l2e update\n"); + result |= SHADOW_SET_ERROR; + } + + // do not call shadow_set_l2e... + return result; + } + } +#endif /* CONFIG_PAGING_LEVELS == SHADOW_PAGING_LEVELS */ +#endif /* SHADOW_PAGING_LEVELS < 4 */ + result |= shadow_set_l2e(v, sl2p, new_sl2e, sl2mfn); return result; @@ -2897,7 +2995,7 @@ static int sh_page_fault(struct vcpu *v, } // All levels of the guest page table are now known to be present. - accumulated_gflags = accumulate_guest_flags(&gw); + accumulated_gflags = accumulate_guest_flags(v, &gw); // Check for attempts to access supervisor-only pages from user mode, // i.e. ring 3. Such errors are not caused or dealt with by the shadow @@ -3348,6 +3446,7 @@ sh_update_linear_entries(struct vcpu *v) l2_pgentry_t *l2e, new_l2e; shadow_l3e_t *guest_l3e = NULL, *shadow_l3e; int i; + int unmap_l2e = 0; #if GUEST_PAGING_LEVELS == 2 /* Shadow l3 tables were built by update_cr3 */ @@ -3365,39 +3464,45 @@ sh_update_linear_entries(struct vcpu *v) #endif /* GUEST_PAGING_LEVELS */ /* Choose where to write the entries, using linear maps if possible */ - if ( v == current && shadow_mode_external(d) ) - { - /* From the monitor tables, it's safe to use linear maps to update - * monitor l2s */ - l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES); - } - else if ( shadow_mode_external(d) ) - { - /* Map the monitor table's high l2 */ - l3_pgentry_t *l3e; - l3e = sh_map_domain_page( - pagetable_get_mfn(v->arch.monitor_table)); - ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); - l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); - sh_unmap_domain_page(l3e); - } + if ( shadow_mode_external(d) ) + { + if ( v == current ) + { + /* From the monitor tables, it's safe to use linear maps + * to update monitor l2s */ + l2e = __linear_l2_table + (3 * L2_PAGETABLE_ENTRIES); + } + else + { + /* Map the monitor table's high l2 */ + l3_pgentry_t *l3e; + l3e = sh_map_domain_page( + pagetable_get_mfn(v->arch.monitor_table)); + ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); + l2e = sh_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); + unmap_l2e = 1; + sh_unmap_domain_page(l3e); + } + } else { /* Map the shadow table's high l2 */ ASSERT(shadow_l3e_get_flags(shadow_l3e[3]) & _PAGE_PRESENT); l2e = sh_map_domain_page(shadow_l3e_get_mfn(shadow_l3e[3])); + unmap_l2e = 1; } - - if ( !shadow_mode_external(d) ) - { - /* Write linear mapping of guest. */ + /* Write linear mapping of guest (only in PV, and only when + * not translated). */ + if ( !shadow_mode_translate(d) ) + { for ( i = 0; i < SHADOW_L3_PAGETABLE_ENTRIES; i++ ) - { - new_l2e = (shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) - ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])), - __PAGE_HYPERVISOR) - : l2e_empty(); + { + new_l2e = + ((shadow_l3e_get_flags(guest_l3e[i]) & _PAGE_PRESENT) + ? l2e_from_pfn(mfn_x(shadow_l3e_get_mfn(guest_l3e[i])), + __PAGE_HYPERVISOR) + : l2e_empty()); safe_write_entry( &l2e[l2_table_offset(LINEAR_PT_VIRT_START) + i], &new_l2e); @@ -3416,9 +3521,8 @@ sh_update_linear_entries(struct vcpu *v) &new_l2e); } - if ( v != current || !shadow_mode_external(d) ) + if ( unmap_l2e ) sh_unmap_domain_page(l2e); - } #elif CONFIG_PAGING_LEVELS == 2 @@ -3521,16 +3625,24 @@ static void static void sh_detach_old_tables(struct vcpu *v) { + struct domain *d = v->domain; mfn_t smfn; //// //// vcpu->arch.guest_vtable //// - if ( (shadow_mode_external(v->domain) || (GUEST_PAGING_LEVELS == 3)) && - v->arch.guest_vtable ) - { - // Q: why does this need to use (un)map_domain_page_*global* ? - sh_unmap_domain_page_global(v->arch.guest_vtable); + if ( v->arch.guest_vtable ) + { +#if GUEST_PAGING_LEVELS == 4 + if ( shadow_mode_external(d) || shadow_mode_translate(d) ) + sh_unmap_domain_page_global(v->arch.guest_vtable); +#elif GUEST_PAGING_LEVELS == 3 + if ( 1 || shadow_mode_external(d) || shadow_mode_translate(d) ) + sh_unmap_domain_page_global(v->arch.guest_vtable); +#elif GUEST_PAGING_LEVELS == 2 + if ( shadow_mode_external(d) || shadow_mode_translate(d) ) + sh_unmap_domain_page_global(v->arch.guest_vtable); +#endif v->arch.guest_vtable = NULL; } @@ -3645,9 +3757,14 @@ sh_update_cr3(struct vcpu *v) //// //// vcpu->arch.guest_vtable //// +#if GUEST_PAGING_LEVELS == 4 + if ( shadow_mode_external(d) || shadow_mode_translate(d) ) + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); + else + v->arch.guest_vtable = __linear_l4_table; +#elif GUEST_PAGING_LEVELS == 3 if ( shadow_mode_external(d) ) { -#if GUEST_PAGING_LEVELS == 3 if ( shadow_vcpu_mode_translate(v) ) /* Paging enabled: find where in the page the l3 table is */ guest_idx = guest_index((void *)hvm_get_guest_ctrl_reg(v, 3)); @@ -3658,25 +3775,21 @@ sh_update_cr3(struct vcpu *v) // Ignore the low 2 bits of guest_idx -- they are really just // cache control. guest_idx &= ~3; + // XXX - why does this need a global map? v->arch.guest_vtable = (guest_l3e_t *)sh_map_domain_page_global(gmfn) + guest_idx; + } + else + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); +#elif GUEST_PAGING_LEVELS == 2 + if ( shadow_mode_external(d) || shadow_mode_translate(d) ) + v->arch.guest_vtable = sh_map_domain_page_global(gmfn); + else + v->arch.guest_vtable = __linear_l2_table; #else - // XXX - why does this need a global map? - v->arch.guest_vtable = sh_map_domain_page_global(gmfn); -#endif - } - else - { -#ifdef __x86_64__ - v->arch.guest_vtable = __linear_l4_table; -#elif GUEST_PAGING_LEVELS == 3 - // XXX - why does this need a global map? - v->arch.guest_vtable = sh_map_domain_page_global(gmfn); -#else - v->arch.guest_vtable = __linear_l2_table; -#endif - } +#error this should never happen +#endif #if 0 printk("%s %s %d gmfn=%05lx guest_vtable=%p\n", @@ -3743,6 +3856,17 @@ sh_update_cr3(struct vcpu *v) v->arch.shadow_vtable = __sh_linear_l2_table; #endif } + +#if (CONFIG_PAGING_LEVELS == 3) && (GUEST_PAGING_LEVELS == 3) + // Now that shadow_vtable is in place, check that the sl3e[3] is properly + // shadowed and installed in PAE PV guests... + if ( !shadow_mode_external(d) && + !(shadow_l3e_get_flags(((shadow_l3e_t *)v->arch.shadow_vtable)[3]) & + _PAGE_PRESENT) ) + { + sh_install_xen_entries_in_l3(v, gmfn, smfn); + } +#endif //// //// Take a ref to the new shadow table, and pin it. @@ -4049,7 +4173,7 @@ static inline void * emulate_map_dest(st mfn_t mfn; guest_walk_tables(v, vaddr, &gw, 1); - flags = accumulate_guest_flags(&gw); + flags = accumulate_guest_flags(v, &gw); gfn = guest_l1e_get_gfn(gw.eff_l1e); mfn = vcpu_gfn_to_mfn(v, gfn); sh_audit_gw(v, &gw); @@ -4453,6 +4577,8 @@ struct shadow_paging_mode sh_paging_mode .x86_emulate_cmpxchg8b = sh_x86_emulate_cmpxchg8b, .make_monitor_table = sh_make_monitor_table, .destroy_monitor_table = sh_destroy_monitor_table, + .guest_map_l1e = sh_guest_map_l1e, + .guest_get_eff_l1e = sh_guest_get_eff_l1e, #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC .guess_wrmap = sh_guess_wrmap, #endif diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/mm/shadow/multi.h --- a/xen/arch/x86/mm/shadow/multi.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/mm/shadow/multi.h Thu Sep 28 17:10:54 2006 +0100 @@ -103,6 +103,13 @@ SHADOW_INTERNAL_NAME(sh_audit_l4_table, (struct vcpu *v, mfn_t sl4mfn, mfn_t x); #endif +extern void * +SHADOW_INTERNAL_NAME(sh_guest_map_l1e, CONFIG_PAGING_LEVELS, CONFIG_PAGING_LEVELS) + (struct vcpu *v, unsigned long va, unsigned long *gl1mfn); +extern void +SHADOW_INTERNAL_NAME(sh_guest_get_eff_l1e, CONFIG_PAGING_LEVELS, CONFIG_PAGING_LEVELS) + (struct vcpu *v, unsigned long va, void *eff_l1e); + #if SHADOW_LEVELS == GUEST_LEVELS extern mfn_t SHADOW_INTERNAL_NAME(sh_make_monitor_table, SHADOW_LEVELS, GUEST_LEVELS) diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/mm/shadow/private.h --- a/xen/arch/x86/mm/shadow/private.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/mm/shadow/private.h Thu Sep 28 17:10:54 2006 +0100 @@ -532,55 +532,6 @@ static inline void sh_unpin(struct vcpu } } -/**************************************************************************/ -/* Guest physmap (p2m) support */ - -/* Read our own P2M table, checking in the linear pagetables first to be - * sure that we will succeed. Call this function if you expect it to - * fail often, as it avoids page faults. If you expect to succeed, use - * vcpu_gfn_to_mfn, which copy_from_user()s the entry */ -static inline mfn_t -vcpu_gfn_to_mfn_nofault(struct vcpu *v, unsigned long gfn) -{ - unsigned long entry_addr = (unsigned long) &phys_to_machine_mapping[gfn]; -#if CONFIG_PAGING_LEVELS >= 4 - l4_pgentry_t *l4e; - l3_pgentry_t *l3e; -#endif - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - - ASSERT(current == v); - if ( !shadow_vcpu_mode_translate(v) ) - return _mfn(gfn); - -#if CONFIG_PAGING_LEVELS > 2 - if ( gfn >= (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t) ) - /* This pfn is higher than the p2m map can hold */ - return _mfn(INVALID_MFN); -#endif - - /* Walk the linear pagetables. Note that this is *not* the same as - * the walk in sh_gfn_to_mfn_foreign, which is walking the p2m map */ -#if CONFIG_PAGING_LEVELS >= 4 - l4e = __linear_l4_table + l4_linear_offset(entry_addr); - if ( !(l4e_get_flags(*l4e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); - l3e = __linear_l3_table + l3_linear_offset(entry_addr); - if ( !(l3e_get_flags(*l3e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); -#endif - l2e = __linear_l2_table + l2_linear_offset(entry_addr); - if ( !(l2e_get_flags(*l2e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); - l1e = __linear_l1_table + l1_linear_offset(entry_addr); - if ( !(l1e_get_flags(*l1e) & _PAGE_PRESENT) ) return _mfn(INVALID_MFN); - - /* Safe to look at this part of the table */ - if ( l1e_get_flags(phys_to_machine_mapping[gfn]) & _PAGE_PRESENT ) - return _mfn(l1e_get_pfn(phys_to_machine_mapping[gfn])); - - return _mfn(INVALID_MFN); -} - - #endif /* _XEN_SHADOW_PRIVATE_H */ /* diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/mm/shadow/types.h --- a/xen/arch/x86/mm/shadow/types.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/mm/shadow/types.h Thu Sep 28 17:10:54 2006 +0100 @@ -205,6 +205,9 @@ static inline shadow_l4e_t shadow_l4e_fr __sh_linear_l1_table; \ }) +// XXX -- these should not be conditional on hvm_guest(v), but rather on +// shadow_mode_external(d)... +// #define sh_linear_l2_table(v) ({ \ ASSERT(current == (v)); \ ((shadow_l2e_t *) \ @@ -507,10 +510,22 @@ struct shadow_walk_t #define sh_guess_wrmap INTERNAL_NAME(sh_guess_wrmap) #define sh_clear_shadow_entry INTERNAL_NAME(sh_clear_shadow_entry) +/* The sh_guest_(map|get)_* functions only depends on the number of config + * levels + */ +#define sh_guest_map_l1e \ + SHADOW_INTERNAL_NAME(sh_guest_map_l1e, \ + CONFIG_PAGING_LEVELS, \ + CONFIG_PAGING_LEVELS) +#define sh_guest_get_eff_l1e \ + SHADOW_INTERNAL_NAME(sh_guest_get_eff_l1e, \ + CONFIG_PAGING_LEVELS, \ + CONFIG_PAGING_LEVELS) + /* sh_make_monitor_table only depends on the number of shadow levels */ -#define sh_make_monitor_table \ - SHADOW_INTERNAL_NAME(sh_make_monitor_table, \ - SHADOW_PAGING_LEVELS, \ +#define sh_make_monitor_table \ + SHADOW_INTERNAL_NAME(sh_make_monitor_table, \ + SHADOW_PAGING_LEVELS, \ SHADOW_PAGING_LEVELS) #define sh_destroy_monitor_table \ SHADOW_INTERNAL_NAME(sh_destroy_monitor_table, \ @@ -652,7 +667,7 @@ static inline void sh_unpin_l3_subshadow #endif /* GUEST_PAGING_LEVELS >= 3 */ static inline u32 -accumulate_guest_flags(walk_t *gw) +accumulate_guest_flags(struct vcpu *v, walk_t *gw) { u32 accumulated_flags; @@ -674,8 +689,14 @@ accumulate_guest_flags(walk_t *gw) accumulated_flags &= guest_l4e_get_flags(*gw->l4e) ^ _PAGE_NX_BIT; #endif - // Finally, revert the NX bit back to its original polarity + // Revert the NX bit back to its original polarity accumulated_flags ^= _PAGE_NX_BIT; + + // In 64-bit PV guests, the _PAGE_USER bit is implied in all guest + // entries (since even the guest kernel runs in ring 3). + // + if ( (GUEST_PAGING_LEVELS == 4) && !hvm_guest(v) ) + accumulated_flags |= _PAGE_USER; return accumulated_flags; } diff -r 5f42b4824e45 -r b6ee084892da xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/arch/x86/traps.c Thu Sep 28 17:10:54 2006 +0100 @@ -886,7 +886,7 @@ static int fixup_page_fault(unsigned lon /* Do not check if access-protection fault since the page may legitimately be not present in shadow page tables */ ((regs->error_code & PFEC_write_access) == PFEC_write_access) && - ptwr_do_page_fault(d, addr, regs) ) + ptwr_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; if ( shadow_mode_enabled(d) ) diff -r 5f42b4824e45 -r b6ee084892da xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/include/asm-x86/domain.h Thu Sep 28 17:10:54 2006 +0100 @@ -139,7 +139,7 @@ struct shadow_vcpu { /* Last MFN that we emulated a write to. */ unsigned long last_emulated_mfn; /* HVM guest: paging enabled (CR0.PG)? */ - unsigned int hvm_paging_enabled:1; + unsigned int translate_enabled:1; /* Emulated fault needs to be propagated to guest? */ unsigned int propagate_fault:1; #if CONFIG_PAGING_LEVELS >= 3 diff -r 5f42b4824e45 -r b6ee084892da xen/include/asm-x86/guest_access.h --- a/xen/include/asm-x86/guest_access.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/include/asm-x86/guest_access.h Thu Sep 28 17:10:54 2006 +0100 @@ -8,6 +8,7 @@ #define __ASM_X86_GUEST_ACCESS_H__ #include <asm/uaccess.h> +#include <asm/shadow.h> #include <asm/hvm/support.h> #include <asm/hvm/guest_access.h> @@ -33,7 +34,7 @@ #define copy_to_guest_offset(hnd, off, ptr, nr) ({ \ const typeof(ptr) _x = (hnd).p; \ const typeof(ptr) _y = (ptr); \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_to_user_hvm(_x+(off), _y, sizeof(*_x)*(nr)) : \ copy_to_user(_x+(off), _y, sizeof(*_x)*(nr)); \ }) @@ -45,7 +46,7 @@ #define copy_from_guest_offset(ptr, hnd, off, nr) ({ \ const typeof(ptr) _x = (hnd).p; \ const typeof(ptr) _y = (ptr); \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_from_user_hvm(_y, _x+(off), sizeof(*_x)*(nr)) :\ copy_from_user(_y, _x+(off), sizeof(*_x)*(nr)); \ }) @@ -54,7 +55,7 @@ #define copy_field_to_guest(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _x = &(hnd).p->field; \ const typeof(&(ptr)->field) _y = &(ptr)->field; \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_to_user_hvm(_x, _y, sizeof(*_x)) : \ copy_to_user(_x, _y, sizeof(*_x)); \ }) @@ -63,7 +64,7 @@ #define copy_field_from_guest(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _x = &(hnd).p->field; \ const typeof(&(ptr)->field) _y = &(ptr)->field; \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_from_user_hvm(_y, _x, sizeof(*_x)) : \ copy_from_user(_y, _x, sizeof(*_x)); \ }) @@ -73,12 +74,13 @@ * Allows use of faster __copy_* functions. */ #define guest_handle_okay(hnd, nr) \ - (hvm_guest(current) || array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))) + (shadow_mode_external(current->domain) || \ + array_access_ok((hnd).p, (nr), sizeof(*(hnd).p))) #define __copy_to_guest_offset(hnd, off, ptr, nr) ({ \ const typeof(ptr) _x = (hnd).p; \ const typeof(ptr) _y = (ptr); \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_to_user_hvm(_x+(off), _y, sizeof(*_x)*(nr)) : \ __copy_to_user(_x+(off), _y, sizeof(*_x)*(nr)); \ }) @@ -86,7 +88,7 @@ #define __copy_from_guest_offset(ptr, hnd, off, nr) ({ \ const typeof(ptr) _x = (hnd).p; \ const typeof(ptr) _y = (ptr); \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_from_user_hvm(_y, _x+(off),sizeof(*_x)*(nr)) : \ __copy_from_user(_y, _x+(off), sizeof(*_x)*(nr)); \ }) @@ -94,7 +96,7 @@ #define __copy_field_to_guest(hnd, ptr, field) ({ \ const typeof(&(ptr)->field) _x = &(hnd).p->field; \ const typeof(&(ptr)->field) _y = &(ptr)->field; \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_to_user_hvm(_x, _y, sizeof(*_x)) : \ __copy_to_user(_x, _y, sizeof(*_x)); \ }) @@ -102,7 +104,7 @@ #define __copy_field_from_guest(ptr, hnd, field) ({ \ const typeof(&(ptr)->field) _x = &(hnd).p->field; \ const typeof(&(ptr)->field) _y = &(ptr)->field; \ - hvm_guest(current) ? \ + shadow_mode_translate(current->domain) ? \ copy_from_user_hvm(_x, _y, sizeof(*_x)) : \ __copy_from_user(_y, _x, sizeof(*_x)); \ }) diff -r 5f42b4824e45 -r b6ee084892da xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/include/asm-x86/mm.h Thu Sep 28 17:10:54 2006 +0100 @@ -348,7 +348,7 @@ void memguard_unguard_range(void *p, uns void memguard_guard_stack(void *p); -int ptwr_do_page_fault(struct domain *, unsigned long, +int ptwr_do_page_fault(struct vcpu *, unsigned long, struct cpu_user_regs *); int audit_adjust_pgtables(struct domain *d, int dir, int noisy); diff -r 5f42b4824e45 -r b6ee084892da xen/include/asm-x86/shadow.h --- a/xen/include/asm-x86/shadow.h Thu Sep 28 17:09:11 2006 +0100 +++ b/xen/include/asm-x86/shadow.h Thu Sep 28 17:10:54 2006 +0100 @@ -26,6 +26,7 @@ #include <public/domctl.h> #include <xen/sched.h> #include <xen/perfc.h> +#include <xen/domain_page.h> #include <asm/flushtlb.h> /* How to make sure a page is not referred to in a shadow PT */ @@ -245,7 +246,9 @@ shadow_vcpu_mode_translate(struct vcpu * // enabled. (HVM vcpu's with paging disabled are using the p2m table as // its paging table, so no translation occurs in this case.) // - return v->arch.shadow.hvm_paging_enabled; + // It is also true for translated PV domains. + // + return v->arch.shadow.translate_enabled; } @@ -287,6 +290,10 @@ struct shadow_paging_mode { struct x86_emulate_ctxt *ctxt); mfn_t (*make_monitor_table )(struct vcpu *v); void (*destroy_monitor_table )(struct vcpu *v, mfn_t mmfn); + void * (*guest_map_l1e )(struct vcpu *v, unsigned long va, + unsigned long *gl1mfn); + void (*guest_get_eff_l1e )(struct vcpu *v, unsigned long va, + void *eff_l1e); #if SHADOW_OPTIMIZATIONS & SHOPT_WRITABLE_HEURISTIC int (*guess_wrmap )(struct vcpu *v, unsigned long vaddr, mfn_t gmfn); @@ -452,9 +459,73 @@ shadow_destroy_monitor_table(struct vcpu v->arch.shadow.mode->destroy_monitor_table(v, mmfn); } +static inline void * +guest_map_l1e(struct vcpu *v, unsigned long addr, unsigned long *gl1mfn) +{ + if ( likely(!shadow_mode_translate(v->domain)) ) + { + l2_pgentry_t l2e; + ASSERT(!shadow_mode_external(v->domain)); + /* Find this l1e and its enclosing l1mfn in the linear map */ + if ( __copy_from_user(&l2e, + &__linear_l2_table[l2_linear_offset(addr)], + sizeof(l2_pgentry_t)) != 0 ) + return NULL; + /* Check flags that it will be safe to read the l1e */ + if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) + != _PAGE_PRESENT ) + return NULL; + *gl1mfn = l2e_get_pfn(l2e); + return &__linear_l1_table[l1_linear_offset(addr)]; + } + + return v->arch.shadow.mode->guest_map_l1e(v, addr, gl1mfn); +} + +static inline void +guest_unmap_l1e(struct vcpu *v, void *p) +{ + if ( unlikely(shadow_mode_translate(v->domain)) ) + unmap_domain_page(p); +} + +static inline void +guest_get_eff_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e) +{ + if ( likely(!shadow_mode_translate(v->domain)) ) + { + ASSERT(!shadow_mode_external(v->domain)); + if ( __copy_from_user(eff_l1e, + &__linear_l1_table[l1_linear_offset(addr)], + sizeof(l1_pgentry_t)) != 0 ) + *(l1_pgentry_t *)eff_l1e = l1e_empty(); + return; + } + + v->arch.shadow.mode->guest_get_eff_l1e(v, addr, eff_l1e); +} + +static inline void +guest_get_eff_kern_l1e(struct vcpu *v, unsigned long addr, void *eff_l1e) +{ +#if defined(__x86_64__) + int user_mode = !(v->arch.flags & TF_kernel_mode); +#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) +#else +#define TOGGLE_MODE() ((void)0) +#endif + + TOGGLE_MODE(); + guest_get_eff_l1e(v, addr, eff_l1e); + TOGGLE_MODE(); +} + + /* Validate a pagetable change from the guest and update the shadows. */ extern int shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, void *new_guest_entry); +extern int __shadow_validate_guest_entry(struct vcpu *v, mfn_t gmfn, + void *entry, u32 size); /* Update the shadows in response to a pagetable write from a HVM guest */ extern void shadow_validate_guest_pt_write(struct vcpu *v, mfn_t gmfn, @@ -629,7 +700,14 @@ sh_mfn_to_gfn(struct domain *d, mfn_t mf return mfn_x(mfn); } - +static inline l1_pgentry_t +gl1e_to_ml1e(struct domain *d, l1_pgentry_t l1e) +{ + if ( unlikely(shadow_mode_translate(d)) ) + l1e = l1e_from_pfn(gmfn_to_mfn(d, l1e_get_pfn(l1e)), + l1e_get_flags(l1e)); + return l1e; +} #endif /* _XEN_SHADOW_H */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |