x86: re-introduce map_domain_page() et al This is being done mostly in the form previously used on x86-32, utilizing the second L3 page table slot within the per-domain mapping area for those mappings. It remains to be determined whether that concept is really suitable, or whether instead re-implementing at least the non-global variant from scratch would be better. Also add the helpers {clear,copy}_domain_page() as well as initial uses of them. One question is whether, to exercise the non-trivial code paths, we shouldn't make the trivial shortcuts conditional upon NDEBUG being defined. See the debugging patch at the end of the series. Signed-off-by: Jan Beulich --- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -19,6 +19,7 @@ obj-bin-y += dmi_scan.init.o obj-y += domctl.o obj-y += domain.o obj-bin-y += domain_build.init.o +obj-y += domain_page.o obj-y += e820.o obj-y += extable.o obj-y += flushtlb.o --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -397,10 +397,14 @@ int vcpu_initialise(struct vcpu *v) return -ENOMEM; clear_page(page_to_virt(pg)); perdomain_pt_page(d, idx) = pg; - d->arch.mm_perdomain_l2[l2_table_offset(PERDOMAIN_VIRT_START)+idx] + d->arch.mm_perdomain_l2[0][l2_table_offset(PERDOMAIN_VIRT_START)+idx] = l2e_from_page(pg, __PAGE_HYPERVISOR); } + rc = mapcache_vcpu_init(v); + if ( rc ) + return rc; + paging_vcpu_init(v); v->arch.perdomain_ptes = perdomain_ptes(d, v); @@ -526,8 +530,8 @@ int arch_domain_create(struct domain *d, pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( pg == NULL ) goto fail; - d->arch.mm_perdomain_l2 = page_to_virt(pg); - clear_page(d->arch.mm_perdomain_l2); + d->arch.mm_perdomain_l2[0] = page_to_virt(pg); + clear_page(d->arch.mm_perdomain_l2[0]); pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(d))); if ( pg == NULL ) @@ -535,8 +539,10 @@ int arch_domain_create(struct domain *d, d->arch.mm_perdomain_l3 = page_to_virt(pg); clear_page(d->arch.mm_perdomain_l3); d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] = - l3e_from_page(virt_to_page(d->arch.mm_perdomain_l2), - __PAGE_HYPERVISOR); + l3e_from_pfn(virt_to_mfn(d->arch.mm_perdomain_l2[0]), + __PAGE_HYPERVISOR); + + mapcache_domain_init(d); HYPERVISOR_COMPAT_VIRT_START(d) = is_hvm_domain(d) ? ~0u : __HYPERVISOR_COMPAT_VIRT_START; @@ -609,8 +615,9 @@ int arch_domain_create(struct domain *d, free_xenheap_page(d->shared_info); if ( paging_initialised ) paging_final_teardown(d); - if ( d->arch.mm_perdomain_l2 ) - free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2)); + mapcache_domain_exit(d); + if ( d->arch.mm_perdomain_l2[0] ) + free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2[0])); if ( d->arch.mm_perdomain_l3 ) free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3)); if ( d->arch.mm_perdomain_pt_pages ) @@ -633,13 +640,15 @@ void arch_domain_destroy(struct domain * paging_final_teardown(d); + mapcache_domain_exit(d); + for ( i = 0; i < PDPT_L2_ENTRIES; ++i ) { if ( perdomain_pt_page(d, i) ) free_domheap_page(perdomain_pt_page(d, i)); } free_domheap_page(virt_to_page(d->arch.mm_perdomain_pt_pages)); - free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2)); + free_domheap_page(virt_to_page(d->arch.mm_perdomain_l2[0])); free_domheap_page(virt_to_page(d->arch.mm_perdomain_l3)); free_xenheap_page(d->shared_info); --- /dev/null +++ b/xen/arch/x86/domain_page.c @@ -0,0 +1,471 @@ +/****************************************************************************** + * domain_page.h + * + * Allow temporary mapping of domain pages. + * + * Copyright (c) 2003-2006, Keir Fraser + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static inline struct vcpu *mapcache_current_vcpu(void) +{ + /* In the common case we use the mapcache of the running VCPU. */ + struct vcpu *v = current; + + /* + * When current isn't properly set up yet, this is equivalent to + * running in an idle vCPU (callers must check for NULL). + */ + if ( v == (struct vcpu *)0xfffff000 ) + return NULL; + + /* + * If guest_table is NULL, and we are running a paravirtualised guest, + * then it means we are running on the idle domain's page table and must + * therefore use its mapcache. + */ + if ( unlikely(pagetable_is_null(v->arch.guest_table)) && !is_hvm_vcpu(v) ) + { + /* If we really are idling, perform lazy context switch now. */ + if ( (v = idle_vcpu[smp_processor_id()]) == current ) + sync_local_execstate(); + /* We must now be running on the idle page table. */ + ASSERT(read_cr3() == __pa(idle_pg_table)); + } + + return v; +} + +#define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER) +#define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1) +#define DCACHE_L1ENT(dc, idx) \ + ((dc)->l1tab[(idx) >> PAGETABLE_ORDER] \ + [(idx) & ((1 << PAGETABLE_ORDER) - 1)]) + +void *map_domain_page(unsigned long mfn) +{ + unsigned long flags; + unsigned int idx, i; + struct vcpu *v; + struct mapcache_domain *dcache; + struct mapcache_vcpu *vcache; + struct vcpu_maphash_entry *hashent; + + if ( mfn <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) + return mfn_to_virt(mfn); + + v = mapcache_current_vcpu(); + if ( !v || is_hvm_vcpu(v) ) + return mfn_to_virt(mfn); + + dcache = &v->domain->arch.pv_domain.mapcache; + vcache = &v->arch.pv_vcpu.mapcache; + if ( !dcache->l1tab ) + return mfn_to_virt(mfn); + + perfc_incr(map_domain_page_count); + + local_irq_save(flags); + + hashent = &vcache->hash[MAPHASH_HASHFN(mfn)]; + if ( hashent->mfn == mfn ) + { + idx = hashent->idx; + ASSERT(idx < dcache->entries); + hashent->refcnt++; + ASSERT(hashent->refcnt); + ASSERT(l1e_get_pfn(DCACHE_L1ENT(dcache, idx)) == mfn); + goto out; + } + + spin_lock(&dcache->lock); + + /* Has some other CPU caused a wrap? We must flush if so. */ + if ( unlikely(dcache->epoch != vcache->shadow_epoch) ) + { + vcache->shadow_epoch = dcache->epoch; + if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) ) + { + perfc_incr(domain_page_tlb_flush); + flush_tlb_local(); + } + } + + idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor); + if ( unlikely(idx >= dcache->entries) ) + { + unsigned long accum = 0; + + /* /First/, clean the garbage map and update the inuse list. */ + for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ ) + { + dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0); + accum |= ~dcache->inuse[i]; + } + + if ( accum ) + idx = find_first_zero_bit(dcache->inuse, dcache->entries); + else + { + /* Replace a hash entry instead. */ + i = MAPHASH_HASHFN(mfn); + do { + hashent = &vcache->hash[i]; + if ( hashent->idx != MAPHASHENT_NOTINUSE && !hashent->refcnt ) + { + idx = hashent->idx; + ASSERT(l1e_get_pfn(DCACHE_L1ENT(dcache, idx)) == + hashent->mfn); + l1e_write(&DCACHE_L1ENT(dcache, idx), l1e_empty()); + hashent->idx = MAPHASHENT_NOTINUSE; + hashent->mfn = ~0UL; + break; + } + if ( ++i == MAPHASH_ENTRIES ) + i = 0; + } while ( i != MAPHASH_HASHFN(mfn) ); + } + BUG_ON(idx >= dcache->entries); + + /* /Second/, flush TLBs. */ + perfc_incr(domain_page_tlb_flush); + flush_tlb_local(); + vcache->shadow_epoch = ++dcache->epoch; + dcache->tlbflush_timestamp = tlbflush_current_time(); + } + + set_bit(idx, dcache->inuse); + dcache->cursor = idx + 1; + + spin_unlock(&dcache->lock); + + l1e_write(&DCACHE_L1ENT(dcache, idx), + l1e_from_pfn(mfn, __PAGE_HYPERVISOR)); + + out: + local_irq_restore(flags); + return (void *)MAPCACHE_VIRT_START + pfn_to_paddr(idx); +} + +void unmap_domain_page(const void *ptr) +{ + unsigned int idx; + struct vcpu *v; + struct mapcache_domain *dcache; + unsigned long va = (unsigned long)ptr, mfn, flags; + struct vcpu_maphash_entry *hashent; + + if ( va >= DIRECTMAP_VIRT_START ) + return; + + ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); + + v = mapcache_current_vcpu(); + ASSERT(v && !is_hvm_vcpu(v)); + + dcache = &v->domain->arch.pv_domain.mapcache; + ASSERT(dcache->l1tab); + + idx = PFN_DOWN(va - MAPCACHE_VIRT_START); + mfn = l1e_get_pfn(DCACHE_L1ENT(dcache, idx)); + hashent = &v->arch.pv_vcpu.mapcache.hash[MAPHASH_HASHFN(mfn)]; + + local_irq_save(flags); + + if ( hashent->idx == idx ) + { + ASSERT(hashent->mfn == mfn); + ASSERT(hashent->refcnt); + hashent->refcnt--; + } + else if ( !hashent->refcnt ) + { + if ( hashent->idx != MAPHASHENT_NOTINUSE ) + { + /* /First/, zap the PTE. */ + ASSERT(l1e_get_pfn(DCACHE_L1ENT(dcache, hashent->idx)) == + hashent->mfn); + l1e_write(&DCACHE_L1ENT(dcache, hashent->idx), l1e_empty()); + /* /Second/, mark as garbage. */ + set_bit(hashent->idx, dcache->garbage); + } + + /* Add newly-freed mapping to the maphash. */ + hashent->mfn = mfn; + hashent->idx = idx; + } + else + { + /* /First/, zap the PTE. */ + l1e_write(&DCACHE_L1ENT(dcache, idx), l1e_empty()); + /* /Second/, mark as garbage. */ + set_bit(idx, dcache->garbage); + } + + local_irq_restore(flags); +} + +void clear_domain_page(unsigned long mfn) +{ + void *ptr = map_domain_page(mfn); + + clear_page(ptr); + unmap_domain_page(ptr); +} + +void copy_domain_page(unsigned long dmfn, unsigned long smfn) +{ + const void *src = map_domain_page(smfn); + void *dst = map_domain_page(dmfn); + + copy_page(dst, src); + unmap_domain_page(dst); + unmap_domain_page(src); +} + +int mapcache_domain_init(struct domain *d) +{ + struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; + unsigned int i, bitmap_pages, memf = MEMF_node(domain_to_node(d)); + unsigned long *end; + + if ( is_hvm_domain(d) || is_idle_domain(d) ) + return 0; + + if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) + return 0; + + dcache->l1tab = xzalloc_array(l1_pgentry_t *, MAPCACHE_L2_ENTRIES + 1); + d->arch.mm_perdomain_l2[MAPCACHE_SLOT] = alloc_xenheap_pages(0, memf); + if ( !dcache->l1tab || !d->arch.mm_perdomain_l2[MAPCACHE_SLOT] ) + return -ENOMEM; + + clear_page(d->arch.mm_perdomain_l2[MAPCACHE_SLOT]); + d->arch.mm_perdomain_l3[l3_table_offset(MAPCACHE_VIRT_START)] = + l3e_from_paddr(__pa(d->arch.mm_perdomain_l2[MAPCACHE_SLOT]), + __PAGE_HYPERVISOR); + + BUILD_BUG_ON(MAPCACHE_VIRT_END + 3 + + 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)) > + MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20)); + bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long)); + dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE; + dcache->garbage = dcache->inuse + + (bitmap_pages + 1) * PAGE_SIZE / sizeof(long); + end = dcache->garbage + bitmap_pages * PAGE_SIZE / sizeof(long); + + for ( i = l2_table_offset((unsigned long)dcache->inuse); + i <= l2_table_offset((unsigned long)(end - 1)); ++i ) + { + ASSERT(i <= MAPCACHE_L2_ENTRIES); + dcache->l1tab[i] = alloc_xenheap_pages(0, memf); + if ( !dcache->l1tab[i] ) + return -ENOMEM; + clear_page(dcache->l1tab[i]); + d->arch.mm_perdomain_l2[MAPCACHE_SLOT][i] = + l2e_from_paddr(__pa(dcache->l1tab[i]), __PAGE_HYPERVISOR); + } + + spin_lock_init(&dcache->lock); + + return 0; +} + +void mapcache_domain_exit(struct domain *d) +{ + struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; + + if ( is_hvm_domain(d) ) + return; + + if ( dcache->l1tab ) + { + unsigned long i; + + for ( i = (unsigned long)dcache->inuse; ; i += PAGE_SIZE ) + { + l1_pgentry_t *pl1e; + + if ( l2_table_offset(i) > MAPCACHE_L2_ENTRIES || + !dcache->l1tab[l2_table_offset(i)] ) + break; + + pl1e = &dcache->l1tab[l2_table_offset(i)][l1_table_offset(i)]; + if ( l1e_get_flags(*pl1e) ) + free_domheap_page(l1e_get_page(*pl1e)); + } + + for ( i = 0; i < MAPCACHE_L2_ENTRIES + 1; ++i ) + free_xenheap_page(dcache->l1tab[i]); + + xfree(dcache->l1tab); + } + free_xenheap_page(d->arch.mm_perdomain_l2[MAPCACHE_SLOT]); +} + +int mapcache_vcpu_init(struct vcpu *v) +{ + struct domain *d = v->domain; + struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache; + unsigned long i; + unsigned int memf = MEMF_node(vcpu_to_node(v)); + + if ( is_hvm_vcpu(v) || !dcache->l1tab ) + return 0; + + while ( dcache->entries < d->max_vcpus * MAPCACHE_VCPU_ENTRIES ) + { + unsigned int ents = dcache->entries + MAPCACHE_VCPU_ENTRIES; + l1_pgentry_t *pl1e; + + /* Populate page tables. */ + if ( !dcache->l1tab[i = mapcache_l2_entry(ents - 1)] ) + { + dcache->l1tab[i] = alloc_xenheap_pages(0, memf); + if ( !dcache->l1tab[i] ) + return -ENOMEM; + clear_page(dcache->l1tab[i]); + d->arch.mm_perdomain_l2[MAPCACHE_SLOT][i] = + l2e_from_paddr(__pa(dcache->l1tab[i]), __PAGE_HYPERVISOR); + } + + /* Populate bit maps. */ + i = (unsigned long)(dcache->inuse + BITS_TO_LONGS(ents)); + pl1e = &dcache->l1tab[l2_table_offset(i)][l1_table_offset(i)]; + if ( !l1e_get_flags(*pl1e) ) + { + struct page_info *pg = alloc_domheap_page(NULL, memf); + + if ( !pg ) + return -ENOMEM; + clear_domain_page(page_to_mfn(pg)); + *pl1e = l1e_from_page(pg, __PAGE_HYPERVISOR); + + i = (unsigned long)(dcache->garbage + BITS_TO_LONGS(ents)); + pl1e = &dcache->l1tab[l2_table_offset(i)][l1_table_offset(i)]; + ASSERT(!l1e_get_flags(*pl1e)); + + pg = alloc_domheap_page(NULL, memf); + if ( !pg ) + return -ENOMEM; + clear_domain_page(page_to_mfn(pg)); + *pl1e = l1e_from_page(pg, __PAGE_HYPERVISOR); + } + + dcache->entries = ents; + } + + /* Mark all maphash entries as not in use. */ + BUILD_BUG_ON(MAPHASHENT_NOTINUSE < MAPCACHE_ENTRIES); + for ( i = 0; i < MAPHASH_ENTRIES; i++ ) + { + struct vcpu_maphash_entry *hashent = &v->arch.pv_vcpu.mapcache.hash[i]; + + hashent->mfn = ~0UL; /* never valid to map */ + hashent->idx = MAPHASHENT_NOTINUSE; + } + + return 0; +} + +#define GLOBALMAP_BITS (GLOBALMAP_GBYTES << (30 - PAGE_SHIFT)) +static unsigned long inuse[BITS_TO_LONGS(GLOBALMAP_BITS)]; +static unsigned long garbage[BITS_TO_LONGS(GLOBALMAP_BITS)]; +static unsigned int inuse_cursor; +static DEFINE_SPINLOCK(globalmap_lock); + +void *map_domain_page_global(unsigned long mfn) +{ + l1_pgentry_t *pl1e; + unsigned int idx, i; + unsigned long va; + + ASSERT(!in_irq() && local_irq_is_enabled()); + + if ( mfn <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) ) + return mfn_to_virt(mfn); + + spin_lock(&globalmap_lock); + + idx = find_next_zero_bit(inuse, GLOBALMAP_BITS, inuse_cursor); + va = GLOBALMAP_VIRT_START + pfn_to_paddr(idx); + if ( unlikely(va >= GLOBALMAP_VIRT_END) ) + { + /* /First/, clean the garbage map and update the inuse list. */ + for ( i = 0; i < ARRAY_SIZE(garbage); i++ ) + inuse[i] &= ~xchg(&garbage[i], 0); + + /* /Second/, flush all TLBs to get rid of stale garbage mappings. */ + flush_tlb_all(); + + idx = find_first_zero_bit(inuse, GLOBALMAP_BITS); + va = GLOBALMAP_VIRT_START + pfn_to_paddr(idx); + if ( unlikely(va >= GLOBALMAP_VIRT_END) ) + { + spin_unlock(&globalmap_lock); + return NULL; + } + } + + set_bit(idx, inuse); + inuse_cursor = idx + 1; + + spin_unlock(&globalmap_lock); + + pl1e = virt_to_xen_l1e(va); + if ( !pl1e ) + return NULL; + l1e_write(pl1e, l1e_from_pfn(mfn, __PAGE_HYPERVISOR)); + + return (void *)va; +} + +void unmap_domain_page_global(const void *ptr) +{ + unsigned long va = (unsigned long)ptr; + l1_pgentry_t *pl1e; + + if ( va >= DIRECTMAP_VIRT_START ) + return; + + ASSERT(va >= GLOBALMAP_VIRT_START && va < GLOBALMAP_VIRT_END); + + /* /First/, we zap the PTE. */ + pl1e = virt_to_xen_l1e(va); + BUG_ON(!pl1e); + l1e_write(pl1e, l1e_empty()); + + /* /Second/, we add to the garbage map. */ + set_bit(PFN_DOWN(va - GLOBALMAP_VIRT_START), garbage); +} + +/* Translate a map-domain-page'd address to the underlying MFN */ +unsigned long domain_page_map_to_mfn(const void *ptr) +{ + unsigned long va = (unsigned long)ptr; + const l1_pgentry_t *pl1e; + + if ( va >= DIRECTMAP_VIRT_START ) + return virt_to_mfn(ptr); + + if ( va >= GLOBALMAP_VIRT_START && va < GLOBALMAP_VIRT_END ) + { + pl1e = virt_to_xen_l1e(va); + BUG_ON(!pl1e); + } + else + { + ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END); + pl1e = &__linear_l1_table[l1_linear_offset(va)]; + } + + return l1e_get_pfn(*pl1e); +} --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -2661,9 +2661,6 @@ static inline int vcpumask_to_pcpumask( } } -#define fixmap_domain_page(mfn) mfn_to_virt(mfn) -#define fixunmap_domain_page(ptr) ((void)(ptr)) - long do_mmuext_op( XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops, unsigned int count, @@ -2983,7 +2980,6 @@ long do_mmuext_op( case MMUEXT_CLEAR_PAGE: { struct page_info *page; - unsigned char *ptr; page = get_page_from_gfn(d, op.arg1.mfn, NULL, P2M_ALLOC); if ( !page || !get_page_type(page, PGT_writable_page) ) @@ -2998,9 +2994,7 @@ long do_mmuext_op( /* A page is dirtied when it's being cleared. */ paging_mark_dirty(d, page_to_mfn(page)); - ptr = fixmap_domain_page(page_to_mfn(page)); - clear_page(ptr); - fixunmap_domain_page(ptr); + clear_domain_page(page_to_mfn(page)); put_page_and_type(page); break; @@ -3008,8 +3002,6 @@ long do_mmuext_op( case MMUEXT_COPY_PAGE: { - const unsigned char *src; - unsigned char *dst; struct page_info *src_page, *dst_page; src_page = get_page_from_gfn(d, op.arg2.src_mfn, NULL, P2M_ALLOC); @@ -3034,11 +3026,7 @@ long do_mmuext_op( /* A page is dirtied when it's being copied to. */ paging_mark_dirty(d, page_to_mfn(dst_page)); - src = __map_domain_page(src_page); - dst = fixmap_domain_page(page_to_mfn(dst_page)); - copy_page(dst, src); - fixunmap_domain_page(dst); - unmap_domain_page(src); + copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page)); put_page_and_type(dst_page); put_page(src_page); --- a/xen/include/asm-x86/config.h +++ b/xen/include/asm-x86/config.h @@ -27,6 +27,7 @@ #define CONFIG_DISCONTIGMEM 1 #define CONFIG_NUMA_EMU 1 #define CONFIG_PAGEALLOC_MAX_ORDER (2 * PAGETABLE_ORDER) +#define CONFIG_DOMAIN_PAGE 1 /* Intel P4 currently has largest cache line (L2 line size is 128 bytes). */ #define CONFIG_X86_L1_CACHE_SHIFT 7 @@ -147,12 +148,14 @@ extern unsigned char boot_edid_info[128] * 0xffff82c000000000 - 0xffff82c3ffffffff [16GB, 2^34 bytes, PML4:261] * vmap()/ioremap()/fixmap area. * 0xffff82c400000000 - 0xffff82c43fffffff [1GB, 2^30 bytes, PML4:261] - * Compatibility machine-to-phys translation table. + * Global domain page map area. * 0xffff82c440000000 - 0xffff82c47fffffff [1GB, 2^30 bytes, PML4:261] - * High read-only compatibility machine-to-phys translation table. + * Compatibility machine-to-phys translation table. * 0xffff82c480000000 - 0xffff82c4bfffffff [1GB, 2^30 bytes, PML4:261] + * High read-only compatibility machine-to-phys translation table. + * 0xffff82c4c0000000 - 0xffff82c4ffffffff [1GB, 2^30 bytes, PML4:261] * Xen text, static data, bss. - * 0xffff82c4c0000000 - 0xffff82dffbffffff [109GB - 64MB, PML4:261] + * 0xffff82c500000000 - 0xffff82dffbffffff [108GB - 64MB, PML4:261] * Reserved for future use. * 0xffff82dffc000000 - 0xffff82dfffffffff [64MB, 2^26 bytes, PML4:261] * Super-page information array. @@ -201,18 +204,24 @@ extern unsigned char boot_edid_info[128] /* Slot 259: linear page table (shadow table). */ #define SH_LINEAR_PT_VIRT_START (PML4_ADDR(259)) #define SH_LINEAR_PT_VIRT_END (SH_LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES) -/* Slot 260: per-domain mappings. */ +/* Slot 260: per-domain mappings (including map cache). */ #define PERDOMAIN_VIRT_START (PML4_ADDR(260)) -#define PERDOMAIN_VIRT_END (PERDOMAIN_VIRT_START + (PERDOMAIN_MBYTES<<20)) -#define PERDOMAIN_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER)) +#define PERDOMAIN_SLOT_MBYTES (PML4_ENTRY_BYTES >> (20 + PAGETABLE_ORDER)) +#define PERDOMAIN_SLOTS 2 +#define PERDOMAIN_VIRT_SLOT(s) (PERDOMAIN_VIRT_START + (s) * \ + (PERDOMAIN_SLOT_MBYTES << 20)) /* Slot 261: machine-to-phys conversion table (256GB). */ #define RDWR_MPT_VIRT_START (PML4_ADDR(261)) #define RDWR_MPT_VIRT_END (RDWR_MPT_VIRT_START + MPT_VIRT_SIZE) /* Slot 261: vmap()/ioremap()/fixmap area (16GB). */ #define VMAP_VIRT_START RDWR_MPT_VIRT_END #define VMAP_VIRT_END (VMAP_VIRT_START + GB(16)) +/* Slot 261: global domain page map area (1GB). */ +#define GLOBALMAP_GBYTES 1 +#define GLOBALMAP_VIRT_START VMAP_VIRT_END +#define GLOBALMAP_VIRT_END (GLOBALMAP_VIRT_START + (GLOBALMAP_GBYTES<<30)) /* Slot 261: compatibility machine-to-phys conversion table (1GB). */ -#define RDWR_COMPAT_MPT_VIRT_START VMAP_VIRT_END +#define RDWR_COMPAT_MPT_VIRT_START GLOBALMAP_VIRT_END #define RDWR_COMPAT_MPT_VIRT_END (RDWR_COMPAT_MPT_VIRT_START + GB(1)) /* Slot 261: high read-only compat machine-to-phys conversion table (1GB). */ #define HIRO_COMPAT_MPT_VIRT_START RDWR_COMPAT_MPT_VIRT_END @@ -279,9 +288,9 @@ extern unsigned long xen_phys_start; /* GDT/LDT shadow mapping area. The first per-domain-mapping sub-area. */ #define GDT_LDT_VCPU_SHIFT 5 #define GDT_LDT_VCPU_VA_SHIFT (GDT_LDT_VCPU_SHIFT + PAGE_SHIFT) -#define GDT_LDT_MBYTES PERDOMAIN_MBYTES +#define GDT_LDT_MBYTES PERDOMAIN_SLOT_MBYTES #define MAX_VIRT_CPUS (GDT_LDT_MBYTES << (20-GDT_LDT_VCPU_VA_SHIFT)) -#define GDT_LDT_VIRT_START PERDOMAIN_VIRT_START +#define GDT_LDT_VIRT_START PERDOMAIN_VIRT_SLOT(0) #define GDT_LDT_VIRT_END (GDT_LDT_VIRT_START + (GDT_LDT_MBYTES << 20)) /* The address of a particular VCPU's GDT or LDT. */ @@ -290,8 +299,16 @@ extern unsigned long xen_phys_start; #define LDT_VIRT_START(v) \ (GDT_VIRT_START(v) + (64*1024)) +/* map_domain_page() map cache. The last per-domain-mapping sub-area. */ +#define MAPCACHE_VCPU_ENTRIES (CONFIG_PAGING_LEVELS * CONFIG_PAGING_LEVELS) +#define MAPCACHE_ENTRIES (MAX_VIRT_CPUS * MAPCACHE_VCPU_ENTRIES) +#define MAPCACHE_SLOT (PERDOMAIN_SLOTS - 1) +#define MAPCACHE_VIRT_START PERDOMAIN_VIRT_SLOT(MAPCACHE_SLOT) +#define MAPCACHE_VIRT_END (MAPCACHE_VIRT_START + \ + MAPCACHE_ENTRIES * PAGE_SIZE) + #define PDPT_L1_ENTRIES \ - ((PERDOMAIN_VIRT_END - PERDOMAIN_VIRT_START) >> PAGE_SHIFT) + ((PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS - 1) - PERDOMAIN_VIRT_START) >> PAGE_SHIFT) #define PDPT_L2_ENTRIES \ ((PDPT_L1_ENTRIES + (1 << PAGETABLE_ORDER) - 1) >> PAGETABLE_ORDER) --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -39,7 +39,7 @@ struct trap_bounce { #define MAPHASH_ENTRIES 8 #define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1)) -#define MAPHASHENT_NOTINUSE ((u16)~0U) +#define MAPHASHENT_NOTINUSE ((u32)~0U) struct mapcache_vcpu { /* Shadow of mapcache_domain.epoch. */ unsigned int shadow_epoch; @@ -47,16 +47,15 @@ struct mapcache_vcpu { /* Lock-free per-VCPU hash of recently-used mappings. */ struct vcpu_maphash_entry { unsigned long mfn; - uint16_t idx; - uint16_t refcnt; + uint32_t idx; + uint32_t refcnt; } hash[MAPHASH_ENTRIES]; }; -#define MAPCACHE_ORDER 10 -#define MAPCACHE_ENTRIES (1 << MAPCACHE_ORDER) struct mapcache_domain { /* The PTEs that provide the mappings, and a cursor into the array. */ - l1_pgentry_t *l1tab; + l1_pgentry_t **l1tab; + unsigned int entries; unsigned int cursor; /* Protects map_domain_page(). */ @@ -67,12 +66,13 @@ struct mapcache_domain { u32 tlbflush_timestamp; /* Which mappings are in use, and which are garbage to reap next epoch? */ - unsigned long inuse[BITS_TO_LONGS(MAPCACHE_ENTRIES)]; - unsigned long garbage[BITS_TO_LONGS(MAPCACHE_ENTRIES)]; + unsigned long *inuse; + unsigned long *garbage; }; -void mapcache_domain_init(struct domain *); -void mapcache_vcpu_init(struct vcpu *); +int mapcache_domain_init(struct domain *); +void mapcache_domain_exit(struct domain *); +int mapcache_vcpu_init(struct vcpu *); /* x86/64: toggle guest between kernel and user modes. */ void toggle_guest_mode(struct vcpu *); @@ -229,6 +229,9 @@ struct pv_domain * unmask the event channel */ bool_t auto_unmask; + /* map_domain_page() mapping cache. */ + struct mapcache_domain mapcache; + /* Pseudophysical e820 map (XENMEM_memory_map). */ spinlock_t e820_lock; struct e820entry *e820; @@ -238,7 +241,7 @@ struct pv_domain struct arch_domain { struct page_info **mm_perdomain_pt_pages; - l2_pgentry_t *mm_perdomain_l2; + l2_pgentry_t *mm_perdomain_l2[PERDOMAIN_SLOTS]; l3_pgentry_t *mm_perdomain_l3; unsigned int hv_compat_vstart; @@ -324,6 +327,9 @@ struct arch_domain struct pv_vcpu { + /* map_domain_page() mapping cache. */ + struct mapcache_vcpu mapcache; + struct trap_info *trap_ctxt; unsigned long gdt_frames[FIRST_RESERVED_GDT_PAGE]; --- a/xen/include/xen/domain_page.h +++ b/xen/include/xen/domain_page.h @@ -25,11 +25,16 @@ void *map_domain_page(unsigned long mfn) */ void unmap_domain_page(const void *va); +/* + * Clear a given page frame, or copy between two of them. + */ +void clear_domain_page(unsigned long mfn); +void copy_domain_page(unsigned long dmfn, unsigned long smfn); /* * Given a VA from map_domain_page(), return its underlying MFN. */ -unsigned long domain_page_map_to_mfn(void *va); +unsigned long domain_page_map_to_mfn(const void *va); /* * Similar to the above calls, except the mapping is accessible in all @@ -107,6 +112,9 @@ domain_mmap_cache_destroy(struct domain_ #define map_domain_page(mfn) mfn_to_virt(mfn) #define __map_domain_page(pg) page_to_virt(pg) #define unmap_domain_page(va) ((void)(va)) +#define clear_domain_page(mfn) clear_page(mfn_to_virt(mfn)) +#define copy_domain_page(dmfn, smfn) copy_page(mfn_to_virt(dmfn), \ + mfn_to_virt(smfn)) #define domain_page_map_to_mfn(va) virt_to_mfn((unsigned long)(va)) #define map_domain_page_global(mfn) mfn_to_virt(mfn)