Xen project Mailing List

[Xen-changelog] [xen-unstable] x86/mm/p2m: break into common, pt-implementation and pod parts.

From: Xen patchbot-unstable <patchbot@xxxxxxx>

Date: Wed, 11 May 2011 04:40:23 +0100

Delivery-date: Tue, 10 May 2011 20:49:54 -0700

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User Tim Deegan <Tim.Deegan@xxxxxxxxxx> # Date 1304676935 -3600 # Node ID 19452acd23045f40c4e18437f0a60f016757e5bd # Parent d9982136d8fa5e7ffb0d6ddb23668a7040c71d5b x86/mm/p2m: break into common, pt-implementation and pod parts. Start to make a clearer distinction between generic p2m functions and the implementation of the datastructure as an x86 pagetable. Also move the EPT datastructure implementation into x86/mm/ to match, and split the PoD admin code into its own file. This is just code motion, except for splitting the p2m_initialise function into a pt-specific part and a common part. Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx> --- diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/Makefile --- a/xen/arch/x86/mm/Makefile Mon May 09 15:00:57 2011 +0100 +++ b/xen/arch/x86/mm/Makefile Fri May 06 11:15:35 2011 +0100 @@ -2,7 +2,7 @@ subdir-y += hap obj-y += paging.o -obj-y += p2m.o +obj-y += p2m.o p2m-pt.o p2m-ept.o p2m-pod.o obj-y += guest_walk_2.o obj-y += guest_walk_3.o obj-$(x86_64) += guest_walk_4.o diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/hap/Makefile --- a/xen/arch/x86/mm/hap/Makefile Mon May 09 15:00:57 2011 +0100 +++ b/xen/arch/x86/mm/hap/Makefile Fri May 06 11:15:35 2011 +0100 @@ -2,7 +2,6 @@ obj-y += guest_walk_2level.o obj-y += guest_walk_3level.o obj-$(x86_64) += guest_walk_4level.o -obj-y += p2m-ept.o obj-y += nested_hap.o guest_walk_%level.o: guest_walk.c Makefile diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/hap/p2m-ept.c --- a/xen/arch/x86/mm/hap/p2m-ept.c Mon May 09 15:00:57 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,911 +0,0 @@ -/* - * ept-p2m.c: use the EPT page table as p2m - * Copyright (c) 2007, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - */ - -#include <xen/config.h> -#include <xen/domain_page.h> -#include <xen/sched.h> -#include <asm/current.h> -#include <asm/paging.h> -#include <asm/types.h> -#include <asm/domain.h> -#include <asm/p2m.h> -#include <asm/hvm/vmx/vmx.h> -#include <asm/hvm/vmx/vmcs.h> -#include <xen/iommu.h> -#include <asm/mtrr.h> -#include <asm/hvm/cacheattr.h> -#include <xen/keyhandler.h> -#include <xen/softirq.h> - -#define atomic_read_ept_entry(__pepte) \ - ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } ) -#define atomic_write_ept_entry(__pepte, __epte) \ - atomic_write64(&(__pepte)->epte, (__epte).epte) - -#define is_epte_present(ept_entry) ((ept_entry)->epte & 0x7) -#define is_epte_superpage(ept_entry) ((ept_entry)->sp) - -/* Non-ept "lock-and-check" wrapper */ -static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long gfn, - ept_entry_t *entry, int order, - p2m_query_t q) -{ - /* Only take the lock if we don't already have it. Otherwise it - * wouldn't be safe to do p2m lookups with the p2m lock held */ - int do_locking = !p2m_locked_by_me(p2m); - int r; - - if ( do_locking ) - p2m_lock(p2m); - - /* Check to make sure this is still PoD */ - if ( entry->sa_p2mt != p2m_populate_on_demand ) - { - if ( do_locking ) - p2m_unlock(p2m); - return 0; - } - - r = p2m_pod_demand_populate(p2m, gfn, order, q); - - if ( do_locking ) - p2m_unlock(p2m); - - return r; -} - -static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_access_t access) -{ - /* First apply type permissions */ - switch(type) - { - case p2m_invalid: - case p2m_mmio_dm: - case p2m_populate_on_demand: - case p2m_ram_paging_out: - case p2m_ram_paged: - case p2m_ram_paging_in: - case p2m_ram_paging_in_start: - default: - entry->r = entry->w = entry->x = 0; - break; - case p2m_ram_rw: - entry->r = entry->w = entry->x = 1; - break; - case p2m_mmio_direct: - entry->r = entry->x = 1; - entry->w = !rangeset_contains_singleton(mmio_ro_ranges, - entry->mfn); - break; - case p2m_ram_logdirty: - case p2m_ram_ro: - case p2m_ram_shared: - entry->r = entry->x = 1; - entry->w = 0; - break; - case p2m_grant_map_rw: - entry->r = entry->w = 1; - entry->x = 0; - break; - case p2m_grant_map_ro: - entry->r = 1; - entry->w = entry->x = 0; - break; - } - - - /* Then restrict with access permissions */ - switch (access) - { - case p2m_access_n: - entry->r = entry->w = entry->x = 0; - break; - case p2m_access_r: - entry->w = entry->x = 0; - break; - case p2m_access_w: - entry->r = entry->x = 0; - break; - case p2m_access_x: - entry->r = entry->w = 0; - break; - case p2m_access_rx: - case p2m_access_rx2rw: - entry->w = 0; - break; - case p2m_access_wx: - entry->r = 0; - break; - case p2m_access_rw: - entry->x = 0; - break; - case p2m_access_rwx: - break; - } - -} - -#define GUEST_TABLE_MAP_FAILED 0 -#define GUEST_TABLE_NORMAL_PAGE 1 -#define GUEST_TABLE_SUPER_PAGE 2 -#define GUEST_TABLE_POD_PAGE 3 - -/* Fill in middle levels of ept table */ -static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry) -{ - struct page_info *pg; - - pg = p2m_alloc_ptp(p2m, 0); - if ( pg == NULL ) - return 0; - - ept_entry->epte = 0; - ept_entry->mfn = page_to_mfn(pg); - ept_entry->access = p2m->default_access; - - ept_entry->r = ept_entry->w = ept_entry->x = 1; - - return 1; -} - -/* free ept sub tree behind an entry */ -void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level) -{ - /* End if the entry is a leaf entry. */ - if ( level == 0 || !is_epte_present(ept_entry) || - is_epte_superpage(ept_entry) ) - return; - - if ( level > 1 ) - { - ept_entry_t *epte = map_domain_page(ept_entry->mfn); - for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) - ept_free_entry(p2m, epte + i, level - 1); - unmap_domain_page(epte); - } - - p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn)); -} - -static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry, - int level, int target) -{ - ept_entry_t new_ept, *table; - uint64_t trunk; - int rv = 1; - - /* End if the entry is a leaf entry or reaches the target level. */ - if ( level == 0 || level == target ) - return rv; - - ASSERT(is_epte_superpage(ept_entry)); - - if ( !ept_set_middle_entry(p2m, &new_ept) ) - return 0; - - table = map_domain_page(new_ept.mfn); - trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER); - - for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) - { - ept_entry_t *epte = table + i; - - epte->epte = 0; - epte->emt = ept_entry->emt; - epte->ipat = ept_entry->ipat; - epte->sp = (level > 1) ? 1 : 0; - epte->access = ept_entry->access; - epte->sa_p2mt = ept_entry->sa_p2mt; - epte->mfn = ept_entry->mfn + i * trunk; - epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0; - - ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access); - - if ( (level - 1) == target ) - continue; - - ASSERT(is_epte_superpage(epte)); - - if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) ) - break; - } - - unmap_domain_page(table); - - /* Even failed we should install the newly allocated ept page. */ - *ept_entry = new_ept; - - return rv; -} - -/* Take the currently mapped table, find the corresponding gfn entry, - * and map the next table, if available. If the entry is empty - * and read_only is set, - * Return values: - * 0: Failed to map. Either read_only was set and the entry was - * empty, or allocating a new page failed. - * GUEST_TABLE_NORMAL_PAGE: next level mapped normally - * GUEST_TABLE_SUPER_PAGE: - * The next entry points to a superpage, and caller indicates - * that they are going to the superpage level, or are only doing - * a read. - * GUEST_TABLE_POD: - * The next entry is marked populate-on-demand. - */ -static int ept_next_level(struct p2m_domain *p2m, bool_t read_only, - ept_entry_t **table, unsigned long *gfn_remainder, - int next_level) -{ - unsigned long mfn; - ept_entry_t *ept_entry, e; - u32 shift, index; - - shift = next_level * EPT_TABLE_ORDER; - - index = *gfn_remainder >> shift; - - /* index must be falling into the page */ - ASSERT(index < EPT_PAGETABLE_ENTRIES); - - ept_entry = (*table) + index; - - /* ept_next_level() is called (sometimes) without a lock. Read - * the entry once, and act on the "cached" entry after that to - * avoid races. */ - e = atomic_read_ept_entry(ept_entry); - - if ( !is_epte_present(&e) ) - { - if ( e.sa_p2mt == p2m_populate_on_demand ) - return GUEST_TABLE_POD_PAGE; - - if ( read_only ) - return GUEST_TABLE_MAP_FAILED; - - if ( !ept_set_middle_entry(p2m, ept_entry) ) - return GUEST_TABLE_MAP_FAILED; - else - e = atomic_read_ept_entry(ept_entry); /* Refresh */ - } - - /* The only time sp would be set here is if we had hit a superpage */ - if ( is_epte_superpage(&e) ) - return GUEST_TABLE_SUPER_PAGE; - - mfn = e.mfn; - unmap_domain_page(*table); - *table = map_domain_page(mfn); - *gfn_remainder &= (1UL << shift) - 1; - return GUEST_TABLE_NORMAL_PAGE; -} - -/* - * ept_set_entry() computes 'need_modify_vtd_table' for itself, - * by observing whether any gfn->mfn translations are modified. - */ -static int -ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, - unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma) -{ - ept_entry_t *table, *ept_entry = NULL; - unsigned long gfn_remainder = gfn; - unsigned long offset = 0; - u32 index; - int i, target = order / EPT_TABLE_ORDER; - int rv = 0; - int ret = 0; - bool_t direct_mmio = (p2mt == p2m_mmio_direct); - uint8_t ipat = 0; - int need_modify_vtd_table = 1; - int vtd_pte_present = 0; - int needs_sync = 1; - struct domain *d = p2m->domain; - ept_entry_t old_entry = { .epte = 0 }; - - /* - * the caller must make sure: - * 1. passing valid gfn and mfn at order boundary. - * 2. gfn not exceeding guest physical address width. - * 3. passing a valid order. - */ - if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) || - ((u64)gfn >> ((ept_get_wl(d) + 1) * EPT_TABLE_ORDER)) || - (order % EPT_TABLE_ORDER) ) - return 0; - - ASSERT((target == 2 && hvm_hap_has_1gb(d)) || - (target == 1 && hvm_hap_has_2mb(d)) || - (target == 0)); - - table = map_domain_page(ept_get_asr(d)); - - ASSERT(table != NULL); - - for ( i = ept_get_wl(d); i > target; i-- ) - { - ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i); - if ( !ret ) - goto out; - else if ( ret != GUEST_TABLE_NORMAL_PAGE ) - break; - } - - ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target); - - index = gfn_remainder >> (i * EPT_TABLE_ORDER); - offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1); - - ept_entry = table + index; - - /* In case VT-d uses same page table, this flag is needed by VT-d */ - vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0; - - /* - * If we're here with i > target, we must be at a leaf node, and - * we need to break up the superpage. - * - * If we're here with i == target and i > 0, we need to check to see - * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table) - * with a leaf entry (a 1GiB or 2MiB page), and handle things appropriately. - */ - - if ( i == target ) - { - /* We reached the target level. */ - ept_entry_t new_entry = { .epte = 0 }; - - /* No need to flush if the old entry wasn't valid */ - if ( !is_epte_present(ept_entry) ) - needs_sync = 0; - - /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB), - * the intermediate tables will be freed below after the ept flush - * - * Read-then-write is OK because we hold the p2m lock. */ - old_entry = *ept_entry; - - if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) || - (p2mt == p2m_ram_paging_in_start) ) - { - /* Construct the new entry, and then write it once */ - new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, - direct_mmio); - - new_entry.ipat = ipat; - new_entry.sp = order ? 1 : 0; - new_entry.sa_p2mt = p2mt; - new_entry.access = p2ma; - new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); - - new_entry.mfn = mfn_x(mfn); - - if ( old_entry.mfn == new_entry.mfn ) - need_modify_vtd_table = 0; - - ept_p2m_type_to_flags(&new_entry, p2mt, p2ma); - } - - atomic_write_ept_entry(ept_entry, new_entry); - } - else - { - /* We need to split the original page. */ - ept_entry_t split_ept_entry; - ept_entry_t new_entry = { .epte = 0 }; - - ASSERT(is_epte_superpage(ept_entry)); - - split_ept_entry = atomic_read_ept_entry(ept_entry); - - if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) ) - { - ept_free_entry(p2m, &split_ept_entry, i); - goto out; - } - - /* now install the newly split ept sub-tree */ - /* NB: please make sure domian is paused and no in-fly VT-d DMA. */ - atomic_write_ept_entry(ept_entry, split_ept_entry); - - /* then move to the level we want to make real changes */ - for ( ; i > target; i-- ) - ept_next_level(p2m, 0, &table, &gfn_remainder, i); - - ASSERT(i == target); - - index = gfn_remainder >> (i * EPT_TABLE_ORDER); - offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1); - - ept_entry = table + index; - - new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio); - new_entry.ipat = ipat; - new_entry.sp = i ? 1 : 0; - new_entry.sa_p2mt = p2mt; - new_entry.access = p2ma; - new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); - - /* the caller should take care of the previous page */ - new_entry.mfn = mfn_x(mfn); - - /* Safe to read-then-write because we hold the p2m lock */ - if ( ept_entry->mfn == new_entry.mfn ) - need_modify_vtd_table = 0; - - ept_p2m_type_to_flags(&new_entry, p2mt, p2ma); - - atomic_write_ept_entry(ept_entry, new_entry); - } - - /* Track the highest gfn for which we have ever had a valid mapping */ - if ( mfn_valid(mfn_x(mfn)) && - (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) ) - p2m->max_mapped_pfn = gfn + (1UL << order) - 1; - - /* Success */ - rv = 1; - -out: - unmap_domain_page(table); - - if ( needs_sync ) - ept_sync_domain(p2m->domain); - - if ( rv && iommu_enabled && need_iommu(p2m->domain) && need_modify_vtd_table ) - { - if ( iommu_hap_pt_share ) - iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present); - else - { - if ( p2mt == p2m_ram_rw ) - { - if ( order > 0 ) - { - for ( i = 0; i < (1 << order); i++ ) - iommu_map_page( - p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i, - IOMMUF_readable | IOMMUF_writable); - } - else if ( !order ) - iommu_map_page( - p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable); - } - else - { - if ( order > 0 ) - { - for ( i = 0; i < (1 << order); i++ ) - iommu_unmap_page(p2m->domain, gfn - offset + i); - } - else if ( !order ) - iommu_unmap_page(p2m->domain, gfn); - } - } - } - - /* Release the old intermediate tables, if any. This has to be the - last thing we do, after the ept_sync_domain() and removal - from the iommu tables, so as to avoid a potential - use-after-free. */ - if ( is_epte_present(&old_entry) ) - ept_free_entry(p2m, &old_entry, target); - - return rv; -} - -/* Read ept p2m entries */ -static mfn_t ept_get_entry(struct p2m_domain *p2m, - unsigned long gfn, p2m_type_t *t, p2m_access_t* a, - p2m_query_t q) -{ - struct domain *d = p2m->domain; - ept_entry_t *table = map_domain_page(ept_get_asr(d)); - unsigned long gfn_remainder = gfn; - ept_entry_t *ept_entry; - u32 index; - int i; - int ret = 0; - mfn_t mfn = _mfn(INVALID_MFN); - - *t = p2m_mmio_dm; - *a = p2m_access_n; - - /* This pfn is higher than the highest the p2m map currently holds */ - if ( gfn > p2m->max_mapped_pfn ) - goto out; - - /* Should check if gfn obeys GAW here. */ - - for ( i = ept_get_wl(d); i > 0; i-- ) - { - retry: - ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); - if ( !ret ) - goto out; - else if ( ret == GUEST_TABLE_POD_PAGE ) - { - if ( q == p2m_query ) - { - *t = p2m_populate_on_demand; - goto out; - } - - /* Populate this superpage */ - ASSERT(i == 1); - - index = gfn_remainder >> ( i * EPT_TABLE_ORDER); - ept_entry = table + index; - - if ( !ept_pod_check_and_populate(p2m, gfn, - ept_entry, 9, q) ) - goto retry; - else - goto out; - } - else if ( ret == GUEST_TABLE_SUPER_PAGE ) - break; - } - - index = gfn_remainder >> (i * EPT_TABLE_ORDER); - ept_entry = table + index; - - if ( ept_entry->sa_p2mt == p2m_populate_on_demand ) - { - if ( q == p2m_query ) - { - *t = p2m_populate_on_demand; - goto out; - } - - ASSERT(i == 0); - - if ( ept_pod_check_and_populate(p2m, gfn, - ept_entry, 0, q) ) - goto out; - } - - /* Need to check for all-zeroes because typecode 0 is p2m_ram and an - * entirely empty entry shouldn't have RAM type. */ - if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid ) - { - *t = ept_entry->sa_p2mt; - *a = ept_entry->access; - - mfn = _mfn(ept_entry->mfn); - if ( i ) - { - /* - * We may meet super pages, and to split into 4k pages - * to emulate p2m table - */ - unsigned long split_mfn = mfn_x(mfn) + - (gfn_remainder & - ((1 << (i * EPT_TABLE_ORDER)) - 1)); - mfn = _mfn(split_mfn); - } - } - -out: - unmap_domain_page(table); - return mfn; -} - -/* WARNING: Only caller doesn't care about PoD pages. So this function will - * always return 0 for PoD pages, not populate them. If that becomes necessary, - * pass a p2m_query_t type along to distinguish. */ -static ept_entry_t ept_get_entry_content(struct p2m_domain *p2m, - unsigned long gfn, int *level) -{ - ept_entry_t *table = map_domain_page(ept_get_asr(p2m->domain)); - unsigned long gfn_remainder = gfn; - ept_entry_t *ept_entry; - ept_entry_t content = { .epte = 0 }; - u32 index; - int i; - int ret=0; - - /* This pfn is higher than the highest the p2m map currently holds */ - if ( gfn > p2m->max_mapped_pfn ) - goto out; - - for ( i = ept_get_wl(p2m->domain); i > 0; i-- ) - { - ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); - if ( !ret || ret == GUEST_TABLE_POD_PAGE ) - goto out; - else if ( ret == GUEST_TABLE_SUPER_PAGE ) - break; - } - - index = gfn_remainder >> (i * EPT_TABLE_ORDER); - ept_entry = table + index; - content = *ept_entry; - *level = i; - - out: - unmap_domain_page(table); - return content; -} - -void ept_walk_table(struct domain *d, unsigned long gfn) -{ - struct p2m_domain *p2m = p2m_get_hostp2m(d); - ept_entry_t *table = map_domain_page(ept_get_asr(d)); - unsigned long gfn_remainder = gfn; - - int i; - - gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n", - d->domain_id, gfn); - - /* This pfn is higher than the highest the p2m map currently holds */ - if ( gfn > p2m->max_mapped_pfn ) - { - gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n", - p2m->max_mapped_pfn); - goto out; - } - - for ( i = ept_get_wl(d); i >= 0; i-- ) - { - ept_entry_t *ept_entry, *next; - u32 index; - - /* Stolen from ept_next_level */ - index = gfn_remainder >> (i*EPT_TABLE_ORDER); - ept_entry = table + index; - - gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte); - - if ( (i == 0) || !is_epte_present(ept_entry) || - is_epte_superpage(ept_entry) ) - goto out; - else - { - gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1; - - next = map_domain_page(ept_entry->mfn); - - unmap_domain_page(table); - - table = next; - } - } - -out: - unmap_domain_page(table); - return; -} - -static mfn_t ept_get_entry_current(struct p2m_domain *p2m, - unsigned long gfn, p2m_type_t *t, p2m_access_t *a, - p2m_query_t q) -{ - return ept_get_entry(p2m, gfn, t, a, q); -} - -/* - * To test if the new emt type is the same with old, - * return 1 to not to reset ept entry. - */ -static int need_modify_ept_entry(struct p2m_domain *p2m, unsigned long gfn, - mfn_t mfn, uint8_t o_ipat, uint8_t o_emt, - p2m_type_t p2mt) -{ - uint8_t ipat; - uint8_t emt; - bool_t direct_mmio = (p2mt == p2m_mmio_direct); - - emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio); - - if ( (emt == o_emt) && (ipat == o_ipat) ) - return 0; - - return 1; -} - -void ept_change_entry_emt_with_range(struct domain *d, - unsigned long start_gfn, - unsigned long end_gfn) -{ - unsigned long gfn; - ept_entry_t e; - mfn_t mfn; - int order = 0; - struct p2m_domain *p2m = p2m_get_hostp2m(d); - - p2m_lock(p2m); - for ( gfn = start_gfn; gfn <= end_gfn; gfn++ ) - { - int level = 0; - uint64_t trunk = 0; - - e = ept_get_entry_content(p2m, gfn, &level); - if ( !p2m_has_emt(e.sa_p2mt) ) - continue; - - order = 0; - mfn = _mfn(e.mfn); - - if ( is_epte_superpage(&e) ) - { - while ( level ) - { - trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1; - if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) ) - { - /* gfn assigned with 2M or 1G, and the end covers more than - * the super page areas. - * Set emt for super page. - */ - order = level * EPT_TABLE_ORDER; - if ( need_modify_ept_entry(p2m, gfn, mfn, - e.ipat, e.emt, e.sa_p2mt) ) - ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access); - gfn += trunk; - break; - } - level--; - } - } - else /* gfn assigned with 4k */ - { - if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.sa_p2mt) ) - ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access); - } - } - p2m_unlock(p2m); -} - -/* - * Walk the whole p2m table, changing any entries of the old type - * to the new type. This is used in hardware-assisted paging to - * quickly enable or diable log-dirty tracking - */ -static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level, - p2m_type_t ot, p2m_type_t nt) -{ - ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn)); - - for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) - { - if ( !is_epte_present(epte + i) ) - continue; - - if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) ) - ept_change_entry_type_page(_mfn(epte[i].mfn), - ept_page_level - 1, ot, nt); - else - { - e = atomic_read_ept_entry(&epte[i]); - if ( e.sa_p2mt != ot ) - continue; - - e.sa_p2mt = nt; - ept_p2m_type_to_flags(&e, nt, e.access); - atomic_write_ept_entry(&epte[i], e); - } - } - - unmap_domain_page(epte); -} - -static void ept_change_entry_type_global(struct p2m_domain *p2m, - p2m_type_t ot, p2m_type_t nt) -{ - struct domain *d = p2m->domain; - if ( ept_get_asr(d) == 0 ) - return; - - BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); - BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); - - ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt); - - ept_sync_domain(d); -} - -void ept_p2m_init(struct domain *d) -{ - struct p2m_domain *p2m = p2m_get_hostp2m(d); - p2m->set_entry = ept_set_entry; - p2m->get_entry = ept_get_entry; - p2m->get_entry_current = ept_get_entry_current; - p2m->change_entry_type_global = ept_change_entry_type_global; -} - -static void ept_dump_p2m_table(unsigned char key) -{ - struct domain *d; - ept_entry_t *table, *ept_entry; - mfn_t mfn; - int order; - int i; - int is_pod; - int ret = 0; - unsigned long index; - unsigned long gfn, gfn_remainder; - unsigned long record_counter = 0; - struct p2m_domain *p2m; - - for_each_domain(d) - { - if ( !hap_enabled(d) ) - continue; - - p2m = p2m_get_hostp2m(d); - printk("\ndomain%d EPT p2m table: \n", d->domain_id); - - for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) ) - { - gfn_remainder = gfn; - mfn = _mfn(INVALID_MFN); - table = map_domain_page(ept_get_asr(d)); - - for ( i = ept_get_wl(d); i > 0; i-- ) - { - ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); - if ( ret != GUEST_TABLE_NORMAL_PAGE ) - break; - } - - order = i * EPT_TABLE_ORDER; - - if ( ret == GUEST_TABLE_MAP_FAILED ) - goto out; - - index = gfn_remainder >> order; - ept_entry = table + index; - if ( ept_entry->sa_p2mt != p2m_invalid ) - { - ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? - ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) : - ( mfn = _mfn(ept_entry->mfn), is_pod = 0 ); - - printk("gfn: %-16lx mfn: %-16lx order: %2d is_pod: %d\n", - gfn, mfn_x(mfn), order, is_pod); - - if ( !(record_counter++ % 100) ) - process_pending_softirqs(); - } -out: - unmap_domain_page(table); - } - } -} - -static struct keyhandler ept_p2m_table = { - .diagnostic = 0, - .u.fn = ept_dump_p2m_table, - .desc = "dump ept p2m table" -}; - -void setup_ept_dump(void) -{ - register_keyhandler('D', &ept_p2m_table); -} - -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/p2m-ept.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/p2m-ept.c Fri May 06 11:15:35 2011 +0100 @@ -0,0 +1,910 @@ +/* + * ept-p2m.c: use the EPT page table as p2m + * Copyright (c) 2007, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + */ + +#include <xen/config.h> +#include <xen/domain_page.h> +#include <xen/sched.h> +#include <asm/current.h> +#include <asm/paging.h> +#include <asm/types.h> +#include <asm/domain.h> +#include <asm/p2m.h> +#include <asm/hvm/vmx/vmx.h> +#include <asm/hvm/vmx/vmcs.h> +#include <xen/iommu.h> +#include <asm/mtrr.h> +#include <asm/hvm/cacheattr.h> +#include <xen/keyhandler.h> +#include <xen/softirq.h> + +#define atomic_read_ept_entry(__pepte) \ + ( (ept_entry_t) { .epte = atomic_read64(&(__pepte)->epte) } ) +#define atomic_write_ept_entry(__pepte, __epte) \ + atomic_write64(&(__pepte)->epte, (__epte).epte) + +#define is_epte_present(ept_entry) ((ept_entry)->epte & 0x7) +#define is_epte_superpage(ept_entry) ((ept_entry)->sp) + +/* Non-ept "lock-and-check" wrapper */ +static int ept_pod_check_and_populate(struct p2m_domain *p2m, unsigned long gfn, + ept_entry_t *entry, int order, + p2m_query_t q) +{ + /* Only take the lock if we don't already have it. Otherwise it + * wouldn't be safe to do p2m lookups with the p2m lock held */ + int do_locking = !p2m_locked_by_me(p2m); + int r; + + if ( do_locking ) + p2m_lock(p2m); + + /* Check to make sure this is still PoD */ + if ( entry->sa_p2mt != p2m_populate_on_demand ) + { + if ( do_locking ) + p2m_unlock(p2m); + return 0; + } + + r = p2m_pod_demand_populate(p2m, gfn, order, q); + + if ( do_locking ) + p2m_unlock(p2m); + + return r; +} + +static void ept_p2m_type_to_flags(ept_entry_t *entry, p2m_type_t type, p2m_access_t access) +{ + /* First apply type permissions */ + switch(type) + { + case p2m_invalid: + case p2m_mmio_dm: + case p2m_populate_on_demand: + case p2m_ram_paging_out: + case p2m_ram_paged: + case p2m_ram_paging_in: + case p2m_ram_paging_in_start: + default: + entry->r = entry->w = entry->x = 0; + break; + case p2m_ram_rw: + entry->r = entry->w = entry->x = 1; + break; + case p2m_mmio_direct: + entry->r = entry->x = 1; + entry->w = !rangeset_contains_singleton(mmio_ro_ranges, + entry->mfn); + break; + case p2m_ram_logdirty: + case p2m_ram_ro: + case p2m_ram_shared: + entry->r = entry->x = 1; + entry->w = 0; + break; + case p2m_grant_map_rw: + entry->r = entry->w = 1; + entry->x = 0; + break; + case p2m_grant_map_ro: + entry->r = 1; + entry->w = entry->x = 0; + break; + } + + + /* Then restrict with access permissions */ + switch (access) + { + case p2m_access_n: + entry->r = entry->w = entry->x = 0; + break; + case p2m_access_r: + entry->w = entry->x = 0; + break; + case p2m_access_w: + entry->r = entry->x = 0; + break; + case p2m_access_x: + entry->r = entry->w = 0; + break; + case p2m_access_rx: + case p2m_access_rx2rw: + entry->w = 0; + break; + case p2m_access_wx: + entry->r = 0; + break; + case p2m_access_rw: + entry->x = 0; + break; + case p2m_access_rwx: + break; + } + +} + +#define GUEST_TABLE_MAP_FAILED 0 +#define GUEST_TABLE_NORMAL_PAGE 1 +#define GUEST_TABLE_SUPER_PAGE 2 +#define GUEST_TABLE_POD_PAGE 3 + +/* Fill in middle levels of ept table */ +static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry) +{ + struct page_info *pg; + + pg = p2m_alloc_ptp(p2m, 0); + if ( pg == NULL ) + return 0; + + ept_entry->epte = 0; + ept_entry->mfn = page_to_mfn(pg); + ept_entry->access = p2m->default_access; + + ept_entry->r = ept_entry->w = ept_entry->x = 1; + + return 1; +} + +/* free ept sub tree behind an entry */ +void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level) +{ + /* End if the entry is a leaf entry. */ + if ( level == 0 || !is_epte_present(ept_entry) || + is_epte_superpage(ept_entry) ) + return; + + if ( level > 1 ) + { + ept_entry_t *epte = map_domain_page(ept_entry->mfn); + for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) + ept_free_entry(p2m, epte + i, level - 1); + unmap_domain_page(epte); + } + + p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn)); +} + +static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry, + int level, int target) +{ + ept_entry_t new_ept, *table; + uint64_t trunk; + int rv = 1; + + /* End if the entry is a leaf entry or reaches the target level. */ + if ( level == 0 || level == target ) + return rv; + + ASSERT(is_epte_superpage(ept_entry)); + + if ( !ept_set_middle_entry(p2m, &new_ept) ) + return 0; + + table = map_domain_page(new_ept.mfn); + trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER); + + for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) + { + ept_entry_t *epte = table + i; + + epte->epte = 0; + epte->emt = ept_entry->emt; + epte->ipat = ept_entry->ipat; + epte->sp = (level > 1) ? 1 : 0; + epte->access = ept_entry->access; + epte->sa_p2mt = ept_entry->sa_p2mt; + epte->mfn = ept_entry->mfn + i * trunk; + epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0; + + ept_p2m_type_to_flags(epte, epte->sa_p2mt, epte->access); + + if ( (level - 1) == target ) + continue; + + ASSERT(is_epte_superpage(epte)); + + if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) ) + break; + } + + unmap_domain_page(table); + + /* Even failed we should install the newly allocated ept page. */ + *ept_entry = new_ept; + + return rv; +} + +/* Take the currently mapped table, find the corresponding gfn entry, + * and map the next table, if available. If the entry is empty + * and read_only is set, + * Return values: + * 0: Failed to map. Either read_only was set and the entry was + * empty, or allocating a new page failed. + * GUEST_TABLE_NORMAL_PAGE: next level mapped normally + * GUEST_TABLE_SUPER_PAGE: + * The next entry points to a superpage, and caller indicates + * that they are going to the superpage level, or are only doing + * a read. + * GUEST_TABLE_POD: + * The next entry is marked populate-on-demand. + */ +static int ept_next_level(struct p2m_domain *p2m, bool_t read_only, + ept_entry_t **table, unsigned long *gfn_remainder, + int next_level) +{ + unsigned long mfn; + ept_entry_t *ept_entry, e; + u32 shift, index; + + shift = next_level * EPT_TABLE_ORDER; + + index = *gfn_remainder >> shift; + + /* index must be falling into the page */ + ASSERT(index < EPT_PAGETABLE_ENTRIES); + + ept_entry = (*table) + index; + + /* ept_next_level() is called (sometimes) without a lock. Read + * the entry once, and act on the "cached" entry after that to + * avoid races. */ + e = atomic_read_ept_entry(ept_entry); + + if ( !is_epte_present(&e) ) + { + if ( e.sa_p2mt == p2m_populate_on_demand ) + return GUEST_TABLE_POD_PAGE; + + if ( read_only ) + return GUEST_TABLE_MAP_FAILED; + + if ( !ept_set_middle_entry(p2m, ept_entry) ) + return GUEST_TABLE_MAP_FAILED; + else + e = atomic_read_ept_entry(ept_entry); /* Refresh */ + } + + /* The only time sp would be set here is if we had hit a superpage */ + if ( is_epte_superpage(&e) ) + return GUEST_TABLE_SUPER_PAGE; + + mfn = e.mfn; + unmap_domain_page(*table); + *table = map_domain_page(mfn); + *gfn_remainder &= (1UL << shift) - 1; + return GUEST_TABLE_NORMAL_PAGE; +} + +/* + * ept_set_entry() computes 'need_modify_vtd_table' for itself, + * by observing whether any gfn->mfn translations are modified. + */ +static int +ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, + unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma) +{ + ept_entry_t *table, *ept_entry = NULL; + unsigned long gfn_remainder = gfn; + unsigned long offset = 0; + u32 index; + int i, target = order / EPT_TABLE_ORDER; + int rv = 0; + int ret = 0; + bool_t direct_mmio = (p2mt == p2m_mmio_direct); + uint8_t ipat = 0; + int need_modify_vtd_table = 1; + int vtd_pte_present = 0; + int needs_sync = 1; + struct domain *d = p2m->domain; + ept_entry_t old_entry = { .epte = 0 }; + + /* + * the caller must make sure: + * 1. passing valid gfn and mfn at order boundary. + * 2. gfn not exceeding guest physical address width. + * 3. passing a valid order. + */ + if ( ((gfn | mfn_x(mfn)) & ((1UL << order) - 1)) || + ((u64)gfn >> ((ept_get_wl(d) + 1) * EPT_TABLE_ORDER)) || + (order % EPT_TABLE_ORDER) ) + return 0; + + ASSERT((target == 2 && hvm_hap_has_1gb(d)) || + (target == 1 && hvm_hap_has_2mb(d)) || + (target == 0)); + + table = map_domain_page(ept_get_asr(d)); + + ASSERT(table != NULL); + + for ( i = ept_get_wl(d); i > target; i-- ) + { + ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i); + if ( !ret ) + goto out; + else if ( ret != GUEST_TABLE_NORMAL_PAGE ) + break; + } + + ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target); + + index = gfn_remainder >> (i * EPT_TABLE_ORDER); + offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1); + + ept_entry = table + index; + + /* In case VT-d uses same page table, this flag is needed by VT-d */ + vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0; + + /* + * If we're here with i > target, we must be at a leaf node, and + * we need to break up the superpage. + * + * If we're here with i == target and i > 0, we need to check to see + * if we're replacing a non-leaf entry (i.e., pointing to an N-1 table) + * with a leaf entry (a 1GiB or 2MiB page), and handle things appropriately. + */ + + if ( i == target ) + { + /* We reached the target level. */ + ept_entry_t new_entry = { .epte = 0 }; + + /* No need to flush if the old entry wasn't valid */ + if ( !is_epte_present(ept_entry) ) + needs_sync = 0; + + /* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB), + * the intermediate tables will be freed below after the ept flush + * + * Read-then-write is OK because we hold the p2m lock. */ + old_entry = *ept_entry; + + if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) || + (p2mt == p2m_ram_paging_in_start) ) + { + /* Construct the new entry, and then write it once */ + new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, + direct_mmio); + + new_entry.ipat = ipat; + new_entry.sp = order ? 1 : 0; + new_entry.sa_p2mt = p2mt; + new_entry.access = p2ma; + new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); + + new_entry.mfn = mfn_x(mfn); + + if ( old_entry.mfn == new_entry.mfn ) + need_modify_vtd_table = 0; + + ept_p2m_type_to_flags(&new_entry, p2mt, p2ma); + } + + atomic_write_ept_entry(ept_entry, new_entry); + } + else + { + /* We need to split the original page. */ + ept_entry_t split_ept_entry; + ept_entry_t new_entry = { .epte = 0 }; + + ASSERT(is_epte_superpage(ept_entry)); + + split_ept_entry = atomic_read_ept_entry(ept_entry); + + if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) ) + { + ept_free_entry(p2m, &split_ept_entry, i); + goto out; + } + + /* now install the newly split ept sub-tree */ + /* NB: please make sure domian is paused and no in-fly VT-d DMA. */ + atomic_write_ept_entry(ept_entry, split_ept_entry); + + /* then move to the level we want to make real changes */ + for ( ; i > target; i-- ) + ept_next_level(p2m, 0, &table, &gfn_remainder, i); + + ASSERT(i == target); + + index = gfn_remainder >> (i * EPT_TABLE_ORDER); + offset = gfn_remainder & ((1UL << (i * EPT_TABLE_ORDER)) - 1); + + ept_entry = table + index; + + new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio); + new_entry.ipat = ipat; + new_entry.sp = i ? 1 : 0; + new_entry.sa_p2mt = p2mt; + new_entry.access = p2ma; + new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); + + /* the caller should take care of the previous page */ + new_entry.mfn = mfn_x(mfn); + + /* Safe to read-then-write because we hold the p2m lock */ + if ( ept_entry->mfn == new_entry.mfn ) + need_modify_vtd_table = 0; + + ept_p2m_type_to_flags(&new_entry, p2mt, p2ma); + + atomic_write_ept_entry(ept_entry, new_entry); + } + + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( mfn_valid(mfn_x(mfn)) && + (gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) ) + p2m->max_mapped_pfn = gfn + (1UL << order) - 1; + + /* Success */ + rv = 1; + +out: + unmap_domain_page(table); + + if ( needs_sync ) + ept_sync_domain(p2m->domain); + + if ( rv && iommu_enabled && need_iommu(p2m->domain) && need_modify_vtd_table ) + { + if ( iommu_hap_pt_share ) + iommu_pte_flush(d, gfn, (u64*)ept_entry, order, vtd_pte_present); + else + { + if ( p2mt == p2m_ram_rw ) + { + if ( order > 0 ) + { + for ( i = 0; i < (1 << order); i++ ) + iommu_map_page( + p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i, + IOMMUF_readable | IOMMUF_writable); + } + else if ( !order ) + iommu_map_page( + p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable); + } + else + { + if ( order > 0 ) + { + for ( i = 0; i < (1 << order); i++ ) + iommu_unmap_page(p2m->domain, gfn - offset + i); + } + else if ( !order ) + iommu_unmap_page(p2m->domain, gfn); + } + } + } + + /* Release the old intermediate tables, if any. This has to be the + last thing we do, after the ept_sync_domain() and removal + from the iommu tables, so as to avoid a potential + use-after-free. */ + if ( is_epte_present(&old_entry) ) + ept_free_entry(p2m, &old_entry, target); + + return rv; +} + +/* Read ept p2m entries */ +static mfn_t ept_get_entry(struct p2m_domain *p2m, + unsigned long gfn, p2m_type_t *t, p2m_access_t* a, + p2m_query_t q) +{ + struct domain *d = p2m->domain; + ept_entry_t *table = map_domain_page(ept_get_asr(d)); + unsigned long gfn_remainder = gfn; + ept_entry_t *ept_entry; + u32 index; + int i; + int ret = 0; + mfn_t mfn = _mfn(INVALID_MFN); + + *t = p2m_mmio_dm; + *a = p2m_access_n; + + /* This pfn is higher than the highest the p2m map currently holds */ + if ( gfn > p2m->max_mapped_pfn ) + goto out; + + /* Should check if gfn obeys GAW here. */ + + for ( i = ept_get_wl(d); i > 0; i-- ) + { + retry: + ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); + if ( !ret ) + goto out; + else if ( ret == GUEST_TABLE_POD_PAGE ) + { + if ( q == p2m_query ) + { + *t = p2m_populate_on_demand; + goto out; + } + + /* Populate this superpage */ + ASSERT(i == 1); + + index = gfn_remainder >> ( i * EPT_TABLE_ORDER); + ept_entry = table + index; + + if ( !ept_pod_check_and_populate(p2m, gfn, + ept_entry, 9, q) ) + goto retry; + else + goto out; + } + else if ( ret == GUEST_TABLE_SUPER_PAGE ) + break; + } + + index = gfn_remainder >> (i * EPT_TABLE_ORDER); + ept_entry = table + index; + + if ( ept_entry->sa_p2mt == p2m_populate_on_demand ) + { + if ( q == p2m_query ) + { + *t = p2m_populate_on_demand; + goto out; + } + + ASSERT(i == 0); + + if ( ept_pod_check_and_populate(p2m, gfn, + ept_entry, 0, q) ) + goto out; + } + + /* Need to check for all-zeroes because typecode 0 is p2m_ram and an + * entirely empty entry shouldn't have RAM type. */ + if ( ept_entry->epte != 0 && ept_entry->sa_p2mt != p2m_invalid ) + { + *t = ept_entry->sa_p2mt; + *a = ept_entry->access; + + mfn = _mfn(ept_entry->mfn); + if ( i ) + { + /* + * We may meet super pages, and to split into 4k pages + * to emulate p2m table + */ + unsigned long split_mfn = mfn_x(mfn) + + (gfn_remainder & + ((1 << (i * EPT_TABLE_ORDER)) - 1)); + mfn = _mfn(split_mfn); + } + } + +out: + unmap_domain_page(table); + return mfn; +} + +/* WARNING: Only caller doesn't care about PoD pages. So this function will + * always return 0 for PoD pages, not populate them. If that becomes necessary, + * pass a p2m_query_t type along to distinguish. */ +static ept_entry_t ept_get_entry_content(struct p2m_domain *p2m, + unsigned long gfn, int *level) +{ + ept_entry_t *table = map_domain_page(ept_get_asr(p2m->domain)); + unsigned long gfn_remainder = gfn; + ept_entry_t *ept_entry; + ept_entry_t content = { .epte = 0 }; + u32 index; + int i; + int ret=0; + + /* This pfn is higher than the highest the p2m map currently holds */ + if ( gfn > p2m->max_mapped_pfn ) + goto out; + + for ( i = ept_get_wl(p2m->domain); i > 0; i-- ) + { + ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); + if ( !ret || ret == GUEST_TABLE_POD_PAGE ) + goto out; + else if ( ret == GUEST_TABLE_SUPER_PAGE ) + break; + } + + index = gfn_remainder >> (i * EPT_TABLE_ORDER); + ept_entry = table + index; + content = *ept_entry; + *level = i; + + out: + unmap_domain_page(table); + return content; +} + +void ept_walk_table(struct domain *d, unsigned long gfn) +{ + struct p2m_domain *p2m = p2m_get_hostp2m(d); + ept_entry_t *table = map_domain_page(ept_get_asr(d)); + unsigned long gfn_remainder = gfn; + + int i; + + gdprintk(XENLOG_ERR, "Walking EPT tables for domain %d gfn %lx\n", + d->domain_id, gfn); + + /* This pfn is higher than the highest the p2m map currently holds */ + if ( gfn > p2m->max_mapped_pfn ) + { + gdprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n", + p2m->max_mapped_pfn); + goto out; + } + + for ( i = ept_get_wl(d); i >= 0; i-- ) + { + ept_entry_t *ept_entry, *next; + u32 index; + + /* Stolen from ept_next_level */ + index = gfn_remainder >> (i*EPT_TABLE_ORDER); + ept_entry = table + index; + + gdprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte); + + if ( (i == 0) || !is_epte_present(ept_entry) || + is_epte_superpage(ept_entry) ) + goto out; + else + { + gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1; + + next = map_domain_page(ept_entry->mfn); + + unmap_domain_page(table); + + table = next; + } + } + +out: + unmap_domain_page(table); + return; +} + +static mfn_t ept_get_entry_current(struct p2m_domain *p2m, + unsigned long gfn, p2m_type_t *t, p2m_access_t *a, + p2m_query_t q) +{ + return ept_get_entry(p2m, gfn, t, a, q); +} + +/* + * To test if the new emt type is the same with old, + * return 1 to not to reset ept entry. + */ +static int need_modify_ept_entry(struct p2m_domain *p2m, unsigned long gfn, + mfn_t mfn, uint8_t o_ipat, uint8_t o_emt, + p2m_type_t p2mt) +{ + uint8_t ipat; + uint8_t emt; + bool_t direct_mmio = (p2mt == p2m_mmio_direct); + + emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio); + + if ( (emt == o_emt) && (ipat == o_ipat) ) + return 0; + + return 1; +} + +void ept_change_entry_emt_with_range(struct domain *d, + unsigned long start_gfn, + unsigned long end_gfn) +{ + unsigned long gfn; + ept_entry_t e; + mfn_t mfn; + int order = 0; + struct p2m_domain *p2m = p2m_get_hostp2m(d); + + p2m_lock(p2m); + for ( gfn = start_gfn; gfn <= end_gfn; gfn++ ) + { + int level = 0; + uint64_t trunk = 0; + + e = ept_get_entry_content(p2m, gfn, &level); + if ( !p2m_has_emt(e.sa_p2mt) ) + continue; + + order = 0; + mfn = _mfn(e.mfn); + + if ( is_epte_superpage(&e) ) + { + while ( level ) + { + trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1; + if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) ) + { + /* gfn assigned with 2M or 1G, and the end covers more than + * the super page areas. + * Set emt for super page. + */ + order = level * EPT_TABLE_ORDER; + if ( need_modify_ept_entry(p2m, gfn, mfn, + e.ipat, e.emt, e.sa_p2mt) ) + ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access); + gfn += trunk; + break; + } + level--; + } + } + else /* gfn assigned with 4k */ + { + if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.sa_p2mt) ) + ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt, e.access); + } + } + p2m_unlock(p2m); +} + +/* + * Walk the whole p2m table, changing any entries of the old type + * to the new type. This is used in hardware-assisted paging to + * quickly enable or diable log-dirty tracking + */ +static void ept_change_entry_type_page(mfn_t ept_page_mfn, int ept_page_level, + p2m_type_t ot, p2m_type_t nt) +{ + ept_entry_t e, *epte = map_domain_page(mfn_x(ept_page_mfn)); + + for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) + { + if ( !is_epte_present(epte + i) ) + continue; + + if ( (ept_page_level > 0) && !is_epte_superpage(epte + i) ) + ept_change_entry_type_page(_mfn(epte[i].mfn), + ept_page_level - 1, ot, nt); + else + { + e = atomic_read_ept_entry(&epte[i]); + if ( e.sa_p2mt != ot ) + continue; + + e.sa_p2mt = nt; + ept_p2m_type_to_flags(&e, nt, e.access); + atomic_write_ept_entry(&epte[i], e); + } + } + + unmap_domain_page(epte); +} + +static void ept_change_entry_type_global(struct p2m_domain *p2m, + p2m_type_t ot, p2m_type_t nt) +{ + struct domain *d = p2m->domain; + if ( ept_get_asr(d) == 0 ) + return; + + BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); + BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); + + ept_change_entry_type_page(_mfn(ept_get_asr(d)), ept_get_wl(d), ot, nt); + + ept_sync_domain(d); +} + +void ept_p2m_init(struct p2m_domain *p2m) +{ + p2m->set_entry = ept_set_entry; + p2m->get_entry = ept_get_entry; + p2m->get_entry_current = ept_get_entry_current; + p2m->change_entry_type_global = ept_change_entry_type_global; +} + +static void ept_dump_p2m_table(unsigned char key) +{ + struct domain *d; + ept_entry_t *table, *ept_entry; + mfn_t mfn; + int order; + int i; + int is_pod; + int ret = 0; + unsigned long index; + unsigned long gfn, gfn_remainder; + unsigned long record_counter = 0; + struct p2m_domain *p2m; + + for_each_domain(d) + { + if ( !hap_enabled(d) ) + continue; + + p2m = p2m_get_hostp2m(d); + printk("\ndomain%d EPT p2m table: \n", d->domain_id); + + for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += (1 << order) ) + { + gfn_remainder = gfn; + mfn = _mfn(INVALID_MFN); + table = map_domain_page(ept_get_asr(d)); + + for ( i = ept_get_wl(d); i > 0; i-- ) + { + ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); + if ( ret != GUEST_TABLE_NORMAL_PAGE ) + break; + } + + order = i * EPT_TABLE_ORDER; + + if ( ret == GUEST_TABLE_MAP_FAILED ) + goto out; + + index = gfn_remainder >> order; + ept_entry = table + index; + if ( ept_entry->sa_p2mt != p2m_invalid ) + { + ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? + ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) : + ( mfn = _mfn(ept_entry->mfn), is_pod = 0 ); + + printk("gfn: %-16lx mfn: %-16lx order: %2d is_pod: %d\n", + gfn, mfn_x(mfn), order, is_pod); + + if ( !(record_counter++ % 100) ) + process_pending_softirqs(); + } +out: + unmap_domain_page(table); + } + } +} + +static struct keyhandler ept_p2m_table = { + .diagnostic = 0, + .u.fn = ept_dump_p2m_table, + .desc = "dump ept p2m table" +}; + +void setup_ept_dump(void) +{ + register_keyhandler('D', &ept_p2m_table); +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/p2m-pod.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/p2m-pod.c Fri May 06 11:15:35 2011 +0100 @@ -0,0 +1,1151 @@ +/****************************************************************************** + * arch/x86/mm/p2m-pod.c + * + * Populate-on-demand p2m entries. + * + * Copyright (c) 2009-2011 Citrix Systems, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/domain.h> +#include <asm/page.h> +#include <asm/paging.h> +#include <asm/p2m.h> +#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */ +#include <xen/iommu.h> +#include <asm/mem_event.h> +#include <public/mem_event.h> +#include <asm/mem_sharing.h> +#include <xen/event.h> +#include <asm/hvm/nestedhvm.h> +#include <asm/hvm/svm/amd-iommu-proto.h> + +/* Printouts */ +#define P2M_PRINTK(_f, _a...) \ + debugtrace_printk("p2m: %s(): " _f, __func__, ##_a) +#define P2M_ERROR(_f, _a...) \ + printk("pg error: %s(): " _f, __func__, ##_a) +#if P2M_DEBUGGING +#define P2M_DEBUG(_f, _a...) \ + debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a) +#else +#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0) +#endif + + +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_to_page +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) +#undef mfn_valid +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) +#undef page_to_mfn +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) + +#if P2M_AUDIT +extern void audit_p2m(struct p2m_domain *p2m, int strict_m2p); +#else +# define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0) +#endif /* P2M_AUDIT */ + +#define SUPERPAGE_PAGES (1UL << 9) +#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) + +/* + * Populate-on-demand functionality + */ + +static int +p2m_pod_cache_add(struct p2m_domain *p2m, + struct page_info *page, + unsigned long order) +{ + int i; + struct page_info *p; + struct domain *d = p2m->domain; + +#ifndef NDEBUG + mfn_t mfn; + + mfn = page_to_mfn(page); + + /* Check to make sure this is a contiguous region */ + if( mfn_x(mfn) & ((1 << order) - 1) ) + { + printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n", + __func__, mfn_x(mfn), order, ((1UL << order) - 1)); + return -1; + } + + for(i=0; i < 1 << order ; i++) { + struct domain * od; + + p = mfn_to_page(_mfn(mfn_x(mfn) + i)); + od = page_get_owner(p); + if(od != d) + { + printk("%s: mfn %lx expected owner d%d, got owner d%d!\n", + __func__, mfn_x(mfn), d->domain_id, + od?od->domain_id:-1); + return -1; + } + } +#endif + + ASSERT(p2m_locked_by_me(p2m)); + + /* + * Pages from domain_alloc and returned by the balloon driver aren't + * guaranteed to be zero; but by reclaiming zero pages, we implicitly + * promise to provide zero pages. So we scrub pages before using. + */ + for ( i = 0; i < (1 << order); i++ ) + { + char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i); + clear_page(b); + unmap_domain_page(b); + } + + spin_lock(&d->page_alloc_lock); + + /* First, take all pages off the domain list */ + for(i=0; i < 1 << order ; i++) + { + p = page + i; + page_list_del(p, &d->page_list); + } + + /* Then add the first one to the appropriate populate-on-demand list */ + switch(order) + { + case 9: + page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */ + p2m->pod.count += 1 << order; + break; + case 0: + page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */ + p2m->pod.count += 1; + break; + default: + BUG(); + } + + /* Ensure that the PoD cache has never been emptied. + * This may cause "zombie domains" since the page will never be freed. */ + BUG_ON( d->arch.relmem != RELMEM_not_started ); + + spin_unlock(&d->page_alloc_lock); + + return 0; +} + +/* Get a page of size order from the populate-on-demand cache. Will break + * down 2-meg pages into singleton pages automatically. Returns null if + * a superpage is requested and no superpages are available. Must be called + * with the d->page_lock held. */ +static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, + unsigned long order) +{ + struct page_info *p = NULL; + int i; + + if ( order == 9 && page_list_empty(&p2m->pod.super) ) + { + return NULL; + } + else if ( order == 0 && page_list_empty(&p2m->pod.single) ) + { + unsigned long mfn; + struct page_info *q; + + BUG_ON( page_list_empty(&p2m->pod.super) ); + + /* Break up a superpage to make single pages. NB count doesn't + * need to be adjusted. */ + p = page_list_remove_head(&p2m->pod.super); + mfn = mfn_x(page_to_mfn(p)); + + for ( i=0; i<SUPERPAGE_PAGES; i++ ) + { + q = mfn_to_page(_mfn(mfn+i)); + page_list_add_tail(q, &p2m->pod.single); + } + } + + switch ( order ) + { + case 9: + BUG_ON( page_list_empty(&p2m->pod.super) ); + p = page_list_remove_head(&p2m->pod.super); + p2m->pod.count -= 1 << order; /* Lock: page_alloc */ + break; + case 0: + BUG_ON( page_list_empty(&p2m->pod.single) ); + p = page_list_remove_head(&p2m->pod.single); + p2m->pod.count -= 1; + break; + default: + BUG(); + } + + /* Put the pages back on the domain page_list */ + for ( i = 0 ; i < (1 << order); i++ ) + { + BUG_ON(page_get_owner(p + i) != p2m->domain); + page_list_add_tail(p + i, &p2m->domain->page_list); + } + + return p; +} + +/* Set the size of the cache, allocating or freeing as necessary. */ +static int +p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible) +{ + struct domain *d = p2m->domain; + int ret = 0; + + /* Increasing the target */ + while ( pod_target > p2m->pod.count ) + { + struct page_info * page; + int order; + + if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES ) + order = 9; + else + order = 0; + retry: + page = alloc_domheap_pages(d, order, 0); + if ( unlikely(page == NULL) ) + { + if ( order == 9 ) + { + /* If we can't allocate a superpage, try singleton pages */ + order = 0; + goto retry; + } + + printk("%s: Unable to allocate domheap page for pod cache. target %lu cachesize %d\n", + __func__, pod_target, p2m->pod.count); + ret = -ENOMEM; + goto out; + } + + p2m_pod_cache_add(p2m, page, order); + + if ( hypercall_preempt_check() && preemptible ) + { + ret = -EAGAIN; + goto out; + } + } + + /* Decreasing the target */ + /* We hold the p2m lock here, so we don't need to worry about + * cache disappearing under our feet. */ + while ( pod_target < p2m->pod.count ) + { + struct page_info * page; + int order, i; + + /* Grab the lock before checking that pod.super is empty, or the last + * entries may disappear before we grab the lock. */ + spin_lock(&d->page_alloc_lock); + + if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES + && !page_list_empty(&p2m->pod.super) ) + order = 9; + else + order = 0; + + page = p2m_pod_cache_get(p2m, order); + + ASSERT(page != NULL); + + spin_unlock(&d->page_alloc_lock); + + /* Then free them */ + for ( i = 0 ; i < (1 << order) ; i++ ) + { + /* Copied from common/memory.c:guest_remove_page() */ + if ( unlikely(!get_page(page+i, d)) ) + { + gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); + ret = -EINVAL; + goto out; + } + + if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) ) + put_page_and_type(page+i); + + if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) ) + put_page(page+i); + + put_page(page+i); + + if ( hypercall_preempt_check() && preemptible ) + { + ret = -EAGAIN; + goto out; + } + } + } + +out: + return ret; +} + +/* + * The "right behavior" here requires some careful thought. First, some + * definitions: + * + M: static_max + * + B: number of pages the balloon driver has ballooned down to. + * + P: Number of populated pages. + * + T: Old target + * + T': New target + * + * The following equations should hold: + * 0 <= P <= T <= B <= M + * d->arch.p2m->pod.entry_count == B - P + * d->tot_pages == P + d->arch.p2m->pod.count + * + * Now we have the following potential cases to cover: + * B <T': Set the PoD cache size equal to the number of outstanding PoD + * entries. The balloon driver will deflate the balloon to give back + * the remainder of the ram to the guest OS. + * T <T'<B : Increase PoD cache size. + * T'<T<=B : Here we have a choice. We can decrease the size of the cache, + * get the memory right away. However, that means every time we + * reduce the memory target we risk the guest attempting to populate the + * memory before the balloon driver has reached its new target. Safer to + * never reduce the cache size here, but only when the balloon driver frees + * PoD ranges. + * + * If there are many zero pages, we could reach the target also by doing + * zero sweeps and marking the ranges PoD; but the balloon driver will have + * to free this memory eventually anyway, so we don't actually gain that much + * by doing so. + * + * NB that the equation (B<T') may require adjustment to the cache + * size as PoD pages are freed as well; i.e., freeing a PoD-backed + * entry when pod.entry_count == pod.count requires us to reduce both + * pod.entry_count and pod.count. + */ +int +p2m_pod_set_mem_target(struct domain *d, unsigned long target) +{ + unsigned pod_target; + struct p2m_domain *p2m = p2m_get_hostp2m(d); + int ret = 0; + unsigned long populated; + + p2m_lock(p2m); + + /* P == B: Nothing to do. */ + if ( p2m->pod.entry_count == 0 ) + goto out; + + /* Don't do anything if the domain is being torn down */ + if ( d->is_dying ) + goto out; + + /* T' < B: Don't reduce the cache size; let the balloon driver + * take care of it. */ + if ( target < d->tot_pages ) + goto out; + + populated = d->tot_pages - p2m->pod.count; + + pod_target = target - populated; + + /* B < T': Set the cache size equal to # of outstanding entries, + * let the balloon driver fill in the rest. */ + if ( pod_target > p2m->pod.entry_count ) + pod_target = p2m->pod.entry_count; + + ASSERT( pod_target >= p2m->pod.count ); + + ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); + +out: + p2m_unlock(p2m); + + return ret; +} + +void +p2m_pod_empty_cache(struct domain *d) +{ + struct p2m_domain *p2m = p2m_get_hostp2m(d); + struct page_info *page; + + /* After this barrier no new PoD activities can happen. */ + BUG_ON(!d->is_dying); + spin_barrier(&p2m->lock); + + spin_lock(&d->page_alloc_lock); + + while ( (page = page_list_remove_head(&p2m->pod.super)) ) + { + int i; + + for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ ) + { + BUG_ON(page_get_owner(page + i) != d); + page_list_add_tail(page + i, &d->page_list); + } + + p2m->pod.count -= SUPERPAGE_PAGES; + } + + while ( (page = page_list_remove_head(&p2m->pod.single)) ) + { + BUG_ON(page_get_owner(page) != d); + page_list_add_tail(page, &d->page_list); + + p2m->pod.count -= 1; + } + + BUG_ON(p2m->pod.count != 0); + + spin_unlock(&d->page_alloc_lock); +} + +int +p2m_pod_offline_or_broken_hit(struct page_info *p) +{ + struct domain *d; + struct p2m_domain *p2m; + struct page_info *q, *tmp; + unsigned long mfn, bmfn; + + if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) + return 0; + + spin_lock(&d->page_alloc_lock); + bmfn = mfn_x(page_to_mfn(p)); + page_list_for_each_safe(q, tmp, &p2m->pod.super) + { + mfn = mfn_x(page_to_mfn(q)); + if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) ) + { + unsigned long i; + page_list_del(q, &p2m->pod.super); + for ( i = 0; i < SUPERPAGE_PAGES; i++) + { + q = mfn_to_page(_mfn(mfn + i)); + page_list_add_tail(q, &p2m->pod.single); + } + page_list_del(p, &p2m->pod.single); + p2m->pod.count--; + goto pod_hit; + } + } + + page_list_for_each_safe(q, tmp, &p2m->pod.single) + { + mfn = mfn_x(page_to_mfn(q)); + if ( mfn == bmfn ) + { + page_list_del(p, &p2m->pod.single); + p2m->pod.count--; + goto pod_hit; + } + } + + spin_unlock(&d->page_alloc_lock); + return 0; + +pod_hit: + page_list_add_tail(p, &d->arch.relmem_list); + spin_unlock(&d->page_alloc_lock); + return 1; +} + +void +p2m_pod_offline_or_broken_replace(struct page_info *p) +{ + struct domain *d; + struct p2m_domain *p2m; + + if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) + return; + + free_domheap_page(p); + + p = alloc_domheap_page(d, 0); + if ( unlikely(!p) ) + return; + + p2m_lock(p2m); + p2m_pod_cache_add(p2m, p, 0); + p2m_unlock(p2m); + return; +} + +/* This function is needed for two reasons: + * + To properly handle clearing of PoD entries + * + To "steal back" memory being freed for the PoD cache, rather than + * releasing it. + * + * Once both of these functions have been completed, we can return and + * allow decrease_reservation() to handle everything else. + */ +int +p2m_pod_decrease_reservation(struct domain *d, + xen_pfn_t gpfn, + unsigned int order) +{ + int ret=0; + int i; + struct p2m_domain *p2m = p2m_get_hostp2m(d); + + int steal_for_cache = 0; + int pod = 0, nonpod = 0, ram = 0; + + + /* If we don't have any outstanding PoD entries, let things take their + * course */ + if ( p2m->pod.entry_count == 0 ) + goto out; + + /* Figure out if we need to steal some freed memory for our cache */ + steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); + + p2m_lock(p2m); + audit_p2m(p2m, 1); + + if ( unlikely(d->is_dying) ) + goto out_unlock; + + /* See what's in here. */ + /* FIXME: Add contiguous; query for PSE entries? */ + for ( i=0; i<(1<<order); i++) + { + p2m_type_t t; + + gfn_to_mfn_query(p2m, gpfn + i, &t); + + if ( t == p2m_populate_on_demand ) + pod++; + else + { + nonpod++; + if ( p2m_is_ram(t) ) + ram++; + } + } + + /* No populate-on-demand? Don't need to steal anything? Then we're done!*/ + if(!pod && !steal_for_cache) + goto out_unlock; + + if ( !nonpod ) + { + /* All PoD: Mark the whole region invalid and tell caller + * we're done. */ + set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access); + p2m->pod.entry_count-=(1<<order); /* Lock: p2m */ + BUG_ON(p2m->pod.entry_count < 0); + ret = 1; + goto out_entry_check; + } + + /* FIXME: Steal contig 2-meg regions for cache */ + + /* Process as long as: + * + There are PoD entries to handle, or + * + There is ram left, and we want to steal it + */ + for ( i=0; + i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0)); + i++) + { + mfn_t mfn; + p2m_type_t t; + + mfn = gfn_to_mfn_query(p2m, gpfn + i, &t); + if ( t == p2m_populate_on_demand ) + { + set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); + p2m->pod.entry_count--; /* Lock: p2m */ + BUG_ON(p2m->pod.entry_count < 0); + pod--; + } + else if ( steal_for_cache && p2m_is_ram(t) ) + { + struct page_info *page; + + ASSERT(mfn_valid(mfn)); + + page = mfn_to_page(mfn); + + set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); + set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); + + p2m_pod_cache_add(p2m, page, 0); + + steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); + + nonpod--; + ram--; + } + } + + /* If there are no more non-PoD entries, tell decrease_reservation() that + * there's nothing left to do. */ + if ( nonpod == 0 ) + ret = 1; + +out_entry_check: + /* If we've reduced our "liabilities" beyond our "assets", free some */ + if ( p2m->pod.entry_count < p2m->pod.count ) + { + p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/); + } + +out_unlock: + audit_p2m(p2m, 1); + p2m_unlock(p2m); + +out: + return ret; +} + +void +p2m_pod_dump_data(struct p2m_domain *p2m) +{ + printk(" PoD entries=%d cachesize=%d\n", + p2m->pod.entry_count, p2m->pod.count); +} + + +/* Search for all-zero superpages to be reclaimed as superpages for the + * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */ +static int +p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn) +{ + mfn_t mfn, mfn0 = _mfn(INVALID_MFN); + p2m_type_t type, type0 = 0; + unsigned long * map = NULL; + int ret=0, reset = 0; + int i, j; + int max_ref = 1; + struct domain *d = p2m->domain; + + if ( !superpage_aligned(gfn) ) + goto out; + + /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ + if ( paging_mode_shadow(d) ) + max_ref++; + + /* Look up the mfns, checking to make sure they're the same mfn + * and aligned, and mapping them. */ + for ( i=0; i<SUPERPAGE_PAGES; i++ ) + { + + mfn = gfn_to_mfn_query(p2m, gfn + i, &type); + + if ( i == 0 ) + { + mfn0 = mfn; + type0 = type; + } + + /* Conditions that must be met for superpage-superpage: + * + All gfns are ram types + * + All gfns have the same type + * + All of the mfns are allocated to a domain + * + None of the mfns are used as pagetables, or allocated via xenheap + * + The first mfn is 2-meg aligned + * + All the other mfns are in sequence + * Adding for good measure: + * + None of the mfns are likely to be mapped elsewhere (refcount + * 2 or less for shadow, 1 for hap) + */ + if ( !p2m_is_ram(type) + || type != type0 + || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 ) + || ( (mfn_to_page(mfn)->count_info & (PGC_page_table|PGC_xen_heap)) != 0 ) + || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap ) != 0 ) + || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref ) + || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) ) + || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) ) + goto out; + } + + /* Now, do a quick check to see if it may be zero before unmapping. */ + for ( i=0; i<SUPERPAGE_PAGES; i++ ) + { + /* Quick zero-check */ + map = map_domain_page(mfn_x(mfn0) + i); + + for ( j=0; j<16; j++ ) + if( *(map+j) != 0 ) + break; + + unmap_domain_page(map); + + if ( j < 16 ) + goto out; + + } + + /* Try to remove the page, restoring old mapping if it fails. */ + set_p2m_entry(p2m, gfn, + _mfn(POPULATE_ON_DEMAND_MFN), 9, + p2m_populate_on_demand, p2m->default_access); + + /* Make none of the MFNs are used elsewhere... for example, mapped + * via the grant table interface, or by qemu. Allow one refcount for + * being allocated to the domain. */ + for ( i=0; i < SUPERPAGE_PAGES; i++ ) + { + mfn = _mfn(mfn_x(mfn0) + i); + if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 ) + { + reset = 1; + goto out_reset; + } + } + + /* Finally, do a full zero-check */ + for ( i=0; i < SUPERPAGE_PAGES; i++ ) + { + map = map_domain_page(mfn_x(mfn0) + i); + + for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ ) + if( *(map+j) != 0 ) + { + reset = 1; + break; + } + + unmap_domain_page(map); + + if ( reset ) + goto out_reset; + } + + if ( tb_init_done ) + { + struct { + u64 gfn, mfn; + int d:16,order:16; + } t; + + t.gfn = gfn; + t.mfn = mfn_x(mfn); + t.d = d->domain_id; + t.order = 9; + + __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); + } + + /* Finally! We've passed all the checks, and can add the mfn superpage + * back on the PoD cache, and account for the new p2m PoD entries */ + p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9); + p2m->pod.entry_count += SUPERPAGE_PAGES; + +out_reset: + if ( reset ) + set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access); + +out: + return ret; +} + +static void +p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count) +{ + mfn_t mfns[count]; + p2m_type_t types[count]; + unsigned long * map[count]; + struct domain *d = p2m->domain; + + int i, j; + int max_ref = 1; + + /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ + if ( paging_mode_shadow(d) ) + max_ref++; + + /* First, get the gfn list, translate to mfns, and map the pages. */ + for ( i=0; i<count; i++ ) + { + mfns[i] = gfn_to_mfn_query(p2m, gfns[i], types + i); + /* If this is ram, and not a pagetable or from the xen heap, and probably not mapped + elsewhere, map it; otherwise, skip. */ + if ( p2m_is_ram(types[i]) + && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) + && ( (mfn_to_page(mfns[i])->count_info & (PGC_page_table|PGC_xen_heap)) == 0 ) + && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) ) + map[i] = map_domain_page(mfn_x(mfns[i])); + else + map[i] = NULL; + } + + /* Then, go through and check for zeroed pages, removing write permission + * for those with zeroes. */ + for ( i=0; i<count; i++ ) + { + if(!map[i]) + continue; + + /* Quick zero-check */ + for ( j=0; j<16; j++ ) + if( *(map[i]+j) != 0 ) + break; + + if ( j < 16 ) + { + unmap_domain_page(map[i]); + map[i] = NULL; + continue; + } + + /* Try to remove the page, restoring old mapping if it fails. */ + set_p2m_entry(p2m, gfns[i], + _mfn(POPULATE_ON_DEMAND_MFN), 0, + p2m_populate_on_demand, p2m->default_access); + + /* See if the page was successfully unmapped. (Allow one refcount + * for being allocated to a domain.) */ + if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 ) + { + unmap_domain_page(map[i]); + map[i] = NULL; + + set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], p2m->default_access); + + continue; + } + } + + /* Now check each page for real */ + for ( i=0; i < count; i++ ) + { + if(!map[i]) + continue; + + for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ ) + if( *(map[i]+j) != 0 ) + break; + + unmap_domain_page(map[i]); + + /* See comment in p2m_pod_zero_check_superpage() re gnttab + * check timing. */ + if ( j < PAGE_SIZE/sizeof(*map[i]) ) + { + set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], p2m->default_access); + } + else + { + if ( tb_init_done ) + { + struct { + u64 gfn, mfn; + int d:16,order:16; + } t; + + t.gfn = gfns[i]; + t.mfn = mfn_x(mfns[i]); + t.d = d->domain_id; + t.order = 0; + + __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); + } + + /* Add to cache, and account for the new p2m PoD entry */ + p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), 0); + p2m->pod.entry_count++; + } + } + +} + +#define POD_SWEEP_LIMIT 1024 +static void +p2m_pod_emergency_sweep_super(struct p2m_domain *p2m) +{ + unsigned long i, start, limit; + + if ( p2m->pod.reclaim_super == 0 ) + { + p2m->pod.reclaim_super = (p2m->pod.max_guest>>9)<<9; + p2m->pod.reclaim_super -= SUPERPAGE_PAGES; + } + + start = p2m->pod.reclaim_super; + limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; + + for ( i=p2m->pod.reclaim_super ; i > 0 ; i -= SUPERPAGE_PAGES ) + { + p2m_pod_zero_check_superpage(p2m, i); + /* Stop if we're past our limit and we have found *something*. + * + * NB that this is a zero-sum game; we're increasing our cache size + * by increasing our 'debt'. Since we hold the p2m lock, + * (entry_count - count) must remain the same. */ + if ( !page_list_empty(&p2m->pod.super) && i < limit ) + break; + } + + p2m->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0; +} + +#define POD_SWEEP_STRIDE 16 +static void +p2m_pod_emergency_sweep(struct p2m_domain *p2m) +{ + unsigned long gfns[POD_SWEEP_STRIDE]; + unsigned long i, j=0, start, limit; + p2m_type_t t; + + + if ( p2m->pod.reclaim_single == 0 ) + p2m->pod.reclaim_single = p2m->pod.max_guest; + + start = p2m->pod.reclaim_single; + limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; + + /* FIXME: Figure out how to avoid superpages */ + for ( i=p2m->pod.reclaim_single; i > 0 ; i-- ) + { + gfn_to_mfn_query(p2m, i, &t ); + if ( p2m_is_ram(t) ) + { + gfns[j] = i; + j++; + BUG_ON(j > POD_SWEEP_STRIDE); + if ( j == POD_SWEEP_STRIDE ) + { + p2m_pod_zero_check(p2m, gfns, j); + j = 0; + } + } + /* Stop if we're past our limit and we have found *something*. + * + * NB that this is a zero-sum game; we're increasing our cache size + * by re-increasing our 'debt'. Since we hold the p2m lock, + * (entry_count - count) must remain the same. */ + if ( p2m->pod.count > 0 && i < limit ) + break; + } + + if ( j ) + p2m_pod_zero_check(p2m, gfns, j); + + p2m->pod.reclaim_single = i ? i - 1 : i; + +} + +int +p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn, + unsigned int order, + p2m_query_t q) +{ + struct domain *d = p2m->domain; + struct page_info *p = NULL; /* Compiler warnings */ + unsigned long gfn_aligned; + mfn_t mfn; + int i; + + ASSERT(p2m_locked_by_me(p2m)); + + /* This check is done with the p2m lock held. This will make sure that + * even if d->is_dying changes under our feet, p2m_pod_empty_cache() + * won't start until we're done. */ + if ( unlikely(d->is_dying) ) + goto out_fail; + + /* Because PoD does not have cache list for 1GB pages, it has to remap + * 1GB region to 2MB chunks for a retry. */ + if ( order == 18 ) + { + gfn_aligned = (gfn >> order) << order; + /* Note that we are supposed to call set_p2m_entry() 512 times to + * split 1GB into 512 2MB pages here. But We only do once here because + * set_p2m_entry() should automatically shatter the 1GB page into + * 512 2MB pages. The rest of 511 calls are unnecessary. + */ + set_p2m_entry(p2m, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9, + p2m_populate_on_demand, p2m->default_access); + audit_p2m(p2m, 1); + p2m_unlock(p2m); + return 0; + } + + /* Once we've ballooned down enough that we can fill the remaining + * PoD entries from the cache, don't sweep even if the particular + * list we want to use is empty: that can lead to thrashing zero pages + * through the cache for no good reason. */ + if ( p2m->pod.entry_count > p2m->pod.count ) + { + + /* If we're low, start a sweep */ + if ( order == 9 && page_list_empty(&p2m->pod.super) ) + p2m_pod_emergency_sweep_super(p2m); + + if ( page_list_empty(&p2m->pod.single) && + ( ( order == 0 ) + || (order == 9 && page_list_empty(&p2m->pod.super) ) ) ) + p2m_pod_emergency_sweep(p2m); + } + + /* Keep track of the highest gfn demand-populated by a guest fault */ + if ( q == p2m_guest && gfn > p2m->pod.max_guest ) + p2m->pod.max_guest = gfn; + + spin_lock(&d->page_alloc_lock); + + if ( p2m->pod.count == 0 ) + goto out_of_memory; + + /* Get a page f/ the cache. A NULL return value indicates that the + * 2-meg range should be marked singleton PoD, and retried */ + if ( (p = p2m_pod_cache_get(p2m, order)) == NULL ) + goto remap_and_retry; + + mfn = page_to_mfn(p); + + BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); + + spin_unlock(&d->page_alloc_lock); + + gfn_aligned = (gfn >> order) << order; + + set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access); + + for( i = 0; i < (1UL << order); i++ ) + { + set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i); + paging_mark_dirty(d, mfn_x(mfn) + i); + } + + p2m->pod.entry_count -= (1 << order); /* Lock: p2m */ + BUG_ON(p2m->pod.entry_count < 0); + + if ( tb_init_done ) + { + struct { + u64 gfn, mfn; + int d:16,order:16; + } t; + + t.gfn = gfn; + t.mfn = mfn_x(mfn); + t.d = d->domain_id; + t.order = order; + + __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); + } + + return 0; +out_of_memory: + spin_unlock(&d->page_alloc_lock); + + printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n", + __func__, d->tot_pages, p2m->pod.entry_count); + domain_crash(d); +out_fail: + return -1; +remap_and_retry: + BUG_ON(order != 9); + spin_unlock(&d->page_alloc_lock); + + /* Remap this 2-meg region in singleton chunks */ + gfn_aligned = (gfn>>order)<<order; + for(i=0; i<(1<<order); i++) + set_p2m_entry(p2m, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0, + p2m_populate_on_demand, p2m->default_access); + if ( tb_init_done ) + { + struct { + u64 gfn; + int d:16; + } t; + + t.gfn = gfn; + t.d = d->domain_id; + + __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t); + } + + return 0; +} + + +int +guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, + unsigned int order) +{ + struct p2m_domain *p2m = p2m_get_hostp2m(d); + unsigned long i; + p2m_type_t ot; + mfn_t omfn; + int pod_count = 0; + int rc = 0; + + BUG_ON(!paging_mode_translate(d)); + + rc = p2m_gfn_check_limit(d, gfn, order); + if ( rc != 0 ) + return rc; + + p2m_lock(p2m); + audit_p2m(p2m, 1); + + P2M_DEBUG("mark pod gfn=%#lx\n", gfn); + + /* Make sure all gpfns are unused */ + for ( i = 0; i < (1UL << order); i++ ) + { + omfn = gfn_to_mfn_query(p2m, gfn + i, &ot); + if ( p2m_is_ram(ot) ) + { + printk("%s: gfn_to_mfn returned type %d!\n", + __func__, ot); + rc = -EBUSY; + goto out; + } + else if ( ot == p2m_populate_on_demand ) + { + /* Count how man PoD entries we'll be replacing if successful */ + pod_count++; + } + } + + /* Now, actually do the two-way mapping */ + if ( !set_p2m_entry(p2m, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order, + p2m_populate_on_demand, p2m->default_access) ) + rc = -EINVAL; + else + { + p2m->pod.entry_count += 1 << order; /* Lock: p2m */ + p2m->pod.entry_count -= pod_count; + BUG_ON(p2m->pod.entry_count < 0); + } + + audit_p2m(p2m, 1); + p2m_unlock(p2m); + +out: + return rc; +} + diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/p2m-pt.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/p2m-pt.c Fri May 06 11:15:35 2011 +0100 @@ -0,0 +1,1301 @@ +/****************************************************************************** + * arch/x86/mm/p2m-pt.c + * + * Implementation of p2m datastructures as pagetables, for use by + * NPT and shadow-pagetable code + * + * Parts of this code are Copyright (c) 2009-2011 by Citrix Systems, Inc. + * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices. + * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/domain.h> +#include <asm/page.h> +#include <asm/paging.h> +#include <asm/p2m.h> +#include <xen/iommu.h> +#include <asm/mem_event.h> +#include <public/mem_event.h> +#include <asm/mem_sharing.h> +#include <xen/event.h> +#include <xen/trace.h> +#include <asm/hvm/nestedhvm.h> +#include <asm/hvm/svm/amd-iommu-proto.h> + +/* Debugging and auditing of the P2M code? */ +#define P2M_AUDIT 0 +#define P2M_DEBUGGING 0 + +/* Printouts */ +#define P2M_PRINTK(_f, _a...) \ + debugtrace_printk("p2m: %s(): " _f, __func__, ##_a) +#define P2M_ERROR(_f, _a...) \ + printk("pg error: %s(): " _f, __func__, ##_a) +#if P2M_DEBUGGING +#define P2M_DEBUG(_f, _a...) \ + debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a) +#else +#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0) +#endif + + +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_to_page +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) +#undef mfn_valid +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) +#undef page_to_mfn +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) + + +/* PTE flags for the various types of p2m entry */ +#define P2M_BASE_FLAGS \ + (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) + +#define SUPERPAGE_PAGES (1UL << 9) +#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) + +unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) +{ + unsigned long flags; +#ifdef __x86_64__ + /* + * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be + * used for iommu hardware to encode next io page level. Bit 59 - bit 62 + * are used for iommu flags, We could not use these bits to store p2m types. + */ + flags = (unsigned long)(t & 0x7f) << 12; +#else + flags = (t & 0x7UL) << 9; +#endif +#ifndef HAVE_GRANT_MAP_P2M + BUG_ON(p2m_is_grant(t)); +#endif + switch(t) + { + case p2m_invalid: + default: + return flags; + case p2m_ram_rw: + case p2m_grant_map_rw: + return flags | P2M_BASE_FLAGS | _PAGE_RW; + case p2m_ram_logdirty: + return flags | P2M_BASE_FLAGS; + case p2m_ram_ro: + case p2m_grant_map_ro: + return flags | P2M_BASE_FLAGS; + case p2m_ram_shared: + return flags | P2M_BASE_FLAGS; + case p2m_mmio_dm: + return flags; + case p2m_mmio_direct: + if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) + flags |= _PAGE_RW; + return flags | P2M_BASE_FLAGS | _PAGE_PCD; + case p2m_populate_on_demand: + return flags; + } +} + +#if P2M_AUDIT +void audit_p2m(struct p2m_domain *p2m, int strict_m2p); +#else +# define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0) +#endif /* P2M_AUDIT */ + +// Find the next level's P2M entry, checking for out-of-range gfn's... +// Returns NULL on error. +// +l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, uint32_t shift, uint32_t max) +{ + u32 index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + P2M_DEBUG("gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +struct page_info * +p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type) +{ + struct page_info *pg; + + ASSERT(p2m); + ASSERT(p2m->domain); + ASSERT(p2m->domain->arch.paging.alloc_page); + pg = p2m->domain->arch.paging.alloc_page(p2m->domain); + if (pg == NULL) + return NULL; + + page_list_add_tail(pg, &p2m->pages); + pg->u.inuse.type_info = type | 1 | PGT_validated; + + return pg; +} + +void +p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg) +{ + ASSERT(pg); + ASSERT(p2m); + ASSERT(p2m->domain); + ASSERT(p2m->domain->arch.paging.free_page); + + page_list_del(pg, &p2m->pages); + p2m->domain->arch.paging.free_page(p2m->domain, pg); + + return; +} + +/* Free intermediate tables from a p2m sub-tree */ +void +p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order) +{ + /* End if the entry is a leaf entry. */ + if ( page_order == 0 + || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) + || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + return; + + if ( page_order > 9 ) + { + l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry)); + for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + p2m_free_entry(p2m, l3_table + i, page_order - 9); + unmap_domain_page(l3_table); + } + + p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry)))); +} + +// Walk one level of the P2M table, allocating a new table if required. +// Returns 0 on error. +// + +/* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */ +#define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21)) + +static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry, + unsigned int nlevel, unsigned int flags) +{ +#if CONFIG_PAGING_LEVELS == 4 + if ( iommu_hap_pt_share ) + l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags)); +#endif +} + +static int +p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table, + unsigned long *gfn_remainder, unsigned long gfn, u32 shift, + u32 max, unsigned long type) +{ + l1_pgentry_t *l1_entry; + l1_pgentry_t *p2m_entry; + l1_pgentry_t new_entry; + void *next; + int i; + + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + /* PoD: Not present doesn't imply empty. */ + if ( !l1e_get_flags(*p2m_entry) ) + { + struct page_info *pg; + + pg = p2m_alloc_ptp(p2m, type); + if ( pg == NULL ) + return 0; + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR | _PAGE_USER); + + switch ( type ) { + case PGT_l3_page_table: + p2m_add_iommu_flags(&new_entry, 3, IOMMUF_readable|IOMMUF_writable); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4); + break; + case PGT_l2_page_table: +#if CONFIG_PAGING_LEVELS == 3 + /* for PAE mode, PDPE only has PCD/PWT/P bits available */ + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT); +#endif + p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); + break; + case PGT_l1_page_table: + p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2); + break; + default: + BUG(); + break; + } + } + + ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE)); + + /* split 1GB pages into 2MB pages */ + if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + unsigned long flags, pfn; + struct page_info *pg; + + pg = p2m_alloc_ptp(p2m, PGT_l2_page_table); + if ( pg == NULL ) + return 0; + + flags = l1e_get_flags(*p2m_entry); + pfn = l1e_get_pfn(*p2m_entry); + + l1_entry = map_domain_page(mfn_x(page_to_mfn(pg))); + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + { + new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags); + p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); + p2m->write_p2m_entry(p2m, gfn, + l1_entry+i, *table_mfn, new_entry, 2); + } + unmap_domain_page(l1_entry); + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE + p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); + } + + + /* split single 2MB large page into 4KB page in P2M table */ + if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + unsigned long flags, pfn; + struct page_info *pg; + + pg = p2m_alloc_ptp(p2m, PGT_l1_page_table); + if ( pg == NULL ) + return 0; + + /* New splintered mappings inherit the flags of the old superpage, + * with a little reorganisation for the _PAGE_PSE_PAT bit. */ + flags = l1e_get_flags(*p2m_entry); + pfn = l1e_get_pfn(*p2m_entry); + if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */ + pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */ + else + flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */ + + l1_entry = __map_domain_page(pg); + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + new_entry = l1e_from_pfn(pfn + i, flags); + p2m_add_iommu_flags(&new_entry, 0, 0); + p2m->write_p2m_entry(p2m, gfn, + l1_entry+i, *table_mfn, new_entry, 1); + } + unmap_domain_page(l1_entry); + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); + p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); + p2m->write_p2m_entry(p2m, gfn, + p2m_entry, *table_mfn, new_entry, 2); + } + + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); + next = map_domain_page(mfn_x(*table_mfn)); + unmap_domain_page(*table); + *table = next; + + return 1; +} + +// Returns 0 on error (out of memory) +static int +p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, + unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) +{ + // XXX -- this might be able to be faster iff current->domain == d + mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); + void *table =map_domain_page(mfn_x(table_mfn)); + unsigned long i, gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + l1_pgentry_t entry_content; + l2_pgentry_t l2e_content; + l3_pgentry_t l3e_content; + int rv=0; + unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ? + IOMMUF_readable|IOMMUF_writable: + 0; + unsigned long old_mfn = 0; + + if ( tb_init_done ) + { + struct { + u64 gfn, mfn; + int p2mt; + int d:16,order:16; + } t; + + t.gfn = gfn; + t.mfn = mfn_x(mfn); + t.p2mt = p2mt; + t.d = p2m->domain->domain_id; + t.order = page_order; + + __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t); + } + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + goto out; +#endif + /* + * Try to allocate 1GB page table if this feature is supported. + */ + if ( page_order == 18 ) + { + l1_pgentry_t old_entry = l1e_empty(); + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + L3_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + /* We're replacing a non-SP page with a superpage. Make sure to + * handle freeing the table properly. */ + old_entry = *p2m_entry; + } + + ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); + l3e_content = mfn_valid(mfn) + ? l3e_from_pfn(mfn_x(mfn), + p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) + : l3e_empty(); + entry_content.l1 = l3e_content.l3; + + if ( entry_content.l1 != 0 ) + { + p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); + old_mfn = l1e_get_pfn(*p2m_entry); + } + + p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3); + /* NB: paging_write_p2m_entry() handles tlb flushes properly */ + + /* Free old intermediate tables if necessary */ + if ( l1e_get_flags(old_entry) & _PAGE_PRESENT ) + p2m_free_entry(p2m, &old_entry, page_order); + } + /* + * When using PAE Xen, we only allow 33 bits of pseudo-physical + * address in translated guests (i.e. 8 GBytes). This restriction + * comes from wanting to map the P2M table into the 16MB RO_MPT hole + * in Xen's address space for translated PV guests. + * When using AMD's NPT on PAE Xen, we are restricted to 4GB. + */ + else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + ((CONFIG_PAGING_LEVELS == 3) + ? (hap_enabled(p2m->domain) ? 4 : 8) + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + goto out; + + if ( page_order == 0 ) + { + if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + goto out; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) + entry_content = l1e_from_pfn(mfn_x(mfn), + p2m_type_to_flags(p2mt, mfn)); + else + entry_content = l1e_empty(); + + if ( entry_content.l1 != 0 ) + { + p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); + old_mfn = l1e_get_pfn(*p2m_entry); + } + /* level 1 entry */ + p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1); + /* NB: paging_write_p2m_entry() handles tlb flushes properly */ + } + else if ( page_order == 9 ) + { + l1_pgentry_t old_entry = l1e_empty(); + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + /* FIXME: Deal with 4k replaced by 2meg pages */ + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + /* We're replacing a non-SP page with a superpage. Make sure to + * handle freeing the table properly. */ + old_entry = *p2m_entry; + } + + ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); + if ( mfn_valid(mfn) || p2m_is_magic(p2mt) ) + l2e_content = l2e_from_pfn(mfn_x(mfn), + p2m_type_to_flags(p2mt, mfn) | + _PAGE_PSE); + else + l2e_content = l2e_empty(); + + entry_content.l1 = l2e_content.l2; + + if ( entry_content.l1 != 0 ) + { + p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); + old_mfn = l1e_get_pfn(*p2m_entry); + } + + p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2); + /* NB: paging_write_p2m_entry() handles tlb flushes properly */ + + /* Free old intermediate tables if necessary */ + if ( l1e_get_flags(old_entry) & _PAGE_PRESENT ) + p2m_free_entry(p2m, &old_entry, page_order); + } + + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( mfn_valid(mfn) + && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) ) + p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1; + + if ( iommu_enabled && need_iommu(p2m->domain) ) + { + if ( iommu_hap_pt_share ) + { + if ( old_mfn && (old_mfn != mfn_x(mfn)) ) + amd_iommu_flush_pages(p2m->domain, gfn, page_order); + } + else + { + if ( p2mt == p2m_ram_rw ) + for ( i = 0; i < (1UL << page_order); i++ ) + iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i, + IOMMUF_readable|IOMMUF_writable); + else + for ( int i = 0; i < (1UL << page_order); i++ ) + iommu_unmap_page(p2m->domain, gfn+i); + } + } + + /* Success */ + rv = 1; + +out: + unmap_domain_page(table); + return rv; +} + + +/* Non-ept "lock-and-check" wrapper */ +static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long gfn, + l1_pgentry_t *p2m_entry, int order, + p2m_query_t q) +{ + /* Only take the lock if we don't already have it. Otherwise it + * wouldn't be safe to do p2m lookups with the p2m lock held */ + int do_locking = !p2m_locked_by_me(p2m); + int r; + + if ( do_locking ) + p2m_lock(p2m); + + audit_p2m(p2m, 1); + + /* Check to make sure this is still PoD */ + if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand ) + { + if ( do_locking ) + p2m_unlock(p2m); + return 0; + } + + r = p2m_pod_demand_populate(p2m, gfn, order, q); + + audit_p2m(p2m, 1); + if ( do_locking ) + p2m_unlock(p2m); + + return r; +} + + +static mfn_t +p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, + p2m_query_t q) +{ + mfn_t mfn; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(paging_mode_translate(p2m->domain)); + + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + *t = p2m_mmio_dm; + /* Not implemented except with EPT */ + *a = p2m_access_rwx; + + mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); + + if ( gfn > p2m->max_mapped_pfn ) + /* This pfn is higher than the highest the p2m map currently holds */ + return _mfn(INVALID_MFN); + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + unmap_domain_page(l4e); + } +#endif + { + l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); +#if CONFIG_PAGING_LEVELS == 3 + /* On PAE hosts the p2m has eight l3 entries, not four (see + * shadow_set_p2m_entry()) so we can't use l3_table_offset. + * Instead, just count the number of l3es from zero. It's safe + * to do this because we already checked that the gfn is within + * the bounds of the p2m. */ + l3e += (addr >> L3_PAGETABLE_SHIFT); +#else + l3e += l3_table_offset(addr); +#endif +pod_retry_l3: + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand ) + { + if ( q != p2m_query ) + { + if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) ) + goto pod_retry_l3; + } + else + *t = p2m_populate_on_demand; + } + unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) + { + mfn = _mfn(l3e_get_pfn(*l3e) + + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + + l1_table_offset(addr)); + *t = p2m_flags_to_type(l3e_get_flags(*l3e)); + unmap_domain_page(l3e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); + } + + mfn = _mfn(l3e_get_pfn(*l3e)); + unmap_domain_page(l3e); + } + + l2e = map_domain_page(mfn_x(mfn)); + l2e += l2_table_offset(addr); + +pod_retry_l2: + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + /* PoD: Try to populate a 2-meg chunk */ + if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) + { + if ( q != p2m_query ) { + if ( !p2m_pod_check_and_populate(p2m, gfn, + (l1_pgentry_t *)l2e, 9, q) ) + goto pod_retry_l2; + } else + *t = p2m_populate_on_demand; + } + + unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) + { + mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); + *t = p2m_flags_to_type(l2e_get_flags(*l2e)); + unmap_domain_page(l2e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); + } + + mfn = _mfn(l2e_get_pfn(*l2e)); + unmap_domain_page(l2e); + + l1e = map_domain_page(mfn_x(mfn)); + l1e += l1_table_offset(addr); +pod_retry_l1: + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + /* PoD: Try to populate */ + if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand ) + { + if ( q != p2m_query ) { + if ( !p2m_pod_check_and_populate(p2m, gfn, + (l1_pgentry_t *)l1e, 0, q) ) + goto pod_retry_l1; + } else + *t = p2m_populate_on_demand; + } + + unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + *t = p2m_flags_to_type(l1e_get_flags(*l1e)); + unmap_domain_page(l1e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN); +} + +/* Read the current domain's p2m table (through the linear mapping). */ +static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m, + unsigned long gfn, p2m_type_t *t, p2m_access_t *a, + p2m_query_t q) +{ + mfn_t mfn = _mfn(INVALID_MFN); + p2m_type_t p2mt = p2m_mmio_dm; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + + /* Not currently implemented except for EPT */ + *a = p2m_access_rwx; + + if ( gfn <= p2m->max_mapped_pfn ) + { + l1_pgentry_t l1e = l1e_empty(), *p2m_entry; + l2_pgentry_t l2e = l2e_empty(); + int ret; +#if CONFIG_PAGING_LEVELS >= 4 + l3_pgentry_t l3e = l3e_empty(); +#endif + + ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) + / sizeof(l1_pgentry_t)); + +#if CONFIG_PAGING_LEVELS >= 4 + /* + * Read & process L3 + */ + p2m_entry = (l1_pgentry_t *) + &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START) + + l3_linear_offset(addr)]; + pod_retry_l3: + ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e)); + + if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + { + if ( (l3e_get_flags(l3e) & _PAGE_PSE) && + (p2m_flags_to_type(l3e_get_flags(l3e)) == p2m_populate_on_demand) ) + { + /* The read has succeeded, so we know that mapping exists */ + if ( q != p2m_query ) + { + if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) ) + goto pod_retry_l3; + p2mt = p2m_invalid; + printk("%s: Allocate 1GB failed!\n", __func__); + goto out; + } + else + { + p2mt = p2m_populate_on_demand; + goto out; + } + } + goto pod_retry_l2; + } + + if ( l3e_get_flags(l3e) & _PAGE_PSE ) + { + p2mt = p2m_flags_to_type(l3e_get_flags(l3e)); + ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt)); + if (p2m_is_valid(p2mt) ) + mfn = _mfn(l3e_get_pfn(l3e) + + l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + + l1_table_offset(addr)); + else + p2mt = p2m_mmio_dm; + + goto out; + } +#endif + /* + * Read & process L2 + */ + p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + + l2_linear_offset(addr)]; + + pod_retry_l2: + ret = __copy_from_user(&l2e, + p2m_entry, + sizeof(l2e)); + if ( ret != 0 + || !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + if( (l2e_get_flags(l2e) & _PAGE_PSE) + && ( p2m_flags_to_type(l2e_get_flags(l2e)) + == p2m_populate_on_demand ) ) + { + /* The read has succeeded, so we know that the mapping + * exits at this point. */ + if ( q != p2m_query ) + { + if ( !p2m_pod_check_and_populate(p2m, gfn, + p2m_entry, 9, q) ) + goto pod_retry_l2; + + /* Allocate failed. */ + p2mt = p2m_invalid; + printk("%s: Allocate failed!\n", __func__); + goto out; + } + else + { + p2mt = p2m_populate_on_demand; + goto out; + } + } + + goto pod_retry_l1; + } + + if (l2e_get_flags(l2e) & _PAGE_PSE) + { + p2mt = p2m_flags_to_type(l2e_get_flags(l2e)); + ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt)); + + if ( p2m_is_valid(p2mt) ) + mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr)); + else + p2mt = p2m_mmio_dm; + + goto out; + } + + /* + * Read and process L1 + */ + + /* Need to __copy_from_user because the p2m is sparse and this + * part might not exist */ + pod_retry_l1: + p2m_entry = &phys_to_machine_mapping[gfn]; + + ret = __copy_from_user(&l1e, + p2m_entry, + sizeof(l1e)); + + if ( ret == 0 ) { + p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); + ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); + + if ( p2m_flags_to_type(l1e_get_flags(l1e)) + == p2m_populate_on_demand ) + { + /* The read has succeeded, so we know that the mapping + * exits at this point. */ + if ( q != p2m_query ) + { + if ( !p2m_pod_check_and_populate(p2m, gfn, + (l1_pgentry_t *)p2m_entry, 0, q) ) + goto pod_retry_l1; + + /* Allocate failed. */ + p2mt = p2m_invalid; + goto out; + } + else + { + p2mt = p2m_populate_on_demand; + goto out; + } + } + + if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) ) + mfn = _mfn(l1e_get_pfn(l1e)); + else + /* XXX see above */ + p2mt = p2m_mmio_dm; + } + } +out: + *t = p2mt; + return mfn; +} + +/* Walk the whole p2m table, changing any entries of the old type + * to the new type. This is used in hardware-assisted paging to + * quickly enable or diable log-dirty tracking */ +void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt) +{ + unsigned long mfn, gfn, flags; + l1_pgentry_t l1e_content; + l1_pgentry_t *l1e; + l2_pgentry_t *l2e; + mfn_t l1mfn, l2mfn, l3mfn; + unsigned long i1, i2, i3; + l3_pgentry_t *l3e; +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + unsigned long i4; +#endif /* CONFIG_PAGING_LEVELS == 4 */ + + BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); + BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); + + if ( !paging_mode_translate(p2m->domain) ) + return; + + if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 ) + return; + + ASSERT(p2m_locked_by_me(p2m)); + +#if CONFIG_PAGING_LEVELS == 4 + l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); +#else /* CONFIG_PAGING_LEVELS == 3 */ + l3mfn = _mfn(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); + l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); +#endif + +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + continue; + } + l3mfn = _mfn(l4e_get_pfn(l4e[i4])); + l3e = map_domain_page(l4e_get_pfn(l4e[i4])); +#endif + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + continue; + } + if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) ) + { + flags = l3e_get_flags(l3e[i3]); + if ( p2m_flags_to_type(flags) != ot ) + continue; + mfn = l3e_get_pfn(l3e[i3]); + gfn = get_gpfn_from_mfn(mfn); + flags = p2m_type_to_flags(nt, _mfn(mfn)); + l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); + p2m->write_p2m_entry(p2m, gfn, + (l1_pgentry_t *)&l3e[i3], + l3mfn, l1e_content, 3); + continue; + } + + l2mfn = _mfn(l3e_get_pfn(l3e[i3])); + l2e = map_domain_page(l3e_get_pfn(l3e[i3])); + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + continue; + } + + if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) ) + { + flags = l2e_get_flags(l2e[i2]); + if ( p2m_flags_to_type(flags) != ot ) + continue; + mfn = l2e_get_pfn(l2e[i2]); + /* Do not use get_gpfn_from_mfn because it may return + SHARED_M2P_ENTRY */ + gfn = (i2 + (i3 +#if CONFIG_PAGING_LEVELS >= 4 + + (i4 * L3_PAGETABLE_ENTRIES) +#endif + ) + * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; + flags = p2m_type_to_flags(nt, _mfn(mfn)); + l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); + p2m->write_p2m_entry(p2m, gfn, + (l1_pgentry_t *)&l2e[i2], + l2mfn, l1e_content, 2); + continue; + } + + l1mfn = _mfn(l2e_get_pfn(l2e[i2])); + l1e = map_domain_page(mfn_x(l1mfn)); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + flags = l1e_get_flags(l1e[i1]); + if ( p2m_flags_to_type(flags) != ot ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + gfn = i1 + (i2 + (i3 +#if CONFIG_PAGING_LEVELS >= 4 + + (i4 * L3_PAGETABLE_ENTRIES) +#endif + ) + * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; + /* create a new 1le entry with the new type */ + flags = p2m_type_to_flags(nt, _mfn(mfn)); + l1e_content = l1e_from_pfn(mfn, flags); + p2m->write_p2m_entry(p2m, gfn, &l1e[i1], + l1mfn, l1e_content, 1); + } + unmap_domain_page(l1e); + } + unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + unmap_domain_page(l3e); + } +#endif + +#if CONFIG_PAGING_LEVELS == 4 + unmap_domain_page(l4e); +#else /* CONFIG_PAGING_LEVELS == 3 */ + unmap_domain_page(l3e); +#endif + +} + +/* Set up the p2m function pointers for pagetable format */ +void p2m_pt_init(struct p2m_domain *p2m) +{ + p2m->set_entry = p2m_set_entry; + p2m->get_entry = p2m_gfn_to_mfn; + p2m->get_entry_current = p2m_gfn_to_mfn_current; + p2m->change_entry_type_global = p2m_change_type_global; + p2m->write_p2m_entry = paging_write_p2m_entry; +} + + +#if P2M_AUDIT +/* strict_m2p == 0 allows m2p mappings that don'#t match the p2m. + * It's intended for add_to_physmap, when the domain has just been allocated + * new mfns that might have stale m2p entries from previous owners */ +void audit_p2m(struct p2m_domain *p2m, int strict_m2p) +{ + struct page_info *page; + struct domain *od; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0; + int entry_count = 0; + mfn_t p2mfn; + unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; + int test_linear; + p2m_type_t type; + struct domain *d = p2m->domain; + + if ( !paging_mode_translate(d) ) + return; + + //P2M_PRINTK("p2m audit starts\n"); + + test_linear = ( (d == current->domain) + && !pagetable_is_null(current->arch.monitor_table) ); + if ( test_linear ) + flush_tlb_local(); + + spin_lock(&d->page_alloc_lock); + + /* Audit part one: walk the domain's page allocation list, checking + * the m2p entries. */ + page_list_for_each ( page, &d->page_list ) + { + mfn = mfn_x(page_to_mfn(page)); + + // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn); + + od = page_get_owner(page); + + if ( od != d ) + { + P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", + mfn, od, (od?od->domain_id:-1), d, d->domain_id); + continue; + } + + gfn = get_gpfn_from_mfn(mfn); + if ( gfn == INVALID_M2P_ENTRY ) + { + orphans_i++; + //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", + // mfn); + continue; + } + + if ( gfn == 0x55555555 || gfn == 0x5555555555555555 ) + { + orphans_d++; + //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", + // mfn); + continue; + } + + if ( gfn == SHARED_M2P_ENTRY ) + { + P2M_PRINTK("shared mfn (%lx) on domain page list!\n", + mfn); + continue; + } + + p2mfn = gfn_to_mfn_type_p2m(p2m, gfn, &type, p2m_query); + if ( strict_m2p && mfn_x(p2mfn) != mfn ) + { + mpbad++; + P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" + " (-> gfn %#lx)\n", + mfn, gfn, mfn_x(p2mfn), + (mfn_valid(p2mfn) + ? get_gpfn_from_mfn(mfn_x(p2mfn)) + : -1u)); + /* This m2p entry is stale: the domain has another frame in + * this physical slot. No great disaster, but for neatness, + * blow away the m2p entry. */ + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); + } + + if ( test_linear && (gfn <= p2m->max_mapped_pfn) ) + { + lp2mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &type)); + if ( lp2mfn != mfn_x(p2mfn) ) + { + P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " + "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn)); + } + } + + // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", + // mfn, gfn, mfn_x(p2mfn), lp2mfn); + } + + spin_unlock(&d->page_alloc_lock); + + /* Audit part two: walk the domain's p2m table, checking the entries. */ + if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 ) + { + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + int i1, i2; + +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + int i4, i3; + l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); +#else /* CONFIG_PAGING_LEVELS == 3 */ + l3_pgentry_t *l3e; + int i3; + l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); +#endif + + gfn = 0; +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4])))); +#endif + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + + /* check for 1GB super page */ + if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE ) + { + mfn = l3e_get_pfn(l3e[i3]); + ASSERT(mfn_valid(_mfn(mfn))); + /* we have to cover 512x512 4K pages */ + for ( i2 = 0; + i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES); + i2++) + { + m2pfn = get_gpfn_from_mfn(mfn+i2); + if ( m2pfn != (gfn + i2) ) + { + pmbad++; + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn+i2, mfn+i2, + m2pfn); + BUG(); + } + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + } + + l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3])))); + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) + && ( p2m_flags_to_type(l2e_get_flags(l2e[i2])) + == p2m_populate_on_demand ) ) + entry_count+=SUPERPAGE_PAGES; + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + + /* check for super page */ + if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE ) + { + mfn = l2e_get_pfn(l2e[i2]); + ASSERT(mfn_valid(_mfn(mfn))); + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++) + { + m2pfn = get_gpfn_from_mfn(mfn+i1); + /* Allow shared M2Ps */ + if ( (m2pfn != (gfn + i1)) && + (m2pfn != SHARED_M2P_ENTRY) ) + { + pmbad++; + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn+i1, mfn+i1, + m2pfn); + BUG(); + } + } + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + + l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2])))); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + p2m_type_t type; + + type = p2m_flags_to_type(l1e_get_flags(l1e[i1])); + if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) + { + if ( type == p2m_populate_on_demand ) + entry_count++; + continue; + } + mfn = l1e_get_pfn(l1e[i1]); + ASSERT(mfn_valid(_mfn(mfn))); + m2pfn = get_gpfn_from_mfn(mfn); + if ( m2pfn != gfn && + type != p2m_mmio_direct && + !p2m_is_grant(type) && + !p2m_is_shared(type) ) + { + pmbad++; + printk("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + BUG(); + } + } + unmap_domain_page(l1e); + } + unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + unmap_domain_page(l3e); + } +#endif + +#if CONFIG_PAGING_LEVELS == 4 + unmap_domain_page(l4e); +#else /* CONFIG_PAGING_LEVELS == 3 */ + unmap_domain_page(l3e); +#endif + + } + + if ( entry_count != p2m->pod.entry_count ) + { + printk("%s: refcounted entry count %d, audit count %d!\n", + __func__, + p2m->pod.entry_count, + entry_count); + BUG(); + } + + //P2M_PRINTK("p2m audit complete\n"); + //if ( orphans_i | orphans_d | mpbad | pmbad ) + // P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", + // orphans_i + orphans_d, orphans_i, orphans_d); + if ( mpbad | pmbad ) + { + P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", + pmbad, mpbad); + WARN(); + } +} +#endif /* P2M_AUDIT */ + diff -r d9982136d8fa -r 19452acd2304 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c Mon May 09 15:00:57 2011 +0100 +++ b/xen/arch/x86/mm/p2m.c Fri May 06 11:15:35 2011 +0100 @@ -37,10 +37,6 @@ #include <asm/hvm/nestedhvm.h> #include <asm/hvm/svm/amd-iommu-proto.h> -/* Debugging and auditing of the P2M code? */ -#define P2M_AUDIT 0 -#define P2M_DEBUGGING 0 - /* turn on/off 1GB host page table support for hap, default on */ static bool_t __read_mostly opt_hap_1gb = 1; boolean_param("hap_1gb", opt_hap_1gb); @@ -69,1853 +65,14 @@ #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) - -/* PTE flags for the various types of p2m entry */ -#define P2M_BASE_FLAGS \ - (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) - -#define SUPERPAGE_PAGES (1UL << 9) -#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) - -unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) -{ - unsigned long flags; -#ifdef __x86_64__ - /* - * AMD IOMMU: When we share p2m table with iommu, bit 9 - bit 11 will be - * used for iommu hardware to encode next io page level. Bit 59 - bit 62 - * are used for iommu flags, We could not use these bits to store p2m types. - */ - flags = (unsigned long)(t & 0x7f) << 12; -#else - flags = (t & 0x7UL) << 9; -#endif -#ifndef HAVE_GRANT_MAP_P2M - BUG_ON(p2m_is_grant(t)); -#endif - switch(t) - { - case p2m_invalid: - default: - return flags; - case p2m_ram_rw: - case p2m_grant_map_rw: - return flags | P2M_BASE_FLAGS | _PAGE_RW; - case p2m_ram_logdirty: - return flags | P2M_BASE_FLAGS; - case p2m_ram_ro: - case p2m_grant_map_ro: - return flags | P2M_BASE_FLAGS; - case p2m_ram_shared: - return flags | P2M_BASE_FLAGS; - case p2m_mmio_dm: - return flags; - case p2m_mmio_direct: - if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn_x(mfn)) ) - flags |= _PAGE_RW; - return flags | P2M_BASE_FLAGS | _PAGE_PCD; - case p2m_populate_on_demand: - return flags; - } -} - #if P2M_AUDIT -static void audit_p2m(struct p2m_domain *p2m, int strict_m2p); +extern void audit_p2m(struct p2m_domain *p2m, int strict_m2p); #else # define audit_p2m(_p2m, _m2p) do { (void)(_p2m),(_m2p); } while (0) #endif /* P2M_AUDIT */ -// Find the next level's P2M entry, checking for out-of-range gfn's... -// Returns NULL on error. -// -l1_pgentry_t * -p2m_find_entry(void *table, unsigned long *gfn_remainder, - unsigned long gfn, uint32_t shift, uint32_t max) -{ - u32 index; - - index = *gfn_remainder >> shift; - if ( index >= max ) - { - P2M_DEBUG("gfn=0x%lx out of range " - "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", - gfn, *gfn_remainder, shift, index, max); - return NULL; - } - *gfn_remainder &= (1 << shift) - 1; - return (l1_pgentry_t *)table + index; -} - -struct page_info * -p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type) -{ - struct page_info *pg; - - ASSERT(p2m); - ASSERT(p2m->domain); - ASSERT(p2m->domain->arch.paging.alloc_page); - pg = p2m->domain->arch.paging.alloc_page(p2m->domain); - if (pg == NULL) - return NULL; - - page_list_add_tail(pg, &p2m->pages); - pg->u.inuse.type_info = type | 1 | PGT_validated; - - return pg; -} - -void -p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg) -{ - ASSERT(pg); - ASSERT(p2m); - ASSERT(p2m->domain); - ASSERT(p2m->domain->arch.paging.free_page); - - page_list_del(pg, &p2m->pages); - p2m->domain->arch.paging.free_page(p2m->domain, pg); - - return; -} - -/* Free intermediate tables from a p2m sub-tree */ -void -p2m_free_entry(struct p2m_domain *p2m, l1_pgentry_t *p2m_entry, int page_order) -{ - /* End if the entry is a leaf entry. */ - if ( page_order == 0 - || !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) - || (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) - return; - - if ( page_order > 9 ) - { - l1_pgentry_t *l3_table = map_domain_page(l1e_get_pfn(*p2m_entry)); - for ( int i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) - p2m_free_entry(p2m, l3_table + i, page_order - 9); - unmap_domain_page(l3_table); - } - - p2m_free_ptp(p2m, mfn_to_page(_mfn(l1e_get_pfn(*p2m_entry)))); -} - -// Walk one level of the P2M table, allocating a new table if required. -// Returns 0 on error. -// - -/* AMD IOMMU: Convert next level bits and r/w bits into 24 bits p2m flags */ -#define iommu_nlevel_to_flags(nl, f) ((((nl) & 0x7) << 9 )|(((f) & 0x3) << 21)) - -static void p2m_add_iommu_flags(l1_pgentry_t *p2m_entry, - unsigned int nlevel, unsigned int flags) -{ -#if CONFIG_PAGING_LEVELS == 4 - if ( iommu_hap_pt_share ) - l1e_add_flags(*p2m_entry, iommu_nlevel_to_flags(nlevel, flags)); -#endif -} - -static int -p2m_next_level(struct p2m_domain *p2m, mfn_t *table_mfn, void **table, - unsigned long *gfn_remainder, unsigned long gfn, u32 shift, - u32 max, unsigned long type) -{ - l1_pgentry_t *l1_entry; - l1_pgentry_t *p2m_entry; - l1_pgentry_t new_entry; - void *next; - int i; - - if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, - shift, max)) ) - return 0; - - /* PoD: Not present doesn't imply empty. */ - if ( !l1e_get_flags(*p2m_entry) ) - { - struct page_info *pg; - - pg = p2m_alloc_ptp(p2m, type); - if ( pg == NULL ) - return 0; - - new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), - __PAGE_HYPERVISOR | _PAGE_USER); - - switch ( type ) { - case PGT_l3_page_table: - p2m_add_iommu_flags(&new_entry, 3, IOMMUF_readable|IOMMUF_writable); - p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4); - break; - case PGT_l2_page_table: -#if CONFIG_PAGING_LEVELS == 3 - /* for PAE mode, PDPE only has PCD/PWT/P bits available */ - new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT); -#endif - p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable); - p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); - break; - case PGT_l1_page_table: - p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); - p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2); - break; - default: - BUG(); - break; - } - } - - ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE)); - - /* split 1GB pages into 2MB pages */ - if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) - { - unsigned long flags, pfn; - struct page_info *pg; - - pg = p2m_alloc_ptp(p2m, PGT_l2_page_table); - if ( pg == NULL ) - return 0; - - flags = l1e_get_flags(*p2m_entry); - pfn = l1e_get_pfn(*p2m_entry); - - l1_entry = map_domain_page(mfn_x(page_to_mfn(pg))); - for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) - { - new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags); - p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); - p2m->write_p2m_entry(p2m, gfn, - l1_entry+i, *table_mfn, new_entry, 2); - } - unmap_domain_page(l1_entry); - new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), - __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE - p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable); - p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); - } - - - /* split single 2MB large page into 4KB page in P2M table */ - if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) - { - unsigned long flags, pfn; - struct page_info *pg; - - pg = p2m_alloc_ptp(p2m, PGT_l1_page_table); - if ( pg == NULL ) - return 0; - - /* New splintered mappings inherit the flags of the old superpage, - * with a little reorganisation for the _PAGE_PSE_PAT bit. */ - flags = l1e_get_flags(*p2m_entry); - pfn = l1e_get_pfn(*p2m_entry); - if ( pfn & 1 ) /* ==> _PAGE_PSE_PAT was set */ - pfn -= 1; /* Clear it; _PAGE_PSE becomes _PAGE_PAT */ - else - flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */ - - l1_entry = __map_domain_page(pg); - for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) - { - new_entry = l1e_from_pfn(pfn + i, flags); - p2m_add_iommu_flags(&new_entry, 0, 0); - p2m->write_p2m_entry(p2m, gfn, - l1_entry+i, *table_mfn, new_entry, 1); - } - unmap_domain_page(l1_entry); - - new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), - __PAGE_HYPERVISOR|_PAGE_USER); - p2m_add_iommu_flags(&new_entry, 1, IOMMUF_readable|IOMMUF_writable); - p2m->write_p2m_entry(p2m, gfn, - p2m_entry, *table_mfn, new_entry, 2); - } - - *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); - next = map_domain_page(mfn_x(*table_mfn)); - unmap_domain_page(*table); - *table = next; - - return 1; -} - -/* - * Populate-on-demand functionality - */ -static -int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, - unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma); - -static int -p2m_pod_cache_add(struct p2m_domain *p2m, - struct page_info *page, - unsigned long order) -{ - int i; - struct page_info *p; - struct domain *d = p2m->domain; - -#ifndef NDEBUG - mfn_t mfn; - - mfn = page_to_mfn(page); - - /* Check to make sure this is a contiguous region */ - if( mfn_x(mfn) & ((1 << order) - 1) ) - { - printk("%s: mfn %lx not aligned order %lu! (mask %lx)\n", - __func__, mfn_x(mfn), order, ((1UL << order) - 1)); - return -1; - } - - for(i=0; i < 1 << order ; i++) { - struct domain * od; - - p = mfn_to_page(_mfn(mfn_x(mfn) + i)); - od = page_get_owner(p); - if(od != d) - { - printk("%s: mfn %lx expected owner d%d, got owner d%d!\n", - __func__, mfn_x(mfn), d->domain_id, - od?od->domain_id:-1); - return -1; - } - } -#endif - - ASSERT(p2m_locked_by_me(p2m)); - - /* - * Pages from domain_alloc and returned by the balloon driver aren't - * guaranteed to be zero; but by reclaiming zero pages, we implicitly - * promise to provide zero pages. So we scrub pages before using. - */ - for ( i = 0; i < (1 << order); i++ ) - { - char *b = map_domain_page(mfn_x(page_to_mfn(page)) + i); - clear_page(b); - unmap_domain_page(b); - } - - spin_lock(&d->page_alloc_lock); - - /* First, take all pages off the domain list */ - for(i=0; i < 1 << order ; i++) - { - p = page + i; - page_list_del(p, &d->page_list); - } - - /* Then add the first one to the appropriate populate-on-demand list */ - switch(order) - { - case 9: - page_list_add_tail(page, &p2m->pod.super); /* lock: page_alloc */ - p2m->pod.count += 1 << order; - break; - case 0: - page_list_add_tail(page, &p2m->pod.single); /* lock: page_alloc */ - p2m->pod.count += 1; - break; - default: - BUG(); - } - - /* Ensure that the PoD cache has never been emptied. - * This may cause "zombie domains" since the page will never be freed. */ - BUG_ON( d->arch.relmem != RELMEM_not_started ); - - spin_unlock(&d->page_alloc_lock); - - return 0; -} - -/* Get a page of size order from the populate-on-demand cache. Will break - * down 2-meg pages into singleton pages automatically. Returns null if - * a superpage is requested and no superpages are available. Must be called - * with the d->page_lock held. */ -static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, - unsigned long order) -{ - struct page_info *p = NULL; - int i; - - if ( order == 9 && page_list_empty(&p2m->pod.super) ) - { - return NULL; - } - else if ( order == 0 && page_list_empty(&p2m->pod.single) ) - { - unsigned long mfn; - struct page_info *q; - - BUG_ON( page_list_empty(&p2m->pod.super) ); - - /* Break up a superpage to make single pages. NB count doesn't - * need to be adjusted. */ - p = page_list_remove_head(&p2m->pod.super); - mfn = mfn_x(page_to_mfn(p)); - - for ( i=0; i<SUPERPAGE_PAGES; i++ ) - { - q = mfn_to_page(_mfn(mfn+i)); - page_list_add_tail(q, &p2m->pod.single); - } - } - - switch ( order ) - { - case 9: - BUG_ON( page_list_empty(&p2m->pod.super) ); - p = page_list_remove_head(&p2m->pod.super); - p2m->pod.count -= 1 << order; /* Lock: page_alloc */ - break; - case 0: - BUG_ON( page_list_empty(&p2m->pod.single) ); - p = page_list_remove_head(&p2m->pod.single); - p2m->pod.count -= 1; - break; - default: - BUG(); - } - - /* Put the pages back on the domain page_list */ - for ( i = 0 ; i < (1 << order); i++ ) - { - BUG_ON(page_get_owner(p + i) != p2m->domain); - page_list_add_tail(p + i, &p2m->domain->page_list); - } - - return p; -} - -/* Set the size of the cache, allocating or freeing as necessary. */ -static int -p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible) -{ - struct domain *d = p2m->domain; - int ret = 0; - - /* Increasing the target */ - while ( pod_target > p2m->pod.count ) - { - struct page_info * page; - int order; - - if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES ) - order = 9; - else - order = 0; - retry: - page = alloc_domheap_pages(d, order, 0); - if ( unlikely(page == NULL) ) - { - if ( order == 9 ) - { - /* If we can't allocate a superpage, try singleton pages */ - order = 0; - goto retry; - } - - printk("%s: Unable to allocate domheap page for pod cache. target %lu cachesize %d\n", - __func__, pod_target, p2m->pod.count); - ret = -ENOMEM; - goto out; - } - - p2m_pod_cache_add(p2m, page, order); - - if ( hypercall_preempt_check() && preemptible ) - { - ret = -EAGAIN; - goto out; - } - } - - /* Decreasing the target */ - /* We hold the p2m lock here, so we don't need to worry about - * cache disappearing under our feet. */ - while ( pod_target < p2m->pod.count ) - { - struct page_info * page; - int order, i; - - /* Grab the lock before checking that pod.super is empty, or the last - * entries may disappear before we grab the lock. */ - spin_lock(&d->page_alloc_lock); - - if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES - && !page_list_empty(&p2m->pod.super) ) - order = 9; - else - order = 0; - - page = p2m_pod_cache_get(p2m, order); - - ASSERT(page != NULL); - - spin_unlock(&d->page_alloc_lock); - - /* Then free them */ - for ( i = 0 ; i < (1 << order) ; i++ ) - { - /* Copied from common/memory.c:guest_remove_page() */ - if ( unlikely(!get_page(page+i, d)) ) - { - gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); - ret = -EINVAL; - goto out; - } - - if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) ) - put_page_and_type(page+i); - - if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) ) - put_page(page+i); - - put_page(page+i); - - if ( hypercall_preempt_check() && preemptible ) - { - ret = -EAGAIN; - goto out; - } - } - } - -out: - return ret; -} - -/* - * The "right behavior" here requires some careful thought. First, some - * definitions: - * + M: static_max - * + B: number of pages the balloon driver has ballooned down to. - * + P: Number of populated pages. - * + T: Old target - * + T': New target - * - * The following equations should hold: - * 0 <= P <= T <= B <= M - * d->arch.p2m->pod.entry_count == B - P - * d->tot_pages == P + d->arch.p2m->pod.count - * - * Now we have the following potential cases to cover: - * B <T': Set the PoD cache size equal to the number of outstanding PoD - * entries. The balloon driver will deflate the balloon to give back - * the remainder of the ram to the guest OS. - * T <T'<B : Increase PoD cache size. - * T'<T<=B : Here we have a choice. We can decrease the size of the cache, - * get the memory right away. However, that means every time we - * reduce the memory target we risk the guest attempting to populate the - * memory before the balloon driver has reached its new target. Safer to - * never reduce the cache size here, but only when the balloon driver frees - * PoD ranges. - * - * If there are many zero pages, we could reach the target also by doing - * zero sweeps and marking the ranges PoD; but the balloon driver will have - * to free this memory eventually anyway, so we don't actually gain that much - * by doing so. - * - * NB that the equation (B<T') may require adjustment to the cache - * size as PoD pages are freed as well; i.e., freeing a PoD-backed - * entry when pod.entry_count == pod.count requires us to reduce both - * pod.entry_count and pod.count. - */ -int -p2m_pod_set_mem_target(struct domain *d, unsigned long target) -{ - unsigned pod_target; - struct p2m_domain *p2m = p2m_get_hostp2m(d); - int ret = 0; - unsigned long populated; - - p2m_lock(p2m); - - /* P == B: Nothing to do. */ - if ( p2m->pod.entry_count == 0 ) - goto out; - - /* Don't do anything if the domain is being torn down */ - if ( d->is_dying ) - goto out; - - /* T' < B: Don't reduce the cache size; let the balloon driver - * take care of it. */ - if ( target < d->tot_pages ) - goto out; - - populated = d->tot_pages - p2m->pod.count; - - pod_target = target - populated; - - /* B < T': Set the cache size equal to # of outstanding entries, - * let the balloon driver fill in the rest. */ - if ( pod_target > p2m->pod.entry_count ) - pod_target = p2m->pod.entry_count; - - ASSERT( pod_target >= p2m->pod.count ); - - ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); - -out: - p2m_unlock(p2m); - - return ret; -} - -void -p2m_pod_empty_cache(struct domain *d) -{ - struct p2m_domain *p2m = p2m_get_hostp2m(d); - struct page_info *page; - - /* After this barrier no new PoD activities can happen. */ - BUG_ON(!d->is_dying); - spin_barrier(&p2m->lock); - - spin_lock(&d->page_alloc_lock); - - while ( (page = page_list_remove_head(&p2m->pod.super)) ) - { - int i; - - for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ ) - { - BUG_ON(page_get_owner(page + i) != d); - page_list_add_tail(page + i, &d->page_list); - } - - p2m->pod.count -= SUPERPAGE_PAGES; - } - - while ( (page = page_list_remove_head(&p2m->pod.single)) ) - { - BUG_ON(page_get_owner(page) != d); - page_list_add_tail(page, &d->page_list); - - p2m->pod.count -= 1; - } - - BUG_ON(p2m->pod.count != 0); - - spin_unlock(&d->page_alloc_lock); -} - -int -p2m_pod_offline_or_broken_hit(struct page_info *p) -{ - struct domain *d; - struct p2m_domain *p2m; - struct page_info *q, *tmp; - unsigned long mfn, bmfn; - - if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) - return 0; - - spin_lock(&d->page_alloc_lock); - bmfn = mfn_x(page_to_mfn(p)); - page_list_for_each_safe(q, tmp, &p2m->pod.super) - { - mfn = mfn_x(page_to_mfn(q)); - if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) ) - { - unsigned long i; - page_list_del(q, &p2m->pod.super); - for ( i = 0; i < SUPERPAGE_PAGES; i++) - { - q = mfn_to_page(_mfn(mfn + i)); - page_list_add_tail(q, &p2m->pod.single); - } - page_list_del(p, &p2m->pod.single); - p2m->pod.count--; - goto pod_hit; - } - } - - page_list_for_each_safe(q, tmp, &p2m->pod.single) - { - mfn = mfn_x(page_to_mfn(q)); - if ( mfn == bmfn ) - { - page_list_del(p, &p2m->pod.single); - p2m->pod.count--; - goto pod_hit; - } - } - - spin_unlock(&d->page_alloc_lock); - return 0; - -pod_hit: - page_list_add_tail(p, &d->arch.relmem_list); - spin_unlock(&d->page_alloc_lock); - return 1; -} - -void -p2m_pod_offline_or_broken_replace(struct page_info *p) -{ - struct domain *d; - struct p2m_domain *p2m; - - if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) - return; - - free_domheap_page(p); - - p = alloc_domheap_page(d, 0); - if ( unlikely(!p) ) - return; - - p2m_lock(p2m); - p2m_pod_cache_add(p2m, p, 0); - p2m_unlock(p2m); - return; -} - -/* This function is needed for two reasons: - * + To properly handle clearing of PoD entries - * + To "steal back" memory being freed for the PoD cache, rather than - * releasing it. - * - * Once both of these functions have been completed, we can return and - * allow decrease_reservation() to handle everything else. - */ -int -p2m_pod_decrease_reservation(struct domain *d, - xen_pfn_t gpfn, - unsigned int order) -{ - int ret=0; - int i; - struct p2m_domain *p2m = p2m_get_hostp2m(d); - - int steal_for_cache = 0; - int pod = 0, nonpod = 0, ram = 0; - - - /* If we don't have any outstanding PoD entries, let things take their - * course */ - if ( p2m->pod.entry_count == 0 ) - goto out; - - /* Figure out if we need to steal some freed memory for our cache */ - steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); - - p2m_lock(p2m); - audit_p2m(p2m, 1); - - if ( unlikely(d->is_dying) ) - goto out_unlock; - - /* See what's in here. */ - /* FIXME: Add contiguous; query for PSE entries? */ - for ( i=0; i<(1<<order); i++) - { - p2m_type_t t; - - gfn_to_mfn_query(p2m, gpfn + i, &t); - - if ( t == p2m_populate_on_demand ) - pod++; - else - { - nonpod++; - if ( p2m_is_ram(t) ) - ram++; - } - } - - /* No populate-on-demand? Don't need to steal anything? Then we're done!*/ - if(!pod && !steal_for_cache) - goto out_unlock; - - if ( !nonpod ) - { - /* All PoD: Mark the whole region invalid and tell caller - * we're done. */ - set_p2m_entry(p2m, gpfn, _mfn(INVALID_MFN), order, p2m_invalid, p2m->default_access); - p2m->pod.entry_count-=(1<<order); /* Lock: p2m */ - BUG_ON(p2m->pod.entry_count < 0); - ret = 1; - goto out_entry_check; - } - - /* FIXME: Steal contig 2-meg regions for cache */ - - /* Process as long as: - * + There are PoD entries to handle, or - * + There is ram left, and we want to steal it - */ - for ( i=0; - i<(1<<order) && (pod>0 || (steal_for_cache && ram > 0)); - i++) - { - mfn_t mfn; - p2m_type_t t; - - mfn = gfn_to_mfn_query(p2m, gpfn + i, &t); - if ( t == p2m_populate_on_demand ) - { - set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); - p2m->pod.entry_count--; /* Lock: p2m */ - BUG_ON(p2m->pod.entry_count < 0); - pod--; - } - else if ( steal_for_cache && p2m_is_ram(t) ) - { - struct page_info *page; - - ASSERT(mfn_valid(mfn)); - - page = mfn_to_page(mfn); - - set_p2m_entry(p2m, gpfn + i, _mfn(INVALID_MFN), 0, p2m_invalid, p2m->default_access); - set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); - - p2m_pod_cache_add(p2m, page, 0); - - steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); - - nonpod--; - ram--; - } - } - - /* If there are no more non-PoD entries, tell decrease_reservation() that - * there's nothing left to do. */ - if ( nonpod == 0 ) - ret = 1; - -out_entry_check: - /* If we've reduced our "liabilities" beyond our "assets", free some */ - if ( p2m->pod.entry_count < p2m->pod.count ) - { - p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/); - } - -out_unlock: - audit_p2m(p2m, 1); - p2m_unlock(p2m); - -out: - return ret; -} - -void -p2m_pod_dump_data(struct p2m_domain *p2m) -{ - printk(" PoD entries=%d cachesize=%d\n", - p2m->pod.entry_count, p2m->pod.count); -} - - -/* Search for all-zero superpages to be reclaimed as superpages for the - * PoD cache. Must be called w/ p2m lock held, page_alloc lock not held. */ -static int -p2m_pod_zero_check_superpage(struct p2m_domain *p2m, unsigned long gfn) -{ - mfn_t mfn, mfn0 = _mfn(INVALID_MFN); - p2m_type_t type, type0 = 0; - unsigned long * map = NULL; - int ret=0, reset = 0; - int i, j; - int max_ref = 1; - struct domain *d = p2m->domain; - - if ( !superpage_aligned(gfn) ) - goto out; - - /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ - if ( paging_mode_shadow(d) ) - max_ref++; - - /* Look up the mfns, checking to make sure they're the same mfn - * and aligned, and mapping them. */ - for ( i=0; i<SUPERPAGE_PAGES; i++ ) - { - - mfn = gfn_to_mfn_query(p2m, gfn + i, &type); - - if ( i == 0 ) - { - mfn0 = mfn; - type0 = type; - } - - /* Conditions that must be met for superpage-superpage: - * + All gfns are ram types - * + All gfns have the same type - * + All of the mfns are allocated to a domain - * + None of the mfns are used as pagetables, or allocated via xenheap - * + The first mfn is 2-meg aligned - * + All the other mfns are in sequence - * Adding for good measure: - * + None of the mfns are likely to be mapped elsewhere (refcount - * 2 or less for shadow, 1 for hap) - */ - if ( !p2m_is_ram(type) - || type != type0 - || ( (mfn_to_page(mfn)->count_info & PGC_allocated) == 0 ) - || ( (mfn_to_page(mfn)->count_info & (PGC_page_table|PGC_xen_heap)) != 0 ) - || ( (mfn_to_page(mfn)->count_info & PGC_xen_heap ) != 0 ) - || ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > max_ref ) - || !( ( i == 0 && superpage_aligned(mfn_x(mfn0)) ) - || ( i != 0 && mfn_x(mfn) == (mfn_x(mfn0) + i) ) ) ) - goto out; - } - - /* Now, do a quick check to see if it may be zero before unmapping. */ - for ( i=0; i<SUPERPAGE_PAGES; i++ ) - { - /* Quick zero-check */ - map = map_domain_page(mfn_x(mfn0) + i); - - for ( j=0; j<16; j++ ) - if( *(map+j) != 0 ) - break; - - unmap_domain_page(map); - - if ( j < 16 ) - goto out; - - } - - /* Try to remove the page, restoring old mapping if it fails. */ - set_p2m_entry(p2m, gfn, - _mfn(POPULATE_ON_DEMAND_MFN), 9, - p2m_populate_on_demand, p2m->default_access); - - /* Make none of the MFNs are used elsewhere... for example, mapped - * via the grant table interface, or by qemu. Allow one refcount for - * being allocated to the domain. */ - for ( i=0; i < SUPERPAGE_PAGES; i++ ) - { - mfn = _mfn(mfn_x(mfn0) + i); - if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 ) - { - reset = 1; - goto out_reset; - } - } - - /* Finally, do a full zero-check */ - for ( i=0; i < SUPERPAGE_PAGES; i++ ) - { - map = map_domain_page(mfn_x(mfn0) + i); - - for ( j=0; j<PAGE_SIZE/sizeof(*map); j++ ) - if( *(map+j) != 0 ) - { - reset = 1; - break; - } - - unmap_domain_page(map); - - if ( reset ) - goto out_reset; - } - - if ( tb_init_done ) - { - struct { - u64 gfn, mfn; - int d:16,order:16; - } t; - - t.gfn = gfn; - t.mfn = mfn_x(mfn); - t.d = d->domain_id; - t.order = 9; - - __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); - } - - /* Finally! We've passed all the checks, and can add the mfn superpage - * back on the PoD cache, and account for the new p2m PoD entries */ - p2m_pod_cache_add(p2m, mfn_to_page(mfn0), 9); - p2m->pod.entry_count += SUPERPAGE_PAGES; - -out_reset: - if ( reset ) - set_p2m_entry(p2m, gfn, mfn0, 9, type0, p2m->default_access); - -out: - return ret; -} - -static void -p2m_pod_zero_check(struct p2m_domain *p2m, unsigned long *gfns, int count) -{ - mfn_t mfns[count]; - p2m_type_t types[count]; - unsigned long * map[count]; - struct domain *d = p2m->domain; - - int i, j; - int max_ref = 1; - - /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ - if ( paging_mode_shadow(d) ) - max_ref++; - - /* First, get the gfn list, translate to mfns, and map the pages. */ - for ( i=0; i<count; i++ ) - { - mfns[i] = gfn_to_mfn_query(p2m, gfns[i], types + i); - /* If this is ram, and not a pagetable or from the xen heap, and probably not mapped - elsewhere, map it; otherwise, skip. */ - if ( p2m_is_ram(types[i]) - && ( (mfn_to_page(mfns[i])->count_info & PGC_allocated) != 0 ) - && ( (mfn_to_page(mfns[i])->count_info & (PGC_page_table|PGC_xen_heap)) == 0 ) - && ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) <= max_ref ) ) - map[i] = map_domain_page(mfn_x(mfns[i])); - else - map[i] = NULL; - } - - /* Then, go through and check for zeroed pages, removing write permission - * for those with zeroes. */ - for ( i=0; i<count; i++ ) - { - if(!map[i]) - continue; - - /* Quick zero-check */ - for ( j=0; j<16; j++ ) - if( *(map[i]+j) != 0 ) - break; - - if ( j < 16 ) - { - unmap_domain_page(map[i]); - map[i] = NULL; - continue; - } - - /* Try to remove the page, restoring old mapping if it fails. */ - set_p2m_entry(p2m, gfns[i], - _mfn(POPULATE_ON_DEMAND_MFN), 0, - p2m_populate_on_demand, p2m->default_access); - - /* See if the page was successfully unmapped. (Allow one refcount - * for being allocated to a domain.) */ - if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 ) - { - unmap_domain_page(map[i]); - map[i] = NULL; - - set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], p2m->default_access); - - continue; - } - } - - /* Now check each page for real */ - for ( i=0; i < count; i++ ) - { - if(!map[i]) - continue; - - for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ ) - if( *(map[i]+j) != 0 ) - break; - - unmap_domain_page(map[i]); - - /* See comment in p2m_pod_zero_check_superpage() re gnttab - * check timing. */ - if ( j < PAGE_SIZE/sizeof(*map[i]) ) - { - set_p2m_entry(p2m, gfns[i], mfns[i], 0, types[i], p2m->default_access); - } - else - { - if ( tb_init_done ) - { - struct { - u64 gfn, mfn; - int d:16,order:16; - } t; - - t.gfn = gfns[i]; - t.mfn = mfn_x(mfns[i]); - t.d = d->domain_id; - t.order = 0; - - __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); - } - - /* Add to cache, and account for the new p2m PoD entry */ - p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), 0); - p2m->pod.entry_count++; - } - } - -} - -#define POD_SWEEP_LIMIT 1024 -static void -p2m_pod_emergency_sweep_super(struct p2m_domain *p2m) -{ - unsigned long i, start, limit; - - if ( p2m->pod.reclaim_super == 0 ) - { - p2m->pod.reclaim_super = (p2m->pod.max_guest>>9)<<9; - p2m->pod.reclaim_super -= SUPERPAGE_PAGES; - } - - start = p2m->pod.reclaim_super; - limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; - - for ( i=p2m->pod.reclaim_super ; i > 0 ; i -= SUPERPAGE_PAGES ) - { - p2m_pod_zero_check_superpage(p2m, i); - /* Stop if we're past our limit and we have found *something*. - * - * NB that this is a zero-sum game; we're increasing our cache size - * by increasing our 'debt'. Since we hold the p2m lock, - * (entry_count - count) must remain the same. */ - if ( !page_list_empty(&p2m->pod.super) && i < limit ) - break; - } - - p2m->pod.reclaim_super = i ? i - SUPERPAGE_PAGES : 0; -} - -#define POD_SWEEP_STRIDE 16 -static void -p2m_pod_emergency_sweep(struct p2m_domain *p2m) -{ - unsigned long gfns[POD_SWEEP_STRIDE]; - unsigned long i, j=0, start, limit; - p2m_type_t t; - - - if ( p2m->pod.reclaim_single == 0 ) - p2m->pod.reclaim_single = p2m->pod.max_guest; - - start = p2m->pod.reclaim_single; - limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; - - /* FIXME: Figure out how to avoid superpages */ - for ( i=p2m->pod.reclaim_single; i > 0 ; i-- ) - { - gfn_to_mfn_query(p2m, i, &t ); - if ( p2m_is_ram(t) ) - { - gfns[j] = i; - j++; - BUG_ON(j > POD_SWEEP_STRIDE); - if ( j == POD_SWEEP_STRIDE ) - { - p2m_pod_zero_check(p2m, gfns, j); - j = 0; - } - } - /* Stop if we're past our limit and we have found *something*. - * - * NB that this is a zero-sum game; we're increasing our cache size - * by re-increasing our 'debt'. Since we hold the p2m lock, - * (entry_count - count) must remain the same. */ - if ( p2m->pod.count > 0 && i < limit ) - break; - } - - if ( j ) - p2m_pod_zero_check(p2m, gfns, j); - - p2m->pod.reclaim_single = i ? i - 1 : i; - -} - -int -p2m_pod_demand_populate(struct p2m_domain *p2m, unsigned long gfn, - unsigned int order, - p2m_query_t q) -{ - struct domain *d = p2m->domain; - struct page_info *p = NULL; /* Compiler warnings */ - unsigned long gfn_aligned; - mfn_t mfn; - int i; - - ASSERT(p2m_locked_by_me(p2m)); - - /* This check is done with the p2m lock held. This will make sure that - * even if d->is_dying changes under our feet, p2m_pod_empty_cache() - * won't start until we're done. */ - if ( unlikely(d->is_dying) ) - goto out_fail; - - /* Because PoD does not have cache list for 1GB pages, it has to remap - * 1GB region to 2MB chunks for a retry. */ - if ( order == 18 ) - { - gfn_aligned = (gfn >> order) << order; - /* Note that we are supposed to call set_p2m_entry() 512 times to - * split 1GB into 512 2MB pages here. But We only do once here because - * set_p2m_entry() should automatically shatter the 1GB page into - * 512 2MB pages. The rest of 511 calls are unnecessary. - */ - set_p2m_entry(p2m, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9, - p2m_populate_on_demand, p2m->default_access); - audit_p2m(p2m, 1); - p2m_unlock(p2m); - return 0; - } - - /* Once we've ballooned down enough that we can fill the remaining - * PoD entries from the cache, don't sweep even if the particular - * list we want to use is empty: that can lead to thrashing zero pages - * through the cache for no good reason. */ - if ( p2m->pod.entry_count > p2m->pod.count ) - { - - /* If we're low, start a sweep */ - if ( order == 9 && page_list_empty(&p2m->pod.super) ) - p2m_pod_emergency_sweep_super(p2m); - - if ( page_list_empty(&p2m->pod.single) && - ( ( order == 0 ) - || (order == 9 && page_list_empty(&p2m->pod.super) ) ) ) - p2m_pod_emergency_sweep(p2m); - } - - /* Keep track of the highest gfn demand-populated by a guest fault */ - if ( q == p2m_guest && gfn > p2m->pod.max_guest ) - p2m->pod.max_guest = gfn; - - spin_lock(&d->page_alloc_lock); - - if ( p2m->pod.count == 0 ) - goto out_of_memory; - - /* Get a page f/ the cache. A NULL return value indicates that the - * 2-meg range should be marked singleton PoD, and retried */ - if ( (p = p2m_pod_cache_get(p2m, order)) == NULL ) - goto remap_and_retry; - - mfn = page_to_mfn(p); - - BUG_ON((mfn_x(mfn) & ((1 << order)-1)) != 0); - - spin_unlock(&d->page_alloc_lock); - - gfn_aligned = (gfn >> order) << order; - - set_p2m_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access); - - for( i = 0; i < (1UL << order); i++ ) - { - set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_aligned + i); - paging_mark_dirty(d, mfn_x(mfn) + i); - } - - p2m->pod.entry_count -= (1 << order); /* Lock: p2m */ - BUG_ON(p2m->pod.entry_count < 0); - - if ( tb_init_done ) - { - struct { - u64 gfn, mfn; - int d:16,order:16; - } t; - - t.gfn = gfn; - t.mfn = mfn_x(mfn); - t.d = d->domain_id; - t.order = order; - - __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); - } - - return 0; -out_of_memory: - spin_unlock(&d->page_alloc_lock); - - printk("%s: Out of populate-on-demand memory! tot_pages %" PRIu32 " pod_entries %" PRIi32 "\n", - __func__, d->tot_pages, p2m->pod.entry_count); - domain_crash(d); -out_fail: - return -1; -remap_and_retry: - BUG_ON(order != 9); - spin_unlock(&d->page_alloc_lock); - - /* Remap this 2-meg region in singleton chunks */ - gfn_aligned = (gfn>>order)<<order; - for(i=0; i<(1<<order); i++) - set_p2m_entry(p2m, gfn_aligned+i, _mfn(POPULATE_ON_DEMAND_MFN), 0, - p2m_populate_on_demand, p2m->default_access); - if ( tb_init_done ) - { - struct { - u64 gfn; - int d:16; - } t; - - t.gfn = gfn; - t.d = d->domain_id; - - __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t); - } - - return 0; -} - -/* Non-ept "lock-and-check" wrapper */ -static int p2m_pod_check_and_populate(struct p2m_domain *p2m, unsigned long gfn, - l1_pgentry_t *p2m_entry, int order, - p2m_query_t q) -{ - /* Only take the lock if we don't already have it. Otherwise it - * wouldn't be safe to do p2m lookups with the p2m lock held */ - int do_locking = !p2m_locked_by_me(p2m); - int r; - - if ( do_locking ) - p2m_lock(p2m); - - audit_p2m(p2m, 1); - - /* Check to make sure this is still PoD */ - if ( p2m_flags_to_type(l1e_get_flags(*p2m_entry)) != p2m_populate_on_demand ) - { - if ( do_locking ) - p2m_unlock(p2m); - return 0; - } - - r = p2m_pod_demand_populate(p2m, gfn, order, q); - - audit_p2m(p2m, 1); - if ( do_locking ) - p2m_unlock(p2m); - - return r; -} - -// Returns 0 on error (out of memory) -static int -p2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, - unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) -{ - // XXX -- this might be able to be faster iff current->domain == d - mfn_t table_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); - void *table =map_domain_page(mfn_x(table_mfn)); - unsigned long i, gfn_remainder = gfn; - l1_pgentry_t *p2m_entry; - l1_pgentry_t entry_content; - l2_pgentry_t l2e_content; - l3_pgentry_t l3e_content; - int rv=0; - unsigned int iommu_pte_flags = (p2mt == p2m_ram_rw) ? - IOMMUF_readable|IOMMUF_writable: - 0; - unsigned long old_mfn = 0; - - if ( tb_init_done ) - { - struct { - u64 gfn, mfn; - int p2mt; - int d:16,order:16; - } t; - - t.gfn = gfn; - t.mfn = mfn_x(mfn); - t.p2mt = p2mt; - t.d = p2m->domain->domain_id; - t.order = page_order; - - __trace_var(TRC_MEM_SET_P2M_ENTRY, 0, sizeof(t), &t); - } - -#if CONFIG_PAGING_LEVELS >= 4 - if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, - L4_PAGETABLE_SHIFT - PAGE_SHIFT, - L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) - goto out; -#endif - /* - * Try to allocate 1GB page table if this feature is supported. - */ - if ( page_order == 18 ) - { - l1_pgentry_t old_entry = l1e_empty(); - p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, - L3_PAGETABLE_SHIFT - PAGE_SHIFT, - L3_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); - if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && - !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) - { - /* We're replacing a non-SP page with a superpage. Make sure to - * handle freeing the table properly. */ - old_entry = *p2m_entry; - } - - ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); - l3e_content = mfn_valid(mfn) - ? l3e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) - : l3e_empty(); - entry_content.l1 = l3e_content.l3; - - if ( entry_content.l1 != 0 ) - { - p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); - old_mfn = l1e_get_pfn(*p2m_entry); - } - - p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3); - /* NB: paging_write_p2m_entry() handles tlb flushes properly */ - - /* Free old intermediate tables if necessary */ - if ( l1e_get_flags(old_entry) & _PAGE_PRESENT ) - p2m_free_entry(p2m, &old_entry, page_order); - } - /* - * When using PAE Xen, we only allow 33 bits of pseudo-physical - * address in translated guests (i.e. 8 GBytes). This restriction - * comes from wanting to map the P2M table into the 16MB RO_MPT hole - * in Xen's address space for translated PV guests. - * When using AMD's NPT on PAE Xen, we are restricted to 4GB. - */ - else if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, - L3_PAGETABLE_SHIFT - PAGE_SHIFT, - ((CONFIG_PAGING_LEVELS == 3) - ? (hap_enabled(p2m->domain) ? 4 : 8) - : L3_PAGETABLE_ENTRIES), - PGT_l2_page_table) ) - goto out; - - if ( page_order == 0 ) - { - if ( !p2m_next_level(p2m, &table_mfn, &table, &gfn_remainder, gfn, - L2_PAGETABLE_SHIFT - PAGE_SHIFT, - L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) - goto out; - - p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, - 0, L1_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); - - if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) - entry_content = l1e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn)); - else - entry_content = l1e_empty(); - - if ( entry_content.l1 != 0 ) - { - p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); - old_mfn = l1e_get_pfn(*p2m_entry); - } - /* level 1 entry */ - p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1); - /* NB: paging_write_p2m_entry() handles tlb flushes properly */ - } - else if ( page_order == 9 ) - { - l1_pgentry_t old_entry = l1e_empty(); - p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, - L2_PAGETABLE_SHIFT - PAGE_SHIFT, - L2_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); - - /* FIXME: Deal with 4k replaced by 2meg pages */ - if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && - !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) - { - /* We're replacing a non-SP page with a superpage. Make sure to - * handle freeing the table properly. */ - old_entry = *p2m_entry; - } - - ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct); - if ( mfn_valid(mfn) || p2m_is_magic(p2mt) ) - l2e_content = l2e_from_pfn(mfn_x(mfn), - p2m_type_to_flags(p2mt, mfn) | - _PAGE_PSE); - else - l2e_content = l2e_empty(); - - entry_content.l1 = l2e_content.l2; - - if ( entry_content.l1 != 0 ) - { - p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags); - old_mfn = l1e_get_pfn(*p2m_entry); - } - - p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2); - /* NB: paging_write_p2m_entry() handles tlb flushes properly */ - - /* Free old intermediate tables if necessary */ - if ( l1e_get_flags(old_entry) & _PAGE_PRESENT ) - p2m_free_entry(p2m, &old_entry, page_order); - } - - /* Track the highest gfn for which we have ever had a valid mapping */ - if ( mfn_valid(mfn) - && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) ) - p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1; - - if ( iommu_enabled && need_iommu(p2m->domain) ) - { - if ( iommu_hap_pt_share ) - { - if ( old_mfn && (old_mfn != mfn_x(mfn)) ) - amd_iommu_flush_pages(p2m->domain, gfn, page_order); - } - else - { - if ( p2mt == p2m_ram_rw ) - for ( i = 0; i < (1UL << page_order); i++ ) - iommu_map_page(p2m->domain, gfn+i, mfn_x(mfn)+i, - IOMMUF_readable|IOMMUF_writable); - else - for ( int i = 0; i < (1UL << page_order); i++ ) - iommu_unmap_page(p2m->domain, gfn+i); - } - } - - /* Success */ - rv = 1; - -out: - unmap_domain_page(table); - return rv; -} - -static mfn_t -p2m_gfn_to_mfn(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, - p2m_query_t q) -{ - mfn_t mfn; - paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - - ASSERT(paging_mode_translate(p2m->domain)); - - /* XXX This is for compatibility with the old model, where anything not - * XXX marked as RAM was considered to be emulated MMIO space. - * XXX Once we start explicitly registering MMIO regions in the p2m - * XXX we will return p2m_invalid for unmapped gfns */ - *t = p2m_mmio_dm; - /* Not implemented except with EPT */ - *a = p2m_access_rwx; - - mfn = pagetable_get_mfn(p2m_get_pagetable(p2m)); - - if ( gfn > p2m->max_mapped_pfn ) - /* This pfn is higher than the highest the p2m map currently holds */ - return _mfn(INVALID_MFN); - -#if CONFIG_PAGING_LEVELS >= 4 - { - l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); - l4e += l4_table_offset(addr); - if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l4e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l4e_get_pfn(*l4e)); - unmap_domain_page(l4e); - } -#endif - { - l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); -#if CONFIG_PAGING_LEVELS == 3 - /* On PAE hosts the p2m has eight l3 entries, not four (see - * shadow_set_p2m_entry()) so we can't use l3_table_offset. - * Instead, just count the number of l3es from zero. It's safe - * to do this because we already checked that the gfn is within - * the bounds of the p2m. */ - l3e += (addr >> L3_PAGETABLE_SHIFT); -#else - l3e += l3_table_offset(addr); -#endif -pod_retry_l3: - if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) - { - if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand ) - { - if ( q != p2m_query ) - { - if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) ) - goto pod_retry_l3; - } - else - *t = p2m_populate_on_demand; - } - unmap_domain_page(l3e); - return _mfn(INVALID_MFN); - } - else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) ) - { - mfn = _mfn(l3e_get_pfn(*l3e) + - l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + - l1_table_offset(addr)); - *t = p2m_flags_to_type(l3e_get_flags(*l3e)); - unmap_domain_page(l3e); - - ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); - return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); - } - - mfn = _mfn(l3e_get_pfn(*l3e)); - unmap_domain_page(l3e); - } - - l2e = map_domain_page(mfn_x(mfn)); - l2e += l2_table_offset(addr); - -pod_retry_l2: - if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) - { - /* PoD: Try to populate a 2-meg chunk */ - if ( p2m_flags_to_type(l2e_get_flags(*l2e)) == p2m_populate_on_demand ) - { - if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *)l2e, 9, q) ) - goto pod_retry_l2; - } else - *t = p2m_populate_on_demand; - } - - unmap_domain_page(l2e); - return _mfn(INVALID_MFN); - } - else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) - { - mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); - *t = p2m_flags_to_type(l2e_get_flags(*l2e)); - unmap_domain_page(l2e); - - ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); - return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); - } - - mfn = _mfn(l2e_get_pfn(*l2e)); - unmap_domain_page(l2e); - - l1e = map_domain_page(mfn_x(mfn)); - l1e += l1_table_offset(addr); -pod_retry_l1: - if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) - { - /* PoD: Try to populate */ - if ( p2m_flags_to_type(l1e_get_flags(*l1e)) == p2m_populate_on_demand ) - { - if ( q != p2m_query ) { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *)l1e, 0, q) ) - goto pod_retry_l1; - } else - *t = p2m_populate_on_demand; - } - - unmap_domain_page(l1e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l1e_get_pfn(*l1e)); - *t = p2m_flags_to_type(l1e_get_flags(*l1e)); - unmap_domain_page(l1e); - - ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); - return (p2m_is_valid(*t) || p2m_is_grant(*t)) ? mfn : _mfn(INVALID_MFN); -} - -/* Read the current domain's p2m table (through the linear mapping). */ -static mfn_t p2m_gfn_to_mfn_current(struct p2m_domain *p2m, - unsigned long gfn, p2m_type_t *t, p2m_access_t *a, - p2m_query_t q) -{ - mfn_t mfn = _mfn(INVALID_MFN); - p2m_type_t p2mt = p2m_mmio_dm; - paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; - /* XXX This is for compatibility with the old model, where anything not - * XXX marked as RAM was considered to be emulated MMIO space. - * XXX Once we start explicitly registering MMIO regions in the p2m - * XXX we will return p2m_invalid for unmapped gfns */ - - /* Not currently implemented except for EPT */ - *a = p2m_access_rwx; - - if ( gfn <= p2m->max_mapped_pfn ) - { - l1_pgentry_t l1e = l1e_empty(), *p2m_entry; - l2_pgentry_t l2e = l2e_empty(); - int ret; -#if CONFIG_PAGING_LEVELS >= 4 - l3_pgentry_t l3e = l3e_empty(); -#endif - - ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) - / sizeof(l1_pgentry_t)); - -#if CONFIG_PAGING_LEVELS >= 4 - /* - * Read & process L3 - */ - p2m_entry = (l1_pgentry_t *) - &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START) - + l3_linear_offset(addr)]; - pod_retry_l3: - ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e)); - - if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - { - if ( (l3e_get_flags(l3e) & _PAGE_PSE) && - (p2m_flags_to_type(l3e_get_flags(l3e)) == p2m_populate_on_demand) ) - { - /* The read has succeeded, so we know that mapping exists */ - if ( q != p2m_query ) - { - if ( !p2m_pod_demand_populate(p2m, gfn, 18, q) ) - goto pod_retry_l3; - p2mt = p2m_invalid; - printk("%s: Allocate 1GB failed!\n", __func__); - goto out; - } - else - { - p2mt = p2m_populate_on_demand; - goto out; - } - } - goto pod_retry_l2; - } - - if ( l3e_get_flags(l3e) & _PAGE_PSE ) - { - p2mt = p2m_flags_to_type(l3e_get_flags(l3e)); - ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt)); - if (p2m_is_valid(p2mt) ) - mfn = _mfn(l3e_get_pfn(l3e) + - l2_table_offset(addr) * L1_PAGETABLE_ENTRIES + - l1_table_offset(addr)); - else - p2mt = p2m_mmio_dm; - - goto out; - } -#endif - /* - * Read & process L2 - */ - p2m_entry = &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) - + l2_linear_offset(addr)]; - - pod_retry_l2: - ret = __copy_from_user(&l2e, - p2m_entry, - sizeof(l2e)); - if ( ret != 0 - || !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - { - if( (l2e_get_flags(l2e) & _PAGE_PSE) - && ( p2m_flags_to_type(l2e_get_flags(l2e)) - == p2m_populate_on_demand ) ) - { - /* The read has succeeded, so we know that the mapping - * exits at this point. */ - if ( q != p2m_query ) - { - if ( !p2m_pod_check_and_populate(p2m, gfn, - p2m_entry, 9, q) ) - goto pod_retry_l2; - - /* Allocate failed. */ - p2mt = p2m_invalid; - printk("%s: Allocate failed!\n", __func__); - goto out; - } - else - { - p2mt = p2m_populate_on_demand; - goto out; - } - } - - goto pod_retry_l1; - } - - if (l2e_get_flags(l2e) & _PAGE_PSE) - { - p2mt = p2m_flags_to_type(l2e_get_flags(l2e)); - ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt)); - - if ( p2m_is_valid(p2mt) ) - mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr)); - else - p2mt = p2m_mmio_dm; - - goto out; - } - - /* - * Read and process L1 - */ - - /* Need to __copy_from_user because the p2m is sparse and this - * part might not exist */ - pod_retry_l1: - p2m_entry = &phys_to_machine_mapping[gfn]; - - ret = __copy_from_user(&l1e, - p2m_entry, - sizeof(l1e)); - - if ( ret == 0 ) { - p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); - ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); - - if ( p2m_flags_to_type(l1e_get_flags(l1e)) - == p2m_populate_on_demand ) - { - /* The read has succeeded, so we know that the mapping - * exits at this point. */ - if ( q != p2m_query ) - { - if ( !p2m_pod_check_and_populate(p2m, gfn, - (l1_pgentry_t *)p2m_entry, 0, q) ) - goto pod_retry_l1; - - /* Allocate failed. */ - p2mt = p2m_invalid; - goto out; - } - else - { - p2mt = p2m_populate_on_demand; - goto out; - } - } - - if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) ) - mfn = _mfn(l1e_get_pfn(l1e)); - else - /* XXX see above */ - p2mt = p2m_mmio_dm; - } - } -out: - *t = p2mt; - return mfn; -} +/* XXX declare functions moved to p2m-pt.c */ +extern void p2m_pt_init(struct p2m_domain *p2m); /* Init the datastructures for later use by the p2m code */ static void p2m_initialise(struct domain *d, struct p2m_domain *p2m) @@ -1930,15 +87,12 @@ p2m->default_access = p2m_access_rwx; p2m->cr3 = CR3_EADDR; - p2m->set_entry = p2m_set_entry; - p2m->get_entry = p2m_gfn_to_mfn; - p2m->get_entry_current = p2m_gfn_to_mfn_current; - p2m->change_entry_type_global = p2m_change_type_global; - p2m->write_p2m_entry = paging_write_p2m_entry; cpus_clear(p2m->p2m_dirty_cpumask); if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ) - ept_p2m_init(d); + ept_p2m_init(p2m); + else + p2m_pt_init(p2m); return; } @@ -1986,7 +140,6 @@ p2m_unlock(p2m); } -static int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma) { @@ -2162,275 +315,6 @@ p2m_teardown_nestedp2m(d); } -#if P2M_AUDIT -/* strict_m2p == 0 allows m2p mappings that don'#t match the p2m. - * It's intended for add_to_physmap, when the domain has just been allocated - * new mfns that might have stale m2p entries from previous owners */ -static void audit_p2m(struct p2m_domain *p2m, int strict_m2p) -{ - struct page_info *page; - struct domain *od; - unsigned long mfn, gfn, m2pfn, lp2mfn = 0; - int entry_count = 0; - mfn_t p2mfn; - unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; - int test_linear; - p2m_type_t type; - struct domain *d = p2m->domain; - - if ( !paging_mode_translate(d) ) - return; - - //P2M_PRINTK("p2m audit starts\n"); - - test_linear = ( (d == current->domain) - && !pagetable_is_null(current->arch.monitor_table) ); - if ( test_linear ) - flush_tlb_local(); - - spin_lock(&d->page_alloc_lock); - - /* Audit part one: walk the domain's page allocation list, checking - * the m2p entries. */ - page_list_for_each ( page, &d->page_list ) - { - mfn = mfn_x(page_to_mfn(page)); - - // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn); - - od = page_get_owner(page); - - if ( od != d ) - { - P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", - mfn, od, (od?od->domain_id:-1), d, d->domain_id); - continue; - } - - gfn = get_gpfn_from_mfn(mfn); - if ( gfn == INVALID_M2P_ENTRY ) - { - orphans_i++; - //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", - // mfn); - continue; - } - - if ( gfn == 0x55555555 || gfn == 0x5555555555555555 ) - { - orphans_d++; - //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", - // mfn); - continue; - } - - if ( gfn == SHARED_M2P_ENTRY ) - { - P2M_PRINTK("shared mfn (%lx) on domain page list!\n", - mfn); - continue; - } - - p2mfn = gfn_to_mfn_type_p2m(p2m, gfn, &type, p2m_query); - if ( strict_m2p && mfn_x(p2mfn) != mfn ) - { - mpbad++; - P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" - " (-> gfn %#lx)\n", - mfn, gfn, mfn_x(p2mfn), - (mfn_valid(p2mfn) - ? get_gpfn_from_mfn(mfn_x(p2mfn)) - : -1u)); - /* This m2p entry is stale: the domain has another frame in - * this physical slot. No great disaster, but for neatness, - * blow away the m2p entry. */ - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); - } - - if ( test_linear && (gfn <= p2m->max_mapped_pfn) ) - { - lp2mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &type)); - if ( lp2mfn != mfn_x(p2mfn) ) - { - P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " - "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn)); - } - } - - // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", - // mfn, gfn, mfn_x(p2mfn), lp2mfn); - } - - spin_unlock(&d->page_alloc_lock); - - /* Audit part two: walk the domain's p2m table, checking the entries. */ - if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) != 0 ) - { - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - int i1, i2; - -#if CONFIG_PAGING_LEVELS == 4 - l4_pgentry_t *l4e; - l3_pgentry_t *l3e; - int i4, i3; - l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); -#else /* CONFIG_PAGING_LEVELS == 3 */ - l3_pgentry_t *l3e; - int i3; - l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); -#endif - - gfn = 0; -#if CONFIG_PAGING_LEVELS >= 4 - for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) - { - if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) - { - gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4])))); -#endif - for ( i3 = 0; - i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); - i3++ ) - { - if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) - { - gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - - /* check for 1GB super page */ - if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE ) - { - mfn = l3e_get_pfn(l3e[i3]); - ASSERT(mfn_valid(_mfn(mfn))); - /* we have to cover 512x512 4K pages */ - for ( i2 = 0; - i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES); - i2++) - { - m2pfn = get_gpfn_from_mfn(mfn+i2); - if ( m2pfn != (gfn + i2) ) - { - pmbad++; - P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" - " -> gfn %#lx\n", gfn+i2, mfn+i2, - m2pfn); - BUG(); - } - gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - } - - l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3])))); - for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) - { - if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) - { - if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) - && ( p2m_flags_to_type(l2e_get_flags(l2e[i2])) - == p2m_populate_on_demand ) ) - entry_count+=SUPERPAGE_PAGES; - gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - - /* check for super page */ - if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE ) - { - mfn = l2e_get_pfn(l2e[i2]); - ASSERT(mfn_valid(_mfn(mfn))); - for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++) - { - m2pfn = get_gpfn_from_mfn(mfn+i1); - /* Allow shared M2Ps */ - if ( (m2pfn != (gfn + i1)) && - (m2pfn != SHARED_M2P_ENTRY) ) - { - pmbad++; - P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" - " -> gfn %#lx\n", gfn+i1, mfn+i1, - m2pfn); - BUG(); - } - } - gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); - continue; - } - - l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2])))); - - for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) - { - p2m_type_t type; - - type = p2m_flags_to_type(l1e_get_flags(l1e[i1])); - if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) - { - if ( type == p2m_populate_on_demand ) - entry_count++; - continue; - } - mfn = l1e_get_pfn(l1e[i1]); - ASSERT(mfn_valid(_mfn(mfn))); - m2pfn = get_gpfn_from_mfn(mfn); - if ( m2pfn != gfn && - type != p2m_mmio_direct && - !p2m_is_grant(type) && - !p2m_is_shared(type) ) - { - pmbad++; - printk("mismatch: gfn %#lx -> mfn %#lx" - " -> gfn %#lx\n", gfn, mfn, m2pfn); - P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" - " -> gfn %#lx\n", gfn, mfn, m2pfn); - BUG(); - } - } - unmap_domain_page(l1e); - } - unmap_domain_page(l2e); - } -#if CONFIG_PAGING_LEVELS >= 4 - unmap_domain_page(l3e); - } -#endif - -#if CONFIG_PAGING_LEVELS == 4 - unmap_domain_page(l4e); -#else /* CONFIG_PAGING_LEVELS == 3 */ - unmap_domain_page(l3e); -#endif - - } - - if ( entry_count != p2m->pod.entry_count ) - { - printk("%s: refcounted entry count %d, audit count %d!\n", - __func__, - p2m->pod.entry_count, - entry_count); - BUG(); - } - - //P2M_PRINTK("p2m audit complete\n"); - //if ( orphans_i | orphans_d | mpbad | pmbad ) - // P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", - // orphans_i + orphans_d, orphans_i, orphans_d); - if ( mpbad | pmbad ) - { - P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", - pmbad, mpbad); - WARN(); - } -} -#endif /* P2M_AUDIT */ - - static void p2m_remove_page(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn, @@ -2475,88 +359,6 @@ p2m_unlock(p2m); } -#if CONFIG_PAGING_LEVELS == 3 -static int gfn_check_limit( - struct domain *d, unsigned long gfn, unsigned int order) -{ - /* - * 32bit AMD nested paging does not support over 4GB guest due to - * hardware translation limit. This limitation is checked by comparing - * gfn with 0xfffffUL. - */ - if ( !hap_enabled(d) || ((gfn + (1ul << order)) <= 0x100000UL) || - (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ) - return 0; - - if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) ) - dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond" - " 4GB: specify 'hap=0' domain config option.\n", - d->domain_id); - - return -EINVAL; -} -#else -#define gfn_check_limit(d, g, o) 0 -#endif - -int -guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn, - unsigned int order) -{ - struct p2m_domain *p2m = p2m_get_hostp2m(d); - unsigned long i; - p2m_type_t ot; - mfn_t omfn; - int pod_count = 0; - int rc = 0; - - BUG_ON(!paging_mode_translate(d)); - - rc = gfn_check_limit(d, gfn, order); - if ( rc != 0 ) - return rc; - - p2m_lock(p2m); - audit_p2m(p2m, 1); - - P2M_DEBUG("mark pod gfn=%#lx\n", gfn); - - /* Make sure all gpfns are unused */ - for ( i = 0; i < (1UL << order); i++ ) - { - omfn = gfn_to_mfn_query(p2m, gfn + i, &ot); - if ( p2m_is_ram(ot) ) - { - printk("%s: gfn_to_mfn returned type %d!\n", - __func__, ot); - rc = -EBUSY; - goto out; - } - else if ( ot == p2m_populate_on_demand ) - { - /* Count how man PoD entries we'll be replacing if successful */ - pod_count++; - } - } - - /* Now, actually do the two-way mapping */ - if ( !set_p2m_entry(p2m, gfn, _mfn(POPULATE_ON_DEMAND_MFN), order, - p2m_populate_on_demand, p2m->default_access) ) - rc = -EINVAL; - else - { - p2m->pod.entry_count += 1 << order; /* Lock: p2m */ - p2m->pod.entry_count -= pod_count; - BUG_ON(p2m->pod.entry_count < 0); - } - - audit_p2m(p2m, 1); - p2m_unlock(p2m); - -out: - return rc; -} - int guest_physmap_add_entry(struct p2m_domain *p2m, unsigned long gfn, unsigned long mfn, unsigned int page_order, @@ -2588,7 +390,7 @@ return 0; } - rc = gfn_check_limit(d, gfn, page_order); + rc = p2m_gfn_check_limit(d, gfn, page_order); if ( rc != 0 ) return rc; @@ -2682,142 +484,6 @@ return rc; } -/* Walk the whole p2m table, changing any entries of the old type - * to the new type. This is used in hardware-assisted paging to - * quickly enable or diable log-dirty tracking */ -void p2m_change_type_global(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt) -{ - unsigned long mfn, gfn, flags; - l1_pgentry_t l1e_content; - l1_pgentry_t *l1e; - l2_pgentry_t *l2e; - mfn_t l1mfn, l2mfn, l3mfn; - unsigned long i1, i2, i3; - l3_pgentry_t *l3e; -#if CONFIG_PAGING_LEVELS == 4 - l4_pgentry_t *l4e; - unsigned long i4; -#endif /* CONFIG_PAGING_LEVELS == 4 */ - - BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt)); - BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct)); - - if ( !paging_mode_translate(p2m->domain) ) - return; - - if ( pagetable_get_pfn(p2m_get_pagetable(p2m)) == 0 ) - return; - - ASSERT(p2m_locked_by_me(p2m)); - -#if CONFIG_PAGING_LEVELS == 4 - l4e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); -#else /* CONFIG_PAGING_LEVELS == 3 */ - l3mfn = _mfn(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); - l3e = map_domain_page(mfn_x(pagetable_get_mfn(p2m_get_pagetable(p2m)))); -#endif - -#if CONFIG_PAGING_LEVELS >= 4 - for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) - { - if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) - { - continue; - } - l3mfn = _mfn(l4e_get_pfn(l4e[i4])); - l3e = map_domain_page(l4e_get_pfn(l4e[i4])); -#endif - for ( i3 = 0; - i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); - i3++ ) - { - if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) - { - continue; - } - if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) ) - { - flags = l3e_get_flags(l3e[i3]); - if ( p2m_flags_to_type(flags) != ot ) - continue; - mfn = l3e_get_pfn(l3e[i3]); - gfn = get_gpfn_from_mfn(mfn); - flags = p2m_type_to_flags(nt, _mfn(mfn)); - l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); - p2m->write_p2m_entry(p2m, gfn, - (l1_pgentry_t *)&l3e[i3], - l3mfn, l1e_content, 3); - continue; - } - - l2mfn = _mfn(l3e_get_pfn(l3e[i3])); - l2e = map_domain_page(l3e_get_pfn(l3e[i3])); - for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) - { - if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) - { - continue; - } - - if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) ) - { - flags = l2e_get_flags(l2e[i2]); - if ( p2m_flags_to_type(flags) != ot ) - continue; - mfn = l2e_get_pfn(l2e[i2]); - /* Do not use get_gpfn_from_mfn because it may return - SHARED_M2P_ENTRY */ - gfn = (i2 + (i3 -#if CONFIG_PAGING_LEVELS >= 4 - + (i4 * L3_PAGETABLE_ENTRIES) -#endif - ) - * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; - flags = p2m_type_to_flags(nt, _mfn(mfn)); - l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); - p2m->write_p2m_entry(p2m, gfn, - (l1_pgentry_t *)&l2e[i2], - l2mfn, l1e_content, 2); - continue; - } - - l1mfn = _mfn(l2e_get_pfn(l2e[i2])); - l1e = map_domain_page(mfn_x(l1mfn)); - - for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) - { - flags = l1e_get_flags(l1e[i1]); - if ( p2m_flags_to_type(flags) != ot ) - continue; - mfn = l1e_get_pfn(l1e[i1]); - gfn = i1 + (i2 + (i3 -#if CONFIG_PAGING_LEVELS >= 4 - + (i4 * L3_PAGETABLE_ENTRIES) -#endif - ) - * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; - /* create a new 1le entry with the new type */ - flags = p2m_type_to_flags(nt, _mfn(mfn)); - l1e_content = l1e_from_pfn(mfn, flags); - p2m->write_p2m_entry(p2m, gfn, &l1e[i1], - l1mfn, l1e_content, 1); - } - unmap_domain_page(l1e); - } - unmap_domain_page(l2e); - } -#if CONFIG_PAGING_LEVELS >= 4 - unmap_domain_page(l3e); - } -#endif - -#if CONFIG_PAGING_LEVELS == 4 - unmap_domain_page(l4e); -#else /* CONFIG_PAGING_LEVELS == 3 */ - unmap_domain_page(l3e); -#endif - -} /* Modify the p2m type of a single gfn from ot to nt, returning the * entry's previous type. Resets the access permissions. */ diff -r d9982136d8fa -r 19452acd2304 xen/include/asm-x86/hvm/vmx/vmx.h --- a/xen/include/asm-x86/hvm/vmx/vmx.h Mon May 09 15:00:57 2011 +0100 +++ b/xen/include/asm-x86/hvm/vmx/vmx.h Fri May 06 11:15:35 2011 +0100 @@ -399,7 +399,7 @@ void vmx_inject_extint(int trap); void vmx_inject_nmi(void); -void ept_p2m_init(struct domain *d); +void ept_p2m_init(struct p2m_domain *p2m); void ept_walk_table(struct domain *d, unsigned long gfn); void setup_ept_dump(void); diff -r d9982136d8fa -r 19452acd2304 xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h Mon May 09 15:00:57 2011 +0100 +++ b/xen/include/asm-x86/p2m.h Fri May 06 11:15:35 2011 +0100 @@ -638,6 +638,34 @@ struct page_info *p2m_alloc_ptp(struct p2m_domain *p2m, unsigned long type); void p2m_free_ptp(struct p2m_domain *p2m, struct page_info *pg); +#if CONFIG_PAGING_LEVELS == 3 +static inline int p2m_gfn_check_limit( + struct domain *d, unsigned long gfn, unsigned int order) +{ + /* + * 32bit AMD nested paging does not support over 4GB guest due to + * hardware translation limit. This limitation is checked by comparing + * gfn with 0xfffffUL. + */ + if ( !hap_enabled(d) || ((gfn + (1ul << order)) <= 0x100000UL) || + (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ) + return 0; + + if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) ) + dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond" + " 4GB: specify 'hap=0' domain config option.\n", + d->domain_id); + + return -EINVAL; +} +#else +#define p2m_gfn_check_limit(d, g, o) 0 +#endif + +/* Directly set a p2m entry: only for use by p2m code */ +int set_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, + unsigned int page_order, p2m_type_t p2mt, p2m_access_t p2ma); + #endif /* _XEN_P2M_H */ /* _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.