[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] EPT/VT-d page table sharing
# HG changeset patch # User Keir Fraser <keir@xxxxxxx> # Date 1292422563 0 # Node ID 764e95f64b28b381abcc40ddad5e77d47572f4d6 # Parent f0d26fdebf40d3a8f1839408f72bf182ce93fee0 EPT/VT-d page table sharing Basic idea is to leverage 2MB and 1GB page size support in EPT by having VT-d using the same page tables as EPT. When EPT page table changes, flush VT-d IOTLB cache. Signed-off-by: Weidong Han <weidong.han@xxxxxxxxx> Signed-off-by: Allen Kay <allen.m.kay@xxxxxxxxx> --- xen/arch/x86/mm/hap/p2m-ept.c | 104 ++++++++++++++++++----------------- xen/arch/x86/mm/p2m.c | 5 + xen/drivers/passthrough/iommu.c | 6 +- xen/drivers/passthrough/vtd/iommu.c | 105 ++++++++++++++++++++++++++++++++++-- xen/drivers/passthrough/vtd/iommu.h | 16 +++-- xen/include/asm-x86/hvm/vmx/vmx.h | 29 ++++++--- xen/include/xen/iommu.h | 4 + 7 files changed, 197 insertions(+), 72 deletions(-) diff -r f0d26fdebf40 -r 764e95f64b28 xen/arch/x86/mm/hap/p2m-ept.c --- a/xen/arch/x86/mm/hap/p2m-ept.c Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/arch/x86/mm/hap/p2m-ept.c Wed Dec 15 14:16:03 2010 +0000 @@ -44,7 +44,7 @@ static int ept_pod_check_and_populate(st p2m_lock(p2m); /* Check to make sure this is still PoD */ - if ( entry->avail1 != p2m_populate_on_demand ) + if ( entry->sa_p2mt != p2m_populate_on_demand ) { p2m_unlock(p2m); return 0; @@ -110,13 +110,8 @@ static int ept_set_middle_entry(struct p if ( pg == NULL ) return 0; - ept_entry->emt = 0; - ept_entry->ipat = 0; - ept_entry->sp = 0; - ept_entry->avail1 = 0; + ept_entry->epte = 0; ept_entry->mfn = page_to_mfn(pg); - ept_entry->avail2 = 0; - ept_entry->r = ept_entry->w = ept_entry->x = 1; return 1; @@ -166,14 +161,15 @@ static int ept_split_super_page(struct p { ept_entry_t *epte = table + i; + epte->epte = 0; epte->emt = ept_entry->emt; epte->ipat = ept_entry->ipat; epte->sp = (level > 1) ? 1 : 0; - epte->avail1 = ept_entry->avail1; - epte->avail2 = 0; + epte->sa_p2mt = ept_entry->sa_p2mt; epte->mfn = ept_entry->mfn + i * trunk; - - ept_p2m_type_to_flags(epte, epte->avail1); + epte->rsvd2_snp = ( iommu_enabled && iommu_snoop ) ? 1 : 0; + + ept_p2m_type_to_flags(epte, epte->sa_p2mt); if ( (level - 1) == target ) continue; @@ -230,7 +226,7 @@ static int ept_next_level(struct p2m_dom if ( !is_epte_present(&e) ) { - if ( e.avail1 == p2m_populate_on_demand ) + if ( e.sa_p2mt == p2m_populate_on_demand ) return GUEST_TABLE_POD_PAGE; if ( read_only ) @@ -261,7 +257,7 @@ ept_set_entry(struct p2m_domain *p2m, un ept_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn, unsigned int order, p2m_type_t p2mt) { - ept_entry_t *table, *ept_entry; + ept_entry_t *table, *ept_entry = NULL; unsigned long gfn_remainder = gfn; unsigned long offset = 0; u32 index; @@ -271,6 +267,7 @@ ept_set_entry(struct p2m_domain *p2m, un bool_t direct_mmio = (p2mt == p2m_mmio_direct); uint8_t ipat = 0; int need_modify_vtd_table = 1; + int vtd_pte_present = 0; int needs_sync = 1; struct domain *d = p2m->domain; @@ -309,6 +306,9 @@ ept_set_entry(struct p2m_domain *p2m, un ept_entry = table + index; + /* In case VT-d uses same page table, this flag is needed by VT-d */ + vtd_pte_present = is_epte_present(ept_entry) ? 1 : 0; + /* * When we are here, we must be on a leaf ept entry * with i == target or i > target. @@ -325,15 +325,15 @@ ept_set_entry(struct p2m_domain *p2m, un if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_in_start) ) { - ept_entry_t new_entry; + ept_entry_t new_entry = { .epte = 0 }; /* Construct the new entry, and then write it once */ new_entry.emt = epte_get_entry_emt(p2m->domain, gfn, mfn, &ipat, direct_mmio); new_entry.ipat = ipat; new_entry.sp = order ? 1 : 0; - new_entry.avail1 = p2mt; - new_entry.avail2 = 0; + new_entry.sa_p2mt = p2mt; + new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); if ( new_entry.mfn == mfn_x(mfn) ) need_modify_vtd_table = 0; @@ -351,7 +351,7 @@ ept_set_entry(struct p2m_domain *p2m, un { /* We need to split the original page. */ ept_entry_t split_ept_entry; - ept_entry_t new_entry; + ept_entry_t new_entry = { .epte = 0 }; ASSERT(is_epte_superpage(ept_entry)); @@ -381,8 +381,8 @@ ept_set_entry(struct p2m_domain *p2m, un new_entry.emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio); new_entry.ipat = ipat; new_entry.sp = i ? 1 : 0; - new_entry.avail1 = p2mt; - new_entry.avail2 = 0; + new_entry.sa_p2mt = p2mt; + new_entry.rsvd2_snp = (iommu_enabled && iommu_snoop); if ( new_entry.mfn == mfn_x(mfn) ) need_modify_vtd_table = 0; @@ -408,31 +408,35 @@ out: if ( needs_sync ) ept_sync_domain(p2m->domain); - /* Now the p2m table is not shared with vt-d page table */ if ( rv && iommu_enabled && need_iommu(p2m->domain) && need_modify_vtd_table ) { - if ( p2mt == p2m_ram_rw ) - { - if ( order == EPT_TABLE_ORDER ) + if ( iommu_hap_pt_share ) + iommu_pte_flush(d, gfn, (u64*)ept_entry, vtd_pte_present); + else + { + if ( p2mt == p2m_ram_rw ) { - for ( i = 0; i < (1 << order); i++ ) + if ( order == EPT_TABLE_ORDER ) + { + for ( i = 0; i < (1 << order); i++ ) + iommu_map_page( + p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i, + IOMMUF_readable | IOMMUF_writable); + } + else if ( !order ) iommu_map_page( - p2m->domain, gfn - offset + i, mfn_x(mfn) - offset + i, - IOMMUF_readable | IOMMUF_writable); + p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable); } - else if ( !order ) - iommu_map_page( - p2m->domain, gfn, mfn_x(mfn), IOMMUF_readable | IOMMUF_writable); - } - else - { - if ( order == EPT_TABLE_ORDER ) + else { - for ( i = 0; i < (1 << order); i++ ) - iommu_unmap_page(p2m->domain, gfn - offset + i); + if ( order == EPT_TABLE_ORDER ) + { + for ( i = 0; i < (1 << order); i++ ) + iommu_unmap_page(p2m->domain, gfn - offset + i); + } + else if ( !order ) + iommu_unmap_page(p2m->domain, gfn); } - else if ( !order ) - iommu_unmap_page(p2m->domain, gfn); } } @@ -494,7 +498,7 @@ static mfn_t ept_get_entry(struct p2m_do index = gfn_remainder >> (i * EPT_TABLE_ORDER); ept_entry = table + index; - if ( ept_entry->avail1 == p2m_populate_on_demand ) + if ( ept_entry->sa_p2mt == p2m_populate_on_demand ) { if ( q == p2m_query ) { @@ -510,9 +514,9 @@ static mfn_t ept_get_entry(struct p2m_do } - if ( ept_entry->avail1 != p2m_invalid ) - { - *t = ept_entry->avail1; + if ( ept_entry->sa_p2mt != p2m_invalid ) + { + *t = ept_entry->sa_p2mt; mfn = _mfn(ept_entry->mfn); if ( i ) { @@ -663,7 +667,7 @@ void ept_change_entry_emt_with_range(str uint64_t trunk = 0; e = ept_get_entry_content(p2m, gfn, &level); - if ( !p2m_has_emt(e.avail1) ) + if ( !p2m_has_emt(e.sa_p2mt) ) continue; order = 0; @@ -682,8 +686,8 @@ void ept_change_entry_emt_with_range(str */ order = level * EPT_TABLE_ORDER; if ( need_modify_ept_entry(p2m, gfn, mfn, - e.ipat, e.emt, e.avail1) ) - ept_set_entry(p2m, gfn, mfn, order, e.avail1); + e.ipat, e.emt, e.sa_p2mt) ) + ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt); gfn += trunk; break; } @@ -692,8 +696,8 @@ void ept_change_entry_emt_with_range(str } else /* gfn assigned with 4k */ { - if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.avail1) ) - ept_set_entry(p2m, gfn, mfn, order, e.avail1); + if ( need_modify_ept_entry(p2m, gfn, mfn, e.ipat, e.emt, e.sa_p2mt) ) + ept_set_entry(p2m, gfn, mfn, order, e.sa_p2mt); } } p2m_unlock(p2m); @@ -719,10 +723,10 @@ static void ept_change_entry_type_page(m ept_page_level - 1, ot, nt); else { - if ( epte[i].avail1 != ot ) + if ( epte[i].sa_p2mt != ot ) continue; - epte[i].avail1 = nt; + epte[i].sa_p2mt = nt; ept_p2m_type_to_flags(epte + i, nt); } } @@ -796,9 +800,9 @@ static void ept_dump_p2m_table(unsigned index = gfn_remainder >> order; ept_entry = table + index; - if ( ept_entry->avail1 != p2m_invalid ) + if ( ept_entry->sa_p2mt != p2m_invalid ) { - ( ept_entry->avail1 == p2m_populate_on_demand ) ? + ( ept_entry->sa_p2mt == p2m_populate_on_demand ) ? ( mfn = _mfn(INVALID_MFN), is_pod = 1 ) : ( mfn = _mfn(ept_entry->mfn), is_pod = 0 ); diff -r f0d26fdebf40 -r 764e95f64b28 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/arch/x86/mm/p2m.c Wed Dec 15 14:16:03 2010 +0000 @@ -1800,6 +1800,7 @@ int p2m_alloc_table(struct p2m_domain *p struct page_info *page, *p2m_top; unsigned int page_count = 0; unsigned long gfn = -1UL; + struct domain *d = p2m->domain; p2m_lock(p2m); @@ -1827,6 +1828,10 @@ int p2m_alloc_table(struct p2m_domain *p } p2m->phys_table = pagetable_from_mfn(page_to_mfn(p2m_top)); + + if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled && + (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ) + iommu_set_pgd(d); P2M_PRINTK("populating p2m table\n"); diff -r f0d26fdebf40 -r 764e95f64b28 xen/drivers/passthrough/iommu.c --- a/xen/drivers/passthrough/iommu.c Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/drivers/passthrough/iommu.c Wed Dec 15 14:16:03 2010 +0000 @@ -47,6 +47,7 @@ bool_t __read_mostly iommu_snoop = 1; bool_t __read_mostly iommu_snoop = 1; bool_t __read_mostly iommu_qinval = 1; bool_t __read_mostly iommu_intremap = 1; +bool_t __read_mostly iommu_hap_pt_share; bool_t __read_mostly amd_iommu_debug; bool_t __read_mostly amd_iommu_perdev_intremap; @@ -174,10 +175,11 @@ int assign_device(struct domain *d, u8 b if ( has_arch_pdevs(d) && !need_iommu(d) ) { d->need_iommu = 1; - rc = iommu_populate_page_table(d); + if ( !iommu_hap_pt_share ) + rc = iommu_populate_page_table(d); goto done; } -done: +done: spin_unlock(&pcidevs_lock); return rc; } diff -r f0d26fdebf40 -r 764e95f64b28 xen/drivers/passthrough/vtd/iommu.c --- a/xen/drivers/passthrough/vtd/iommu.c Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/drivers/passthrough/vtd/iommu.c Wed Dec 15 14:16:03 2010 +0000 @@ -33,6 +33,8 @@ #include <xen/keyhandler.h> #include <asm/msi.h> #include <asm/irq.h> +#include <asm/hvm/vmx/vmx.h> +#include <asm/p2m.h> #include <mach_apic.h> #include "iommu.h" #include "dmar.h" @@ -42,6 +44,9 @@ #ifdef __ia64__ #define nr_ioapics iosapic_get_nr_iosapics() #endif + +static int sharept = 1; +boolean_param("sharept", sharept); int nr_iommus; @@ -1627,6 +1632,9 @@ void iommu_domain_teardown(struct domain if ( list_empty(&acpi_drhd_units) ) return; + if ( iommu_hap_pt_share ) + return; + spin_lock(&hd->mapping_lock); iommu_free_pagetable(hd->pgd_maddr, agaw_to_level(hd->agaw)); hd->pgd_maddr = 0; @@ -1644,6 +1652,10 @@ static int intel_iommu_map_page( u64 pg_maddr; int flush_dev_iotlb; int iommu_domid; + + /* Do nothing if VT-d shares EPT page table */ + if ( iommu_hap_pt_share ) + return 0; /* do nothing if dom0 and iommu supports pass thru */ if ( iommu_passthrough && (d->domain_id == 0) ) @@ -1715,6 +1727,63 @@ static int intel_iommu_unmap_page(struct return 0; } +void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int present) +{ + struct acpi_drhd_unit *drhd; + struct iommu *iommu = NULL; + struct hvm_iommu *hd = domain_hvm_iommu(d); + int flush_dev_iotlb; + int iommu_domid; + + iommu_flush_cache_entry(pte, sizeof(struct dma_pte)); + + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( !test_bit(iommu->index, &hd->iommu_bitmap) ) + continue; + + flush_dev_iotlb = find_ats_dev_drhd(iommu) ? 1 : 0; + iommu_domid= domain_iommu_domid(d, iommu); + if ( iommu_domid == -1 ) + continue; + if ( iommu_flush_iotlb_psi(iommu, iommu_domid, + (paddr_t)gfn << PAGE_SHIFT_4K, 1, + !present, flush_dev_iotlb) ) + iommu_flush_write_buffer(iommu); + } +} + +static int vtd_ept_page_compatible(struct iommu *iommu) +{ + u64 cap = iommu->cap; + + if ( ept_has_2mb(cpu_has_vmx_ept_2mb) != cap_sps_2mb(cap) ) + return 0; + + if ( ept_has_1gb(cpu_has_vmx_ept_1gb) != cap_sps_1gb(cap) ) + return 0; + + return 1; +} + +/* + * set VT-d page table directory to EPT table if allowed + */ +void iommu_set_pgd(struct domain *d) +{ + struct hvm_iommu *hd = domain_hvm_iommu(d); + mfn_t pgd_mfn; + + ASSERT( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled ); + + if ( !iommu_hap_pt_share ) + return; + + pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d))); + hd->pgd_maddr = pagetable_get_paddr(pagetable_from_mfn(pgd_mfn)); +} + static int domain_rmrr_mapped(struct domain *d, struct acpi_rmrr_unit *rmrr) { @@ -1871,6 +1940,9 @@ static int init_vtd_hw(void) unsigned long flags; struct irq_cfg *cfg; + /* + * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults. + */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; @@ -1895,6 +1967,9 @@ static int init_vtd_hw(void) spin_unlock_irqrestore(&iommu->register_lock, flags); } + /* + * Enable queue invalidation + */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; @@ -1910,6 +1985,9 @@ static int init_vtd_hw(void) } } + /* + * Enable interrupt remapping + */ if ( iommu_intremap ) { int apic; @@ -1926,7 +2004,6 @@ static int init_vtd_hw(void) } } } - if ( iommu_intremap ) { for_each_drhd_unit ( drhd ) @@ -1941,6 +2018,11 @@ static int init_vtd_hw(void) } } + /* + * Set root entries for each VT-d engine. After set root entry, + * must globally invalidate context cache, and then globally + * invalidate IOTLB + */ for_each_drhd_unit ( drhd ) { iommu = drhd->iommu; @@ -1951,12 +2033,27 @@ static int init_vtd_hw(void) return -EIO; } } + iommu_flush_all(); /* - * After set root entry, must globally invalidate context cache, and - * then globally invalidate IOTLB + * Determine whether EPT and VT-d page tables can be shared or not. */ - iommu_flush_all(); + iommu_hap_pt_share = TRUE; + for_each_drhd_unit ( drhd ) + { + iommu = drhd->iommu; + if ( (drhd->iommu->nr_pt_levels != VTD_PAGE_TABLE_LEVEL_4) || + !vtd_ept_page_compatible(drhd->iommu) ) + iommu_hap_pt_share = FALSE; + } + + /* keep boot flag sharept as safe fallback. remove after feature matures */ + if ( !sharept ) + iommu_hap_pt_share = FALSE; + + gdprintk(XENLOG_INFO VTDPREFIX, + "VT-d page table %s with EPT table\n", + iommu_hap_pt_share ? "shares" : "not shares"); return 0; } diff -r f0d26fdebf40 -r 764e95f64b28 xen/drivers/passthrough/vtd/iommu.h --- a/xen/drivers/passthrough/vtd/iommu.h Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/drivers/passthrough/vtd/iommu.h Wed Dec 15 14:16:03 2010 +0000 @@ -68,15 +68,19 @@ /* * Decoding Capability Register */ -#define cap_read_drain(c) (((c) >> 55) & 1) -#define cap_write_drain(c) (((c) >> 54) & 1) -#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) -#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) +#define cap_read_drain(c) (((c) >> 55) & 1) +#define cap_write_drain(c) (((c) >> 54) & 1) +#define cap_max_amask_val(c) (((c) >> 48) & 0x3f) +#define cap_num_fault_regs(c) ((((c) >> 40) & 0xff) + 1) #define cap_pgsel_inv(c) (((c) >> 39) & 1) -#define cap_super_page_val(c) (((c) >> 34) & 0xf) +#define cap_super_page_val(c) (((c) >> 34) & 0xf) #define cap_super_offset(c) (((find_first_bit(&cap_super_page_val(c), 4)) \ - * OFFSET_STRIDE) + 21) + * OFFSET_STRIDE) + 21) +#define cap_sps_2mb(c) ((c >> 34) & 1) +#define cap_sps_1gb(c) ((c >> 35) & 1) +#define cap_sps_512gb(c) ((c >> 36) & 1) +#define cap_sps_1tb(c) ((c >> 37) & 1) #define cap_fault_reg_offset(c) ((((c) >> 24) & 0x3ff) * 16) diff -r f0d26fdebf40 -r 764e95f64b28 xen/include/asm-x86/hvm/vmx/vmx.h --- a/xen/include/asm-x86/hvm/vmx/vmx.h Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/include/asm-x86/hvm/vmx/vmx.h Wed Dec 15 14:16:03 2010 +0000 @@ -30,15 +30,21 @@ typedef union { struct { - u64 r : 1, - w : 1, - x : 1, - emt : 3, /* EPT Memory type */ - ipat : 1, /* Ignore PAT memory type */ - sp : 1, /* Is this a superpage? */ - avail1 : 4, - mfn : 40, - avail2 : 12; + u64 r : 1, /* bit 0 - Read permission */ + w : 1, /* bit 1 - Write permission */ + x : 1, /* bit 2 - Execute permission */ + emt : 3, /* bits 5:3 - EPT Memory type */ + ipat : 1, /* bit 6 - Ignore PAT memory type */ + sp : 1, /* bit 7 - Is this a superpage? */ + rsvd1 : 2, /* bits 9:8 - Reserved for future use */ + avail1 : 1, /* bit 10 - Software available 1 */ + rsvd2_snp : 1, /* bit 11 - Used for VT-d snoop control + in shared EPT/VT-d usage */ + mfn : 40, /* bits 51:12 - Machine physical frame number */ + sa_p2mt : 10, /* bits 61:52 - Software available 2 */ + rsvd3_tm : 1, /* bit 62 - Used for VT-d transient-mapping + hint in shared EPT/VT-d usage */ + avail3 : 1; /* bit 63 - Software available 3 */ }; u64 epte; } ept_entry_t; @@ -208,6 +214,11 @@ extern u64 vmx_ept_vpid_cap; #define cpu_has_vmx_ept_invept_single_context \ (vmx_ept_vpid_cap & VMX_EPT_INVEPT_SINGLE_CONTEXT) +#define EPT_2MB_SHIFT 16 +#define EPT_1GB_SHIFT 17 +#define ept_has_2mb(c) ((c >> EPT_2MB_SHIFT) & 1) +#define ept_has_1gb(c) ((c >> EPT_1GB_SHIFT) & 1) + #define INVEPT_SINGLE_CONTEXT 1 #define INVEPT_ALL_CONTEXT 2 diff -r f0d26fdebf40 -r 764e95f64b28 xen/include/xen/iommu.h --- a/xen/include/xen/iommu.h Wed Dec 15 13:34:26 2010 +0000 +++ b/xen/include/xen/iommu.h Wed Dec 15 14:16:03 2010 +0000 @@ -30,6 +30,7 @@ extern bool_t force_iommu, iommu_verbose extern bool_t force_iommu, iommu_verbose; extern bool_t iommu_workaround_bios_bug, iommu_passthrough; extern bool_t iommu_snoop, iommu_qinval, iommu_intremap; +extern bool_t iommu_hap_pt_share; extern bool_t amd_iommu_debug; extern bool_t amd_iommu_perdev_intremap; @@ -84,7 +85,8 @@ int iommu_map_page(struct domain *d, uns int iommu_map_page(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int flags); int iommu_unmap_page(struct domain *d, unsigned long gfn); - +void iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte, int present); +void iommu_set_pgd(struct domain *d); void iommu_domain_teardown(struct domain *d); int hvm_do_IRQ_dpci(struct domain *d, unsigned int irq); int dpci_ioport_intercept(ioreq_t *p); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |