Xen project Mailing List

[Xen-changelog] [xen-unstable] EPT: 1GB large page support.

From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Mon, 05 Apr 2010 23:20:47 -0700

Delivery-date: Mon, 05 Apr 2010 23:23:28 -0700

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User Keir Fraser <keir.fraser@xxxxxxxxxx> # Date 1270534496 -3600 # Node ID d7370232060a31d17cd27c9d40a4a6cf2f09935d # Parent b20f897d6010457ec507138d450a332eba5147ea EPT: 1GB large page support. Alloc 1GB large page for EPT if possible. It also contains the logic to split large page into small ones (2M or 4K). Signed-off-by: Dongxiao Xu <dongxiao.xu@xxxxxxxxx> Signed-off-by: Xiaohui Xin <xiaohui.xin@xxxxxxxxx> Acked-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx> --- xen/arch/x86/hvm/hvm.c | 5 xen/arch/x86/hvm/vmx/vmcs.c | 16 ++ xen/arch/x86/hvm/vmx/vmx.c | 3 xen/arch/x86/mm/hap/p2m-ept.c | 199 +++++++++++++++++++------------------ xen/include/asm-x86/hvm/vmx/vmcs.h | 7 + xen/include/asm-x86/msr-index.h | 1 6 files changed, 134 insertions(+), 97 deletions(-) diff -r b20f897d6010 -r d7370232060a xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c Tue Apr 06 07:13:19 2010 +0100 +++ b/xen/arch/x86/hvm/hvm.c Tue Apr 06 07:14:56 2010 +0100 @@ -966,6 +966,11 @@ bool_t hvm_hap_nested_page_fault(unsigne /* Spurious fault? PoD and log-dirty also take this path. */ if ( p2m_is_ram(p2mt) ) { + /* + * Page log dirty is always done with order 0. If this mfn resides in + * a large page, we do not change other pages type within that large + * page. + */ paging_mark_dirty(current->domain, mfn_x(mfn)); p2m_change_type(current->domain, gfn, p2m_ram_logdirty, p2m_ram_rw); return 1; diff -r b20f897d6010 -r d7370232060a xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Tue Apr 06 07:13:19 2010 +0100 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Tue Apr 06 07:14:56 2010 +0100 @@ -64,6 +64,7 @@ u32 vmx_secondary_exec_control __read_mo u32 vmx_secondary_exec_control __read_mostly; u32 vmx_vmexit_control __read_mostly; u32 vmx_vmentry_control __read_mostly; +u8 vmx_ept_super_page_level_limit __read_mostly; bool_t cpu_has_vmx_ins_outs_instr_info __read_mostly; static DEFINE_PER_CPU_READ_MOSTLY(struct vmcs_struct *, host_vmcs); @@ -183,6 +184,21 @@ static void vmx_init_vmcs_config(void) _vmx_secondary_exec_control &= ~(SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST); + if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT ) + { + uint64_t cap; + rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, cap); + if ( cap & VMX_EPT_SUPER_PAGE_1G ) + { + vmx_ept_super_page_level_limit = 2; + printk("EPT support 1G super page.\n"); + } + else if ( cap & VMX_EPT_SUPER_PAGE_2M ) + { + vmx_ept_super_page_level_limit = 1; + printk("EPT support 2M super page.\n"); + } + } } if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) && diff -r b20f897d6010 -r d7370232060a xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Tue Apr 06 07:13:19 2010 +0100 +++ b/xen/arch/x86/hvm/vmx/vmx.c Tue Apr 06 07:14:56 2010 +0100 @@ -1446,7 +1446,8 @@ void start_vmx(void) if ( cpu_has_vmx_ept ) vmx_function_table.hap_supported = 1; - vmx_function_table.hap_1gb_pgtb = 0; + vmx_function_table.hap_1gb_pgtb = ( vmx_ept_super_page_level_limit == 2 ) ? + 1 : 0; setup_vmcs_dump(); diff -r b20f897d6010 -r d7370232060a xen/arch/x86/mm/hap/p2m-ept.c --- a/xen/arch/x86/mm/hap/p2m-ept.c Tue Apr 06 07:13:19 2010 +0100 +++ b/xen/arch/x86/mm/hap/p2m-ept.c Tue Apr 06 07:14:56 2010 +0100 @@ -25,6 +25,7 @@ #include <asm/domain.h> #include <asm/p2m.h> #include <asm/hvm/vmx/vmx.h> +#include <asm/hvm/vmx/vmcs.h> #include <xen/iommu.h> #include <asm/mtrr.h> #include <asm/hvm/cacheattr.h> @@ -167,6 +168,61 @@ static int ept_next_level(struct domain } } +/* It's super page before and we should break down it now. */ +static int ept_split_large_page(struct domain *d, + ept_entry_t **table, u32 *index, + unsigned long gfn, int level) +{ + ept_entry_t *prev_table = *table; + ept_entry_t *split_table = NULL; + ept_entry_t *split_entry = NULL; + ept_entry_t *ept_entry = (*table) + (*index); + ept_entry_t temp_ept_entry; + unsigned long s_gfn, s_mfn; + unsigned long offset, trunk; + int i; + + /* alloc new page for new ept middle level entry which is + * before a leaf super entry + */ + + if ( !ept_set_middle_entry(d, &temp_ept_entry) ) + return 0; + + /* split the super page to small next level pages */ + split_table = map_domain_page(temp_ept_entry.mfn); + offset = gfn & ((1UL << (level * EPT_TABLE_ORDER)) - 1); + trunk = (1UL << ((level-1) * EPT_TABLE_ORDER)); + + for ( i = 0; i < (1UL << EPT_TABLE_ORDER); i++ ) + { + s_gfn = gfn - offset + i * trunk; + s_mfn = ept_entry->mfn + i * trunk; + + split_entry = split_table + i; + split_entry->emt = ept_entry->emt; + split_entry->ipat = ept_entry->ipat; + + split_entry->sp_avail = (level > 1) ? 1 : 0; + + split_entry->mfn = s_mfn; + + split_entry->avail1 = ept_entry->avail1; + split_entry->avail2 = 0; + /* last step */ + split_entry->r = split_entry->w = split_entry->x = 1; + ept_p2m_type_to_flags(split_entry, ept_entry->avail1); + } + + *ept_entry = temp_ept_entry; + + *index = offset / trunk; + *table = split_table; + unmap_domain_page(prev_table); + + return 1; +} + /* * ept_set_entry() computes 'need_modify_vtd_table' for itself, * by observing whether any gfn->mfn translations are modified. @@ -183,13 +239,11 @@ ept_set_entry(struct domain *d, unsigned int i; int rv = 0; int ret = 0; + int split_level = 0; int walk_level = order / EPT_TABLE_ORDER; int direct_mmio = (p2mt == p2m_mmio_direct); uint8_t ipat = 0; int need_modify_vtd_table = 1; - - /* We only support 4k and 2m pages now */ - BUG_ON(order && order != EPT_TABLE_ORDER); if ( order != 0 ) if ( (gfn & ((1UL << order) - 1)) ) @@ -208,15 +262,15 @@ ept_set_entry(struct domain *d, unsigned break; } - /* If order == 9, we should never get SUPERPAGE or PoD. - * If order == 0, we should only get POD if we have a POD superpage. + /* If order == 0, we should only get POD if we have a POD superpage. * If i > walk_level, we need to split the page; otherwise, * just behave as normal. */ - ASSERT(order == 0 || ret == GUEST_TABLE_NORMAL_PAGE); ASSERT(ret != GUEST_TABLE_POD_PAGE || i != walk_level); index = gfn_remainder >> ( i ? (i * EPT_TABLE_ORDER): order); offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1))); + + split_level = i; ept_entry = table + index; @@ -231,25 +285,10 @@ ept_set_entry(struct domain *d, unsigned ept_entry->ipat = ipat; ept_entry->sp_avail = order ? 1 : 0; - if ( ret == GUEST_TABLE_SUPER_PAGE ) - { - if ( ept_entry->mfn == (mfn_x(mfn) - offset) ) - need_modify_vtd_table = 0; - else - ept_entry->mfn = mfn_x(mfn) - offset; - - if ( (ept_entry->avail1 == p2m_ram_logdirty) - && (p2mt == p2m_ram_rw) ) - for ( i = 0; i < 512; i++ ) - paging_mark_dirty(d, mfn_x(mfn) - offset + i); - } + if ( ept_entry->mfn == mfn_x(mfn) ) + need_modify_vtd_table = 0; else - { - if ( ept_entry->mfn == mfn_x(mfn) ) - need_modify_vtd_table = 0; - else - ept_entry->mfn = mfn_x(mfn); - } + ept_entry->mfn = mfn_x(mfn); ept_entry->avail1 = p2mt; ept_entry->avail2 = 0; @@ -261,51 +300,22 @@ ept_set_entry(struct domain *d, unsigned } else { - /* - * It's super page before, now set one of the 4k pages, so - * we should split the 2m page to 4k pages now. - */ - /* Pointers to / into new (split) middle-level table */ - ept_entry_t *split_table = NULL; - ept_entry_t *split_ept_entry = NULL; - /* Info about old (superpage) table */ - unsigned long super_mfn = ept_entry->mfn; - p2m_type_t super_p2mt = ept_entry->avail1; - /* The new l2 entry which we'll write after we've build the new l1 table */ - ept_entry_t l2_ept_entry; - - /* - * Allocate new page for new ept middle level entry which is - * before a leaf super entry - */ - if ( !ept_set_middle_entry(d, &l2_ept_entry) ) - goto out; - - /* Split the super page before to 4k pages */ - split_table = map_domain_page(l2_ept_entry.mfn); - offset = gfn & ((1 << EPT_TABLE_ORDER) - 1); - - for ( i = 0; i < 512; i++ ) - { - split_ept_entry = split_table + i; - split_ept_entry->emt = epte_get_entry_emt(d, gfn - offset + i, - _mfn(super_mfn + i), - &ipat, direct_mmio); - split_ept_entry->ipat = ipat; - split_ept_entry->sp_avail = 0; - /* Don't increment mfn if it's a PoD mfn */ - if ( super_p2mt != p2m_populate_on_demand ) - split_ept_entry->mfn = super_mfn + i; - else - split_ept_entry->mfn = super_mfn; - split_ept_entry->avail1 = super_p2mt; - split_ept_entry->avail2 = 0; - - ept_p2m_type_to_flags(split_ept_entry, super_p2mt); - } - - /* Set the destinated 4k page as normal */ - split_ept_entry = split_table + offset; + int num = order / EPT_TABLE_ORDER; + int level; + ept_entry_t *split_ept_entry; + + if ( num >= cpu_vmx_ept_super_page_level_limit ) + num = cpu_vmx_ept_super_page_level_limit; + for ( level = split_level; level > num ; level-- ) + { + rv = ept_split_large_page(d, &table, &index, gfn, level); + if ( !rv ) + goto out; + } + + split_ept_entry = table + index; + split_ept_entry->avail1 = p2mt; + ept_p2m_type_to_flags(split_ept_entry, p2mt); split_ept_entry->emt = epte_get_entry_emt(d, gfn, mfn, &ipat, direct_mmio); split_ept_entry->ipat = ipat; @@ -314,12 +324,6 @@ ept_set_entry(struct domain *d, unsigned need_modify_vtd_table = 0; else split_ept_entry->mfn = mfn_x(mfn); - - split_ept_entry->avail1 = p2mt; - ept_p2m_type_to_flags(split_ept_entry, p2mt); - - unmap_domain_page(split_table); - *ept_entry = l2_ept_entry; } /* Track the highest gfn for which we have ever had a valid mapping */ @@ -336,7 +340,7 @@ out: ept_sync_domain(d); /* Now the p2m table is not shared with vt-d page table */ - if ( iommu_enabled && need_iommu(d) && need_modify_vtd_table ) + if ( rv && iommu_enabled && need_iommu(d) && need_modify_vtd_table ) { if ( p2mt == p2m_ram_rw ) { @@ -459,7 +463,7 @@ out: /* WARNING: Only caller doesn't care about PoD pages. So this function will * always return 0 for PoD pages, not populate them. If that becomes necessary, * pass a p2m_query_t type along to distinguish. */ -static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn) +static ept_entry_t ept_get_entry_content(struct domain *d, unsigned long gfn, int *level) { ept_entry_t *table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); @@ -487,6 +491,7 @@ static ept_entry_t ept_get_entry_content index = gfn_remainder >> (i * EPT_TABLE_ORDER); ept_entry = table + index; content = *ept_entry; + *level = i; out: unmap_domain_page(table); @@ -579,7 +584,10 @@ void ept_change_entry_emt_with_range(str p2m_lock(d->arch.p2m); for ( gfn = start_gfn; gfn <= end_gfn; gfn++ ) { - e = ept_get_entry_content(d, gfn); + int level = 0; + uint64_t trunk = 0; + + e = ept_get_entry_content(d, gfn, &level); if ( !p2m_has_emt(e.avail1) ) continue; @@ -588,25 +596,24 @@ void ept_change_entry_emt_with_range(str if ( e.sp_avail ) { - if ( !(gfn & ((1 << EPT_TABLE_ORDER) - 1)) && - ((gfn + 0x1FF) <= end_gfn) ) + while ( level ) { - /* - * gfn assigned with 2M, and the end covers more than 2m areas. - * Set emt for super page. - */ - order = EPT_TABLE_ORDER; - if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, e.avail1) ) - ept_set_entry(d, gfn, mfn, order, e.avail1); - gfn += 0x1FF; - } - else - { - /* Change emt for partial entries of the 2m area. */ - if ( need_modify_ept_entry(d, gfn, mfn, e.ipat, e.emt, e.avail1) ) - ept_set_entry(d, gfn, mfn, order, e.avail1); - gfn = ((gfn >> EPT_TABLE_ORDER) << EPT_TABLE_ORDER) + 0x1FF; - } + trunk = (1UL << (level * EPT_TABLE_ORDER)) - 1; + if ( !(gfn & trunk) && (gfn + trunk <= end_gfn) ) + { + /* gfn assigned with 2M or 1G, and the end covers more than + * the super page areas. + * Set emt for super page. + */ + order = level * EPT_TABLE_ORDER; + if ( need_modify_ept_entry(d, gfn, mfn, + e.ipat, e.emt, e.avail1) ) + ept_set_entry(d, gfn, mfn, order, e.avail1); + gfn += trunk; + break; + } + level--; + } } else /* gfn assigned with 4k */ { diff -r b20f897d6010 -r d7370232060a xen/include/asm-x86/hvm/vmx/vmcs.h --- a/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Apr 06 07:13:19 2010 +0100 +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h Tue Apr 06 07:14:56 2010 +0100 @@ -175,6 +175,11 @@ extern u32 vmx_secondary_exec_control; extern u32 vmx_secondary_exec_control; extern bool_t cpu_has_vmx_ins_outs_instr_info; + +extern u8 vmx_ept_super_page_level_limit; + +#define VMX_EPT_SUPER_PAGE_2M 0x00010000 +#define VMX_EPT_SUPER_PAGE_1G 0x00020000 #define cpu_has_wbinvd_exiting \ (vmx_secondary_exec_control & SECONDARY_EXEC_WBINVD_EXITING) @@ -203,6 +208,8 @@ extern bool_t cpu_has_vmx_ins_outs_instr SECONDARY_EXEC_UNRESTRICTED_GUEST) #define cpu_has_vmx_ple \ (vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) +#define cpu_vmx_ept_super_page_level_limit \ + vmx_ept_super_page_level_limit /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define VMX_INTR_SHADOW_STI 0x00000001 diff -r b20f897d6010 -r d7370232060a xen/include/asm-x86/msr-index.h --- a/xen/include/asm-x86/msr-index.h Tue Apr 06 07:13:19 2010 +0100 +++ b/xen/include/asm-x86/msr-index.h Tue Apr 06 07:14:56 2010 +0100 @@ -166,6 +166,7 @@ #define MSR_IA32_VMX_CR4_FIXED0 0x488 #define MSR_IA32_VMX_CR4_FIXED1 0x489 #define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b +#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c #define MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x48d #define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x48e #define MSR_IA32_VMX_TRUE_EXIT_CTLS 0x48f _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.