[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] Create new vmassist type 'pae_extended_cr3'. Only advertise
# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID 8aca850f66adbbc6e42469d27374b685ea9787ed # Parent c9e6255cb44a2ec123b7bec9c9447697c747a5f3 Create new vmassist type 'pae_extended_cr3'. Only advertise pae_pgdir_above_4gb tp guests that have enabled this vmassist. Control tools ensure all PAE page directories are below 4GB unless the vmassist is enabled (triggered via an extended-cr3 option in guest Elf header). Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx> --- tools/libxc/xc_linux_build.c | 37 +++++-- tools/libxc/xc_linux_restore.c | 196 +++++++++++++++++++++++++++++++++++++---- tools/libxc/xc_linux_save.c | 25 ++++- tools/libxc/xc_load_elf.c | 11 +- tools/libxc/xc_private.c | 22 ++++ tools/libxc/xenctrl.h | 3 tools/libxc/xg_private.h | 3 xen/arch/x86/domain_build.c | 3 xen/arch/x86/mm.c | 15 +++ xen/common/kernel.c | 5 - xen/common/keyhandler.c | 5 - xen/include/public/xen.h | 16 +++ 12 files changed, 306 insertions(+), 35 deletions(-) diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xc_linux_build.c --- a/tools/libxc/xc_linux_build.c Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xc_linux_build.c Mon Jun 05 10:42:40 2006 +0100 @@ -254,16 +254,32 @@ static int setup_pg_tables_pae(int xc_ha unsigned long *page_array, unsigned long vpt_start, unsigned long vpt_end, - unsigned shadow_mode_enabled) + unsigned shadow_mode_enabled, + unsigned pae_mode) { l1_pgentry_64_t *vl1tab = NULL, *vl1e = NULL; l2_pgentry_64_t *vl2tab = NULL, *vl2e = NULL; l3_pgentry_64_t *vl3tab = NULL, *vl3e = NULL; uint64_t l1tab, l2tab, l3tab, pl1tab, pl2tab, pl3tab; - unsigned long ppt_alloc, count; + unsigned long ppt_alloc, count, nmfn; /* First allocate page for page dir. */ ppt_alloc = (vpt_start - dsi_v_start) >> PAGE_SHIFT; + + if ( pae_mode == PAEKERN_extended_cr3 ) + { + ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3); + } + else if ( page_array[ppt_alloc] > 0xfffff ) + { + nmfn = xc_make_page_below_4G(xc_handle, dom, page_array[ppt_alloc]); + if ( nmfn == 0 ) + { + fprintf(stderr, "Couldn't get a page below 4GB :-(\n"); + goto error_out; + } + page_array[ppt_alloc] = nmfn; + } alloc_pt(l3tab, vl3tab, pl3tab); vl3e = &vl3tab[l3_table_offset_pae(dsi_v_start)]; @@ -579,11 +595,11 @@ static int compat_check(int xc_handle, s } if (strstr(xen_caps, "xen-3.0-x86_32p")) { - if (!dsi->pae_kernel) { + if (dsi->pae_kernel == PAEKERN_no) { ERROR("Non PAE-kernel on PAE host."); return 0; } - } else if (dsi->pae_kernel) { + } else if (dsi->pae_kernel != PAEKERN_no) { ERROR("PAE-kernel on non-PAE host."); return 0; } @@ -673,7 +689,8 @@ static int setup_guest(int xc_handle, for ( i = 0; i < XENFEAT_NR_SUBMAPS; i++ ) { - if ( (supported_features[i]&required_features[i]) != required_features[i] ) + if ( (supported_features[i] & required_features[i]) != + required_features[i] ) { ERROR("Guest kernel does not support a required feature."); goto error_out; @@ -719,7 +736,7 @@ static int setup_guest(int xc_handle, (((((_h) + ((1UL<<(_s))-1)) & ~((1UL<<(_s))-1)) - \ ((_l) & ~((1UL<<(_s))-1))) >> (_s)) #if defined(__i386__) - if ( dsi.pae_kernel ) + if ( dsi.pae_kernel != PAEKERN_no ) { if ( (1 + /* # L3 */ NR(dsi.v_start, v_end, L3_PAGETABLE_SHIFT_PAE) + /* # L2 */ @@ -797,11 +814,11 @@ static int setup_guest(int xc_handle, /* setup page tables */ #if defined(__i386__) - if (dsi.pae_kernel) + if (dsi.pae_kernel != PAEKERN_no) rc = setup_pg_tables_pae(xc_handle, dom, ctxt, dsi.v_start, v_end, page_array, vpt_start, vpt_end, - shadow_mode_enabled); + shadow_mode_enabled, dsi.pae_kernel); else rc = setup_pg_tables(xc_handle, dom, ctxt, dsi.v_start, v_end, @@ -824,7 +841,7 @@ static int setup_guest(int xc_handle, */ if ( !shadow_mode_enabled ) { - if ( dsi.pae_kernel ) + if ( dsi.pae_kernel != PAEKERN_no ) { if ( pin_table(xc_handle, MMUEXT_PIN_L3_TABLE, xen_cr3_to_pfn(ctxt->ctrlreg[3]), dom) ) @@ -958,7 +975,7 @@ static int setup_guest(int xc_handle, rc = xc_version(xc_handle, XENVER_version, NULL); sprintf(start_info->magic, "xen-%i.%i-x86_%d%s", rc >> 16, rc & (0xFFFF), (unsigned int)sizeof(long)*8, - dsi.pae_kernel ? "p" : ""); + (dsi.pae_kernel != PAEKERN_no) ? "p" : ""); start_info->nr_pages = nr_pages; start_info->shared_info = guest_shared_info_mfn << PAGE_SHIFT; start_info->flags = flags; diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xc_linux_restore.c --- a/tools/libxc/xc_linux_restore.c Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xc_linux_restore.c Mon Jun 05 10:42:40 2006 +0100 @@ -108,7 +108,7 @@ int xc_linux_restore(int xc_handle, int unsigned int console_evtchn, unsigned long *console_mfn) { DECLARE_DOM0_OP; - int rc = 1, i, n; + int rc = 1, i, n, pae_extended_cr3 = 0; unsigned long mfn, pfn; unsigned int prev_pc, this_pc; int verify = 0; @@ -162,25 +162,83 @@ int xc_linux_restore(int xc_handle, int return 1; } - if (mlock(&ctxt, sizeof(ctxt))) { /* needed for build dom0 op, but might as well do early */ ERR("Unable to mlock ctxt"); return 1; } - - /* Read the saved P2M frame list */ - if(!(p2m_frame_list = malloc(P2M_FL_SIZE))) { + if (!(p2m_frame_list = malloc(P2M_FL_SIZE))) { ERR("Couldn't allocate p2m_frame_list array"); goto out; } - if (!read_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { + /* Read first entry of P2M list, or extended-info signature (~0UL). */ + if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) { + ERR("read extended-info signature failed"); + goto out; + } + + if (p2m_frame_list[0] == ~0UL) { + uint32_t tot_bytes; + + /* Next 4 bytes: total size of following extended info. */ + if (!read_exact(io_fd, &tot_bytes, sizeof(tot_bytes))) { + ERR("read extended-info size failed"); + goto out; + } + + while (tot_bytes) { + uint32_t chunk_bytes; + char chunk_sig[4]; + + /* 4-character chunk signature + 4-byte remaining chunk size. */ + if (!read_exact(io_fd, chunk_sig, sizeof(chunk_sig)) || + !read_exact(io_fd, &chunk_bytes, sizeof(chunk_bytes))) { + ERR("read extended-info chunk signature failed"); + goto out; + } + tot_bytes -= 8; + + /* VCPU context structure? */ + if (!strncmp(chunk_sig, "vcpu", 4)) { + if (!read_exact(io_fd, &ctxt, sizeof(ctxt))) { + ERR("read extended-info vcpu context failed"); + goto out; + } + tot_bytes -= sizeof(struct vcpu_guest_context); + chunk_bytes -= sizeof(struct vcpu_guest_context); + + if (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3)) + pae_extended_cr3 = 1; + } + + /* Any remaining bytes of this chunk: read and discard. */ + while (chunk_bytes) { + unsigned long sz = chunk_bytes; + if ( sz > P2M_FL_SIZE ) + sz = P2M_FL_SIZE; + if (!read_exact(io_fd, p2m_frame_list, sz)) { + ERR("read-and-discard extended-info chunk bytes failed"); + goto out; + } + chunk_bytes -= sz; + tot_bytes -= sz; + } + } + + /* Now read the real first entry of P2M list. */ + if (!read_exact(io_fd, p2m_frame_list, sizeof(long))) { + ERR("read first entry of p2m_frame_list failed"); + goto out; + } + } + + /* First entry is already read into the p2m array. */ + if (!read_exact(io_fd, &p2m_frame_list[1], P2M_FL_SIZE - sizeof(long))) { ERR("read p2m_frame_list failed"); goto out; } - /* We want zeroed memory so use calloc rather than malloc. */ p2m = calloc(max_pfn, sizeof(unsigned long)); @@ -331,17 +389,27 @@ int xc_linux_restore(int xc_handle, int ** A page table page - need to 'uncanonicalize' it, i.e. ** replace all the references to pfns with the corresponding ** mfns for the new domain. + ** + ** On PAE we need to ensure that PGDs are in MFNs < 4G, and + ** so we may need to update the p2m after the main loop. + ** Hence we defer canonicalization of L1s until then. */ - if(!uncanonicalize_pagetable(pagetype, page)) { - /* - ** Failing to uncanonicalize a page table can be ok - ** under live migration since the pages type may have - ** changed by now (and we'll get an update later). - */ - DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n", - pagetype >> 28, pfn, mfn); - nraces++; - continue; + if ((pt_levels != 3) || + pae_extended_cr3 || + (pagetype != L1TAB)) { + + if (!uncanonicalize_pagetable(pagetype, page)) { + /* + ** Failing to uncanonicalize a page table can be ok + ** under live migration since the pages type may have + ** changed by now (and we'll get an update later). + */ + DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n", + pagetype >> 28, pfn, mfn); + nraces++; + continue; + } + } } else if(pagetype != NOTAB) { @@ -389,6 +457,100 @@ int xc_linux_restore(int xc_handle, int } DPRINTF("Received all pages (%d races)\n", nraces); + + if ((pt_levels == 3) && !pae_extended_cr3) { + + /* + ** XXX SMH on PAE we need to ensure PGDs are in MFNs < 4G. This + ** is a little awkward and involves (a) finding all such PGDs and + ** replacing them with 'lowmem' versions; (b) upating the p2m[] + ** with the new info; and (c) canonicalizing all the L1s using the + ** (potentially updated) p2m[]. + ** + ** This is relatively slow (and currently involves two passes through + ** the pfn_type[] array), but at least seems to be correct. May wish + ** to consider more complex approaches to optimize this later. + */ + + int j, k; + + /* First pass: find all L3TABs current in > 4G mfns and get new mfns */ + for (i = 0; i < max_pfn; i++) { + + if (((pfn_type[i] & LTABTYPE_MASK)==L3TAB) && (p2m[i]>0xfffffUL)) { + + unsigned long new_mfn; + uint64_t l3ptes[4]; + uint64_t *l3tab; + + l3tab = (uint64_t *) + xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, + PROT_READ, p2m[i]); + + for(j = 0; j < 4; j++) + l3ptes[j] = l3tab[j]; + + munmap(l3tab, PAGE_SIZE); + + if (!(new_mfn=xc_make_page_below_4G(xc_handle, dom, p2m[i]))) { + ERR("Couldn't get a page below 4GB :-("); + goto out; + } + + p2m[i] = new_mfn; + if (xc_add_mmu_update(xc_handle, mmu, + (((unsigned long long)new_mfn) + << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE, i)) { + ERR("Couldn't m2p on PAE root pgdir"); + goto out; + } + + l3tab = (uint64_t *) + xc_map_foreign_range(xc_handle, dom, PAGE_SIZE, + PROT_READ | PROT_WRITE, p2m[i]); + + for(j = 0; j < 4; j++) + l3tab[j] = l3ptes[j]; + + munmap(l3tab, PAGE_SIZE); + + } + } + + /* Second pass: find all L1TABs and uncanonicalize them */ + j = 0; + + for(i = 0; i < max_pfn; i++) { + + if (((pfn_type[i] & LTABTYPE_MASK)==L1TAB)) { + region_mfn[j] = p2m[i]; + j++; + } + + if(i == (max_pfn-1) || j == MAX_BATCH_SIZE) { + + if (!(region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_READ | PROT_WRITE, + region_mfn, j))) { + ERR("map batch failed"); + goto out; + } + + for(k = 0; k < j; k++) { + if(!uncanonicalize_pagetable(L1TAB, + region_base + k*PAGE_SIZE)) { + ERR("failed uncanonicalize pt!"); + goto out; + } + } + + munmap(region_base, j*PAGE_SIZE); + j = 0; + } + } + + } if (xc_finish_mmu_updates(xc_handle, mmu)) { diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xc_linux_save.c --- a/tools/libxc/xc_linux_save.c Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xc_linux_save.c Mon Jun 05 10:42:40 2006 +0100 @@ -818,12 +818,33 @@ int xc_linux_save(int xc_handle, int io_ /* Start writing out the saved-domain record. */ - if(!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { + if (!write_exact(io_fd, &max_pfn, sizeof(unsigned long))) { ERR("write: max_pfn"); goto out; } - if(!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { + /* + * Write an extended-info structure to inform the restore code that + * a PAE guest understands extended CR3 (PDPTs above 4GB). Turns off + * slow paths in the restore code. + */ + if ((pt_levels == 3) && + (ctxt.vm_assist & (1UL << VMASST_TYPE_pae_extended_cr3))) { + unsigned long signature = ~0UL; + uint32_t tot_sz = sizeof(struct vcpu_guest_context) + 8; + uint32_t chunk_sz = sizeof(struct vcpu_guest_context); + char chunk_sig[] = "vcpu"; + if (!write_exact(io_fd, &signature, sizeof(signature)) || + !write_exact(io_fd, &tot_sz, sizeof(tot_sz)) || + !write_exact(io_fd, &chunk_sig, 4) || + !write_exact(io_fd, &chunk_sz, sizeof(chunk_sz)) || + !write_exact(io_fd, &ctxt, sizeof(ctxt))) { + ERR("write: extended info"); + goto out; + } + } + + if (!write_exact(io_fd, p2m_frame_list, P2M_FL_SIZE)) { ERR("write: p2m_frame_list"); goto out; } diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xc_load_elf.c --- a/tools/libxc/xc_load_elf.c Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xc_load_elf.c Mon Jun 05 10:42:40 2006 +0100 @@ -122,8 +122,15 @@ static int parseelfimage(const char *ima ERROR("Actually saw: '%s'", guestinfo); return -EINVAL; } - if ( (strstr(guestinfo, "PAE=yes") != NULL) ) - dsi->pae_kernel = 1; + + dsi->pae_kernel = PAEKERN_no; + p = strstr(guestinfo, "PAE=yes"); + if ( p != NULL ) + { + dsi->pae_kernel = PAEKERN_yes; + if ( !strncmp(p+7, "[extended-cr3]", 14) ) + dsi->pae_kernel = PAEKERN_extended_cr3; + } break; } diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xc_private.c --- a/tools/libxc/xc_private.c Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xc_private.c Mon Jun 05 10:42:40 2006 +0100 @@ -430,6 +430,28 @@ int xc_version(int xc_handle, int cmd, v return rc; } +unsigned long xc_make_page_below_4G( + int xc_handle, uint32_t domid, unsigned long mfn) +{ + unsigned long new_mfn; + + if ( xc_domain_memory_decrease_reservation( + xc_handle, domid, 1, 0, &mfn) != 0 ) + { + fprintf(stderr,"xc_make_page_below_4G decrease failed. mfn=%lx\n",mfn); + return 0; + } + + if ( xc_domain_memory_increase_reservation( + xc_handle, domid, 1, 0, 32, &new_mfn) != 0 ) + { + fprintf(stderr,"xc_make_page_below_4G increase failed. mfn=%lx\n",mfn); + return 0; + } + + return new_mfn; +} + /* * Local variables: * mode: C diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xenctrl.h Mon Jun 05 10:42:40 2006 +0100 @@ -453,6 +453,9 @@ int xc_domain_iomem_permission(int xc_ha unsigned long nr_mfns, uint8_t allow_access); +unsigned long xc_make_page_below_4G(int xc_handle, uint32_t domid, + unsigned long mfn); + typedef dom0_perfc_desc_t xc_perfc_desc_t; /* IMPORTANT: The caller is responsible for mlock()'ing the @desc array. */ int xc_perfc_control(int xc_handle, diff -r c9e6255cb44a -r 8aca850f66ad tools/libxc/xg_private.h --- a/tools/libxc/xg_private.h Fri Jun 02 19:14:44 2006 +0100 +++ b/tools/libxc/xg_private.h Mon Jun 05 10:42:40 2006 +0100 @@ -156,6 +156,9 @@ struct domain_setup_info unsigned long elf_paddr_offset; +#define PAEKERN_no 0 +#define PAEKERN_yes 1 +#define PAEKERN_extended_cr3 2 unsigned int pae_kernel; unsigned int load_symtab; diff -r c9e6255cb44a -r 8aca850f66ad xen/arch/x86/domain_build.c --- a/xen/arch/x86/domain_build.c Fri Jun 02 19:14:44 2006 +0100 +++ b/xen/arch/x86/domain_build.c Mon Jun 05 10:42:40 2006 +0100 @@ -301,6 +301,9 @@ int construct_dom0(struct domain *d, xen_pae ? "yes" : "no", dom0_pae ? "yes" : "no"); return -EINVAL; } + + if ( xen_pae && !!strstr(dsi.xen_section_string, "PAE=yes[extended-cr3]") ) + set_bit(VMASST_TYPE_pae_extended_cr3, &d->vm_assist); if ( (p = strstr(dsi.xen_section_string, "FEATURES=")) != NULL ) { diff -r c9e6255cb44a -r 8aca850f66ad xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Fri Jun 02 19:14:44 2006 +0100 +++ b/xen/arch/x86/mm.c Mon Jun 05 10:42:40 2006 +0100 @@ -996,6 +996,21 @@ static int alloc_l3_table(struct page_in int i; ASSERT(!shadow_mode_refcounts(d)); + +#ifdef CONFIG_X86_PAE + /* + * PAE pgdirs above 4GB are unacceptable if the guest does not understand + * the weird 'extended cr3' format for dealing with high-order address + * bits. We cut some slack for control tools (before vcpu0 is initialised). + */ + if ( (pfn >= 0x100000) && + unlikely(!VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3)) && + d->vcpu[0] && test_bit(_VCPUF_initialised, &d->vcpu[0]->vcpu_flags) ) + { + MEM_LOG("PAE pgd must be below 4GB (0x%lx >= 0x100000)", pfn); + return 0; + } +#endif pl3e = map_domain_page(pfn); for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) diff -r c9e6255cb44a -r 8aca850f66ad xen/common/kernel.c --- a/xen/common/kernel.c Fri Jun 02 19:14:44 2006 +0100 +++ b/xen/common/kernel.c Mon Jun 05 10:42:40 2006 +0100 @@ -184,6 +184,7 @@ long do_xen_version(int cmd, XEN_GUEST_H case XENVER_get_features: { xen_feature_info_t fi; + struct domain *d = current->domain; if ( copy_from_guest(&fi, arg, 1) ) return -EFAULT; @@ -191,7 +192,9 @@ long do_xen_version(int cmd, XEN_GUEST_H switch ( fi.submap_idx ) { case 0: - fi.submap = (1U << XENFEAT_pae_pgdir_above_4gb); + fi.submap = 0; + if ( VM_ASSIST(d, VMASST_TYPE_pae_extended_cr3) ) + fi.submap |= (1U << XENFEAT_pae_pgdir_above_4gb); if ( shadow_mode_translate(current->domain) ) fi.submap |= (1U << XENFEAT_writable_page_tables) | diff -r c9e6255cb44a -r 8aca850f66ad xen/common/keyhandler.c --- a/xen/common/keyhandler.c Fri Jun 02 19:14:44 2006 +0100 +++ b/xen/common/keyhandler.c Mon Jun 05 10:42:40 2006 +0100 @@ -128,11 +128,12 @@ static void dump_domains(unsigned char k d->domain_flags, atomic_read(&d->refcnt), d->tot_pages, d->xenheap_pages, cpuset); printk(" handle=%02x%02x%02x%02x-%02x%02x-%02x%02x-" - "%02x%02x-%02x%02x%02x%02x%02x%02x\n", + "%02x%02x-%02x%02x%02x%02x%02x%02x vm_assist=%08lx\n", d->handle[ 0], d->handle[ 1], d->handle[ 2], d->handle[ 3], d->handle[ 4], d->handle[ 5], d->handle[ 6], d->handle[ 7], d->handle[ 8], d->handle[ 9], d->handle[10], d->handle[11], - d->handle[12], d->handle[13], d->handle[14], d->handle[15]); + d->handle[12], d->handle[13], d->handle[14], d->handle[15], + d->vm_assist); arch_dump_domain_info(d); diff -r c9e6255cb44a -r 8aca850f66ad xen/include/public/xen.h --- a/xen/include/public/xen.h Fri Jun 02 19:14:44 2006 +0100 +++ b/xen/include/public/xen.h Mon Jun 05 10:42:40 2006 +0100 @@ -234,10 +234,24 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t); */ #define VMASST_CMD_enable 0 #define VMASST_CMD_disable 1 + +/* x86/32 guests: simulate full 4GB segment limits. */ #define VMASST_TYPE_4gb_segments 0 + +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */ #define VMASST_TYPE_4gb_segments_notify 1 + +/* + * x86 guests: support writes to bottom-level PTEs. + * NB1. Page-directory entries cannot be written. + * NB2. Guest must continue to remove all writable mappings of PTEs. + */ #define VMASST_TYPE_writable_pagetables 2 -#define MAX_VMASST_TYPE 2 + +/* x86/PAE guests: support PDPTs above 4GB. */ +#define VMASST_TYPE_pae_extended_cr3 3 + +#define MAX_VMASST_TYPE 3 #ifndef __ASSEMBLY__ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |