diff -r 3191627e5ad6 tools/libxc/xc_hvm_build.c --- a/tools/libxc/xc_hvm_build.c Wed Oct 31 16:21:18 2007 +0000 +++ b/tools/libxc/xc_hvm_build.c Wed Nov 07 07:16:57 2007 -0600 @@ -149,7 +149,9 @@ static int setup_guest(int xc_handle, char *image, unsigned long image_size) { xen_pfn_t *page_array = NULL; + xen_pfn_t *page_array_2MB = NULL; unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT); + unsigned long nr_pages_2MB = (unsigned long)memsize >> 1; unsigned long shared_page_nr, entry_eip; struct xen_add_to_physmap xatp; struct shared_info *shared_info; @@ -189,7 +191,8 @@ static int setup_guest(int xc_handle, v_start, v_end, elf_uval(&elf, elf.ehdr, e_entry)); - if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ) + if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL || + (page_array_2MB = malloc(nr_pages_2MB * sizeof(xen_pfn_t))) == NULL ) { PERROR("Could not allocate memory.\n"); goto error_out; @@ -197,15 +200,33 @@ static int setup_guest(int xc_handle, for ( i = 0; i < nr_pages; i++ ) page_array[i] = i; + for ( i = 0; i < nr_pages_2MB; i++ ) + page_array_2MB[i] = i << 9; for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ ) page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; - - /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */ + for ( i = HVM_BELOW_4G_RAM_END >> (PAGE_SHIFT + 9); i < nr_pages_2MB; i++ ) + page_array_2MB[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT; + + /* Note: We try to request 2MB page allocations at this point. Hypervisor + * will fall back to 4KB allocation if it can not satisfies these requests. + * + * Allocate memory for HVM guest from 0 - 2MB space using 4KB pages, + * skipping VGA hole 0xA0000-0xC0000. + */ rc = xc_domain_memory_populate_physmap( xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]); if ( rc == 0 ) rc = xc_domain_memory_populate_physmap( - xc_handle, dom, nr_pages - 0xc0, 0, 0, &page_array[0xc0]); + xc_handle, dom, 0x200 - 0xc0, 0, 0, &page_array[0xc0]); + /* Allocate memory for HVM guest beyond 2MB space using 2MB pages */ + if ( rc == 0 ) + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, nr_pages_2MB - 1, 9, 0, &page_array_2MB[1]); + /* Handle the case of odd number physical memory size */ + if ( rc == 0 ) + rc = xc_domain_memory_populate_physmap( + xc_handle, dom, nr_pages - (nr_pages_2MB << 9), 0, 0, + &page_array[nr_pages_2MB << 9]); if ( rc != 0 ) { PERROR("Could not allocate memory for HVM guest.\n"); @@ -268,10 +289,12 @@ static int setup_guest(int xc_handle, } free(page_array); + free(page_array_2MB); return 0; error_out: free(page_array); + free(page_array_2MB); return -1; } diff -r 3191627e5ad6 xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/arch/x86/hvm/vmx/vmx.c Wed Nov 07 11:14:53 2007 -0600 @@ -2413,7 +2413,8 @@ static int vmx_alloc_vlapic_mapping(stru return -ENOMEM; share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable); guest_physmap_add_page( - d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va)); + d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), virt_to_mfn(apic_va), + PAGE_SIZE_ORDER_4K); d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va); return 0; diff -r 3191627e5ad6 xen/arch/x86/mm.c --- a/xen/arch/x86/mm.c Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/arch/x86/mm.c Mon Nov 12 07:51:32 2007 -0600 @@ -3155,7 +3155,8 @@ long arch_memory_op(int op, XEN_GUEST_HA { if ( is_xen_heap_frame(mfn_to_page(prev_mfn)) ) /* Xen heap frames are simply unhooked from this phys slot. */ - guest_physmap_remove_page(d, xatp.gpfn, prev_mfn); + guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, + PAGE_SIZE_ORDER_4K); else /* Normal domain memory is freed, to avoid leaking memory. */ guest_remove_page(d, xatp.gpfn); @@ -3164,10 +3165,10 @@ long arch_memory_op(int op, XEN_GUEST_HA /* Unmap from old location, if any. */ gpfn = get_gpfn_from_mfn(mfn); if ( gpfn != INVALID_M2P_ENTRY ) - guest_physmap_remove_page(d, gpfn, mfn); + guest_physmap_remove_page(d, gpfn, mfn, PAGE_SIZE_ORDER_4K); /* Map at new location. */ - guest_physmap_add_page(d, xatp.gpfn, mfn); + guest_physmap_add_page(d, xatp.gpfn, mfn, PAGE_SIZE_ORDER_4K); UNLOCK_BIGLOCK(d); diff -r 3191627e5ad6 xen/arch/x86/mm/hap/hap.c --- a/xen/arch/x86/mm/hap/hap.c Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/arch/x86/mm/hap/hap.c Thu Nov 15 03:54:42 2007 -0600 @@ -238,9 +238,14 @@ static void hap_install_xen_entries_in_l { struct domain *d = v->domain; l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + struct page_info *pg; l4e = hap_map_domain_page(l4mfn); ASSERT(l4e != NULL); + + if ( (pg = hap_alloc(d)) == NULL ) + goto oom; /* Copy the common Xen mappings from the idle domain */ memcpy(&l4e[ROOT_PAGETABLE_FIRST_XEN_SLOT], @@ -261,7 +266,24 @@ static void hap_install_xen_entries_in_l l4e_from_pfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)), __PAGE_HYPERVISOR); + /* Install the L2 pages of P2M table in linear mapping. We borrow + * SH_LINEAR_PT_VIRT_START to store such information. + */ + l4e[l4_table_offset(SH_LINEAR_PT_VIRT_START)] = + l4e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR); + l3e = hap_map_domain_page(page_to_mfn(pg)); + l3e[l3_table_offset(SH_LINEAR_PT_VIRT_START)] = + l3e_from_pfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)), + __PAGE_HYPERVISOR); + hap_unmap_domain_page(l3e); + hap_unmap_domain_page(l4e); + + return; + oom: + HAP_ERROR("out of memory building monitor pagetable\n"); + domain_crash(d); + return; } #endif /* CONFIG_PAGING_LEVELS == 4 */ @@ -269,12 +291,17 @@ static void hap_install_xen_entries_in_l static void hap_install_xen_entries_in_l2h(struct vcpu *v, mfn_t l2hmfn) { struct domain *d = v->domain; + l1_pgentry_t *l1e; l2_pgentry_t *l2e; l3_pgentry_t *p2m; + struct page_info *pg; int i; l2e = hap_map_domain_page(l2hmfn); ASSERT(l2e != NULL); + + if ( (pg = hap_alloc(d)) == NULL ) + goto oom; /* Copy the common Xen mappings from the idle domain */ memcpy(&l2e[L2_PAGETABLE_FIRST_XEN_SLOT & (L2_PAGETABLE_ENTRIES-1)], @@ -304,8 +331,27 @@ static void hap_install_xen_entries_in_l __PAGE_HYPERVISOR) : l2e_empty(); } + + /* Install the L2 pages of p2m table in linear mapping. We borrow + * SH_LINEAR_PT_VIRT_START to store such information. + */ + l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR); + l1e = hap_map_domain_page(page_to_mfn(pg)); + for ( i = 0; i < L3_PAGETABLE_ENTRIES; i++ ) + l1e[l1_table_offset(SH_LINEAR_PT_VIRT_START) + i] = + (l3e_get_flags(p2m[i]) & _PAGE_PRESENT) + ? l1e_from_pfn(l3e_get_pfn(p2m[i]), __PAGE_HYPERVISOR) + : l1e_empty(); + + hap_unmap_domain_page(l1e); hap_unmap_domain_page(p2m); hap_unmap_domain_page(l2e); + return; + oom: + HAP_ERROR("out of memory building monitor pagetable\n"); + domain_crash(d); + return; } #endif @@ -337,6 +383,11 @@ static void hap_install_xen_entries_in_l /* Install the domain-specific P2M table */ l2e[l2_table_offset(RO_MPT_VIRT_START)] = + l2e_from_pfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)), + __PAGE_HYPERVISOR); + + /* Install the domain-specific P2M table */ + l2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_from_pfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)), __PAGE_HYPERVISOR); @@ -414,12 +465,33 @@ static void hap_destroy_monitor_table(st { struct domain *d = v->domain; -#if CONFIG_PAGING_LEVELS == 3 +#if CONFIG_PAGING_LEVELS == 4 + { + l4_pgentry_t *l4e = NULL; + unsigned int shl_l4e_offset = l4_table_offset(SH_LINEAR_PT_VIRT_START); + + l4e = hap_map_domain_page(_mfn(mmfn)); + ASSERT(l4e_get_flags(l4e[shl_l4e_offset]) & _PAGE_PRESENT); + hap_free(d, _mfn(l4e_get_pfn(l4e[shl_l4e_offset]))); + hap_unmap_domain_page(l4e); + } +#elif CONFIG_PAGING_LEVELS == 3 /* Need to destroy the l2 monitor page in slot 4 too */ { - l3_pgentry_t *l3e = hap_map_domain_page(mmfn); + l3_pgentry_t *l3e = NULL; + l2_pgentry_t *l2e = NULL; + unsigned int l2e_offset = l2_table_offset(SH_LINEAR_PT_VIRT_START); + + l3e = hap_map_domain_page(mmfn); ASSERT(l3e_get_flags(l3e[3]) & _PAGE_PRESENT); - hap_free(d, _mfn(l3e_get_pfn(l3e[3]))); + + /* destroy the l1 monitor page created for mapping level 2 p2m pages */ + l2e = hap_map_domain_page(_mfn(l3e_get_pfn(l3e[3]))); + ASSERT(l2e_get_flags(l2e[l2e_offset]) & _PAGE_PRESENT); + hap_free(d, _mfn(l2e_get_pfn(l2e[l2e_offset]))); + hap_unmap_domain_page(l2e); + + hap_free(d, _mfn(l3e_get_pfn(l3e[3]))); hap_unmap_domain_page(l3e); } #endif @@ -644,6 +716,8 @@ static void p2m_install_entry_in_monitor * in the monitor table. This function makes fresh copies when a p2m * l3e changes. */ { + l1_pgentry_t *sh_ml1e; + unsigned int sh_l2_index; l2_pgentry_t *ml2e; struct vcpu *v; unsigned int index; @@ -658,20 +732,31 @@ static void p2m_install_entry_in_monitor ASSERT(paging_mode_external(v->domain)); - if ( v == current ) /* OK to use linear map of monitor_table */ + if ( v == current ) { /* OK to use linear map of monitor_table */ ml2e = __linear_l2_table + l2_linear_offset(RO_MPT_VIRT_START); + sh_ml1e = __linear_l1_table + l1_linear_offset(SH_LINEAR_PT_VIRT_START); + } else { l3_pgentry_t *ml3e; ml3e = hap_map_domain_page( pagetable_get_mfn(v->arch.monitor_table)); ASSERT(l3e_get_flags(ml3e[3]) & _PAGE_PRESENT); ml2e = hap_map_domain_page(_mfn(l3e_get_pfn(ml3e[3]))); + + sh_l2_index = l2_table_offset(SH_LINEAR_PT_VIRT_START); + ASSERT(l2e_get_flags(ml2e[sh_l2_index]) & _PAGE_PRESENT); + sh_ml1e = hap_map_domain_page(_mfn(l2e_get_pfn(ml2e[sh_l2_index]))); + sh_ml1e += l1_table_offset(SH_LINEAR_PT_VIRT_START); + ml2e += l2_table_offset(RO_MPT_VIRT_START); hap_unmap_domain_page(ml3e); } ml2e[index] = l2e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); - if ( v != current ) + sh_ml1e[index] = l1e_from_pfn(l3e_get_pfn(*l3e), __PAGE_HYPERVISOR); + if ( v != current ) { hap_unmap_domain_page(ml2e); + hap_unmap_domain_page(sh_ml1e); + } } } #endif diff -r 3191627e5ad6 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/arch/x86/mm/p2m.c Thu Nov 15 03:54:42 2007 -0600 @@ -149,9 +149,12 @@ p2m_next_level(struct domain *d, mfn_t * unsigned long *gfn_remainder, unsigned long gfn, u32 shift, u32 max, unsigned long type) { + l1_pgentry_t *l1_entry; l1_pgentry_t *p2m_entry; l1_pgentry_t new_entry; void *next; + int i; + ASSERT(d->arch.p2m.alloc_page); if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, @@ -192,6 +195,36 @@ p2m_next_level(struct domain *d, mfn_t * break; } } + + ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT); + + /* split single large page into 4KB page in P2M table */ + if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + struct page_info *pg = d->arch.p2m.alloc_page(d); + if ( pg == NULL ) + return 0; + list_add_tail(&pg->list, &d->arch.p2m.pages); + pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated; + pg->count_info = 1; + + l1_entry = map_domain_page(mfn_x(page_to_mfn(pg))); + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + mfn_t mfn = _mfn(l1e_get_pfn(*p2m_entry) + i); + new_entry = l1e_from_pfn(mfn_x(mfn), + __PAGE_HYPERVISOR|_PAGE_USER); + paging_write_p2m_entry(d, gfn, + l1_entry+i, *table_mfn, new_entry, 1); + } + unmap_domain_page(l1_entry); + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); + paging_write_p2m_entry(d, gfn, + p2m_entry, *table_mfn, new_entry, 2); + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); next = map_domain_page(mfn_x(*table_mfn)); unmap_domain_page(*table); @@ -202,14 +235,16 @@ p2m_next_level(struct domain *d, mfn_t * // Returns 0 on error (out of memory) static int -set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) +set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, + unsigned int order, p2m_type_t p2mt) { // XXX -- this might be able to be faster iff current->domain == d mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); void *table =map_domain_page(mfn_x(table_mfn)); unsigned long gfn_remainder = gfn; - l1_pgentry_t *p2m_entry; + l1_pgentry_t *p2m_entry = NULL; l1_pgentry_t entry_content; + l2_pgentry_t l2e_content; int rv=0; #if CONFIG_PAGING_LEVELS >= 4 @@ -234,30 +269,59 @@ set_p2m_entry(struct domain *d, unsigned PGT_l2_page_table) ) goto out; #endif - if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, - L2_PAGETABLE_SHIFT - PAGE_SHIFT, - L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) - goto out; - - p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, - 0, L1_PAGETABLE_ENTRIES); - ASSERT(p2m_entry); + + if ( order == PAGE_SIZE_ORDER_2M ) + { + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) && + !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) ) + { + P2M_ERROR("configure P2M table 4KB L2 entry with large page\n"); + domain_crash(d); + goto out; + } + + if ( mfn_valid(mfn) ) + l2e_content = l2e_from_pfn(mfn_x(mfn), + p2m_type_to_flags(p2mt) | _PAGE_PSE); + else + l2e_content = l2e_empty(); + + entry_content.l1 = l2e_content.l2; + + paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2); + } + else if ( order == PAGE_SIZE_ORDER_4K ) + { + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + goto out; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) + entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt)); + else + entry_content = l1e_empty(); + + /* level 1 entry */ + paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1); + } + + if ( vtd_enabled && (p2mt == p2m_mmio_direct) && is_hvm_domain(d) ) + iommu_flush(d, gfn, (u64*)p2m_entry); /* Track the highest gfn for which we have ever had a valid mapping */ if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) ) d->arch.p2m.max_mapped_pfn = gfn; - if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) - entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt)); - else - entry_content = l1e_empty(); - - /* level 1 entry */ - paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1); - - if ( vtd_enabled && (p2mt == p2m_mmio_direct) && is_hvm_domain(d) ) - iommu_flush(d, gfn, (u64*)p2m_entry); - /* Success */ rv = 1; @@ -267,14 +331,11 @@ set_p2m_entry(struct domain *d, unsigned } -/* Init the datastructures for later use by the p2m code */ -void p2m_init(struct domain *d) -{ - p2m_lock_init(d); - INIT_LIST_HEAD(&d->arch.p2m.pages); -} - - + +/************************************************/ +/* Completed Functions */ +/************************************************/ +/*********** SUPPORTIVE FUNCTIONS ************/ // Allocate a new p2m table for a domain. // // The structure of the p2m table is that of a pagetable for xen (i.e. it is @@ -290,11 +351,11 @@ int p2m_alloc_table(struct domain *d, void (*free_page)(struct domain *d, struct page_info *pg)) { - mfn_t mfn; + mfn_t mfn = 0; struct list_head *entry; struct page_info *page, *p2m_top; unsigned int page_count = 0; - unsigned long gfn; + unsigned long gfn = 0; p2m_lock(d); @@ -334,7 +395,8 @@ int p2m_alloc_table(struct domain *d, P2M_PRINTK("populating p2m table\n"); /* Initialise physmap tables for slot zero. Other code assumes this. */ - if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) ) + if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), PAGE_SIZE_ORDER_4K, + p2m_invalid) ) goto error; /* Copy all existing mappings from the page list and m2p */ @@ -353,7 +415,7 @@ int p2m_alloc_table(struct domain *d, (gfn != 0x55555555L) #endif && gfn != INVALID_M2P_ENTRY - && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) ) + && !set_p2m_entry(d, gfn, mfn, PAGE_SIZE_ORDER_4K, p2m_ram_rw) ) goto error; } @@ -371,109 +433,6 @@ int p2m_alloc_table(struct domain *d, PRI_mfn "\n", gfn, mfn_x(mfn)); p2m_unlock(d); return -ENOMEM; -} - -void p2m_teardown(struct domain *d) -/* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ -{ - struct list_head *entry, *n; - struct page_info *pg; - - p2m_lock(d); - d->arch.phys_table = pagetable_null(); - - list_for_each_safe(entry, n, &d->arch.p2m.pages) - { - pg = list_entry(entry, struct page_info, list); - list_del(entry); - d->arch.p2m.free_page(d, pg); - } - p2m_unlock(d); -} - -mfn_t -gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t) -/* Read another domain's p2m entries */ -{ - mfn_t mfn; - paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; - l2_pgentry_t *l2e; - l1_pgentry_t *l1e; - - ASSERT(paging_mode_translate(d)); - - /* XXX This is for compatibility with the old model, where anything not - * XXX marked as RAM was considered to be emulated MMIO space. - * XXX Once we start explicitly registering MMIO regions in the p2m - * XXX we will return p2m_invalid for unmapped gfns */ - *t = p2m_mmio_dm; - - mfn = pagetable_get_mfn(d->arch.phys_table); - - if ( gfn > d->arch.p2m.max_mapped_pfn ) - /* This pfn is higher than the highest the p2m map currently holds */ - return _mfn(INVALID_MFN); - -#if CONFIG_PAGING_LEVELS >= 4 - { - l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); - l4e += l4_table_offset(addr); - if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l4e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l4e_get_pfn(*l4e)); - unmap_domain_page(l4e); - } -#endif -#if CONFIG_PAGING_LEVELS >= 3 - { - l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); -#if CONFIG_PAGING_LEVELS == 3 - /* On PAE hosts the p2m has eight l3 entries, not four (see - * shadow_set_p2m_entry()) so we can't use l3_table_offset. - * Instead, just count the number of l3es from zero. It's safe - * to do this because we already checked that the gfn is within - * the bounds of the p2m. */ - l3e += (addr >> L3_PAGETABLE_SHIFT); -#else - l3e += l3_table_offset(addr); -#endif - if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l3e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l3e_get_pfn(*l3e)); - unmap_domain_page(l3e); - } -#endif - - l2e = map_domain_page(mfn_x(mfn)); - l2e += l2_table_offset(addr); - if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l2e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l2e_get_pfn(*l2e)); - unmap_domain_page(l2e); - - l1e = map_domain_page(mfn_x(mfn)); - l1e += l1_table_offset(addr); - if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) - { - unmap_domain_page(l1e); - return _mfn(INVALID_MFN); - } - mfn = _mfn(l1e_get_pfn(*l1e)); - *t = p2m_flags_to_type(l1e_get_flags(*l1e)); - unmap_domain_page(l1e); - - ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); - return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); } #if P2M_AUDIT @@ -614,6 +573,29 @@ static void audit_p2m(struct domain *d) gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); continue; } + + /* check for large page */ + if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE ) + { + mfn = l2e_get_pfn(l2e[i2]); + ASSERT(mfn_valid(_mfn(mfn))); + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++) + { + m2pfn = get_gpfn_from_mfn(mfn+i1); + if ( m2pfn != (gfn + i) ) + { + pmbad++; + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn+i, mfn+i, + m2pfn); + BUG(); + } + } + + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2])))); for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) @@ -664,38 +646,32 @@ static void audit_p2m(struct domain *d) #define audit_p2m(_d) do { (void)(_d); } while(0) #endif /* P2M_AUDIT */ - - static void -p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) -{ +p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn, + unsigned int order) +{ + int i; if ( !paging_mode_translate(d) ) return; - P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); - - set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid); - set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); -} - -void -guest_physmap_remove_page(struct domain *d, unsigned long gfn, - unsigned long mfn) -{ - p2m_lock(d); - audit_p2m(d); - p2m_remove_page(d, gfn, mfn); - audit_p2m(d); - p2m_unlock(d); + P2M_DEBUG("removing gfn=%#lx mfn=%#lx, order=%d\n", gfn, mfn, order); + + set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, p2m_invalid); + for (i = 0; i < (1UL << order); i++ ) + set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY); } int guest_physmap_add_entry(struct domain *d, unsigned long gfn, - unsigned long mfn, p2m_type_t t) + unsigned long mfn, unsigned int order, p2m_type_t t) { unsigned long ogfn; p2m_type_t ot; mfn_t omfn; int rc = 0; + int i; + + /* make sure gfn and mfn are aligned at order boundary */ + ASSERT( !(gfn & ((1UL< mfn %#lx\n", ogfn , mfn_x(omfn)); if ( mfn_x(omfn) == mfn ) - p2m_remove_page(d, ogfn, mfn); - } - } - - if ( mfn_valid(_mfn(mfn)) ) - { - if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) ) + p2m_remove_page(d, ogfn, mfn, order); + } + } + + if ( mfn_valid(_mfn(mfn)) ) + { + if ( !set_p2m_entry(d, gfn, _mfn(mfn), order, t) ) rc = -EINVAL; - set_gpfn_from_mfn(mfn, gfn); + for ( i = 0; i < (1UL << order); i++) + set_gpfn_from_mfn(mfn+i, gfn+i); } else { gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n", gfn, mfn); - if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) ) + if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, p2m_invalid) ) rc = -EINVAL; } @@ -774,7 +752,7 @@ void p2m_change_type_global(struct domai l1_pgentry_t l1e_content; l1_pgentry_t *l1e; l2_pgentry_t *l2e; - mfn_t l1mfn; + mfn_t l1mfn, l2mfn; int i1, i2; #if CONFIG_PAGING_LEVELS >= 3 l3_pgentry_t *l3e; @@ -819,12 +797,27 @@ void p2m_change_type_global(struct domai { continue; } + l2mfn = _mfn(l3e_get_pfn(l3e[i3])); l2e = map_domain_page(l3e_get_pfn(l3e[i3])); #endif /* all levels... */ for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) { if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) { + continue; + } + + if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) ) + { + flags = l2e_get_flags(l2e[i2]); + if ( p2m_flags_to_type(flags) != ot ) + continue; + mfn = l2e_get_pfn(l2e[i2]); + gfn = get_gpfn_from_mfn(mfn); + flags = p2m_flags_to_type(nt); + l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); + paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2], + l2mfn, l1e_content, 2); continue; } @@ -878,7 +871,7 @@ p2m_type_t p2m_change_type(struct domain mfn = gfn_to_mfn(d, gfn, &pt); if ( pt == ot ) - set_p2m_entry(d, gfn, mfn, nt); + set_p2m_entry(d, gfn, mfn, PAGE_SIZE_ORDER_4K, nt); p2m_unlock(d); @@ -899,10 +892,11 @@ set_mmio_p2m_entry(struct domain *d, uns if ( p2m_is_ram(ot) ) { ASSERT(mfn_valid(omfn)); + /* 4K page modification. So we don't need to check page order */ set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); } - rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct); + rc = set_p2m_entry(d, gfn, mfn, PAGE_SIZE_ORDER_4K, p2m_mmio_direct); if ( 0 == rc ) gdprintk(XENLOG_ERR, "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", @@ -926,10 +920,148 @@ clear_mmio_p2m_entry(struct domain *d, u "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn); return 0; } - rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0); + rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), PAGE_SIZE_ORDER_4K, 0); return rc; } + +/************************************************/ +/* PUBLIC INTERFACE FUNCTIONS */ +/************************************************/ +/* Init the datastructures for later use by the p2m code */ +void p2m_init(struct domain *d) +{ + p2m_lock_init(d); + INIT_LIST_HEAD(&d->arch.p2m.pages); +} + +void p2m_teardown(struct domain *d) +/* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ +{ + struct list_head *entry, *n; + struct page_info *pg; + + p2m_lock(d); + d->arch.phys_table = pagetable_null(); + + list_for_each_safe(entry, n, &d->arch.p2m.pages) + { + pg = list_entry(entry, struct page_info, list); + list_del(entry); + d->arch.p2m.free_page(d, pg); + } + p2m_unlock(d); +} + +mfn_t +gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t) +/* Read another domain's p2m entries */ +{ + mfn_t mfn; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(paging_mode_translate(d)); + + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + *t = p2m_mmio_dm; + + mfn = pagetable_get_mfn(d->arch.phys_table); + + if ( gfn > d->arch.p2m.max_mapped_pfn ) + /* This pfn is higher than the highest the p2m map currently holds */ + return _mfn(INVALID_MFN); + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); +#if CONFIG_PAGING_LEVELS == 3 + /* On PAE hosts the p2m has eight l3 entries, not four (see + * shadow_set_p2m_entry()) so we can't use l3_table_offset. + * Instead, just count the number of l3es from zero. It's safe + * to do this because we already checked that the gfn is within + * the bounds of the p2m. */ + l3e += (addr >> L3_PAGETABLE_SHIFT); +#else + l3e += l3_table_offset(addr); +#endif + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + unmap_domain_page(l3e); + } +#endif + + l2e = map_domain_page(mfn_x(mfn)); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) ) + { + mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr)); + *t = p2m_flags_to_type(l2e_get_flags(*l2e)); + unmap_domain_page(l2e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); + } + + mfn = _mfn(l2e_get_pfn(*l2e)); + unmap_domain_page(l2e); + + l1e = map_domain_page(mfn_x(mfn)); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + *t = p2m_flags_to_type(l1e_get_flags(*l1e)); + unmap_domain_page(l1e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); +} + +void +guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn, unsigned int order) +{ + p2m_lock(d); + audit_p2m(d); + p2m_remove_page(d, gfn, mfn, order); + audit_p2m(d); + p2m_unlock(d); +} + +/*************************************************/ +/* INCOMPLETE FUNCTIONS */ +/*************************************************/ /* * Local variables: diff -r 3191627e5ad6 xen/common/grant_table.c --- a/xen/common/grant_table.c Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/common/grant_table.c Wed Nov 07 11:13:23 2007 -0600 @@ -1128,7 +1128,7 @@ gnttab_transfer( spin_lock(&e->grant_table->lock); sha = &shared_entry(e->grant_table, gop.ref); - guest_physmap_add_page(e, sha->frame, mfn); + guest_physmap_add_page(e, sha->frame, mfn, PAGE_SIZE_ORDER_4K); sha->frame = mfn; wmb(); sha->flags |= GTF_transfer_completed; diff -r 3191627e5ad6 xen/common/memory.c --- a/xen/common/memory.c Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/common/memory.c Tue Nov 13 06:33:56 2007 -0600 @@ -119,30 +119,55 @@ static void populate_physmap(struct memo page = __alloc_domheap_pages(d, cpu, a->extent_order, a->memflags); if ( unlikely(page == NULL) ) { - gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: " - "id=%d memflags=%x (%ld of %d)\n", - a->extent_order, d->domain_id, a->memflags, - i, a->nr_extents); - goto out; - } - - mfn = page_to_mfn(page); - - if ( unlikely(paging_mode_translate(d)) ) - { - for ( j = 0; j < (1 << a->extent_order); j++ ) - if ( guest_physmap_add_page(d, gpfn + j, mfn + j) ) - goto out; - } - else - { - for ( j = 0; j < (1 << a->extent_order); j++ ) - set_gpfn_from_mfn(mfn + j, gpfn + j); - - /* Inform the domain of the new page's machine address. */ - if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) ) - goto out; - } + /* fail if it is not translate mode */ + if ( !paging_mode_translate(d) ) + { + gdprintk(XENLOG_INFO, "Could not allocate order=%d extent:" + "id=%d memflags=%x (%ld of %d)\n", + a->extent_order, d->domain_id, a->memflags, + i, a->nr_extents); + goto out; + } + + /* try to do allocation using 4KB page instead */ + for ( j = 0; j < (1 << a->extent_order); j++ ) + { + page = __alloc_domheap_pages(d, cpu, 0, a->memflags); + if ( page == NULL ) + { + gdprintk(XENLOG_INFO, "Could not allocate order=%d extent:" + "id=%d memflags=%x (%ld of %d)\n", + 0, d->domain_id, a->memflags, + i, a->nr_extents); + goto out; + } + + mfn = page_to_mfn(page); + + if ( guest_physmap_add_page(d, gpfn+j, mfn, 0) ) + goto out; + } + } + else /* sucessful in allocating page of extent_order */ + { + mfn = page_to_mfn(page); + + if ( unlikely(paging_mode_translate(d)) ) + { + if ( guest_physmap_add_page(d, gpfn, mfn, a->extent_order) ) + goto out; + } + else + { + for ( j = 0; j < (1 << a->extent_order); j++ ) + set_gpfn_from_mfn(mfn + j, gpfn + j); + + /* Inform the domain of the new page's machine address. */ + if ( unlikely(__copy_to_guest_offset(a->extent_list, i, + &mfn, 1)) ) + goto out; + } + } } out: @@ -175,7 +200,7 @@ int guest_remove_page(struct domain *d, if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) put_page(page); - guest_physmap_remove_page(d, gmfn, mfn); + guest_physmap_remove_page(d, gmfn, mfn, PAGE_SIZE_ORDER_4K); put_page(page); @@ -425,7 +450,8 @@ static long memory_exchange(XEN_GUEST_HA if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) ) BUG(); mfn = page_to_mfn(page); - guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn); + guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, + PAGE_SIZE_ORDER_4K); put_page(page); } @@ -447,8 +473,8 @@ static long memory_exchange(XEN_GUEST_HA if ( unlikely(paging_mode_translate(d)) ) { /* Ignore failure here. There's nothing we can do. */ - for ( k = 0; k < (1UL << exch.out.extent_order); k++ ) - (void)guest_physmap_add_page(d, gpfn + k, mfn + k); + (void)guest_physmap_add_page(d, gpfn, mfn, + exch.out.extent_order); } else { diff -r 3191627e5ad6 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/include/asm-x86/mm.h Wed Nov 07 11:12:56 2007 -0600 @@ -128,6 +128,11 @@ static inline u32 pickle_domptr(struct d #else #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */ #endif + +/* the order of continuously allocated page frame */ +#define PAGE_SIZE_ORDER_4K 0 +#define PAGE_SIZE_ORDER_2M 9 +#define PAGE_SIZE_ORDER_4M 10 #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) diff -r 3191627e5ad6 xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/include/asm-x86/p2m.h Tue Nov 13 05:10:11 2007 -0600 @@ -42,7 +42,7 @@ * paging_mode_external() guests this mapping is in the monitor table.) */ #define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START) - +#define phys_to_machine_large_page_mapping ((l2_pgentry_t *)SH_LINEAR_PT_VIRT_START) /* * The upper levels of the p2m pagetable always contain full rights; all * variation in the access control bits is made in the level-1 PTEs. @@ -98,6 +98,7 @@ static inline mfn_t gfn_to_mfn_current(u { mfn_t mfn = _mfn(INVALID_MFN); p2m_type_t p2mt = p2m_mmio_dm; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; /* XXX This is for compatibility with the old model, where anything not * XXX marked as RAM was considered to be emulated MMIO space. * XXX Once we start explicitly registering MMIO regions in the p2m @@ -106,28 +107,44 @@ static inline mfn_t gfn_to_mfn_current(u if ( gfn <= current->domain->arch.p2m.max_mapped_pfn ) { l1_pgentry_t l1e = l1e_empty(); - int ret; + l2_pgentry_t l2e = l2e_empty(); + int ret, index; ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) / sizeof(l1_pgentry_t)); /* Need to __copy_from_user because the p2m is sparse and this * part might not exist */ - ret = __copy_from_user(&l1e, - &phys_to_machine_mapping[gfn], - sizeof(l1e)); - - if ( ret == 0 ) { - p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); - ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); + index = gfn >> (L2_PAGETABLE_SHIFT - L1_PAGETABLE_SHIFT); + ret = __copy_from_user(&l2e, + &phys_to_machine_large_page_mapping[index], + sizeof(l2e)); + + if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PSE) ) { + p2mt = p2m_flags_to_type(l2e_get_flags(l2e)); + ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt)); if ( p2m_is_valid(p2mt) ) - mfn = _mfn(l1e_get_pfn(l1e)); - else - /* XXX see above */ + mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr)); + else p2mt = p2m_mmio_dm; } + else { + ret = __copy_from_user(&l1e, + &phys_to_machine_mapping[gfn], + sizeof(l1e)); + + if ( ret == 0 ) { + p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); + ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); + if ( p2m_is_valid(p2mt) ) + mfn = _mfn(l1e_get_pfn(l1e)); + else + /* XXX see above */ + p2mt = p2m_mmio_dm; + } + } } - + *t = p2mt; return mfn; } @@ -202,21 +219,22 @@ void p2m_teardown(struct domain *d); /* Add a page to a domain's p2m table */ int guest_physmap_add_entry(struct domain *d, unsigned long gfn, - unsigned long mfn, p2m_type_t t); + unsigned long mfn, unsigned int order, + p2m_type_t t); /* Untyped version for RAM only, for compatibility * * Return 0 for success */ static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn, - unsigned long mfn) -{ - return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw); + unsigned long mfn, unsigned int order) +{ + return guest_physmap_add_entry(d, gfn, mfn, order, p2m_ram_rw); } /* Remove a page from a domain's p2m table */ void guest_physmap_remove_page(struct domain *d, unsigned long gfn, - unsigned long mfn); + unsigned long mfn, unsigned int order); /* Change types across all p2m entries in a domain */ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt); diff -r 3191627e5ad6 xen/include/xen/lib.h --- a/xen/include/xen/lib.h Wed Oct 31 16:21:18 2007 +0000 +++ b/xen/include/xen/lib.h Wed Nov 07 10:03:15 2007 -0600 @@ -45,7 +45,7 @@ struct domain; void cmdline_parse(char *cmdline); -/*#define DEBUG_TRACE_DUMP*/ +#define DEBUG_TRACE_DUMP #ifdef DEBUG_TRACE_DUMP extern void debugtrace_dump(void); extern void debugtrace_printk(const char *fmt, ...); diff -r 3191627e5ad6 xen/arch/x86/mm/p2m-orig.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/mm/p2m-orig.c Wed Nov 07 09:45:39 2007 -0600 @@ -0,0 +1,941 @@ +/****************************************************************************** + * arch/x86/mm/p2m.c + * + * physical-to-machine mappings for automatically-translated domains. + * + * Parts of this code are Copyright (c) 2007 by Advanced Micro Devices. + * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +/* Debugging and auditing of the P2M code? */ +#define P2M_AUDIT 0 +#define P2M_DEBUGGING 1 + +/* + * The P2M lock. This protects all updates to the p2m table. + * Updates are expected to be safe against concurrent reads, + * which do *not* require the lock. + * + * Locking discipline: always acquire this lock before the shadow or HAP one + */ + +#define p2m_lock_init(_d) \ + do { \ + spin_lock_init(&(_d)->arch.p2m.lock); \ + (_d)->arch.p2m.locker = -1; \ + (_d)->arch.p2m.locker_function = "nobody"; \ + } while (0) + +#define p2m_lock(_d) \ + do { \ + if ( unlikely((_d)->arch.p2m.locker == current->processor) )\ + { \ + printk("Error: p2m lock held by %s\n", \ + (_d)->arch.p2m.locker_function); \ + BUG(); \ + } \ + spin_lock(&(_d)->arch.p2m.lock); \ + ASSERT((_d)->arch.p2m.locker == -1); \ + (_d)->arch.p2m.locker = current->processor; \ + (_d)->arch.p2m.locker_function = __func__; \ + } while (0) + +#define p2m_unlock(_d) \ + do { \ + ASSERT((_d)->arch.p2m.locker == current->processor); \ + (_d)->arch.p2m.locker = -1; \ + (_d)->arch.p2m.locker_function = "nobody"; \ + spin_unlock(&(_d)->arch.p2m.lock); \ + } while (0) + + + +/* Printouts */ +#define P2M_PRINTK(_f, _a...) \ + debugtrace_printk("p2m: %s(): " _f, __func__, ##_a) +#define P2M_ERROR(_f, _a...) \ + printk("pg error: %s(): " _f, __func__, ##_a) +#if P2M_DEBUGGING +#define P2M_DEBUG(_f, _a...) \ + debugtrace_printk("p2mdebug: %s(): " _f, __func__, ##_a) +#else +#define P2M_DEBUG(_f, _a...) do { (void)(_f); } while(0) +#endif + + +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_to_page +#define mfn_to_page(_m) (frame_table + mfn_x(_m)) +#undef mfn_valid +#define mfn_valid(_mfn) (mfn_x(_mfn) < max_page) +#undef page_to_mfn +#define page_to_mfn(_pg) (_mfn((_pg) - frame_table)) + + +/* PTE flags for the various types of p2m entry */ +#define P2M_BASE_FLAGS \ + (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED) + +static unsigned long p2m_type_to_flags(p2m_type_t t) +{ + unsigned long flags = (t & 0x7UL) << 9; + switch(t) + { + case p2m_invalid: + default: + return flags; + case p2m_ram_rw: + return flags | P2M_BASE_FLAGS | _PAGE_RW; + case p2m_ram_logdirty: + return flags | P2M_BASE_FLAGS; + case p2m_ram_ro: + return flags | P2M_BASE_FLAGS; + case p2m_mmio_dm: + return flags; + case p2m_mmio_direct: + return flags | P2M_BASE_FLAGS | _PAGE_RW | _PAGE_PCD; + } +} + + +// Find the next level's P2M entry, checking for out-of-range gfn's... +// Returns NULL on error. +// +static l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, u32 shift, u32 max) +{ + u32 index; + + index = *gfn_remainder >> shift; + if ( index >= max ) + { + P2M_DEBUG("gfn=0x%lx out of range " + "(gfn_remainder=0x%lx shift=%d index=0x%x max=0x%x)\n", + gfn, *gfn_remainder, shift, index, max); + return NULL; + } + *gfn_remainder &= (1 << shift) - 1; + return (l1_pgentry_t *)table + index; +} + +// Walk one level of the P2M table, allocating a new table if required. +// Returns 0 on error. +// +static int +p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table, + unsigned long *gfn_remainder, unsigned long gfn, u32 shift, + u32 max, unsigned long type) +{ + l1_pgentry_t *p2m_entry; + l1_pgentry_t new_entry; + void *next; + ASSERT(d->arch.p2m.alloc_page); + + if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, + shift, max)) ) + return 0; + + if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) ) + { + struct page_info *pg = d->arch.p2m.alloc_page(d); + if ( pg == NULL ) + return 0; + list_add_tail(&pg->list, &d->arch.p2m.pages); + pg->u.inuse.type_info = type | 1 | PGT_validated; + pg->count_info = 1; + + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), + __PAGE_HYPERVISOR|_PAGE_USER); + + switch ( type ) { + case PGT_l3_page_table: + paging_write_p2m_entry(d, gfn, + p2m_entry, *table_mfn, new_entry, 4); + break; + case PGT_l2_page_table: +#if CONFIG_PAGING_LEVELS == 3 + /* for PAE mode, PDPE only has PCD/PWT/P bits available */ + new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT); +#endif + paging_write_p2m_entry(d, gfn, + p2m_entry, *table_mfn, new_entry, 3); + break; + case PGT_l1_page_table: + paging_write_p2m_entry(d, gfn, + p2m_entry, *table_mfn, new_entry, 2); + break; + default: + BUG(); + break; + } + } + *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); + next = map_domain_page(mfn_x(*table_mfn)); + unmap_domain_page(*table); + *table = next; + + return 1; +} + +// Returns 0 on error (out of memory) +static int +set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt) +{ + // XXX -- this might be able to be faster iff current->domain == d + mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table); + void *table =map_domain_page(mfn_x(table_mfn)); + unsigned long gfn_remainder = gfn; + l1_pgentry_t *p2m_entry; + l1_pgentry_t entry_content; + int rv=0; + +#if CONFIG_PAGING_LEVELS >= 4 + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L4_PAGETABLE_SHIFT - PAGE_SHIFT, + L4_PAGETABLE_ENTRIES, PGT_l3_page_table) ) + goto out; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + /* + * When using PAE Xen, we only allow 33 bits of pseudo-physical + * address in translated guests (i.e. 8 GBytes). This restriction + * comes from wanting to map the P2M table into the 16MB RO_MPT hole + * in Xen's address space for translated PV guests. + * When using AMD's NPT on PAE Xen, we are restricted to 4GB. + */ + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L3_PAGETABLE_SHIFT - PAGE_SHIFT, + ((CONFIG_PAGING_LEVELS == 3) + ? (hvm_funcs.hap_supported ? 4 : 8) + : L3_PAGETABLE_ENTRIES), + PGT_l2_page_table) ) + goto out; +#endif + if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn, + L2_PAGETABLE_SHIFT - PAGE_SHIFT, + L2_PAGETABLE_ENTRIES, PGT_l1_page_table) ) + goto out; + + p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn, + 0, L1_PAGETABLE_ENTRIES); + ASSERT(p2m_entry); + + /* Track the highest gfn for which we have ever had a valid mapping */ + if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) ) + d->arch.p2m.max_mapped_pfn = gfn; + + if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ) + entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt)); + else + entry_content = l1e_empty(); + + /* level 1 entry */ + paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1); + + if ( vtd_enabled && (p2mt == p2m_mmio_direct) && is_hvm_domain(d) ) + iommu_flush(d, gfn, (u64*)p2m_entry); + + /* Success */ + rv = 1; + + out: + unmap_domain_page(table); + return rv; +} + + +/* Init the datastructures for later use by the p2m code */ +void p2m_init(struct domain *d) +{ + p2m_lock_init(d); + INIT_LIST_HEAD(&d->arch.p2m.pages); +} + + +// Allocate a new p2m table for a domain. +// +// The structure of the p2m table is that of a pagetable for xen (i.e. it is +// controlled by CONFIG_PAGING_LEVELS). +// +// The alloc_page and free_page functions will be used to get memory to +// build the p2m, and to release it again at the end of day. +// +// Returns 0 for success or -errno. +// +int p2m_alloc_table(struct domain *d, + struct page_info * (*alloc_page)(struct domain *d), + void (*free_page)(struct domain *d, struct page_info *pg)) + +{ + mfn_t mfn; + struct list_head *entry; + struct page_info *page, *p2m_top; + unsigned int page_count = 0; + unsigned long gfn; + + p2m_lock(d); + + if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) + { + P2M_ERROR("p2m already allocated for this domain\n"); + p2m_unlock(d); + return -EINVAL; + } + + P2M_PRINTK("allocating p2m table\n"); + + d->arch.p2m.alloc_page = alloc_page; + d->arch.p2m.free_page = free_page; + + p2m_top = d->arch.p2m.alloc_page(d); + if ( p2m_top == NULL ) + { + p2m_unlock(d); + return -ENOMEM; + } + list_add_tail(&p2m_top->list, &d->arch.p2m.pages); + + p2m_top->count_info = 1; + p2m_top->u.inuse.type_info = +#if CONFIG_PAGING_LEVELS == 4 + PGT_l4_page_table +#elif CONFIG_PAGING_LEVELS == 3 + PGT_l3_page_table +#elif CONFIG_PAGING_LEVELS == 2 + PGT_l2_page_table +#endif + | 1 | PGT_validated; + + d->arch.phys_table = pagetable_from_mfn(page_to_mfn(p2m_top)); + + P2M_PRINTK("populating p2m table\n"); + + /* Initialise physmap tables for slot zero. Other code assumes this. */ + if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) ) + goto error; + + /* Copy all existing mappings from the page list and m2p */ + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + page = list_entry(entry, struct page_info, list); + mfn = page_to_mfn(page); + gfn = get_gpfn_from_mfn(mfn_x(mfn)); + page_count++; + if ( +#ifdef __x86_64__ + (gfn != 0x5555555555555555L) +#else + (gfn != 0x55555555L) +#endif + && gfn != INVALID_M2P_ENTRY + && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) ) + goto error; + } + +#if CONFIG_PAGING_LEVELS >= 3 + if (vtd_enabled && is_hvm_domain(d)) + iommu_set_pgd(d); +#endif + + P2M_PRINTK("p2m table initialised (%u pages)\n", page_count); + p2m_unlock(d); + return 0; + + error: + P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%" + PRI_mfn "\n", gfn, mfn_x(mfn)); + p2m_unlock(d); + return -ENOMEM; +} + +void p2m_teardown(struct domain *d) +/* Return all the p2m pages to Xen. + * We know we don't have any extra mappings to these pages */ +{ + struct list_head *entry, *n; + struct page_info *pg; + + p2m_lock(d); + d->arch.phys_table = pagetable_null(); + + list_for_each_safe(entry, n, &d->arch.p2m.pages) + { + pg = list_entry(entry, struct page_info, list); + list_del(entry); + d->arch.p2m.free_page(d, pg); + } + p2m_unlock(d); +} + +mfn_t +gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t) +/* Read another domain's p2m entries */ +{ + mfn_t mfn; + paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT; + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + + ASSERT(paging_mode_translate(d)); + + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + *t = p2m_mmio_dm; + + mfn = pagetable_get_mfn(d->arch.phys_table); + + if ( gfn > d->arch.p2m.max_mapped_pfn ) + /* This pfn is higher than the highest the p2m map currently holds */ + return _mfn(INVALID_MFN); + +#if CONFIG_PAGING_LEVELS >= 4 + { + l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn)); + l4e += l4_table_offset(addr); + if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l4e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l4e_get_pfn(*l4e)); + unmap_domain_page(l4e); + } +#endif +#if CONFIG_PAGING_LEVELS >= 3 + { + l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn)); +#if CONFIG_PAGING_LEVELS == 3 + /* On PAE hosts the p2m has eight l3 entries, not four (see + * shadow_set_p2m_entry()) so we can't use l3_table_offset. + * Instead, just count the number of l3es from zero. It's safe + * to do this because we already checked that the gfn is within + * the bounds of the p2m. */ + l3e += (addr >> L3_PAGETABLE_SHIFT); +#else + l3e += l3_table_offset(addr); +#endif + if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l3e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l3e_get_pfn(*l3e)); + unmap_domain_page(l3e); + } +#endif + + l2e = map_domain_page(mfn_x(mfn)); + l2e += l2_table_offset(addr); + if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l2e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l2e_get_pfn(*l2e)); + unmap_domain_page(l2e); + + l1e = map_domain_page(mfn_x(mfn)); + l1e += l1_table_offset(addr); + if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 ) + { + unmap_domain_page(l1e); + return _mfn(INVALID_MFN); + } + mfn = _mfn(l1e_get_pfn(*l1e)); + *t = p2m_flags_to_type(l1e_get_flags(*l1e)); + unmap_domain_page(l1e); + + ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t)); + return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN); +} + +#if P2M_AUDIT +static void audit_p2m(struct domain *d) +{ + struct list_head *entry; + struct page_info *page; + struct domain *od; + unsigned long mfn, gfn, m2pfn, lp2mfn = 0; + mfn_t p2mfn; + unsigned long orphans_d = 0, orphans_i = 0, mpbad = 0, pmbad = 0; + int test_linear; + + if ( !paging_mode_translate(d) ) + return; + + //P2M_PRINTK("p2m audit starts\n"); + + test_linear = ( (d == current->domain) + && !pagetable_is_null(current->arch.monitor_table) ); + if ( test_linear ) + flush_tlb_local(); + + /* Audit part one: walk the domain's page allocation list, checking + * the m2p entries. */ + for ( entry = d->page_list.next; + entry != &d->page_list; + entry = entry->next ) + { + page = list_entry(entry, struct page_info, list); + mfn = mfn_x(page_to_mfn(page)); + + // P2M_PRINTK("auditing guest page, mfn=%#lx\n", mfn); + + od = page_get_owner(page); + + if ( od != d ) + { + P2M_PRINTK("wrong owner %#lx -> %p(%u) != %p(%u)\n", + mfn, od, (od?od->domain_id:-1), d, d->domain_id); + continue; + } + + gfn = get_gpfn_from_mfn(mfn); + if ( gfn == INVALID_M2P_ENTRY ) + { + orphans_i++; + //P2M_PRINTK("orphaned guest page: mfn=%#lx has invalid gfn\n", + // mfn); + continue; + } + + if ( gfn == 0x55555555 ) + { + orphans_d++; + //P2M_PRINTK("orphaned guest page: mfn=%#lx has debug gfn\n", + // mfn); + continue; + } + + p2mfn = gfn_to_mfn_foreign(d, gfn); + if ( mfn_x(p2mfn) != mfn ) + { + mpbad++; + P2M_PRINTK("map mismatch mfn %#lx -> gfn %#lx -> mfn %#lx" + " (-> gfn %#lx)\n", + mfn, gfn, mfn_x(p2mfn), + (mfn_valid(p2mfn) + ? get_gpfn_from_mfn(mfn_x(p2mfn)) + : -1u)); + /* This m2p entry is stale: the domain has another frame in + * this physical slot. No great disaster, but for neatness, + * blow away the m2p entry. */ + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY, __PAGE_HYPERVISOR|_PAGE_USER); + } + + if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) ) + { + lp2mfn = mfn_x(gfn_to_mfn_current(gfn)); + if ( lp2mfn != mfn_x(p2mfn) ) + { + P2M_PRINTK("linear mismatch gfn %#lx -> mfn %#lx " + "(!= mfn %#lx)\n", gfn, lp2mfn, mfn_x(p2mfn)); + } + } + + // P2M_PRINTK("OK: mfn=%#lx, gfn=%#lx, p2mfn=%#lx, lp2mfn=%#lx\n", + // mfn, gfn, p2mfn, lp2mfn); + } + + /* Audit part two: walk the domain's p2m table, checking the entries. */ + if ( pagetable_get_pfn(d->arch.phys_table) != 0 ) + { + l2_pgentry_t *l2e; + l1_pgentry_t *l1e; + int i1, i2; + +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + l3_pgentry_t *l3e; + int i3, i4; + l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); +#elif CONFIG_PAGING_LEVELS == 3 + l3_pgentry_t *l3e; + int i3; + l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); +#else /* CONFIG_PAGING_LEVELS == 2 */ + l2e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); +#endif + + gfn = 0; +#if CONFIG_PAGING_LEVELS >= 3 +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L4_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l3e = map_domain_page(mfn_x(_mfn(l4e_get_pfn(l4e[i4])))); +#endif /* now at levels 3 or 4... */ + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3])))); +#endif /* all levels... */ + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT); + continue; + } + l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2])))); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + if ( !(l1e_get_flags(l1e[i1]) & _PAGE_PRESENT) ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + ASSERT(mfn_valid(_mfn(mfn))); + m2pfn = get_gpfn_from_mfn(mfn); + if ( m2pfn != gfn ) + { + pmbad++; + P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx" + " -> gfn %#lx\n", gfn, mfn, m2pfn); + BUG(); + } + } + unmap_domain_page(l1e); + } +#if CONFIG_PAGING_LEVELS >= 3 + unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + unmap_domain_page(l3e); + } +#endif +#endif + +#if CONFIG_PAGING_LEVELS == 4 + unmap_domain_page(l4e); +#elif CONFIG_PAGING_LEVELS == 3 + unmap_domain_page(l3e); +#else /* CONFIG_PAGING_LEVELS == 2 */ + unmap_domain_page(l2e); +#endif + + } + + //P2M_PRINTK("p2m audit complete\n"); + //if ( orphans_i | orphans_d | mpbad | pmbad ) + // P2M_PRINTK("p2m audit found %lu orphans (%lu inval %lu debug)\n", + // orphans_i + orphans_d, orphans_i, orphans_d, + if ( mpbad | pmbad ) + P2M_PRINTK("p2m audit found %lu odd p2m, %lu bad m2p entries\n", + pmbad, mpbad); +} +#else +#define audit_p2m(_d) do { (void)(_d); } while(0) +#endif /* P2M_AUDIT */ + + + +static void +p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn) +{ + if ( !paging_mode_translate(d) ) + return; + P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn); + + set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid); + set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); +} + +void +guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + p2m_lock(d); + audit_p2m(d); + p2m_remove_page(d, gfn, mfn); + audit_p2m(d); + p2m_unlock(d); +} + +int +guest_physmap_add_entry(struct domain *d, unsigned long gfn, + unsigned long mfn, p2m_type_t t) +{ + unsigned long ogfn; + p2m_type_t ot; + mfn_t omfn; + int rc = 0; + + if ( !paging_mode_translate(d) ) + return -EINVAL; + +#if CONFIG_PAGING_LEVELS == 3 + /* 32bit PAE nested paging does not support over 4GB guest due to + * hardware translation limit. This limitation is checked by comparing + * gfn with 0xfffffUL. + */ + if ( paging_mode_hap(d) && (gfn > 0xfffffUL) ) + return -EINVAL; +#endif + + p2m_lock(d); + audit_p2m(d); + + P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn); + + omfn = gfn_to_mfn(d, gfn, &ot); + if ( p2m_is_ram(ot) ) + { + ASSERT(mfn_valid(omfn)); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + + ogfn = mfn_to_gfn(d, _mfn(mfn)); + if ( +#ifdef __x86_64__ + (ogfn != 0x5555555555555555L) +#else + (ogfn != 0x55555555L) +#endif + && (ogfn != INVALID_M2P_ENTRY) + && (ogfn != gfn) ) + { + /* This machine frame is already mapped at another physical address */ + P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", + mfn, ogfn, gfn); + omfn = gfn_to_mfn(d, ogfn, &ot); + if ( p2m_is_ram(ot) ) + { + ASSERT(mfn_valid(omfn)); + P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n", + ogfn , mfn_x(omfn)); + if ( mfn_x(omfn) == mfn ) + p2m_remove_page(d, ogfn, mfn); + } + } + + if ( mfn_valid(_mfn(mfn)) ) + { + if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) ) + rc = -EINVAL; + set_gpfn_from_mfn(mfn, gfn); + } + else + { + gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n", + gfn, mfn); + if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) ) + rc = -EINVAL; + } + + audit_p2m(d); + p2m_unlock(d); + + return rc; +} + +/* Walk the whole p2m table, changing any entries of the old type + * to the new type. This is used in hardware-assisted paging to + * quickly enable or diable log-dirty tracking */ +void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt) +{ + unsigned long mfn, gfn, flags; + l1_pgentry_t l1e_content; + l1_pgentry_t *l1e; + l2_pgentry_t *l2e; + mfn_t l1mfn; + int i1, i2; +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t *l3e; + int i3; +#if CONFIG_PAGING_LEVELS == 4 + l4_pgentry_t *l4e; + int i4; +#endif /* CONFIG_PAGING_LEVELS == 4 */ +#endif /* CONFIG_PAGING_LEVELS >= 3 */ + + if ( !paging_mode_translate(d) ) + return; + + if ( pagetable_get_pfn(d->arch.phys_table) == 0 ) + return; + + p2m_lock(d); + +#if CONFIG_PAGING_LEVELS == 4 + l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); +#elif CONFIG_PAGING_LEVELS == 3 + l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); +#else /* CONFIG_PAGING_LEVELS == 2 */ + l2e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table))); +#endif + +#if CONFIG_PAGING_LEVELS >= 3 +#if CONFIG_PAGING_LEVELS >= 4 + for ( i4 = 0; i4 < L4_PAGETABLE_ENTRIES; i4++ ) + { + if ( !(l4e_get_flags(l4e[i4]) & _PAGE_PRESENT) ) + { + continue; + } + l3e = map_domain_page(l4e_get_pfn(l4e[i4])); +#endif /* now at levels 3 or 4... */ + for ( i3 = 0; + i3 < ((CONFIG_PAGING_LEVELS==4) ? L3_PAGETABLE_ENTRIES : 8); + i3++ ) + { + if ( !(l3e_get_flags(l3e[i3]) & _PAGE_PRESENT) ) + { + continue; + } + l2e = map_domain_page(l3e_get_pfn(l3e[i3])); +#endif /* all levels... */ + for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ ) + { + if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) ) + { + continue; + } + + l1mfn = _mfn(l2e_get_pfn(l2e[i2])); + l1e = map_domain_page(mfn_x(l1mfn)); + + for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ ) + { + flags = l1e_get_flags(l1e[i1]); + if ( p2m_flags_to_type(flags) != ot ) + continue; + mfn = l1e_get_pfn(l1e[i1]); + gfn = get_gpfn_from_mfn(mfn); + /* create a new 1le entry with the new type */ + flags = p2m_flags_to_type(nt); + l1e_content = l1e_from_pfn(mfn, flags); + paging_write_p2m_entry(d, gfn, &l1e[i1], + l1mfn, l1e_content, 1); + } + unmap_domain_page(l1e); + } +#if CONFIG_PAGING_LEVELS >= 3 + unmap_domain_page(l2e); + } +#if CONFIG_PAGING_LEVELS >= 4 + unmap_domain_page(l3e); + } +#endif +#endif + +#if CONFIG_PAGING_LEVELS == 4 + unmap_domain_page(l4e); +#elif CONFIG_PAGING_LEVELS == 3 + unmap_domain_page(l3e); +#else /* CONFIG_PAGING_LEVELS == 2 */ + unmap_domain_page(l2e); +#endif + + p2m_unlock(d); +} + +/* Modify the p2m type of a single gfn from ot to nt, returning the + * entry's previous type */ +p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, + p2m_type_t ot, p2m_type_t nt) +{ + p2m_type_t pt; + mfn_t mfn; + + p2m_lock(d); + + mfn = gfn_to_mfn(d, gfn, &pt); + if ( pt == ot ) + set_p2m_entry(d, gfn, mfn, nt); + + p2m_unlock(d); + + return pt; +} + +int +set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) +{ + int rc = 0; + p2m_type_t ot; + mfn_t omfn; + + if ( !paging_mode_translate(d) ) + return 0; + + omfn = gfn_to_mfn(d, gfn, &ot); + if ( p2m_is_ram(ot) ) + { + ASSERT(mfn_valid(omfn)); + set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); + } + + rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct); + if ( 0 == rc ) + gdprintk(XENLOG_ERR, + "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", + gmfn_to_mfn(d, gfn)); + return rc; +} + +int +clear_mmio_p2m_entry(struct domain *d, unsigned long gfn) +{ + int rc = 0; + unsigned long mfn; + + if ( !paging_mode_translate(d) ) + return 0; + + mfn = gmfn_to_mfn(d, gfn); + if ( INVALID_MFN == mfn ) + { + gdprintk(XENLOG_ERR, + "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn); + return 0; + } + rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0); + + return rc; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */ diff -r 3191627e5ad6 xen/include/asm-x86/p2m-orig.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/asm-x86/p2m-orig.h Wed Nov 07 09:46:02 2007 -0600 @@ -0,0 +1,241 @@ +/****************************************************************************** + * include/asm-x86/paging.h + * + * physical-to-machine mappings for automatically-translated domains. + * + * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) + * Parts of this code are Copyright (c) 2006-2007 by XenSource Inc. + * Parts of this code are Copyright (c) 2006 by Michael A Fetterman + * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _XEN_P2M_H +#define _XEN_P2M_H + + +/* + * The phys_to_machine_mapping maps guest physical frame numbers + * to machine frame numbers. It only exists for paging_mode_translate + * guests. It is organised in page-table format, which: + * + * (1) allows us to use it directly as the second pagetable in hardware- + * assisted paging and (hopefully) iommu support; and + * (2) lets us map it directly into the guest vcpus' virtual address space + * as a linear pagetable, so we can read and write it easily. + * + * For (2) we steal the address space that would have normally been used + * by the read-only MPT map in a non-translated guest. (For + * paging_mode_external() guests this mapping is in the monitor table.) + */ +#define phys_to_machine_mapping ((l1_pgentry_t *)RO_MPT_VIRT_START) + +/* + * The upper levels of the p2m pagetable always contain full rights; all + * variation in the access control bits is made in the level-1 PTEs. + * + * In addition to the phys-to-machine translation, each p2m PTE contains + * *type* information about the gfn it translates, helping Xen to decide + * on the correct course of action when handling a page-fault to that + * guest frame. We store the type in the "available" bits of the PTEs + * in the table, which gives us 8 possible types on 32-bit systems. + * Further expansions of the type system will only be supported on + * 64-bit Xen. + */ +typedef enum { + p2m_invalid = 0, /* Nothing mapped here */ + p2m_ram_rw = 1, /* Normal read/write guest RAM */ + p2m_ram_logdirty = 2, /* Temporarily read-only for log-dirty */ + p2m_ram_ro = 3, /* Read-only; writes go to the device model */ + p2m_mmio_dm = 4, /* Reads and write go to the device model */ + p2m_mmio_direct = 5, /* Read/write mapping of genuine MMIO area */ +} p2m_type_t; + +/* We use bitmaps and maks to handle groups of types */ +#define p2m_to_mask(_t) (1UL << (_t)) + +/* RAM types, which map to real machine frames */ +#define P2M_RAM_TYPES (p2m_to_mask(p2m_ram_rw) \ + | p2m_to_mask(p2m_ram_logdirty) \ + | p2m_to_mask(p2m_ram_ro)) + +/* MMIO types, which don't have to map to anything in the frametable */ +#define P2M_MMIO_TYPES (p2m_to_mask(p2m_mmio_dm) \ + | p2m_to_mask(p2m_mmio_direct)) + +/* Read-only types, which must have the _PAGE_RW bit clear in their PTEs */ +#define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty) \ + | p2m_to_mask(p2m_ram_ro)) + +/* Useful predicates */ +#define p2m_is_ram(_t) (p2m_to_mask(_t) & P2M_RAM_TYPES) +#define p2m_is_mmio(_t) (p2m_to_mask(_t) & P2M_MMIO_TYPES) +#define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES) +#define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES)) + +/* Extract the type from the PTE flags that store it */ +static inline p2m_type_t p2m_flags_to_type(unsigned long flags) +{ + /* Type is stored in the "available" bits, 9, 10 and 11 */ + return (flags >> 9) & 0x7; +} + +/* Read the current domain's p2m table (through the linear mapping). */ +static inline mfn_t gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t) +{ + mfn_t mfn = _mfn(INVALID_MFN); + p2m_type_t p2mt = p2m_mmio_dm; + /* XXX This is for compatibility with the old model, where anything not + * XXX marked as RAM was considered to be emulated MMIO space. + * XXX Once we start explicitly registering MMIO regions in the p2m + * XXX we will return p2m_invalid for unmapped gfns */ + + if ( gfn <= current->domain->arch.p2m.max_mapped_pfn ) + { + l1_pgentry_t l1e = l1e_empty(); + int ret; + + ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) + / sizeof(l1_pgentry_t)); + + /* Need to __copy_from_user because the p2m is sparse and this + * part might not exist */ + ret = __copy_from_user(&l1e, + &phys_to_machine_mapping[gfn], + sizeof(l1e)); + + if ( ret == 0 ) { + p2mt = p2m_flags_to_type(l1e_get_flags(l1e)); + ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt)); + if ( p2m_is_valid(p2mt) ) + mfn = _mfn(l1e_get_pfn(l1e)); + else + /* XXX see above */ + p2mt = p2m_mmio_dm; + } + } + + *t = p2mt; + return mfn; +} + +/* Read another domain's P2M table, mapping pages as we go */ +mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t); + +/* General conversion function from gfn to mfn */ +#define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), (g), (t)) +static inline mfn_t _gfn_to_mfn(struct domain *d, + unsigned long gfn, p2m_type_t *t) +{ + if ( !paging_mode_translate(d) ) + { + /* Not necessarily true, but for non-translated guests, we claim + * it's the most generic kind of memory */ + *t = p2m_ram_rw; + return _mfn(gfn); + } + if ( likely(current->domain == d) ) + return gfn_to_mfn_current(gfn, t); + else + return gfn_to_mfn_foreign(d, gfn, t); +} + +/* Compatibility function exporting the old untyped interface */ +static inline unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn) +{ + mfn_t mfn; + p2m_type_t t; + mfn = gfn_to_mfn(d, gpfn, &t); + if ( p2m_is_valid(t) ) + return mfn_x(mfn); + return INVALID_MFN; +} + +/* General conversion function from mfn to gfn */ +static inline unsigned long mfn_to_gfn(struct domain *d, mfn_t mfn) +{ + if ( paging_mode_translate(d) ) + return get_gpfn_from_mfn(mfn_x(mfn)); + else + return mfn_x(mfn); +} + +/* Translate the frame number held in an l1e from guest to machine */ +static inline l1_pgentry_t +gl1e_to_ml1e(struct domain *d, l1_pgentry_t l1e) +{ + if ( unlikely(paging_mode_translate(d)) ) + l1e = l1e_from_pfn(gmfn_to_mfn(d, l1e_get_pfn(l1e)), + l1e_get_flags(l1e)); + return l1e; +} + + +/* Init the datastructures for later use by the p2m code */ +void p2m_init(struct domain *d); + +/* Allocate a new p2m table for a domain. + * + * The alloc_page and free_page functions will be used to get memory to + * build the p2m, and to release it again at the end of day. + * + * Returns 0 for success or -errno. */ +int p2m_alloc_table(struct domain *d, + struct page_info * (*alloc_page)(struct domain *d), + void (*free_page)(struct domain *d, struct page_info *pg)); + +/* Return all the p2m resources to Xen. */ +void p2m_teardown(struct domain *d); + +/* Add a page to a domain's p2m table */ +int guest_physmap_add_entry(struct domain *d, unsigned long gfn, + unsigned long mfn, p2m_type_t t); + +/* Untyped version for RAM only, for compatibility + * + * Return 0 for success + */ +static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn, + unsigned long mfn) +{ + return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw); +} + +/* Remove a page from a domain's p2m table */ +void guest_physmap_remove_page(struct domain *d, unsigned long gfn, + unsigned long mfn); + +/* Change types across all p2m entries in a domain */ +void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt); + +/* Compare-exchange the type of a single p2m entry */ +p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, + p2m_type_t ot, p2m_type_t nt); + +/* Set mmio addresses in the p2m table (for pass-through) */ +int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); +int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn); + +#endif /* _XEN_P2M_H */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * indent-tabs-mode: nil + * End: + */