Xen project Mailing List

[Xen-changelog] Check in files I missed from shadow64 checkin.

From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>

Date: Mon, 11 Jul 2005 05:58:10 -0400

Delivery-date: Mon, 11 Jul 2005 09:58:33 +0000

List-id: BK change log <xen-changelog.lists.xensource.com>

# HG changeset patch # User kaf24@xxxxxxxxxxxxxxxxxxxx # Node ID 0bcfd66a431ebfc70fc068a134e684568ac02966 # Parent d332d4df452ecf6c3aaeab73c79e1e6ce751b61d Check in files I missed from shadow64 checkin. diff -r d332d4df452e -r 0bcfd66a431e xen/arch/x86/shadow_public.c --- /dev/null Mon Jul 11 09:22:15 2005 +++ b/xen/arch/x86/shadow_public.c Mon Jul 11 09:57:38 2005 @@ -0,0 +1,1654 @@ +/****************************************************************************** + * arch/x86/shadow_public.c + * + * Copyright (c) 2005 Michael A Fetterman + * Based on an earlier implementation by Ian Pratt et al + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/domain_page.h> +#include <asm/shadow.h> +#include <asm/page.h> +#include <xen/event.h> +#include <xen/sched.h> +#include <xen/trace.h> + +#if CONFIG_PAGING_LEVELS >= 4 +#include <asm/shadow_64.h> + +extern struct shadow_ops MODE_F_HANDLER; +#endif + +extern struct shadow_ops MODE_A_HANDLER; + +/****************************************************************************/ +/************* export interface functions ***********************************/ +/****************************************************************************/ + + +int shadow_set_guest_paging_levels(struct domain *d, int levels) +{ + shadow_lock(d); + + switch(levels) { +#if CONFIG_PAGING_LEVELS >= 4 + case 4: + if ( d->arch.ops != &MODE_F_HANDLER ) + d->arch.ops = &MODE_F_HANDLER; + shadow_unlock(d); + return 1; +#endif + case 3: + case 2: + if ( d->arch.ops != &MODE_A_HANDLER ) + d->arch.ops = &MODE_A_HANDLER; + shadow_unlock(d); + return 1; + default: + shadow_unlock(d); + return 0; + } +} + +void shadow_invlpg(struct vcpu *v, unsigned long va) +{ + struct domain *d = current->domain; + d->arch.ops->invlpg(v, va); +} + +int shadow_fault(unsigned long va, struct cpu_user_regs *regs) +{ + struct domain *d = current->domain; + return d->arch.ops->fault(va, regs); +} + +void __update_pagetables(struct vcpu *v) +{ + struct domain *d = v->domain; + d->arch.ops->update_pagetables(v); +} + +void __shadow_sync_all(struct domain *d) +{ + d->arch.ops->sync_all(d); +} + +int shadow_remove_all_write_access( + struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn) +{ + return d->arch.ops->remove_all_write_access(d, readonly_gpfn, readonly_gmfn); +} + +int shadow_do_update_va_mapping(unsigned long va, + l1_pgentry_t val, + struct vcpu *v) +{ + struct domain *d = v->domain; + return d->arch.ops->do_update_va_mapping(va, val, v); +} + +struct out_of_sync_entry * +shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, + unsigned long mfn) +{ + struct domain *d = v->domain; + return d->arch.ops->mark_mfn_out_of_sync(v, gpfn, mfn); +} + +/* + * Returns 1 if va's shadow mapping is out-of-sync. + * Returns 0 otherwise. + */ +int __shadow_out_of_sync(struct vcpu *v, unsigned long va) +{ + struct domain *d = v->domain; + return d->arch.ops->is_out_of_sync(v, va); +} + +/****************************************************************************/ +/****************************************************************************/ +#if CONFIG_PAGING_LEVELS >= 4 +/* + * Convert PAE 3-level page-table to 4-level page-table + */ +#define PDP_ENTRIES 4 +static pagetable_t page_table_convert(struct domain *d) +{ + struct pfn_info *l4page, *l3page; + l4_pgentry_t *l4; + l3_pgentry_t *l3, *pae_l3; + int i; + + l4page = alloc_domheap_page(NULL); + if (l4page == NULL) + domain_crash(); + l4 = map_domain_page(page_to_pfn(l4page)); + memset(l4, 0, PAGE_SIZE); + + l3page = alloc_domheap_page(NULL); + if (l3page == NULL) + domain_crash(); + l3 = map_domain_page(page_to_pfn(l3page)); + memset(l3, 0, PAGE_SIZE); + + l4[0] = l4e_from_page(l3page, __PAGE_HYPERVISOR); + pae_l3 = map_domain_page(pagetable_get_pfn(d->arch.phys_table)); + + for (i = 0; i < PDP_ENTRIES; i++) { + l3[i] = pae_l3[i]; + l3e_add_flags(l3[i], 0x67); + } + + unmap_domain_page(l4); + unmap_domain_page(l3); + + return mk_pagetable(page_to_phys(l4page)); +} + +void alloc_monitor_pagetable(struct vcpu *v) +{ + unsigned long mmfn; + l4_pgentry_t *mpl4e; + struct pfn_info *mmfn_info; + struct domain *d = v->domain; + pagetable_t phys_table; + + ASSERT(!pagetable_get_paddr(v->arch.monitor_table)); /* we should only get called once */ + + mmfn_info = alloc_domheap_page(NULL); + ASSERT( mmfn_info ); + + mmfn = (unsigned long) (mmfn_info - frame_table); + mpl4e = (l4_pgentry_t *) map_domain_page(mmfn); + memcpy(mpl4e, &idle_pg_table[0], PAGE_SIZE); + mpl4e[l4_table_offset(PERDOMAIN_VIRT_START)] = + l4e_from_paddr(__pa(d->arch.mm_perdomain_l3), __PAGE_HYPERVISOR); + /* map the phys_to_machine map into the per domain Read-Only MPT space */ + phys_table = page_table_convert(d); + + mpl4e[l4_table_offset(RO_MPT_VIRT_START)] = + l4e_from_paddr(pagetable_get_paddr(phys_table), + __PAGE_HYPERVISOR); + v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT); + v->arch.monitor_vtable = (l2_pgentry_t *) mpl4e; +} + +static void inline +free_shadow_fl1_table(struct domain *d, unsigned long smfn) +{ + l1_pgentry_t *pl1e = map_domain_page(smfn); + int i; + + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + put_page_from_l1e(pl1e[i], d); +} + +/* + * Free l2, l3, l4 shadow tables + */ +static void inline +free_shadow_tables(struct domain *d, unsigned long smfn, u32 level) +{ + pgentry_64_t *ple = map_domain_page(smfn); + int i, external = shadow_mode_external(d); + + for ( i = 0; i < PAGETABLE_ENTRIES; i++ ) + if ( external || is_guest_l4_slot(i) ) + if ( entry_get_flags(ple[i]) & _PAGE_PRESENT ) + put_shadow_ref(entry_get_pfn(ple[i])); + + unmap_domain_page(ple); +} + +void free_monitor_pagetable(struct vcpu *v) +{ + unsigned long mfn; + +// ASSERT( pagetable_val(v->arch.monitor_table) ); + /* + * free monitor_table. + */ + //mfn = (pagetable_val(v->arch.monitor_table)) >> PAGE_SHIFT; + mfn = pagetable_get_pfn(v->arch.monitor_table); + unmap_domain_page(v->arch.monitor_vtable); + free_domheap_page(&frame_table[mfn]); + v->arch.monitor_table = mk_pagetable(0); + v->arch.monitor_vtable = 0; +} + +#elif CONFIG_PAGING_LEVELS == 2 +static void alloc_monitor_pagetable(struct vcpu *v) +{ + unsigned long mmfn; + l2_pgentry_t *mpl2e; + struct pfn_info *mmfn_info; + struct domain *d = v->domain; + + ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0); + + mmfn_info = alloc_domheap_page(NULL); + ASSERT(mmfn_info != NULL); + + mmfn = page_to_pfn(mmfn_info); + mpl2e = (l2_pgentry_t *)map_domain_page(mmfn); + memset(mpl2e, 0, PAGE_SIZE); + +#ifdef __i386__ /* XXX screws x86/64 build */ + memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); +#endif + + mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = + l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), + __PAGE_HYPERVISOR); + + // map the phys_to_machine map into the Read-Only MPT space for this domain + mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = + l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table), + __PAGE_HYPERVISOR); + + // Don't (yet) have mappings for these... + // Don't want to accidentally see the idle_pg_table's linear mapping. + // + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); + mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); + + v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT); + v->arch.monitor_vtable = mpl2e; +} + +/* + * Free the pages for monitor_table and hl2_table + */ +void free_monitor_pagetable(struct vcpu *v) +{ + l2_pgentry_t *mpl2e, hl2e, sl2e; + unsigned long mfn; + + ASSERT( pagetable_get_paddr(v->arch.monitor_table) ); + + mpl2e = v->arch.monitor_vtable; + + /* + * First get the mfn for hl2_table by looking at monitor_table + */ + hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; + if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) + { + mfn = l2e_get_pfn(hl2e); + ASSERT(mfn); + put_shadow_ref(mfn); + } + + sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; + if ( l2e_get_flags(sl2e) & _PAGE_PRESENT ) + { + mfn = l2e_get_pfn(sl2e); + ASSERT(mfn); + put_shadow_ref(mfn); + } + + unmap_domain_page(mpl2e); + + /* + * Then free monitor_table. + */ + mfn = pagetable_get_pfn(v->arch.monitor_table); + free_domheap_page(&frame_table[mfn]); + + v->arch.monitor_table = mk_pagetable(0); + v->arch.monitor_vtable = 0; +} +#endif + +static void +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry) +{ + void *snapshot; + + if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) + return; + + // Clear the out_of_sync bit. + // + clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info); + + // XXX Need to think about how to protect the domain's + // information less expensively. + // + snapshot = map_domain_page(entry->snapshot_mfn); + memset(snapshot, 0, PAGE_SIZE); + unmap_domain_page(snapshot); + + put_shadow_ref(entry->snapshot_mfn); +} + +void +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry) +{ + struct pfn_info *page; + + page = &frame_table[entry->gmfn]; + + // Decrement ref count of guest & shadow pages + // + put_page(page); + + // Only use entries that have low bits clear... + // + if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) + { + put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT); + entry->writable_pl1e = -2; + } + else + ASSERT( entry->writable_pl1e == -1 ); + + // Free the snapshot + // + shadow_free_snapshot(d, entry); +} + +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn) +{ + struct out_of_sync_entry *entry = d->arch.out_of_sync; + struct out_of_sync_entry **prev = &d->arch.out_of_sync; + struct out_of_sync_entry *found = NULL; + + // NB: Be careful not to call something that manipulates this list + // while walking it. Collect the results into a separate list + // first, then walk that list. + // + while ( entry ) + { + if ( entry->gmfn == gmfn ) + { + // remove from out of sync list + *prev = entry->next; + + // add to found list + entry->next = found; + found = entry; + + entry = *prev; + continue; + } + prev = &entry->next; + entry = entry->next; + } + + prev = NULL; + entry = found; + while ( entry ) + { + release_out_of_sync_entry(d, entry); + + prev = &entry->next; + entry = entry->next; + } + + // Add found list to free list + if ( prev ) + { + *prev = d->arch.out_of_sync_free; + d->arch.out_of_sync_free = found; + } +} + +static inline void +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + if ( !shadow_mode_refcounts(d) ) + return; + + ASSERT(frame_table[gmfn].count_info & PGC_page_table); + + if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none ) + { + clear_bit(_PGC_page_table, &frame_table[gmfn].count_info); + + if ( page_out_of_sync(pfn_to_page(gmfn)) ) + { + remove_out_of_sync_entries(d, gmfn); + } + } +} + +static void inline +free_shadow_l1_table(struct domain *d, unsigned long smfn) +{ + l1_pgentry_t *pl1e = map_domain_page(smfn); + int i; + struct pfn_info *spage = pfn_to_page(smfn); + u32 min_max = spage->tlbflush_timestamp; + int min = SHADOW_MIN(min_max); + int max = SHADOW_MAX(min_max); + + for ( i = min; i <= max; i++ ) + { + shadow_put_page_from_l1e(pl1e[i], d); + pl1e[i] = l1e_empty(); + } + + unmap_domain_page(pl1e); +} + +static void inline +free_shadow_hl2_table(struct domain *d, unsigned long smfn) +{ + l1_pgentry_t *hl2 = map_domain_page(smfn); + int i, limit; + + SH_VVLOG("%s: smfn=%lx freed", __func__, smfn); + +#ifdef __i386__ + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; +#else + limit = 0; /* XXX x86/64 XXX */ +#endif + + for ( i = 0; i < limit; i++ ) + { + if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT ) + put_page(pfn_to_page(l1e_get_pfn(hl2[i]))); + } + + unmap_domain_page(hl2); +} + +static void inline +free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type) +{ + l2_pgentry_t *pl2e = map_domain_page(smfn); + int i, external = shadow_mode_external(d); + + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + if ( external || is_guest_l2_slot(type, i) ) + if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT ) + put_shadow_ref(l2e_get_pfn(pl2e[i])); + + if ( (PGT_base_page_table == PGT_l2_page_table) && + shadow_mode_translate(d) && !external ) + { + // free the ref to the hl2 + // + put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)])); + } + + unmap_domain_page(pl2e); +} + +void free_shadow_page(unsigned long smfn) +{ + struct pfn_info *page = &frame_table[smfn]; + unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask; + struct domain *d = page_get_owner(pfn_to_page(gmfn)); + unsigned long gpfn = __mfn_to_gpfn(d, gmfn); + unsigned long type = page->u.inuse.type_info & PGT_type_mask; + + SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn); + + ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); +#if CONFIG_PAGING_LEVELS >=4 + if (type == PGT_fl1_shadow) { + unsigned long mfn; + mfn = __shadow_status(d, gpfn, PGT_fl1_shadow); + if (!mfn) + gpfn |= (1UL << 63); + } +#endif + delete_shadow_status(d, gpfn, gmfn, type); + + switch ( type ) + { + case PGT_l1_shadow: + perfc_decr(shadow_l1_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_l1_table(d, smfn); + break; +#if defined (__i386__) + case PGT_l2_shadow: + perfc_decr(shadow_l2_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_l2_table(d, smfn, page->u.inuse.type_info); + break; + + case PGT_hl2_shadow: + perfc_decr(hl2_table_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_hl2_table(d, smfn); + break; +#else + case PGT_l2_shadow: + case PGT_l3_shadow: + case PGT_l4_shadow: + shadow_demote(d, gpfn, gmfn); + free_shadow_tables(d, smfn, shadow_type_to_level(type)); + break; + + case PGT_fl1_shadow: + free_shadow_fl1_table(d, smfn); + break; + +#endif + + case PGT_snapshot: + perfc_decr(apshot_pages); + break; + + default: + printk("Free shadow weird page type mfn=%lx type=%08x\n", + page_to_pfn(page), page->u.inuse.type_info); + break; + } + + d->arch.shadow_page_count--; + + // No TLB flushes are needed the next time this page gets allocated. + // + page->tlbflush_timestamp = 0; + page->u.free.cpumask = CPU_MASK_NONE; + + if ( type == PGT_l1_shadow ) + { + list_add(&page->list, &d->arch.free_shadow_frames); + perfc_incr(free_l1_pages); + } + else + free_domheap_page(page); +} + +static void +free_writable_pte_predictions(struct domain *d) +{ + int i; + struct shadow_status *x; + + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + u32 count; + unsigned long *gpfn_list; + + /* Skip empty buckets. */ + if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) + continue; + + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) + count++; + + gpfn_list = xmalloc_array(unsigned long, count); + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) + gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask; + + while ( count ) + { + count--; + delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred); + } + + xfree(gpfn_list); + } +} + +static void free_shadow_ht_entries(struct domain *d) +{ + struct shadow_status *x, *n; + + SH_VLOG("freed tables count=%d l1=%d l2=%d", + d->arch.shadow_page_count, perfc_value(shadow_l1_pages), + perfc_value(shadow_l2_pages)); + + n = d->arch.shadow_ht_extras; + while ( (x = n) != NULL ) + { + d->arch.shadow_extras_count--; + n = *((struct shadow_status **)(&x[shadow_ht_extra_size])); + xfree(x); + } + + d->arch.shadow_ht_extras = NULL; + d->arch.shadow_ht_free = NULL; + + ASSERT(d->arch.shadow_extras_count == 0); + SH_LOG("freed extras, now %d", d->arch.shadow_extras_count); + + if ( d->arch.shadow_dirty_bitmap != NULL ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = 0; + d->arch.shadow_dirty_bitmap_size = 0; + } + + xfree(d->arch.shadow_ht); + d->arch.shadow_ht = NULL; +} + +static void free_out_of_sync_entries(struct domain *d) +{ + struct out_of_sync_entry *x, *n; + + n = d->arch.out_of_sync_extras; + while ( (x = n) != NULL ) + { + d->arch.out_of_sync_extras_count--; + n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size])); + xfree(x); + } + + d->arch.out_of_sync_extras = NULL; + d->arch.out_of_sync_free = NULL; + d->arch.out_of_sync = NULL; + + ASSERT(d->arch.out_of_sync_extras_count == 0); + FSH_LOG("freed extra out_of_sync entries, now %d", + d->arch.out_of_sync_extras_count); +} + +void free_shadow_pages(struct domain *d) +{ + int i; + struct shadow_status *x; + struct vcpu *v; + + /* + * WARNING! The shadow page table must not currently be in use! + * e.g., You are expected to have paused the domain and synchronized CR3. + */ + + if( !d->arch.shadow_ht ) return; + + shadow_audit(d, 1); + + // first, remove any outstanding refs from out_of_sync entries... + // + free_out_of_sync_state(d); + + // second, remove any outstanding refs from v->arch.shadow_table + // and CR3. + // + for_each_vcpu(d, v) + { + if ( pagetable_get_paddr(v->arch.shadow_table) ) + { + put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table)); + v->arch.shadow_table = mk_pagetable(0); + } + + if ( v->arch.monitor_shadow_ref ) + { + put_shadow_ref(v->arch.monitor_shadow_ref); + v->arch.monitor_shadow_ref = 0; + } + } + +#if defined (__i386__) + // For external shadows, remove the monitor table's refs + // + if ( shadow_mode_external(d) ) + { + for_each_vcpu(d, v) + { + l2_pgentry_t *mpl2e = v->arch.monitor_vtable; + + if ( mpl2e ) + { + l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; + l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; + + if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) + { + put_shadow_ref(l2e_get_pfn(hl2e)); + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); + } + if ( l2e_get_flags(smfn) & _PAGE_PRESENT ) + { + put_shadow_ref(l2e_get_pfn(smfn)); + mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); + } + } + } + } +#endif + // Now, the only refs to shadow pages that are left are from the shadow + // pages themselves. We just unpin the pinned pages, and the rest + // should automatically disappear. + // + // NB: Beware: each explicitly or implicit call to free_shadow_page + // can/will result in the hash bucket getting rewritten out from + // under us... First, collect the list of pinned pages, then + // free them. + // + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + u32 count; + unsigned long *mfn_list; + + /* Skip empty buckets. */ + if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) + continue; + + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( MFN_PINNED(x->smfn) ) + count++; + if ( !count ) + continue; + + mfn_list = xmalloc_array(unsigned long, count); + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( MFN_PINNED(x->smfn) ) + mfn_list[count++] = x->smfn; + + while ( count ) + { + shadow_unpin(mfn_list[--count]); + } + xfree(mfn_list); + } + + // Now free the pre-zero'ed pages from the domain + // + struct list_head *list_ent, *tmp; + list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames) + { + list_del(list_ent); + perfc_decr(free_l1_pages); + + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + free_domheap_page(page); + } + + shadow_audit(d, 0); + + SH_LOG("Free shadow table."); +} + +void __shadow_mode_disable(struct domain *d) +{ + if ( unlikely(!shadow_mode_enabled(d)) ) + return; + + /* + * Currently this does not fix up page ref counts, so it is valid to call + * only when a domain is being destroyed. + */ + BUG_ON(!test_bit(_DOMF_dying, &d->domain_flags) && + shadow_mode_refcounts(d)); + d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d); + + free_shadow_pages(d); + free_writable_pte_predictions(d); + +#ifndef NDEBUG + int i; + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 ) + { + printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n", + __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags); + BUG(); + } + } +#endif + + d->arch.shadow_mode = 0; + + free_shadow_ht_entries(d); + free_out_of_sync_entries(d); + + struct vcpu *v; + for_each_vcpu(d, v) + { + update_pagetables(v); + } +} + + +static void +free_p2m_table(struct domain *d) +{ + // uh, this needs some work... :) + BUG(); +} + + +int __shadow_mode_enable(struct domain *d, unsigned int mode) +{ + struct vcpu *v; + int new_modes = (mode & ~d->arch.shadow_mode); + + // Gotta be adding something to call this function. + ASSERT(new_modes); + + // can't take anything away by calling this function. + ASSERT(!(d->arch.shadow_mode & ~mode)); + +#if defined(CONFIG_PAGING_LEVELS) + if(!shadow_set_guest_paging_levels(d, + CONFIG_PAGING_LEVELS)) { + printk("Unsupported guest paging levels\n"); + domain_crash_synchronous(); /* need to take a clean path */ + } +#endif + + for_each_vcpu(d, v) + { + invalidate_shadow_ldt(v); + + // We need to set these up for __update_pagetables(). + // See the comment there. + + /* + * arch.guest_vtable + */ + if ( v->arch.guest_vtable && + (v->arch.guest_vtable != __linear_l2_table) ) + { + unmap_domain_page(v->arch.guest_vtable); + } + if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) + v->arch.guest_vtable = __linear_l2_table; + else + v->arch.guest_vtable = NULL; + + /* + * arch.shadow_vtable + */ + if ( v->arch.shadow_vtable && + (v->arch.shadow_vtable != __shadow_linear_l2_table) ) + { + unmap_domain_page(v->arch.shadow_vtable); + } + if ( !(mode & SHM_external) && d->arch.ops->guest_paging_levels == 2) + v->arch.shadow_vtable = __shadow_linear_l2_table; + else + v->arch.shadow_vtable = NULL; + +#if defined (__i386__) + /* + * arch.hl2_vtable + */ + if ( v->arch.hl2_vtable && + (v->arch.hl2_vtable != __linear_hl2_table) ) + { + unmap_domain_page(v->arch.hl2_vtable); + } + if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) + v->arch.hl2_vtable = __linear_hl2_table; + else + v->arch.hl2_vtable = NULL; +#endif + /* + * arch.monitor_table & arch.monitor_vtable + */ + if ( v->arch.monitor_vtable ) + { + free_monitor_pagetable(v); + } + if ( mode & SHM_external ) + { + alloc_monitor_pagetable(v); + } + } + + if ( new_modes & SHM_enable ) + { + ASSERT( !d->arch.shadow_ht ); + d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); + if ( d->arch.shadow_ht == NULL ) + goto nomem; + + memset(d->arch.shadow_ht, 0, + shadow_ht_buckets * sizeof(struct shadow_status)); + } + + if ( new_modes & SHM_log_dirty ) + { + ASSERT( !d->arch.shadow_dirty_bitmap ); + d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63; + d->arch.shadow_dirty_bitmap = + xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size / + (8 * sizeof(unsigned long))); + if ( d->arch.shadow_dirty_bitmap == NULL ) + { + d->arch.shadow_dirty_bitmap_size = 0; + goto nomem; + } + memset(d->arch.shadow_dirty_bitmap, 0, + d->arch.shadow_dirty_bitmap_size/8); + } + + if ( new_modes & SHM_translate ) + { + if ( !(new_modes & SHM_external) ) + { + ASSERT( !pagetable_get_paddr(d->arch.phys_table) ); + if ( !alloc_p2m_table(d) ) + { + printk("alloc_p2m_table failed (out-of-memory?)\n"); + goto nomem; + } + } + else + { + // external guests provide their own memory for their P2M maps. + // + ASSERT( d == page_get_owner( + &frame_table[pagetable_get_pfn(d->arch.phys_table)]) ); + } + } + + printk("audit1\n"); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); + printk("audit1 done\n"); + + // Get rid of any shadow pages from any previous shadow mode. + // + free_shadow_pages(d); + + printk("audit2\n"); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); + printk("audit2 done\n"); + + /* + * Tear down it's counts by disassembling its page-table-based ref counts. + * Also remove CR3's gcount/tcount. + * That leaves things like GDTs and LDTs and external refs in tact. + * + * Most pages will be writable tcount=0. + * Some will still be L1 tcount=0 or L2 tcount=0. + * Maybe some pages will be type none tcount=0. + * Pages granted external writable refs (via grant tables?) will + * still have a non-zero tcount. That's OK. + * + * gcounts will generally be 1 for PGC_allocated. + * GDTs and LDTs will have additional gcounts. + * Any grant-table based refs will still be in the gcount. + * + * We attempt to grab writable refs to each page (thus setting its type). + * Immediately put back those type refs. + * + * Assert that no pages are left with L1/L2/L3/L4 type. + */ + audit_adjust_pgtables(d, -1, 1); + + d->arch.shadow_mode = mode; + + if ( shadow_mode_refcounts(d) ) + { + struct list_head *list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + if ( !get_page_type(page, PGT_writable_page) ) + BUG(); + put_page_type(page); + + list_ent = page->list.next; + } + } + + audit_adjust_pgtables(d, 1, 1); + + printk("audit3\n"); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); + printk("audit3 done\n"); + + return 0; + + nomem: + if ( (new_modes & SHM_enable) ) + { + xfree(d->arch.shadow_ht); + d->arch.shadow_ht = NULL; + } + if ( (new_modes & SHM_log_dirty) ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = NULL; + } + if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) && + pagetable_get_paddr(d->arch.phys_table) ) + { + free_p2m_table(d); + } + return -ENOMEM; +} + + +int shadow_mode_enable(struct domain *d, unsigned int mode) +{ + int rc; + shadow_lock(d); + rc = __shadow_mode_enable(d, mode); + shadow_unlock(d); + return rc; +} + +static int shadow_mode_table_op( + struct domain *d, dom0_shadow_control_t *sc) +{ + unsigned int op = sc->op; + int i, rc = 0; + struct vcpu *v; + + ASSERT(shadow_lock_is_acquired(d)); + + SH_VLOG("shadow mode table op %lx %lx count %d", + (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */ + (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */ + d->arch.shadow_page_count); + + shadow_audit(d, 1); + + switch ( op ) + { + case DOM0_SHADOW_CONTROL_OP_FLUSH: + free_shadow_pages(d); + + d->arch.shadow_fault_count = 0; + d->arch.shadow_dirty_count = 0; + d->arch.shadow_dirty_net_count = 0; + d->arch.shadow_dirty_block_count = 0; + + break; + + case DOM0_SHADOW_CONTROL_OP_CLEAN: + free_shadow_pages(d); + + sc->stats.fault_count = d->arch.shadow_fault_count; + sc->stats.dirty_count = d->arch.shadow_dirty_count; + sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count; + sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count; + + d->arch.shadow_fault_count = 0; + d->arch.shadow_dirty_count = 0; + d->arch.shadow_dirty_net_count = 0; + d->arch.shadow_dirty_block_count = 0; + + if ( (d->max_pages > sc->pages) || + (sc->dirty_bitmap == NULL) || + (d->arch.shadow_dirty_bitmap == NULL) ) + { + rc = -EINVAL; + break; + } + + sc->pages = d->max_pages; + +#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < d->max_pages; i += chunk ) + { + int bytes = ((((d->max_pages - i) > chunk) ? + chunk : (d->max_pages - i)) + 7) / 8; + + if (copy_to_user( + sc->dirty_bitmap + (i/(8*sizeof(unsigned long))), + d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), + bytes)) + { + // copy_to_user can fail when copying to guest app memory. + // app should zero buffer after mallocing, and pin it + rc = -EINVAL; + memset( + d->arch.shadow_dirty_bitmap + + (i/(8*sizeof(unsigned long))), + 0, (d->max_pages/8) - (i/(8*sizeof(unsigned long)))); + break; + } + memset( + d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } + + break; + + case DOM0_SHADOW_CONTROL_OP_PEEK: + sc->stats.fault_count = d->arch.shadow_fault_count; + sc->stats.dirty_count = d->arch.shadow_dirty_count; + sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count; + sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count; + + if ( (d->max_pages > sc->pages) || + (sc->dirty_bitmap == NULL) || + (d->arch.shadow_dirty_bitmap == NULL) ) + { + rc = -EINVAL; + break; + } + + sc->pages = d->max_pages; + if (copy_to_user( + sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8)) + { + rc = -EINVAL; + break; + } + + break; + + default: + rc = -EINVAL; + break; + } + + SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count); + shadow_audit(d, 1); + + for_each_vcpu(d,v) + __update_pagetables(v); + + return rc; +} + +int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) +{ + unsigned int op = sc->op; + int rc = 0; + struct vcpu *v; + + if ( unlikely(d == current->domain) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } + + domain_pause(d); + + shadow_lock(d); + + switch ( op ) + { + case DOM0_SHADOW_CONTROL_OP_OFF: + __shadow_sync_all(d); + __shadow_mode_disable(d); + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: + free_shadow_pages(d); + rc = __shadow_mode_enable(d, SHM_enable); + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: + free_shadow_pages(d); + rc = __shadow_mode_enable( + d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty); + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE: + free_shadow_pages(d); + rc = __shadow_mode_enable( + d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate); + break; + + default: + rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL; + break; + } + + shadow_unlock(d); + + for_each_vcpu(d,v) + update_pagetables(v); + + domain_unpause(d); + + return rc; +} + +void shadow_mode_init(void) +{ +} + +int _shadow_mode_refcounts(struct domain *d) +{ + return shadow_mode_refcounts(d); +} + +int +set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn, + struct domain_mmap_cache *l2cache, + struct domain_mmap_cache *l1cache) +{ + unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table); + l2_pgentry_t *l2, l2e; + l1_pgentry_t *l1; + struct pfn_info *l1page; + unsigned long va = pfn << PAGE_SHIFT; + + ASSERT(tabpfn != 0); + + l2 = map_domain_page_with_cache(tabpfn, l2cache); + l2e = l2[l2_table_offset(va)]; + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + l1page = alloc_domheap_page(NULL); + if ( !l1page ) + { + unmap_domain_page_with_cache(l2, l2cache); + return 0; + } + + l1 = map_domain_page_with_cache(page_to_pfn(l1page), l1cache); + memset(l1, 0, PAGE_SIZE); + unmap_domain_page_with_cache(l1, l1cache); + + l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR); + l2[l2_table_offset(va)] = l2e; + } + unmap_domain_page_with_cache(l2, l2cache); + + l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache); + l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); + unmap_domain_page_with_cache(l1, l1cache); + + return 1; +} + +int +alloc_p2m_table(struct domain *d) +{ + struct list_head *list_ent; + struct pfn_info *page, *l2page; + l2_pgentry_t *l2; + unsigned long mfn, pfn; + struct domain_mmap_cache l1cache, l2cache; + + l2page = alloc_domheap_page(NULL); + if ( l2page == NULL ) + return 0; + + domain_mmap_cache_init(&l1cache); + domain_mmap_cache_init(&l2cache); + + d->arch.phys_table = mk_pagetable(page_to_phys(l2page)); + l2 = map_domain_page_with_cache(page_to_pfn(l2page), &l2cache); + memset(l2, 0, PAGE_SIZE); + unmap_domain_page_with_cache(l2, &l2cache); + + list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + pfn = machine_to_phys_mapping[mfn]; + ASSERT(pfn != INVALID_M2P_ENTRY); + ASSERT(pfn < (1u<<20)); + + set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache); + + list_ent = page->list.next; + } + + list_ent = d->xenpage_list.next; + while ( list_ent != &d->xenpage_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + pfn = machine_to_phys_mapping[mfn]; + if ( (pfn != INVALID_M2P_ENTRY) && + (pfn < (1u<<20)) ) + { + set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache); + } + + list_ent = page->list.next; + } + + domain_mmap_cache_destroy(&l2cache); + domain_mmap_cache_destroy(&l1cache); + + return 1; +} + +void shadow_l1_normal_pt_update( + struct domain *d, + unsigned long pa, l1_pgentry_t gpte, + struct domain_mmap_cache *cache) +{ + unsigned long sl1mfn; + l1_pgentry_t *spl1e, spte; + + shadow_lock(d); + + sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow); + if ( sl1mfn ) + { + SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte, + (void *)pa, l1e_get_intpte(gpte)); + l1pte_propagate_from_guest(current->domain, gpte, &spte); + + spl1e = map_domain_page_with_cache(sl1mfn, cache); + spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte; + unmap_domain_page_with_cache(spl1e, cache); + } + + shadow_unlock(d); +} + +void shadow_l2_normal_pt_update( + struct domain *d, + unsigned long pa, l2_pgentry_t gpde, + struct domain_mmap_cache *cache) +{ + unsigned long sl2mfn; + l2_pgentry_t *spl2e; + + shadow_lock(d); + + sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow); + if ( sl2mfn ) + { + SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte, + (void *)pa, l2e_get_intpte(gpde)); + spl2e = map_domain_page_with_cache(sl2mfn, cache); + validate_pde_change(d, gpde, + &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]); + unmap_domain_page_with_cache(spl2e, cache); + } + + shadow_unlock(d); +} + +#if CONFIG_PAGING_LEVELS >= 3 +void shadow_l3_normal_pt_update( + struct domain *d, + unsigned long pa, l3_pgentry_t gpde, + struct domain_mmap_cache *cache) +{ + unsigned long sl3mfn; + pgentry_64_t *spl3e; + + shadow_lock(d); + + sl3mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l3_shadow); + if ( sl3mfn ) + { + SH_VVLOG("shadow_l3_normal_pt_update pa=%p, gpde=%" PRIpte, + (void *)pa, l3e_get_intpte(gpde)); + + spl3e = (pgentry_64_t *) map_domain_page_with_cache(sl3mfn, cache); + validate_entry_change(d, (pgentry_64_t *) &gpde, + &spl3e[(pa & ~PAGE_MASK) / sizeof(l3_pgentry_t)], + shadow_type_to_level(PGT_l3_shadow)); + unmap_domain_page_with_cache(spl3e, cache); + } + + shadow_unlock(d); +} +#endif + +#if CONFIG_PAGING_LEVELS >= 4 +void shadow_l4_normal_pt_update( + struct domain *d, + unsigned long pa, l4_pgentry_t gpde, + struct domain_mmap_cache *cache) +{ + unsigned long sl4mfn; + pgentry_64_t *spl4e; + + shadow_lock(d); + + sl4mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l4_shadow); + if ( sl4mfn ) + { + SH_VVLOG("shadow_l4_normal_pt_update pa=%p, gpde=%" PRIpte, + (void *)pa, l4e_get_intpte(gpde)); + + spl4e = (pgentry_64_t *)map_domain_page_with_cache(sl4mfn, cache); + validate_entry_change(d, (pgentry_64_t *)&gpde, + &spl4e[(pa & ~PAGE_MASK) / sizeof(l4_pgentry_t)], + shadow_type_to_level(PGT_l4_shadow)); + unmap_domain_page_with_cache(spl4e, cache); + } + + shadow_unlock(d); +} +#endif + +static void +translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn) +{ + int i; + l1_pgentry_t *l1; + + l1 = map_domain_page(l1mfn); + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( is_guest_l1_slot(i) && + (l1e_get_flags(l1[i]) & _PAGE_PRESENT) ) + { + unsigned long mfn = l1e_get_pfn(l1[i]); + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); + l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i])); + } + } + unmap_domain_page(l1); +} + +// This is not general enough to handle arbitrary pagetables +// with shared L1 pages, etc., but it is sufficient for bringing +// up dom0. +// +void +translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn, + unsigned int type) +{ + int i; + l2_pgentry_t *l2; + + ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d)); + + l2 = map_domain_page(l2mfn); + for (i = 0; i < L2_PAGETABLE_ENTRIES; i++) + { + if ( is_guest_l2_slot(type, i) && + (l2e_get_flags(l2[i]) & _PAGE_PRESENT) ) + { + unsigned long mfn = l2e_get_pfn(l2[i]); + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); + l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i])); + translate_l1pgtable(d, p2m, mfn); + } + } + unmap_domain_page(l2); +} + +void +remove_shadow(struct domain *d, unsigned long gpfn, u32 stype) +{ + unsigned long smfn; + + //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype); + + shadow_lock(d); + + while ( stype >= PGT_l1_shadow ) + { + smfn = __shadow_status(d, gpfn, stype); + if ( smfn && MFN_PINNED(smfn) ) + shadow_unpin(smfn); + stype -= PGT_l1_shadow; + } + + shadow_unlock(d); +} + +unsigned long +gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +{ + ASSERT( shadow_mode_translate(d) ); + + perfc_incrc(gpfn_to_mfn_foreign); + + unsigned long va = gpfn << PAGE_SHIFT; + unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table); + l2_pgentry_t *l2 = map_domain_page(tabpfn); + l2_pgentry_t l2e = l2[l2_table_offset(va)]; + unmap_domain_page(l2); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n", + d->domain_id, gpfn, l2e_get_intpte(l2e)); + return INVALID_MFN; + } + l1_pgentry_t *l1 = map_domain_page(l2e_get_pfn(l2e)); + l1_pgentry_t l1e = l1[l1_table_offset(va)]; + unmap_domain_page(l1); + +#if 0 + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n", + d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e); +#endif + + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) + { + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte "\n", + d->domain_id, gpfn, l1e_get_intpte(l1e)); + return INVALID_MFN; + } + + return l1e_get_pfn(l1e); +} + +static u32 remove_all_access_in_page( + struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn) +{ + l1_pgentry_t *pl1e = map_domain_page(l1mfn); + l1_pgentry_t match; + unsigned long flags = _PAGE_PRESENT; + int i; + u32 count = 0; + int is_l1_shadow = + ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) == + PGT_l1_shadow); + + match = l1e_from_pfn(forbidden_gmfn, flags); + + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( unlikely(!l1e_has_changed(pl1e[i], match, flags) == 0) ) + { + l1_pgentry_t ol2e = pl1e[i]; + pl1e[i] = l1e_empty(); + count++; + + if ( is_l1_shadow ) + shadow_put_page_from_l1e(ol2e, d); + else /* must be an hl2 page */ + put_page(&frame_table[forbidden_gmfn]); + } + } + + unmap_domain_page(pl1e); + + return count; +} + +static u32 __shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn) +{ + int i; + struct shadow_status *a; + u32 count = 0; + + if ( unlikely(!shadow_mode_enabled(d)) ) + return 0; + + ASSERT(shadow_lock_is_acquired(d)); + perfc_incrc(remove_all_access); + + for (i = 0; i < shadow_ht_buckets; i++) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + switch (a->gpfn_and_flags & PGT_type_mask) + { + case PGT_l1_shadow: + case PGT_l2_shadow: + case PGT_l3_shadow: + case PGT_l4_shadow: + case PGT_hl2_shadow: + count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn); + break; + case PGT_snapshot: + case PGT_writable_pred: + // these can't hold refs to the forbidden page + break; + default: + BUG(); + } + + a = a->next; + } + } + + return count; +} + +void shadow_drop_references( + struct domain *d, struct pfn_info *page) +{ + if ( likely(!shadow_mode_refcounts(d)) || + ((page->u.inuse.type_info & PGT_count_mask) == 0) ) + return; + + /* XXX This needs more thought... */ + printk("%s: needing to call __shadow_remove_all_access for mfn=%lx\n", + __func__, page_to_pfn(page)); + printk("Before: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page), + page->count_info, page->u.inuse.type_info); + + shadow_lock(d); + __shadow_remove_all_access(d, page_to_pfn(page)); + shadow_unlock(d); + + printk("After: mfn=%lx c=%08x t=%08x\n", page_to_pfn(page), + page->count_info, page->u.inuse.type_info); +} + +/* XXX Needs more thought. Neither pretty nor fast: a place holder. */ +void shadow_sync_and_drop_references( + struct domain *d, struct pfn_info *page) +{ + if ( likely(!shadow_mode_refcounts(d)) ) + return; + + shadow_lock(d); + + if ( page_out_of_sync(page) ) + __shadow_sync_mfn(d, page_to_pfn(page)); + + __shadow_remove_all_access(d, page_to_pfn(page)); + + shadow_unlock(d); +} diff -r d332d4df452e -r 0bcfd66a431e xen/arch/x86/shadow32.c --- /dev/null Mon Jul 11 09:22:15 2005 +++ b/xen/arch/x86/shadow32.c Mon Jul 11 09:57:38 2005 @@ -0,0 +1,3388 @@ +/****************************************************************************** + * arch/x86/shadow.c + * + * Copyright (c) 2005 Michael A Fetterman + * Based on an earlier implementation by Ian Pratt et al + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +#include <xen/config.h> +#include <xen/types.h> +#include <xen/mm.h> +#include <xen/domain_page.h> +#include <asm/shadow.h> +#include <asm/page.h> +#include <xen/event.h> +#include <xen/sched.h> +#include <xen/trace.h> + +#define MFN_PINNED(_x) (frame_table[_x].u.inuse.type_info & PGT_pinned) + +static void shadow_free_snapshot(struct domain *d, + struct out_of_sync_entry *entry); +static void remove_out_of_sync_entries(struct domain *d, unsigned long smfn); +static void free_writable_pte_predictions(struct domain *d); + +#if SHADOW_DEBUG +static void mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn); +#endif + +/******** + +There's a per-domain shadow table spin lock which works fine for SMP +hosts. We don't have to worry about interrupts as no shadow operations +happen in an interrupt context. It's probably not quite ready for SMP +guest operation as we have to worry about synchonisation between gpte +and spte updates. Its possible that this might only happen in a +hypercall context, in which case we'll probably at have a per-domain +hypercall lock anyhow (at least initially). + +********/ + +static inline int +shadow_promote(struct domain *d, unsigned long gpfn, unsigned long gmfn, + unsigned long new_type) +{ + struct pfn_info *page = pfn_to_page(gmfn); + int pinned = 0, okay = 1; + + if ( page_out_of_sync(page) ) + { + // Don't know how long ago this snapshot was taken. + // Can't trust it to be recent enough. + // + __shadow_sync_mfn(d, gmfn); + } + + if ( !shadow_mode_refcounts(d) ) + return 1; + + if ( unlikely(page_is_page_table(page)) ) + return 1; + + FSH_LOG("%s: gpfn=%lx gmfn=%lx nt=%08lx", __func__, gpfn, gmfn, new_type); + + if ( !shadow_remove_all_write_access(d, gpfn, gmfn) ) + { + FSH_LOG("%s: couldn't find/remove all write accesses, gpfn=%lx gmfn=%lx", + __func__, gpfn, gmfn); +#if 1 || defined(LIVE_DANGEROUSLY) + set_bit(_PGC_page_table, &page->count_info); + return 1; +#endif + return 0; + + } + + // To convert this page to use as a page table, the writable count + // should now be zero. Test this by grabbing the page as an page table, + // and then immediately releasing. This will also deal with any + // necessary TLB flushing issues for us. + // + // The cruft here about pinning doesn't really work right. This + // needs rethinking/rewriting... Need to gracefully deal with the + // TLB flushes required when promoting a writable page, and also deal + // with any outstanding (external) writable refs to this page (by + // refusing to promote it). The pinning headache complicates this + // code -- it would all get much simpler if we stop using + // shadow_lock() and move the shadow code to BIGLOCK(). + // + if ( unlikely(!get_page(page, d)) ) + BUG(); // XXX -- needs more thought for a graceful failure + if ( unlikely(test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info)) ) + { + pinned = 1; + put_page_and_type(page); + } + if ( get_page_type(page, PGT_base_page_table) ) + { + set_bit(_PGC_page_table, &page->count_info); + put_page_type(page); + } + else + { + printk("shadow_promote: get_page_type failed " + "dom%d gpfn=%lx gmfn=%lx t=%08lx\n", + d->domain_id, gpfn, gmfn, new_type); + okay = 0; + } + + // Now put the type back to writable... + if ( unlikely(!get_page_type(page, PGT_writable_page)) ) + BUG(); // XXX -- needs more thought for a graceful failure + if ( unlikely(pinned) ) + { + if ( unlikely(test_and_set_bit(_PGT_pinned, + &page->u.inuse.type_info)) ) + BUG(); // hmm... someone pinned this again? + } + else + put_page_and_type(page); + + return okay; +} + +static inline void +shadow_demote(struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + if ( !shadow_mode_refcounts(d) ) + return; + + ASSERT(frame_table[gmfn].count_info & PGC_page_table); + + if ( shadow_max_pgtable_type(d, gpfn, NULL) == PGT_none ) + { + clear_bit(_PGC_page_table, &frame_table[gmfn].count_info); + + if ( page_out_of_sync(pfn_to_page(gmfn)) ) + { + remove_out_of_sync_entries(d, gmfn); + } + } +} + +/* + * Things in shadow mode that collect get_page() refs to the domain's + * pages are: + * - PGC_allocated takes a gen count, just like normal. + * - A writable page can be pinned (paravirtualized guests may consider + * these pages to be L1s or L2s, and don't know the difference). + * Pinning a page takes a gen count (but, for domains in shadow mode, + * it *doesn't* take a type count) + * - CR3 grabs a ref to whatever it points at, just like normal. + * - Shadow mode grabs an initial gen count for itself, as a placehold + * for whatever references will exist. + * - Shadow PTEs that point to a page take a gen count, just like regular + * PTEs. However, they don't get a type count, as get_page_type() is + * hardwired to keep writable pages' counts at 1 for domains in shadow + * mode. + * - Whenever we shadow a page, the entry in the shadow hash grabs a + * general ref to the page. + * - Whenever a page goes out of sync, the out of sync entry grabs a + * general ref to the page. + */ +/* + * pfn_info fields for pages allocated as shadow pages: + * + * All 32 bits of count_info are a simple count of refs to this shadow + * from a) other shadow pages, b) current CR3's (aka ed->arch.shadow_table), + * c) if it's a pinned shadow root pgtable, d) outstanding out-of-sync + * references. + * + * u.inuse._domain is left NULL, to prevent accidently allow some random + * domain from gaining permissions to map this page. + * + * u.inuse.type_info & PGT_type_mask remembers what kind of page is being + * shadowed. + * u.inuse.type_info & PGT_mfn_mask holds the mfn of the page being shadowed. + * u.inuse.type_info & PGT_pinned says that an extra reference to this shadow + * is currently exists because this is a shadow of a root page, and we + * don't want to let those disappear just because no CR3 is currently pointing + * at it. + * + * tlbflush_timestamp holds a min & max index of valid page table entries + * within the shadow page. + */ + +static inline unsigned long +alloc_shadow_page(struct domain *d, + unsigned long gpfn, unsigned long gmfn, + u32 psh_type) +{ + struct pfn_info *page; + unsigned long smfn; + int pin = 0; + + // Currently, we only keep pre-zero'ed pages around for use as L1's... + // This will change. Soon. + // + if ( psh_type == PGT_l1_shadow ) + { + if ( !list_empty(&d->arch.free_shadow_frames) ) + { + struct list_head *entry = d->arch.free_shadow_frames.next; + page = list_entry(entry, struct pfn_info, list); + list_del(entry); + perfc_decr(free_l1_pages); + } + else + { + page = alloc_domheap_page(NULL); + void *l1 = map_domain_page(page_to_pfn(page)); + memset(l1, 0, PAGE_SIZE); + unmap_domain_page(l1); + } + } + else + page = alloc_domheap_page(NULL); + + if ( unlikely(page == NULL) ) + { + printk("Couldn't alloc shadow page! dom%d count=%d\n", + d->domain_id, d->arch.shadow_page_count); + printk("Shadow table counts: l1=%d l2=%d hl2=%d snapshot=%d\n", + perfc_value(shadow_l1_pages), + perfc_value(shadow_l2_pages), + perfc_value(hl2_table_pages), + perfc_value(snapshot_pages)); + BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ + } + + smfn = page_to_pfn(page); + + ASSERT( (gmfn & ~PGT_mfn_mask) == 0 ); + page->u.inuse.type_info = psh_type | gmfn; + page->count_info = 0; + page->tlbflush_timestamp = 0; + + switch ( psh_type ) + { + case PGT_l1_shadow: + if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) + goto fail; + perfc_incr(shadow_l1_pages); + d->arch.shadow_page_count++; + break; + + case PGT_l2_shadow: + if ( !shadow_promote(d, gpfn, gmfn, psh_type) ) + goto fail; + perfc_incr(shadow_l2_pages); + d->arch.shadow_page_count++; + if ( PGT_l2_page_table == PGT_root_page_table ) + pin = 1; + + break; + + case PGT_hl2_shadow: + // Treat an hl2 as an L1 for purposes of promotion. + // For external mode domains, treat them as an L2 for purposes of + // pinning. + // + if ( !shadow_promote(d, gpfn, gmfn, PGT_l1_shadow) ) + goto fail; + perfc_incr(hl2_table_pages); + d->arch.hl2_page_count++; + if ( shadow_mode_external(d) && + (PGT_l2_page_table == PGT_root_page_table) ) + pin = 1; + + break; + + case PGT_snapshot: + perfc_incr(snapshot_pages); + d->arch.snapshot_page_count++; + break; + + default: + printk("Alloc shadow weird page type type=%08x\n", psh_type); + BUG(); + break; + } + + // Don't add a new shadow of something that already has a snapshot. + // + ASSERT( (psh_type == PGT_snapshot) || !mfn_out_of_sync(gmfn) ); + + set_shadow_status(d, gpfn, gmfn, smfn, psh_type); + + if ( pin ) + shadow_pin(smfn); + + return smfn; + + fail: + FSH_LOG("promotion of pfn=%lx mfn=%lx failed! external gnttab refs?", + gpfn, gmfn); + free_domheap_page(page); + return 0; +} + +static void inline +free_shadow_l1_table(struct domain *d, unsigned long smfn) +{ + l1_pgentry_t *pl1e = map_domain_page(smfn); + int i; + struct pfn_info *spage = pfn_to_page(smfn); + u32 min_max = spage->tlbflush_timestamp; + int min = SHADOW_MIN(min_max); + int max = SHADOW_MAX(min_max); + + for ( i = min; i <= max; i++ ) + { + shadow_put_page_from_l1e(pl1e[i], d); + pl1e[i] = l1e_empty(); + } + + unmap_domain_page(pl1e); +} + +static void inline +free_shadow_hl2_table(struct domain *d, unsigned long smfn) +{ + l1_pgentry_t *hl2 = map_domain_page(smfn); + int i, limit; + + SH_VVLOG("%s: smfn=%lx freed", __func__, smfn); + +#ifdef __i386__ + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; +#else + limit = 0; /* XXX x86/64 XXX */ +#endif + + for ( i = 0; i < limit; i++ ) + { + if ( l1e_get_flags(hl2[i]) & _PAGE_PRESENT ) + put_page(pfn_to_page(l1e_get_pfn(hl2[i]))); + } + + unmap_domain_page(hl2); +} + +static void inline +free_shadow_l2_table(struct domain *d, unsigned long smfn, unsigned int type) +{ + l2_pgentry_t *pl2e = map_domain_page(smfn); + int i, external = shadow_mode_external(d); + + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + if ( external || is_guest_l2_slot(type, i) ) + if ( l2e_get_flags(pl2e[i]) & _PAGE_PRESENT ) + put_shadow_ref(l2e_get_pfn(pl2e[i])); + + if ( (PGT_base_page_table == PGT_l2_page_table) && + shadow_mode_translate(d) && !external ) + { + // free the ref to the hl2 + // + put_shadow_ref(l2e_get_pfn(pl2e[l2_table_offset(LINEAR_PT_VIRT_START)])); + } + + unmap_domain_page(pl2e); +} + +void free_shadow_page(unsigned long smfn) +{ + struct pfn_info *page = &frame_table[smfn]; + unsigned long gmfn = page->u.inuse.type_info & PGT_mfn_mask; + struct domain *d = page_get_owner(pfn_to_page(gmfn)); + unsigned long gpfn = __mfn_to_gpfn(d, gmfn); + unsigned long type = page->u.inuse.type_info & PGT_type_mask; + + SH_VVLOG("%s: free'ing smfn=%lx", __func__, smfn); + + ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); + + delete_shadow_status(d, gpfn, gmfn, type); + + switch ( type ) + { + case PGT_l1_shadow: + perfc_decr(shadow_l1_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_l1_table(d, smfn); + break; + + case PGT_l2_shadow: + perfc_decr(shadow_l2_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_l2_table(d, smfn, page->u.inuse.type_info); + break; + + case PGT_hl2_shadow: + perfc_decr(hl2_table_pages); + shadow_demote(d, gpfn, gmfn); + free_shadow_hl2_table(d, smfn); + break; + + case PGT_snapshot: + perfc_decr(snapshot_pages); + break; + + default: + printk("Free shadow weird page type mfn=%lx type=%08x\n", + page_to_pfn(page), page->u.inuse.type_info); + break; + } + + d->arch.shadow_page_count--; + + // No TLB flushes are needed the next time this page gets allocated. + // + page->tlbflush_timestamp = 0; + page->u.free.cpumask = CPU_MASK_NONE; + + if ( type == PGT_l1_shadow ) + { + list_add(&page->list, &d->arch.free_shadow_frames); + perfc_incr(free_l1_pages); + } + else + free_domheap_page(page); +} + +void +remove_shadow(struct domain *d, unsigned long gpfn, u32 stype) +{ + unsigned long smfn; + + //printk("%s(gpfn=%lx, type=%x)\n", __func__, gpfn, stype); + + shadow_lock(d); + + while ( stype >= PGT_l1_shadow ) + { + smfn = __shadow_status(d, gpfn, stype); + if ( smfn && MFN_PINNED(smfn) ) + shadow_unpin(smfn); + stype -= PGT_l1_shadow; + } + + shadow_unlock(d); +} + +static void inline +release_out_of_sync_entry(struct domain *d, struct out_of_sync_entry *entry) +{ + struct pfn_info *page; + + page = &frame_table[entry->gmfn]; + + // Decrement ref count of guest & shadow pages + // + put_page(page); + + // Only use entries that have low bits clear... + // + if ( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ) + { + put_shadow_ref(entry->writable_pl1e >> PAGE_SHIFT); + entry->writable_pl1e = -2; + } + else + ASSERT( entry->writable_pl1e == -1 ); + + // Free the snapshot + // + shadow_free_snapshot(d, entry); +} + +static void remove_out_of_sync_entries(struct domain *d, unsigned long gmfn) +{ + struct out_of_sync_entry *entry = d->arch.out_of_sync; + struct out_of_sync_entry **prev = &d->arch.out_of_sync; + struct out_of_sync_entry *found = NULL; + + // NB: Be careful not to call something that manipulates this list + // while walking it. Collect the results into a separate list + // first, then walk that list. + // + while ( entry ) + { + if ( entry->gmfn == gmfn ) + { + // remove from out of sync list + *prev = entry->next; + + // add to found list + entry->next = found; + found = entry; + + entry = *prev; + continue; + } + prev = &entry->next; + entry = entry->next; + } + + prev = NULL; + entry = found; + while ( entry ) + { + release_out_of_sync_entry(d, entry); + + prev = &entry->next; + entry = entry->next; + } + + // Add found list to free list + if ( prev ) + { + *prev = d->arch.out_of_sync_free; + d->arch.out_of_sync_free = found; + } +} + +static void free_out_of_sync_state(struct domain *d) +{ + struct out_of_sync_entry *entry; + + // NB: Be careful not to call something that manipulates this list + // while walking it. Remove one item at a time, and always + // restart from start of list. + // + while ( (entry = d->arch.out_of_sync) ) + { + d->arch.out_of_sync = entry->next; + release_out_of_sync_entry(d, entry); + + entry->next = d->arch.out_of_sync_free; + d->arch.out_of_sync_free = entry; + } +} + +static void free_shadow_pages(struct domain *d) +{ + int i; + struct shadow_status *x; + struct vcpu *v; + + /* + * WARNING! The shadow page table must not currently be in use! + * e.g., You are expected to have paused the domain and synchronized CR3. + */ + + if( !d->arch.shadow_ht ) return; + + shadow_audit(d, 1); + + // first, remove any outstanding refs from out_of_sync entries... + // + free_out_of_sync_state(d); + + // second, remove any outstanding refs from v->arch.shadow_table + // and CR3. + // + for_each_vcpu(d, v) + { + if ( pagetable_get_paddr(v->arch.shadow_table) ) + { + put_shadow_ref(pagetable_get_pfn(v->arch.shadow_table)); + v->arch.shadow_table = mk_pagetable(0); + } + + if ( v->arch.monitor_shadow_ref ) + { + put_shadow_ref(v->arch.monitor_shadow_ref); + v->arch.monitor_shadow_ref = 0; + } + } + + // For external shadows, remove the monitor table's refs + // + if ( shadow_mode_external(d) ) + { + for_each_vcpu(d, v) + { + l2_pgentry_t *mpl2e = v->arch.monitor_vtable; + + if ( mpl2e ) + { + l2_pgentry_t hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; + l2_pgentry_t smfn = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; + + if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) + { + put_shadow_ref(l2e_get_pfn(hl2e)); + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); + } + if ( l2e_get_flags(smfn) & _PAGE_PRESENT ) + { + put_shadow_ref(l2e_get_pfn(smfn)); + mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); + } + } + } + } + + // Now, the only refs to shadow pages that are left are from the shadow + // pages themselves. We just unpin the pinned pages, and the rest + // should automatically disappear. + // + // NB: Beware: each explicitly or implicit call to free_shadow_page + // can/will result in the hash bucket getting rewritten out from + // under us... First, collect the list of pinned pages, then + // free them. + // + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + u32 count; + unsigned long *mfn_list; + + /* Skip empty buckets. */ + if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) + continue; + + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( MFN_PINNED(x->smfn) ) + count++; + if ( !count ) + continue; + + mfn_list = xmalloc_array(unsigned long, count); + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( MFN_PINNED(x->smfn) ) + mfn_list[count++] = x->smfn; + + while ( count ) + { + shadow_unpin(mfn_list[--count]); + } + xfree(mfn_list); + } + + // Now free the pre-zero'ed pages from the domain + // + struct list_head *list_ent, *tmp; + list_for_each_safe(list_ent, tmp, &d->arch.free_shadow_frames) + { + list_del(list_ent); + perfc_decr(free_l1_pages); + + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + free_domheap_page(page); + } + + shadow_audit(d, 0); + + SH_LOG("Free shadow table."); +} + +void shadow_mode_init(void) +{ +} + +int _shadow_mode_refcounts(struct domain *d) +{ + return shadow_mode_refcounts(d); +} + +void alloc_monitor_pagetable(struct vcpu *v) +{ + unsigned long mmfn; + l2_pgentry_t *mpl2e; + struct pfn_info *mmfn_info; + struct domain *d = v->domain; + + ASSERT(pagetable_get_paddr(v->arch.monitor_table) == 0); + + mmfn_info = alloc_domheap_page(NULL); + ASSERT(mmfn_info != NULL); + + mmfn = page_to_pfn(mmfn_info); + mpl2e = (l2_pgentry_t *)map_domain_page(mmfn); + memset(mpl2e, 0, PAGE_SIZE); + +#ifdef __i386__ /* XXX screws x86/64 build */ + memcpy(&mpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); +#endif + + mpl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = + l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), + __PAGE_HYPERVISOR); + + // map the phys_to_machine map into the Read-Only MPT space for this domain + mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = + l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table), + __PAGE_HYPERVISOR); + + // Don't (yet) have mappings for these... + // Don't want to accidentally see the idle_pg_table's linear mapping. + // + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = l2e_empty(); + mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = l2e_empty(); + + v->arch.monitor_table = mk_pagetable(mmfn << PAGE_SHIFT); + v->arch.monitor_vtable = mpl2e; +} + +/* + * Free the pages for monitor_table and hl2_table + */ +void free_monitor_pagetable(struct vcpu *v) +{ + l2_pgentry_t *mpl2e, hl2e, sl2e; + unsigned long mfn; + + ASSERT( pagetable_get_paddr(v->arch.monitor_table) ); + + mpl2e = v->arch.monitor_vtable; + + /* + * First get the mfn for hl2_table by looking at monitor_table + */ + hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; + if ( l2e_get_flags(hl2e) & _PAGE_PRESENT ) + { + mfn = l2e_get_pfn(hl2e); + ASSERT(mfn); + put_shadow_ref(mfn); + } + + sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; + if ( l2e_get_flags(sl2e) & _PAGE_PRESENT ) + { + mfn = l2e_get_pfn(sl2e); + ASSERT(mfn); + put_shadow_ref(mfn); + } + + unmap_domain_page(mpl2e); + + /* + * Then free monitor_table. + */ + mfn = pagetable_get_pfn(v->arch.monitor_table); + free_domheap_page(&frame_table[mfn]); + + v->arch.monitor_table = mk_pagetable(0); + v->arch.monitor_vtable = 0; +} + +int +set_p2m_entry(struct domain *d, unsigned long pfn, unsigned long mfn, + struct domain_mmap_cache *l2cache, + struct domain_mmap_cache *l1cache) +{ + unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table); + l2_pgentry_t *l2, l2e; + l1_pgentry_t *l1; + struct pfn_info *l1page; + unsigned long va = pfn << PAGE_SHIFT; + + ASSERT(tabpfn != 0); + + l2 = map_domain_page_with_cache(tabpfn, l2cache); + l2e = l2[l2_table_offset(va)]; + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + l1page = alloc_domheap_page(NULL); + if ( !l1page ) + { + unmap_domain_page_with_cache(l2, l2cache); + return 0; + } + + l1 = map_domain_page_with_cache(page_to_pfn(l1page), l1cache); + memset(l1, 0, PAGE_SIZE); + unmap_domain_page_with_cache(l1, l1cache); + + l2e = l2e_from_page(l1page, __PAGE_HYPERVISOR); + l2[l2_table_offset(va)] = l2e; + } + unmap_domain_page_with_cache(l2, l2cache); + + l1 = map_domain_page_with_cache(l2e_get_pfn(l2e), l1cache); + l1[l1_table_offset(va)] = l1e_from_pfn(mfn, __PAGE_HYPERVISOR); + unmap_domain_page_with_cache(l1, l1cache); + + return 1; +} + +static int +alloc_p2m_table(struct domain *d) +{ + struct list_head *list_ent; + struct pfn_info *page, *l2page; + l2_pgentry_t *l2; + unsigned long mfn, pfn; + struct domain_mmap_cache l1cache, l2cache; + + l2page = alloc_domheap_page(NULL); + if ( l2page == NULL ) + return 0; + + domain_mmap_cache_init(&l1cache); + domain_mmap_cache_init(&l2cache); + + d->arch.phys_table = mk_pagetable(page_to_phys(l2page)); + l2 = map_domain_page_with_cache(page_to_pfn(l2page), &l2cache); + memset(l2, 0, PAGE_SIZE); + unmap_domain_page_with_cache(l2, &l2cache); + + list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + pfn = machine_to_phys_mapping[mfn]; + ASSERT(pfn != INVALID_M2P_ENTRY); + ASSERT(pfn < (1u<<20)); + + set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache); + + list_ent = page->list.next; + } + + list_ent = d->xenpage_list.next; + while ( list_ent != &d->xenpage_list ) + { + page = list_entry(list_ent, struct pfn_info, list); + mfn = page_to_pfn(page); + pfn = machine_to_phys_mapping[mfn]; + if ( (pfn != INVALID_M2P_ENTRY) && + (pfn < (1u<<20)) ) + { + set_p2m_entry(d, pfn, mfn, &l2cache, &l1cache); + } + + list_ent = page->list.next; + } + + domain_mmap_cache_destroy(&l2cache); + domain_mmap_cache_destroy(&l1cache); + + return 1; +} + +static void +free_p2m_table(struct domain *d) +{ + // uh, this needs some work... :) + BUG(); +} + +int __shadow_mode_enable(struct domain *d, unsigned int mode) +{ + struct vcpu *v; + int new_modes = (mode & ~d->arch.shadow_mode); + + // Gotta be adding something to call this function. + ASSERT(new_modes); + + // can't take anything away by calling this function. + ASSERT(!(d->arch.shadow_mode & ~mode)); + + for_each_vcpu(d, v) + { + invalidate_shadow_ldt(v); + + // We need to set these up for __update_pagetables(). + // See the comment there. + + /* + * arch.guest_vtable + */ + if ( v->arch.guest_vtable && + (v->arch.guest_vtable != __linear_l2_table) ) + { + unmap_domain_page(v->arch.guest_vtable); + } + if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) + v->arch.guest_vtable = __linear_l2_table; + else + v->arch.guest_vtable = NULL; + + /* + * arch.shadow_vtable + */ + if ( v->arch.shadow_vtable && + (v->arch.shadow_vtable != __shadow_linear_l2_table) ) + { + unmap_domain_page(v->arch.shadow_vtable); + } + if ( !(mode & SHM_external) ) + v->arch.shadow_vtable = __shadow_linear_l2_table; + else + v->arch.shadow_vtable = NULL; + + /* + * arch.hl2_vtable + */ + if ( v->arch.hl2_vtable && + (v->arch.hl2_vtable != __linear_hl2_table) ) + { + unmap_domain_page(v->arch.hl2_vtable); + } + if ( (mode & (SHM_translate | SHM_external)) == SHM_translate ) + v->arch.hl2_vtable = __linear_hl2_table; + else + v->arch.hl2_vtable = NULL; + + /* + * arch.monitor_table & arch.monitor_vtable + */ + if ( v->arch.monitor_vtable ) + { + free_monitor_pagetable(v); + } + if ( mode & SHM_external ) + { + alloc_monitor_pagetable(v); + } + } + + if ( new_modes & SHM_enable ) + { + ASSERT( !d->arch.shadow_ht ); + d->arch.shadow_ht = xmalloc_array(struct shadow_status, shadow_ht_buckets); + if ( d->arch.shadow_ht == NULL ) + goto nomem; + + memset(d->arch.shadow_ht, 0, + shadow_ht_buckets * sizeof(struct shadow_status)); + } + + if ( new_modes & SHM_log_dirty ) + { + ASSERT( !d->arch.shadow_dirty_bitmap ); + d->arch.shadow_dirty_bitmap_size = (d->max_pages + 63) & ~63; + d->arch.shadow_dirty_bitmap = + xmalloc_array(unsigned long, d->arch.shadow_dirty_bitmap_size / + (8 * sizeof(unsigned long))); + if ( d->arch.shadow_dirty_bitmap == NULL ) + { + d->arch.shadow_dirty_bitmap_size = 0; + goto nomem; + } + memset(d->arch.shadow_dirty_bitmap, 0, + d->arch.shadow_dirty_bitmap_size/8); + } + + if ( new_modes & SHM_translate ) + { + if ( !(new_modes & SHM_external) ) + { + ASSERT( !pagetable_get_paddr(d->arch.phys_table) ); + if ( !alloc_p2m_table(d) ) + { + printk("alloc_p2m_table failed (out-of-memory?)\n"); + goto nomem; + } + } + else + { + // external guests provide their own memory for their P2M maps. + // + ASSERT( d == page_get_owner( + &frame_table[pagetable_get_pfn(d->arch.phys_table)]) ); + } + } + + printk("audit1\n"); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); + printk("audit1 done\n"); + + // Get rid of any shadow pages from any previous shadow mode. + // + free_shadow_pages(d); + + printk("audit2\n"); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); + printk("audit2 done\n"); + + /* + * Tear down it's counts by disassembling its page-table-based ref counts. + * Also remove CR3's gcount/tcount. + * That leaves things like GDTs and LDTs and external refs in tact. + * + * Most pages will be writable tcount=0. + * Some will still be L1 tcount=0 or L2 tcount=0. + * Maybe some pages will be type none tcount=0. + * Pages granted external writable refs (via grant tables?) will + * still have a non-zero tcount. That's OK. + * + * gcounts will generally be 1 for PGC_allocated. + * GDTs and LDTs will have additional gcounts. + * Any grant-table based refs will still be in the gcount. + * + * We attempt to grab writable refs to each page (thus setting its type). + * Immediately put back those type refs. + * + * Assert that no pages are left with L1/L2/L3/L4 type. + */ + audit_adjust_pgtables(d, -1, 1); + + d->arch.shadow_mode = mode; + + if ( shadow_mode_refcounts(d) ) + { + struct list_head *list_ent = d->page_list.next; + while ( list_ent != &d->page_list ) + { + struct pfn_info *page = list_entry(list_ent, struct pfn_info, list); + if ( !get_page_type(page, PGT_writable_page) ) + BUG(); + put_page_type(page); + + list_ent = page->list.next; + } + } + + audit_adjust_pgtables(d, 1, 1); + + printk("audit3\n"); + _audit_domain(d, AUDIT_SHADOW_ALREADY_LOCKED | AUDIT_ERRORS_OK); + printk("audit3 done\n"); + + return 0; + + nomem: + if ( (new_modes & SHM_enable) ) + { + xfree(d->arch.shadow_ht); + d->arch.shadow_ht = NULL; + } + if ( (new_modes & SHM_log_dirty) ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = NULL; + } + if ( (new_modes & SHM_translate) && !(new_modes & SHM_external) && + pagetable_get_paddr(d->arch.phys_table) ) + { + free_p2m_table(d); + } + return -ENOMEM; +} + +int shadow_mode_enable(struct domain *d, unsigned int mode) +{ + int rc; + shadow_lock(d); + rc = __shadow_mode_enable(d, mode); + shadow_unlock(d); + return rc; +} + +static void +translate_l1pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l1mfn) +{ + int i; + l1_pgentry_t *l1; + + l1 = map_domain_page(l1mfn); + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( is_guest_l1_slot(i) && + (l1e_get_flags(l1[i]) & _PAGE_PRESENT) ) + { + unsigned long mfn = l1e_get_pfn(l1[i]); + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); + l1[i] = l1e_from_pfn(gpfn, l1e_get_flags(l1[i])); + } + } + unmap_domain_page(l1); +} + +// This is not general enough to handle arbitrary pagetables +// with shared L1 pages, etc., but it is sufficient for bringing +// up dom0. +// +void +translate_l2pgtable(struct domain *d, l1_pgentry_t *p2m, unsigned long l2mfn, + unsigned int type) +{ + int i; + l2_pgentry_t *l2; + + ASSERT(shadow_mode_translate(d) && !shadow_mode_external(d)); + + l2 = map_domain_page(l2mfn); + for (i = 0; i < L2_PAGETABLE_ENTRIES; i++) + { + if ( is_guest_l2_slot(type, i) && + (l2e_get_flags(l2[i]) & _PAGE_PRESENT) ) + { + unsigned long mfn = l2e_get_pfn(l2[i]); + unsigned long gpfn = __mfn_to_gpfn(d, mfn); + ASSERT(l1e_get_pfn(p2m[gpfn]) == mfn); + l2[i] = l2e_from_pfn(gpfn, l2e_get_flags(l2[i])); + translate_l1pgtable(d, p2m, mfn); + } + } + unmap_domain_page(l2); +} + +static void free_shadow_ht_entries(struct domain *d) +{ + struct shadow_status *x, *n; + + SH_VLOG("freed tables count=%d l1=%d l2=%d", + d->arch.shadow_page_count, perfc_value(shadow_l1_pages), + perfc_value(shadow_l2_pages)); + + n = d->arch.shadow_ht_extras; + while ( (x = n) != NULL ) + { + d->arch.shadow_extras_count--; + n = *((struct shadow_status **)(&x[shadow_ht_extra_size])); + xfree(x); + } + + d->arch.shadow_ht_extras = NULL; + d->arch.shadow_ht_free = NULL; + + ASSERT(d->arch.shadow_extras_count == 0); + SH_LOG("freed extras, now %d", d->arch.shadow_extras_count); + + if ( d->arch.shadow_dirty_bitmap != NULL ) + { + xfree(d->arch.shadow_dirty_bitmap); + d->arch.shadow_dirty_bitmap = 0; + d->arch.shadow_dirty_bitmap_size = 0; + } + + xfree(d->arch.shadow_ht); + d->arch.shadow_ht = NULL; +} + +static void free_out_of_sync_entries(struct domain *d) +{ + struct out_of_sync_entry *x, *n; + + n = d->arch.out_of_sync_extras; + while ( (x = n) != NULL ) + { + d->arch.out_of_sync_extras_count--; + n = *((struct out_of_sync_entry **)(&x[out_of_sync_extra_size])); + xfree(x); + } + + d->arch.out_of_sync_extras = NULL; + d->arch.out_of_sync_free = NULL; + d->arch.out_of_sync = NULL; + + ASSERT(d->arch.out_of_sync_extras_count == 0); + FSH_LOG("freed extra out_of_sync entries, now %d", + d->arch.out_of_sync_extras_count); +} + +void __shadow_mode_disable(struct domain *d) +{ + if ( unlikely(!shadow_mode_enabled(d)) ) + return; + + /* + * Currently this does not fix up page ref counts, so it is valid to call + * only when a domain is being destroyed. + */ + BUG_ON(!test_bit(_DOMF_dying, &d->domain_flags) && + shadow_mode_refcounts(d)); + d->arch.shadow_tainted_refcnts = shadow_mode_refcounts(d); + + free_shadow_pages(d); + free_writable_pte_predictions(d); + +#ifndef NDEBUG + int i; + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + if ( d->arch.shadow_ht[i].gpfn_and_flags != 0 ) + { + printk("%s: d->arch.shadow_ht[%x].gpfn_and_flags=%lx\n", + __FILE__, i, d->arch.shadow_ht[i].gpfn_and_flags); + BUG(); + } + } +#endif + + d->arch.shadow_mode = 0; + + free_shadow_ht_entries(d); + free_out_of_sync_entries(d); + + struct vcpu *v; + for_each_vcpu(d, v) + { + update_pagetables(v); + } +} + +static int shadow_mode_table_op( + struct domain *d, dom0_shadow_control_t *sc) +{ + unsigned int op = sc->op; + int i, rc = 0; + struct vcpu *v; + + ASSERT(shadow_lock_is_acquired(d)); + + SH_VLOG("shadow mode table op %lx %lx count %d", + (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.guest_table), /* XXX SMP */ + (unsigned long)pagetable_get_pfn(d->vcpu[0]->arch.shadow_table), /* XXX SMP */ + d->arch.shadow_page_count); + + shadow_audit(d, 1); + + switch ( op ) + { + case DOM0_SHADOW_CONTROL_OP_FLUSH: + free_shadow_pages(d); + + d->arch.shadow_fault_count = 0; + d->arch.shadow_dirty_count = 0; + d->arch.shadow_dirty_net_count = 0; + d->arch.shadow_dirty_block_count = 0; + + break; + + case DOM0_SHADOW_CONTROL_OP_CLEAN: + free_shadow_pages(d); + + sc->stats.fault_count = d->arch.shadow_fault_count; + sc->stats.dirty_count = d->arch.shadow_dirty_count; + sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count; + sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count; + + d->arch.shadow_fault_count = 0; + d->arch.shadow_dirty_count = 0; + d->arch.shadow_dirty_net_count = 0; + d->arch.shadow_dirty_block_count = 0; + + if ( (d->max_pages > sc->pages) || + (sc->dirty_bitmap == NULL) || + (d->arch.shadow_dirty_bitmap == NULL) ) + { + rc = -EINVAL; + break; + } + + sc->pages = d->max_pages; + +#define chunk (8*1024) /* Transfer and clear in 1kB chunks for L1 cache. */ + for ( i = 0; i < d->max_pages; i += chunk ) + { + int bytes = ((((d->max_pages - i) > chunk) ? + chunk : (d->max_pages - i)) + 7) / 8; + + if (copy_to_user( + sc->dirty_bitmap + (i/(8*sizeof(unsigned long))), + d->arch.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))), + bytes)) + { + // copy_to_user can fail when copying to guest app memory. + // app should zero buffer after mallocing, and pin it + rc = -EINVAL; + memset( + d->arch.shadow_dirty_bitmap + + (i/(8*sizeof(unsigned long))), + 0, (d->max_pages/8) - (i/(8*sizeof(unsigned long)))); + break; + } + + memset( + d->arch.shadow_dirty_bitmap + (i/(8*sizeof(unsigned long))), + 0, bytes); + } + + break; + + case DOM0_SHADOW_CONTROL_OP_PEEK: + sc->stats.fault_count = d->arch.shadow_fault_count; + sc->stats.dirty_count = d->arch.shadow_dirty_count; + sc->stats.dirty_net_count = d->arch.shadow_dirty_net_count; + sc->stats.dirty_block_count = d->arch.shadow_dirty_block_count; + + if ( (d->max_pages > sc->pages) || + (sc->dirty_bitmap == NULL) || + (d->arch.shadow_dirty_bitmap == NULL) ) + { + rc = -EINVAL; + break; + } + + sc->pages = d->max_pages; + if (copy_to_user( + sc->dirty_bitmap, d->arch.shadow_dirty_bitmap, (d->max_pages+7)/8)) + { + rc = -EINVAL; + break; + } + + break; + + default: + rc = -EINVAL; + break; + } + + SH_VLOG("shadow mode table op : page count %d", d->arch.shadow_page_count); + shadow_audit(d, 1); + + for_each_vcpu(d,v) + __update_pagetables(v); + + return rc; +} + +int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) +{ + unsigned int op = sc->op; + int rc = 0; + struct vcpu *v; + + if ( unlikely(d == current->domain) ) + { + DPRINTK("Don't try to do a shadow op on yourself!\n"); + return -EINVAL; + } + + domain_pause(d); + + shadow_lock(d); + + switch ( op ) + { + case DOM0_SHADOW_CONTROL_OP_OFF: + __shadow_sync_all(d); + __shadow_mode_disable(d); + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TEST: + free_shadow_pages(d); + rc = __shadow_mode_enable(d, SHM_enable); + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY: + free_shadow_pages(d); + rc = __shadow_mode_enable( + d, d->arch.shadow_mode|SHM_enable|SHM_log_dirty); + break; + + case DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE: + free_shadow_pages(d); + rc = __shadow_mode_enable( + d, d->arch.shadow_mode|SHM_enable|SHM_refcounts|SHM_translate); + break; + + default: + rc = shadow_mode_enabled(d) ? shadow_mode_table_op(d, sc) : -EINVAL; + break; + } + + shadow_unlock(d); + + for_each_vcpu(d,v) + update_pagetables(v); + + domain_unpause(d); + + return rc; +} + +/* + * XXX KAF: Why is this VMX specific? + */ +void vmx_shadow_clear_state(struct domain *d) +{ + SH_VVLOG("%s:", __func__); + shadow_lock(d); + free_shadow_pages(d); + shadow_unlock(d); + update_pagetables(d->vcpu[0]); +} + +unsigned long +gpfn_to_mfn_foreign(struct domain *d, unsigned long gpfn) +{ + ASSERT( shadow_mode_translate(d) ); + + perfc_incrc(gpfn_to_mfn_foreign); + + unsigned long va = gpfn << PAGE_SHIFT; + unsigned long tabpfn = pagetable_get_pfn(d->arch.phys_table); + l2_pgentry_t *l2 = map_domain_page(tabpfn); + l2_pgentry_t l2e = l2[l2_table_offset(va)]; + unmap_domain_page(l2); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + { + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l2e=%" PRIpte "\n", + d->domain_id, gpfn, l2e_get_intpte(l2e)); + return INVALID_MFN; + } + l1_pgentry_t *l1 = map_domain_page(l2e_get_pfn(l2e)); + l1_pgentry_t l1e = l1[l1_table_offset(va)]; + unmap_domain_page(l1); + +#if 0 + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => %lx tabpfn=%lx l2e=%lx l1tab=%lx, l1e=%lx\n", + d->domain_id, gpfn, l1_pgentry_val(l1e) >> PAGE_SHIFT, tabpfn, l2e, l1tab, l1e); +#endif + + if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) ) + { + printk("gpfn_to_mfn_foreign(d->id=%d, gpfn=%lx) => 0 l1e=%" PRIpte "\n", + d->domain_id, gpfn, l1e_get_intpte(l1e)); + return INVALID_MFN; + } + + return l1e_get_pfn(l1e); +} + +static unsigned long +shadow_hl2_table(struct domain *d, unsigned long gpfn, unsigned long gmfn, + unsigned long smfn) +{ + unsigned long hl2mfn; + l1_pgentry_t *hl2; + int limit; + + ASSERT(PGT_base_page_table == PGT_l2_page_table); + + if ( unlikely(!(hl2mfn = alloc_shadow_page(d, gpfn, gmfn, PGT_hl2_shadow))) ) + { + printk("Couldn't alloc an HL2 shadow for pfn=%lx mfn=%lx\n", + gpfn, gmfn); + BUG(); /* XXX Deal gracefully with failure. */ + } + + SH_VVLOG("shadow_hl2_table(gpfn=%lx, gmfn=%lx, smfn=%lx) => %lx", + gpfn, gmfn, smfn, hl2mfn); + perfc_incrc(shadow_hl2_table_count); + + hl2 = map_domain_page(hl2mfn); + +#ifdef __i386__ + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; +#else + limit = 0; /* XXX x86/64 XXX */ +#endif + + memset(hl2, 0, limit * sizeof(l1_pgentry_t)); + + if ( !shadow_mode_external(d) ) + { + memset(&hl2[DOMAIN_ENTRIES_PER_L2_PAGETABLE], 0, + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + // Setup easy access to the GL2, SL2, and HL2 frames. + // + hl2[l2_table_offset(LINEAR_PT_VIRT_START)] = + l1e_from_pfn(gmfn, __PAGE_HYPERVISOR); + hl2[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l1e_from_pfn(smfn, __PAGE_HYPERVISOR); + hl2[l2_table_offset(PERDOMAIN_VIRT_START)] = + l1e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); + } + + unmap_domain_page(hl2); + + return hl2mfn; +} + +/* + * This could take and use a snapshot, and validate the entire page at + * once, or it could continue to fault in entries one at a time... + * Might be worth investigating... + */ +static unsigned long shadow_l2_table( + struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + unsigned long smfn; + l2_pgentry_t *spl2e; + + SH_VVLOG("shadow_l2_table(gpfn=%lx, gmfn=%lx)", gpfn, gmfn); + + perfc_incrc(shadow_l2_table_count); + + if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_l2_shadow))) ) + { + printk("Couldn't alloc an L2 shadow for pfn=%lx mfn=%lx\n", + gpfn, gmfn); + BUG(); /* XXX Deal gracefully with failure. */ + } + + spl2e = (l2_pgentry_t *)map_domain_page(smfn); + + /* Install hypervisor and 2x linear p.t. mapings. */ + if ( (PGT_base_page_table == PGT_l2_page_table) && + !shadow_mode_external(d) ) + { + /* + * We could proactively fill in PDEs for pages that are already + * shadowed *and* where the guest PDE has _PAGE_ACCESSED set + * (restriction required for coherence of the accessed bit). However, + * we tried it and it didn't help performance. This is simpler. + */ + memset(spl2e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE*sizeof(l2_pgentry_t)); + + /* Install hypervisor and 2x linear p.t. mapings. */ + memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t)); + + spl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(smfn, __PAGE_HYPERVISOR); + + spl2e[l2_table_offset(PERDOMAIN_VIRT_START)] = + l2e_from_paddr(__pa(page_get_owner(&frame_table[gmfn])->arch.mm_perdomain_pt), + __PAGE_HYPERVISOR); + + if ( shadow_mode_translate(d) ) // NB: not external + { + unsigned long hl2mfn; + + spl2e[l2_table_offset(RO_MPT_VIRT_START)] = + l2e_from_paddr(pagetable_get_paddr(d->arch.phys_table), + __PAGE_HYPERVISOR); + + if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) + hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); + + // shadow_mode_translate (but not external) sl2 tables hold a + // ref to their hl2. + // + if ( !get_shadow_ref(hl2mfn) ) + BUG(); + + spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = + l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); + } + else + spl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = + l2e_from_pfn(gmfn, __PAGE_HYPERVISOR); + } + else + { + memset(spl2e, 0, L2_PAGETABLE_ENTRIES*sizeof(l2_pgentry_t)); + } + + unmap_domain_page(spl2e); + + SH_VLOG("shadow_l2_table(%lx -> %lx)", gmfn, smfn); + return smfn; +} + +void shadow_map_l1_into_current_l2(unsigned long va) +{ + struct vcpu *v = current; + struct domain *d = v->domain; + l1_pgentry_t *gpl1e, *spl1e; + l2_pgentry_t gl2e, sl2e; + unsigned long gl1pfn, gl1mfn, sl1mfn; + int i, init_table = 0; + + __guest_get_l2e(v, va, &gl2e); + ASSERT(l2e_get_flags(gl2e) & _PAGE_PRESENT); + gl1pfn = l2e_get_pfn(gl2e); + + if ( !(sl1mfn = __shadow_status(d, gl1pfn, PGT_l1_shadow)) ) + { + /* This L1 is NOT already shadowed so we need to shadow it. */ + SH_VVLOG("4a: l1 not shadowed"); + + gl1mfn = __gpfn_to_mfn(d, gl1pfn); + if ( unlikely(!VALID_MFN(gl1mfn)) ) + { + // Attempt to use an invalid pfn as an L1 page. + // XXX this needs to be more graceful! + BUG(); + } + + if ( unlikely(!(sl1mfn = + alloc_shadow_page(d, gl1pfn, gl1mfn, PGT_l1_shadow))) ) + { + printk("Couldn't alloc an L1 shadow for pfn=%lx mfn=%lx\n", + gl1pfn, gl1mfn); + BUG(); /* XXX Need to deal gracefully with failure. */ + } + + perfc_incrc(shadow_l1_table_count); + init_table = 1; + } + else + { + /* This L1 is shadowed already, but the L2 entry is missing. */ + SH_VVLOG("4b: was shadowed, l2 missing (%lx)", sl1mfn); + } + +#ifndef NDEBUG + l2_pgentry_t old_sl2e; + __shadow_get_l2e(v, va, &old_sl2e); + ASSERT( !(l2e_get_flags(old_sl2e) & _PAGE_PRESENT) ); +#endif + + if ( !get_shadow_ref(sl1mfn) ) + BUG(); + l2pde_general(d, &gl2e, &sl2e, sl1mfn); + __guest_set_l2e(v, va, gl2e); + __shadow_set_l2e(v, va, sl2e); + + if ( init_table ) + { + l1_pgentry_t sl1e; + int index = l1_table_offset(va); + int min = 1, max = 0; + + gpl1e = &(linear_pg_table[l1_linear_offset(va) & + ~(L1_PAGETABLE_ENTRIES-1)]); + + spl1e = &(shadow_linear_pg_table[l1_linear_offset(va) & + ~(L1_PAGETABLE_ENTRIES-1)]); + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + { + l1pte_propagate_from_guest(d, gpl1e[i], &sl1e); + if ( (l1e_get_flags(sl1e) & _PAGE_PRESENT) && + unlikely(!shadow_get_page_from_l1e(sl1e, d)) ) + sl1e = l1e_empty(); + if ( l1e_get_flags(sl1e) == 0 ) + { + // First copy entries from 0 until first invalid. + // Then copy entries from index until first invalid. + // + if ( i < index ) { + i = index - 1; + continue; + } + break; + } + spl1e[i] = sl1e; + if ( unlikely(i < min) ) + min = i; + if ( likely(i > max) ) + max = i; + } + + frame_table[sl1mfn].tlbflush_timestamp = + SHADOW_ENCODE_MIN_MAX(min, max); + } +} + +void shadow_invlpg(struct vcpu *v, unsigned long va) +{ + struct domain *d = v->domain; + l1_pgentry_t gpte, spte; + + ASSERT(shadow_mode_enabled(d)); + + shadow_lock(d); + + __shadow_sync_va(v, va); + + // XXX mafetter: will need to think about 4MB pages... + + // It's not strictly necessary to update the shadow here, + // but it might save a fault later. + // + if (__copy_from_user(&gpte, &linear_pg_table[va >> PAGE_SHIFT], + sizeof(gpte))) { + perfc_incrc(shadow_invlpg_faults); + return; + } + l1pte_propagate_from_guest(d, gpte, &spte); + shadow_set_l1e(va, spte, 1); + + shadow_unlock(d); +} + +struct out_of_sync_entry * +shadow_alloc_oos_entry(struct domain *d) +{ + struct out_of_sync_entry *f, *extra; + unsigned size, i; + + if ( unlikely(d->arch.out_of_sync_free == NULL) ) + { + FSH_LOG("Allocate more fullshadow tuple blocks."); + + size = sizeof(void *) + (out_of_sync_extra_size * sizeof(*f)); + extra = xmalloc_bytes(size); + + /* XXX Should be more graceful here. */ + if ( extra == NULL ) + BUG(); + + memset(extra, 0, size); + + /* Record the allocation block so it can be correctly freed later. */ + d->arch.out_of_sync_extras_count++; + *((struct out_of_sync_entry **)&extra[out_of_sync_extra_size]) = + d->arch.out_of_sync_extras; + d->arch.out_of_sync_extras = &extra[0]; + + /* Thread a free chain through the newly-allocated nodes. */ + for ( i = 0; i < (out_of_sync_extra_size - 1); i++ ) + extra[i].next = &extra[i+1]; + extra[i].next = NULL; + + /* Add the new nodes to the free list. */ + d->arch.out_of_sync_free = &extra[0]; + } + + /* Allocate a new node from the quicklist. */ + f = d->arch.out_of_sync_free; + d->arch.out_of_sync_free = f->next; + + return f; +} + +static inline unsigned long +shadow_make_snapshot( + struct domain *d, unsigned long gpfn, unsigned long gmfn) +{ + unsigned long smfn, sl1mfn = 0; + void *original, *snapshot; + u32 min_max = 0; + int min, max, length; + + if ( test_and_set_bit(_PGC_out_of_sync, &frame_table[gmfn].count_info) ) + { + ASSERT(__shadow_status(d, gpfn, PGT_snapshot)); + return SHADOW_SNAPSHOT_ELSEWHERE; + } + + perfc_incrc(shadow_make_snapshot); + + if ( unlikely(!(smfn = alloc_shadow_page(d, gpfn, gmfn, PGT_snapshot))) ) + { + printk("Couldn't alloc fullshadow snapshot for pfn=%lx mfn=%lx!\n" + "Dom%d snapshot_count_count=%d\n", + gpfn, gmfn, d->domain_id, d->arch.snapshot_page_count); + BUG(); /* XXX FIXME: try a shadow flush to free up some memory. */ + } + + if ( !get_shadow_ref(smfn) ) + BUG(); + + if ( shadow_mode_refcounts(d) && + (shadow_max_pgtable_type(d, gpfn, &sl1mfn) == PGT_l1_shadow) ) + min_max = pfn_to_page(sl1mfn)->tlbflush_timestamp; + pfn_to_page(smfn)->tlbflush_timestamp = min_max; + + min = SHADOW_MIN(min_max); + max = SHADOW_MAX(min_max); + length = max - min + 1; + perfc_incr_histo(snapshot_copies, length, PT_UPDATES); + + min *= sizeof(l1_pgentry_t); + length *= sizeof(l1_pgentry_t); + + original = map_domain_page(gmfn); + snapshot = map_domain_page(smfn); + memcpy(snapshot + min, original + min, length); + unmap_domain_page(original); + unmap_domain_page(snapshot); + + return smfn; +} + +static void +shadow_free_snapshot(struct domain *d, struct out_of_sync_entry *entry) +{ + void *snapshot; + + if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) + return; + + // Clear the out_of_sync bit. + // + clear_bit(_PGC_out_of_sync, &frame_table[entry->gmfn].count_info); + + // XXX Need to think about how to protect the domain's + // information less expensively. + // + snapshot = map_domain_page(entry->snapshot_mfn); + memset(snapshot, 0, PAGE_SIZE); + unmap_domain_page(snapshot); + + put_shadow_ref(entry->snapshot_mfn); +} + +struct out_of_sync_entry * +shadow_mark_mfn_out_of_sync(struct vcpu *v, unsigned long gpfn, + unsigned long mfn) +{ + struct domain *d = v->domain; + struct pfn_info *page = &frame_table[mfn]; + struct out_of_sync_entry *entry = shadow_alloc_oos_entry(d); + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(pfn_valid(mfn)); + +#ifndef NDEBUG + u32 type = page->u.inuse.type_info & PGT_type_mask; + if ( shadow_mode_refcounts(d) ) + { + ASSERT(type == PGT_writable_page); + } + else + { + ASSERT(type && (type < PGT_l4_page_table)); + } +#endif + + FSH_LOG("%s(gpfn=%lx, mfn=%lx) c=%08x t=%08x", __func__, + gpfn, mfn, page->count_info, page->u.inuse.type_info); + + // XXX this will require some more thought... Cross-domain sharing and + // modification of page tables? Hmm... + // + if ( d != page_get_owner(page) ) + BUG(); + + perfc_incrc(shadow_mark_mfn_out_of_sync_calls); + + entry->gpfn = gpfn; + entry->gmfn = mfn; + entry->snapshot_mfn = shadow_make_snapshot(d, gpfn, mfn); + entry->writable_pl1e = -1; + +#if SHADOW_DEBUG + mark_shadows_as_reflecting_snapshot(d, gpfn); +#endif + + // increment guest's ref count to represent the entry in the + // full shadow out-of-sync list. + // + get_page(page, d); + + // Add to the out-of-sync list + // + entry->next = d->arch.out_of_sync; + d->arch.out_of_sync = entry; + + return entry; +} + +void shadow_mark_va_out_of_sync( + struct vcpu *v, unsigned long gpfn, unsigned long mfn, unsigned long va) +{ + struct out_of_sync_entry *entry = + shadow_mark_mfn_out_of_sync(v, gpfn, mfn); + l2_pgentry_t sl2e; + + // We need the address of shadow PTE that maps @va. + // It might not exist yet. Make sure it's there. + // + __shadow_get_l2e(v, va, &sl2e); + if ( !(l2e_get_flags(sl2e) & _PAGE_PRESENT) ) + { + // either this L1 isn't shadowed yet, or the shadow isn't linked into + // the current L2. + shadow_map_l1_into_current_l2(va); + __shadow_get_l2e(v, va, &sl2e); + } + ASSERT(l2e_get_flags(sl2e) & _PAGE_PRESENT); + + // NB: this is stored as a machine address. + entry->writable_pl1e = + l2e_get_paddr(sl2e) | (sizeof(l1_pgentry_t) * l1_table_offset(va)); + ASSERT( !(entry->writable_pl1e & (sizeof(l1_pgentry_t)-1)) ); + + // Increment shadow's page count to represent the reference + // inherent in entry->writable_pl1e + // + if ( !get_shadow_ref(l2e_get_pfn(sl2e)) ) + BUG(); + + FSH_LOG("mark_out_of_sync(va=%lx -> writable_pl1e=%lx)", + va, entry->writable_pl1e); +} + +/* + * Returns 1 if the snapshot for @gmfn exists and its @index'th entry matches. + * Returns 0 otherwise. + */ +static int snapshot_entry_matches( + struct domain *d, l1_pgentry_t *guest_pt, + unsigned long gpfn, unsigned index) +{ + unsigned long smfn = __shadow_status(d, gpfn, PGT_snapshot); + l1_pgentry_t *snapshot, gpte; // could be L1s or L2s or ... + int entries_match; + + perfc_incrc(snapshot_entry_matches_calls); + + if ( !smfn ) + return 0; + + snapshot = map_domain_page(smfn); + + if (__copy_from_user(&gpte, &guest_pt[index], + sizeof(gpte))) + return 0; + + // This could probably be smarter, but this is sufficent for + // our current needs. + // + entries_match = !l1e_has_changed(gpte, snapshot[index], + PAGE_FLAG_MASK); + + unmap_domain_page(snapshot); + +#ifdef PERF_COUNTERS + if ( entries_match ) + perfc_incrc(snapshot_entry_matches_true); +#endif + + return entries_match; +} + +/* + * Returns 1 if va's shadow mapping is out-of-sync. + * Returns 0 otherwise. + */ +int __shadow_out_of_sync(struct vcpu *v, unsigned long va) +{ + struct domain *d = v->domain; + unsigned long l2mfn = pagetable_get_pfn(v->arch.guest_table); + unsigned long l2pfn = __mfn_to_gpfn(d, l2mfn); + l2_pgentry_t l2e; + unsigned long l1pfn, l1mfn; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(VALID_M2P(l2pfn)); + + perfc_incrc(shadow_out_of_sync_calls); + + if ( page_out_of_sync(&frame_table[l2mfn]) && + !snapshot_entry_matches(d, (l1_pgentry_t *)v->arch.guest_vtable, + l2pfn, l2_table_offset(va)) ) + return 1; + + __guest_get_l2e(v, va, &l2e); + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) + return 0; + + l1pfn = l2e_get_pfn(l2e); + l1mfn = __gpfn_to_mfn(d, l1pfn); + + // If the l1 pfn is invalid, it can't be out of sync... + if ( !VALID_MFN(l1mfn) ) + return 0; + + if ( page_out_of_sync(&frame_table[l1mfn]) && + !snapshot_entry_matches( + d, &linear_pg_table[l1_linear_offset(va) & ~(L1_PAGETABLE_ENTRIES-1)], + l1pfn, l1_table_offset(va)) ) + return 1; + + return 0; +} + +#define GPFN_TO_GPTEPAGE(_gpfn) ((_gpfn) / (PAGE_SIZE / sizeof(l1_pgentry_t))) +static inline unsigned long +predict_writable_pte_page(struct domain *d, unsigned long gpfn) +{ + return __shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), PGT_writable_pred); +} + +static inline void +increase_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction) +{ + unsigned long score = prediction & PGT_score_mask; + int create = (score == 0); + + // saturating addition + score = (score + (1u << PGT_score_shift)) & PGT_score_mask; + score = score ? score : PGT_score_mask; + + prediction = (prediction & PGT_mfn_mask) | score; + + //printk("increase gpfn=%lx pred=%lx create=%d\n", gpfn, prediction, create); + set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred); + + if ( create ) + perfc_incr(writable_pte_predictions); +} + +static inline void +decrease_writable_pte_prediction(struct domain *d, unsigned long gpfn, unsigned long prediction) +{ + unsigned long score = prediction & PGT_score_mask; + ASSERT(score); + + // divide score by 2... We don't like bad predictions. + // + score = (score >> 1) & PGT_score_mask; + + prediction = (prediction & PGT_mfn_mask) | score; + + //printk("decrease gpfn=%lx pred=%lx score=%lx\n", gpfn, prediction, score); + + if ( score ) + set_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, prediction, PGT_writable_pred); + else + { + delete_shadow_status(d, GPFN_TO_GPTEPAGE(gpfn), 0, PGT_writable_pred); + perfc_decr(writable_pte_predictions); + } +} + +static void +free_writable_pte_predictions(struct domain *d) +{ + int i; + struct shadow_status *x; + + for ( i = 0; i < shadow_ht_buckets; i++ ) + { + u32 count; + unsigned long *gpfn_list; + + /* Skip empty buckets. */ + if ( d->arch.shadow_ht[i].gpfn_and_flags == 0 ) + continue; + + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) + count++; + + gpfn_list = xmalloc_array(unsigned long, count); + count = 0; + for ( x = &d->arch.shadow_ht[i]; x != NULL; x = x->next ) + if ( (x->gpfn_and_flags & PGT_type_mask) == PGT_writable_pred ) + gpfn_list[count++] = x->gpfn_and_flags & PGT_mfn_mask; + + while ( count ) + { + count--; + delete_shadow_status(d, gpfn_list[count], 0, PGT_writable_pred); + } + + xfree(gpfn_list); + } +} + +static u32 remove_all_write_access_in_ptpage( + struct domain *d, unsigned long pt_pfn, unsigned long pt_mfn, + unsigned long readonly_gpfn, unsigned long readonly_gmfn, + u32 max_refs_to_find, unsigned long prediction) +{ + l1_pgentry_t *pt = map_domain_page(pt_mfn); + l1_pgentry_t match; + unsigned long flags = _PAGE_RW | _PAGE_PRESENT; + int i; + u32 found = 0; + int is_l1_shadow = + ((frame_table[pt_mfn].u.inuse.type_info & PGT_type_mask) == + PGT_l1_shadow); + + match = l1e_from_pfn(readonly_gmfn, flags); + + // returns true if all refs have been found and fixed. + // + int fix_entry(int i) + { + l1_pgentry_t old = pt[i]; + l1_pgentry_t new = old; + + l1e_remove_flags(new,_PAGE_RW); + if ( is_l1_shadow && !shadow_get_page_from_l1e(new, d) ) + BUG(); + found++; + pt[i] = new; + if ( is_l1_shadow ) + shadow_put_page_from_l1e(old, d); + +#if 0 + printk("removed write access to pfn=%lx mfn=%lx in smfn=%lx entry %x " + "is_l1_shadow=%d\n", + readonly_gpfn, readonly_gmfn, pt_mfn, i, is_l1_shadow); +#endif + + return (found == max_refs_to_find); + } + + i = readonly_gpfn & (L1_PAGETABLE_ENTRIES - 1); + if ( !l1e_has_changed(pt[i], match, flags) && fix_entry(i) ) + { + perfc_incrc(remove_write_fast_exit); + increase_writable_pte_prediction(d, readonly_gpfn, prediction); + unmap_domain_page(pt); + return found; + } + + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( unlikely(!l1e_has_changed(pt[i], match, flags)) && fix_entry(i) ) + break; + } + + unmap_domain_page(pt); + + return found; +#undef MATCH_ENTRY +} + +int shadow_remove_all_write_access( + struct domain *d, unsigned long readonly_gpfn, unsigned long readonly_gmfn) +{ + int i; + struct shadow_status *a; + u32 found = 0, fixups, write_refs; + unsigned long prediction, predicted_gpfn, predicted_smfn; + + ASSERT(shadow_lock_is_acquired(d)); + ASSERT(VALID_MFN(readonly_gmfn)); + + perfc_incrc(remove_write_access); + + // If it's not a writable page, then no writable refs can be outstanding. + // + if ( (frame_table[readonly_gmfn].u.inuse.type_info & PGT_type_mask) != + PGT_writable_page ) + { + perfc_incrc(remove_write_not_writable); + return 1; + } + + // How many outstanding writable PTEs for this page are there? + // + write_refs = + (frame_table[readonly_gmfn].u.inuse.type_info & PGT_count_mask); + if ( write_refs && MFN_PINNED(readonly_gmfn) ) + { + write_refs--; + } + + if ( write_refs == 0 ) + { + perfc_incrc(remove_write_no_work); + return 1; + } + + // Before searching all the L1 page tables, check the typical culprit first + // + if ( (prediction = predict_writable_pte_page(d, readonly_gpfn)) ) + { + predicted_gpfn = prediction & PGT_mfn_mask; + if ( (predicted_smfn = __shadow_status(d, predicted_gpfn, PGT_l1_shadow)) && + (fixups = remove_all_write_access_in_ptpage(d, predicted_gpfn, predicted_smfn, readonly_gpfn, readonly_gmfn, write_refs, prediction)) ) + { + found += fixups; + if ( found == write_refs ) + { + perfc_incrc(remove_write_predicted); + return 1; + } + } + else + { + perfc_incrc(remove_write_bad_prediction); + decrease_writable_pte_prediction(d, readonly_gpfn, prediction); + } + } + + // Search all the shadow L1 page tables... + // + for (i = 0; i < shadow_ht_buckets; i++) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + if ( (a->gpfn_and_flags & PGT_type_mask) == PGT_l1_shadow ) + { + found += remove_all_write_access_in_ptpage(d, a->gpfn_and_flags & PGT_mfn_mask, a->smfn, readonly_gpfn, readonly_gmfn, write_refs - found, a->gpfn_and_flags & PGT_mfn_mask); + if ( found == write_refs ) + return 1; + } + + a = a->next; + } + } + + FSH_LOG("%s: looking for %d refs, found %d refs", + __func__, write_refs, found); + + return 0; +} + +static u32 remove_all_access_in_page( + struct domain *d, unsigned long l1mfn, unsigned long forbidden_gmfn) +{ + l1_pgentry_t *pl1e = map_domain_page(l1mfn); + l1_pgentry_t match; + unsigned long flags = _PAGE_PRESENT; + int i; + u32 count = 0; + int is_l1_shadow = + ((frame_table[l1mfn].u.inuse.type_info & PGT_type_mask) == + PGT_l1_shadow); + + match = l1e_from_pfn(forbidden_gmfn, flags); + + for (i = 0; i < L1_PAGETABLE_ENTRIES; i++) + { + if ( unlikely(!l1e_has_changed(pl1e[i], match, flags) == 0) ) + { + l1_pgentry_t ol2e = pl1e[i]; + pl1e[i] = l1e_empty(); + count++; + + if ( is_l1_shadow ) + shadow_put_page_from_l1e(ol2e, d); + else /* must be an hl2 page */ + put_page(&frame_table[forbidden_gmfn]); + } + } + + unmap_domain_page(pl1e); + + return count; +} + +u32 shadow_remove_all_access(struct domain *d, unsigned long forbidden_gmfn) +{ + int i; + struct shadow_status *a; + u32 count = 0; + + if ( unlikely(!shadow_mode_enabled(d)) ) + return 0; + + ASSERT(shadow_lock_is_acquired(d)); + perfc_incrc(remove_all_access); + + for (i = 0; i < shadow_ht_buckets; i++) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + switch (a->gpfn_and_flags & PGT_type_mask) + { + case PGT_l1_shadow: + case PGT_l2_shadow: + case PGT_l3_shadow: + case PGT_l4_shadow: + case PGT_hl2_shadow: + count += remove_all_access_in_page(d, a->smfn, forbidden_gmfn); + break; + case PGT_snapshot: + case PGT_writable_pred: + // these can't hold refs to the forbidden page + break; + default: + BUG(); + } + + a = a->next; + } + } + + return count; +} + +static int resync_all(struct domain *d, u32 stype) +{ + struct out_of_sync_entry *entry; + unsigned i; + unsigned long smfn; + void *guest, *shadow, *snapshot; + int need_flush = 0, external = shadow_mode_external(d); + int unshadow; + int changed; + + ASSERT(shadow_lock_is_acquired(d)); + + for ( entry = d->arch.out_of_sync; entry; entry = entry->next) + { + if ( entry->snapshot_mfn == SHADOW_SNAPSHOT_ELSEWHERE ) + continue; + + smfn = __shadow_status(d, entry->gpfn, stype); + + if ( !smfn ) + { + if ( shadow_mode_refcounts(d) ) + continue; + + // For light weight shadows, even when no shadow page exists, + // we need to resync the refcounts to the new contents of the + // guest page. + // This only applies when we have writable page tables. + // + if ( !shadow_mode_write_all(d) && + !((stype == PGT_l1_shadow) && + VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) + // Page is not writable -- no resync necessary + continue; + } + + FSH_LOG("resyncing t=%08x gpfn=%lx gmfn=%lx smfn=%lx snapshot_mfn=%lx", + stype, entry->gpfn, entry->gmfn, smfn, entry->snapshot_mfn); + + // Compare guest's new contents to its snapshot, validating + // and updating its shadow as appropriate. + // + guest = map_domain_page(entry->gmfn); + snapshot = map_domain_page(entry->snapshot_mfn); + + if ( smfn ) + shadow = map_domain_page(smfn); + else + shadow = NULL; + + unshadow = 0; + + switch ( stype ) { + case PGT_l1_shadow: + { + l1_pgentry_t *guest1 = guest; + l1_pgentry_t *shadow1 = shadow; + l1_pgentry_t *snapshot1 = snapshot; + + ASSERT(VM_ASSIST(d, VMASST_TYPE_writable_pagetables) || + shadow_mode_write_all(d)); + + if ( !shadow_mode_refcounts(d) ) + revalidate_l1(d, guest1, snapshot1); + + if ( !smfn ) + break; + + u32 min_max_shadow = pfn_to_page(smfn)->tlbflush_timestamp; + int min_shadow = SHADOW_MIN(min_max_shadow); + int max_shadow = SHADOW_MAX(min_max_shadow); + + u32 min_max_snapshot = + pfn_to_page(entry->snapshot_mfn)->tlbflush_timestamp; + int min_snapshot = SHADOW_MIN(min_max_snapshot); + int max_snapshot = SHADOW_MAX(min_max_snapshot); + + changed = 0; + + for ( i = min_shadow; i <= max_shadow; i++ ) + { + if ( (i < min_snapshot) || (i > max_snapshot) || + l1e_has_changed(guest1[i], snapshot1[i], PAGE_FLAG_MASK) ) + { + need_flush |= validate_pte_change(d, guest1[i], &shadow1[i]); + + // can't update snapshots of linear page tables -- they + // are used multiple times... + // + // snapshot[i] = new_pte; + + changed++; + } + } + perfc_incrc(resync_l1); + perfc_incr_histo(wpt_updates, changed, PT_UPDATES); + perfc_incr_histo(l1_entries_checked, max_shadow - min_shadow + 1, PT_UPDATES); + break; + } + case PGT_l2_shadow: + { + int max = -1; + + l2_pgentry_t *guest2 = guest; + l2_pgentry_t *shadow2 = shadow; + l2_pgentry_t *snapshot2 = snapshot; + + ASSERT(shadow_mode_write_all(d)); + BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented + + changed = 0; + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + { +#if CONFIG_X86_PAE + BUG(); /* FIXME: need type_info */ +#endif + if ( !is_guest_l2_slot(0,i) && !external ) + continue; + + l2_pgentry_t new_pde = guest2[i]; + if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK)) + { + need_flush |= validate_pde_change(d, new_pde, &shadow2[i]); + + // can't update snapshots of linear page tables -- they + // are used multiple times... + // + // snapshot[i] = new_pde; + + changed++; + } + if ( l2e_get_intpte(new_pde) != 0 ) /* FIXME: check flags? */ + max = i; + + // XXX - This hack works for linux guests. + // Need a better solution long term. + if ( !(l2e_get_flags(new_pde) & _PAGE_PRESENT) && + unlikely(l2e_get_intpte(new_pde) != 0) && + !unshadow && MFN_PINNED(smfn) ) + unshadow = 1; + } + if ( max == -1 ) + unshadow = 1; + perfc_incrc(resync_l2); + perfc_incr_histo(shm_l2_updates, changed, PT_UPDATES); + break; + } + case PGT_hl2_shadow: + { + l2_pgentry_t *guest2 = guest; + l2_pgentry_t *snapshot2 = snapshot; + l1_pgentry_t *shadow2 = shadow; + + ASSERT(shadow_mode_write_all(d)); + BUG_ON(!shadow_mode_refcounts(d)); // not yet implemented + + changed = 0; + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + { +#if CONFIG_X86_PAE + BUG(); /* FIXME: need type_info */ +#endif + if ( !is_guest_l2_slot(0, i) && !external ) + continue; + + l2_pgentry_t new_pde = guest2[i]; + if ( l2e_has_changed(new_pde, snapshot2[i], PAGE_FLAG_MASK) ) + { + need_flush |= validate_hl2e_change(d, new_pde, &shadow2[i]); + + // can't update snapshots of linear page tables -- they + // are used multiple times... + // + // snapshot[i] = new_pde; + + changed++; + } + } + perfc_incrc(resync_hl2); + perfc_incr_histo(shm_hl2_updates, changed, PT_UPDATES); + break; + } + default: + BUG(); + } + + if ( smfn ) + unmap_domain_page(shadow); + unmap_domain_page(snapshot); + unmap_domain_page(guest); + + if ( unlikely(unshadow) ) + { + perfc_incrc(unshadow_l2_count); + shadow_unpin(smfn); + if ( unlikely(shadow_mode_external(d)) ) + { + unsigned long hl2mfn; + + if ( (hl2mfn = __shadow_status(d, entry->gpfn, PGT_hl2_shadow)) && + MFN_PINNED(hl2mfn) ) + shadow_unpin(hl2mfn); + } + } + } + + return need_flush; +} + +void __shadow_sync_all(struct domain *d) +{ + struct out_of_sync_entry *entry; + int need_flush = 0; + + perfc_incrc(shadow_sync_all); + + ASSERT(shadow_lock_is_acquired(d)); + + // First, remove all write permissions to the page tables + // + for ( entry = d->arch.out_of_sync; entry; entry = entry->next) + { + // Skip entries that have low bits set... Those aren't + // real PTEs. + // + if ( entry->writable_pl1e & (sizeof(l1_pgentry_t)-1) ) + continue; + + l1_pgentry_t *ppte = (l1_pgentry_t *)( + (char *)map_domain_page(entry->writable_pl1e >> PAGE_SHIFT) + + (entry->writable_pl1e & ~PAGE_MASK)); + l1_pgentry_t opte = *ppte; + l1_pgentry_t npte = opte; + l1e_remove_flags(npte, _PAGE_RW); + + if ( (l1e_get_flags(npte) & _PAGE_PRESENT) && + !shadow_get_page_from_l1e(npte, d) ) + BUG(); + *ppte = npte; + shadow_put_page_from_l1e(opte, d); + + unmap_domain_page(ppte); + } + + // XXX mafetter: SMP + // + // With the current algorithm, we've gotta flush all the TLBs + // before we can safely continue. I don't think we want to + // do it this way, so I think we should consider making + // entirely private copies of the shadow for each vcpu, and/or + // possibly having a mix of private and shared shadow state + // (any path from a PTE that grants write access to an out-of-sync + // page table page needs to be vcpu private). + // +#if 0 // this should be enabled for SMP guests... + flush_tlb_mask(cpu_online_map); +#endif + need_flush = 1; + + // Second, resync all L1 pages, then L2 pages, etc... + // + need_flush |= resync_all(d, PGT_l1_shadow); + if ( shadow_mode_translate(d) ) + need_flush |= resync_all(d, PGT_hl2_shadow); + need_flush |= resync_all(d, PGT_l2_shadow); + + if ( need_flush && !unlikely(shadow_mode_external(d)) ) + local_flush_tlb(); + + free_out_of_sync_state(d); +} + +int shadow_fault(unsigned long va, struct cpu_user_regs *regs) +{ + l1_pgentry_t gpte, spte, orig_gpte; + struct vcpu *v = current; + struct domain *d = v->domain; + l2_pgentry_t gpde; + + spte = l1e_empty(); + + SH_VVLOG("shadow_fault( va=%lx, code=%lu )", + va, (unsigned long)regs->error_code); + perfc_incrc(shadow_fault_calls); + + check_pagetable(v, "pre-sf"); + + /* + * Don't let someone else take the guest's table pages out-of-sync. + */ + shadow_lock(d); + + /* XXX - FIX THIS COMMENT!!! + * STEP 1. Check to see if this fault might have been caused by an + * out-of-sync table page entry, or if we should pass this + * fault onto the guest. + */ + __shadow_sync_va(v, va); + + /* + * STEP 2. Check the guest PTE. + */ + __guest_get_l2e(v, va, &gpde); + if ( unlikely(!(l2e_get_flags(gpde) & _PAGE_PRESENT)) ) + { + SH_VVLOG("shadow_fault - EXIT: L1 not present"); + perfc_incrc(shadow_fault_bail_pde_not_present); + goto fail; + } + + // This can't fault because we hold the shadow lock and we've ensured that + // the mapping is in-sync, so the check of the PDE's present bit, above, + // covers this access. + // + orig_gpte = gpte = linear_pg_table[l1_linear_offset(va)]; + if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_PRESENT)) ) + { + SH_VVLOG("shadow_fault - EXIT: gpte not present (%" PRIpte ")", + l1e_get_intpte(gpte)); + perfc_incrc(shadow_fault_bail_pte_not_present); + goto fail; + } + + /* Write fault? */ + if ( regs->error_code & 2 ) + { + int allow_writes = 0; + + if ( unlikely(!(l1e_get_flags(gpte) & _PAGE_RW)) ) + { + if ( shadow_mode_page_writable(d, l1e_get_pfn(gpte)) ) + { + allow_writes = 1; + l1e_add_flags(gpte, _PAGE_RW); + } + else + { + /* Write fault on a read-only mapping. */ + SH_VVLOG("shadow_fault - EXIT: wr fault on RO page (%" PRIpte ")", + l1e_get_intpte(gpte)); + perfc_incrc(shadow_fault_bail_ro_mapping); + goto fail; + } + } + + if ( !l1pte_write_fault(v, &gpte, &spte, va) ) + { + SH_VVLOG("shadow_fault - EXIT: l1pte_write_fault failed"); + perfc_incrc(write_fault_bail); + shadow_unlock(d); + return 0; + } + + if ( allow_writes ) + l1e_remove_flags(gpte, _PAGE_RW); + } + else + { + if ( !l1pte_read_fault(d, &gpte, &spte) ) + { + SH_VVLOG("shadow_fault - EXIT: l1pte_read_fault failed"); + perfc_incrc(read_fault_bail); + shadow_unlock(d); + return 0; + } + } + + /* + * STEP 3. Write the modified shadow PTE and guest PTE back to the tables. + */ + if ( l1e_has_changed(orig_gpte, gpte, PAGE_FLAG_MASK) ) + { + /* XXX Watch out for read-only L2 entries! (not used in Linux). */ + if ( unlikely(__copy_to_user(&linear_pg_table[l1_linear_offset(va)], + &gpte, sizeof(gpte))) ) + { + printk("%s() failed, crashing domain %d " + "due to a read-only L2 page table (gpde=%" PRIpte "), va=%lx\n", + __func__,d->domain_id, l2e_get_intpte(gpde), va); + domain_crash_synchronous(); + } + + // if necessary, record the page table page as dirty + if ( unlikely(shadow_mode_log_dirty(d)) ) + __mark_dirty(d, __gpfn_to_mfn(d, l2e_get_pfn(gpde))); + } + + shadow_set_l1e(va, spte, 1); + + perfc_incrc(shadow_fault_fixed); + d->arch.shadow_fault_count++; + + shadow_unlock(d); + + check_pagetable(v, "post-sf"); + return EXCRET_fault_fixed; + + fail: + shadow_unlock(d); + return 0; +} + +void shadow_l1_normal_pt_update( + struct domain *d, + unsigned long pa, l1_pgentry_t gpte, + struct domain_mmap_cache *cache) +{ + unsigned long sl1mfn; + l1_pgentry_t *spl1e, spte; + + shadow_lock(d); + + sl1mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l1_shadow); + if ( sl1mfn ) + { + SH_VVLOG("shadow_l1_normal_pt_update pa=%p, gpte=%" PRIpte, + (void *)pa, l1e_get_intpte(gpte)); + l1pte_propagate_from_guest(current->domain, gpte, &spte); + + spl1e = map_domain_page_with_cache(sl1mfn, cache); + spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t)] = spte; + unmap_domain_page_with_cache(spl1e, cache); + } + + shadow_unlock(d); +} + +void shadow_l2_normal_pt_update( + struct domain *d, + unsigned long pa, l2_pgentry_t gpde, + struct domain_mmap_cache *cache) +{ + unsigned long sl2mfn; + l2_pgentry_t *spl2e; + + shadow_lock(d); + + sl2mfn = __shadow_status(current->domain, pa >> PAGE_SHIFT, PGT_l2_shadow); + if ( sl2mfn ) + { + SH_VVLOG("shadow_l2_normal_pt_update pa=%p, gpde=%" PRIpte, + (void *)pa, l2e_get_intpte(gpde)); + spl2e = map_domain_page_with_cache(sl2mfn, cache); + validate_pde_change(d, gpde, + &spl2e[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t)]); + unmap_domain_page_with_cache(spl2e, cache); + } + + shadow_unlock(d); +} + +#if CONFIG_PAGING_LEVELS >= 3 +void shadow_l3_normal_pt_update( + struct domain *d, + unsigned long pa, l3_pgentry_t gpde, + struct domain_mmap_cache *cache) +{ + BUG(); // not yet implemented +} +#endif + +#if CONFIG_PAGING_LEVELS >= 4 +void shadow_l4_normal_pt_update( + struct domain *d, + unsigned long pa, l4_pgentry_t gpde, + struct domain_mmap_cache *cache) +{ + BUG(); // not yet implemented +} +#endif + +int shadow_do_update_va_mapping(unsigned long va, + l1_pgentry_t val, + struct vcpu *v) +{ + struct domain *d = v->domain; + l1_pgentry_t spte; + int rc = 0; + + shadow_lock(d); + + //printk("%s(va=%p, val=%p)\n", __func__, (void *)va, (void *)l1e_get_intpte(val)); + + // This is actually overkill - we don't need to sync the L1 itself, + // just everything involved in getting to this L1 (i.e. we need + // linear_pg_table[l1_linear_offset(va)] to be in sync)... + // + __shadow_sync_va(v, va); + + l1pte_propagate_from_guest(d, val, &spte); + shadow_set_l1e(va, spte, 0); + + /* + * If we're in log-dirty mode then we need to note that we've updated + * the PTE in the PT-holding page. We need the machine frame number + * for this. + */ + if ( shadow_mode_log_dirty(d) ) + __mark_dirty(d, va_to_l1mfn(v, va)); + +// out: + shadow_unlock(d); + + return rc; +} + + +/* + * What lives where in the 32-bit address space in the various shadow modes, + * and what it uses to get/maintain that mapping. + * + * SHADOW MODE: none enable translate external + * + * 4KB things: + * guest_vtable lin_l2 mapped per gl2 lin_l2 via hl2 mapped per gl2 + * shadow_vtable n/a sh_lin_l2 sh_lin_l2 mapped per gl2 + * hl2_vtable n/a n/a lin_hl2 via hl2 mapped per gl2 + * monitor_vtable n/a n/a n/a mapped once + * + * 4MB things: + * guest_linear lin via gl2 lin via gl2 lin via hl2 lin via hl2 + * shadow_linear n/a sh_lin via sl2 sh_lin via sl2 sh_lin via sl2 + * monitor_linear n/a n/a n/a ??? + * perdomain perdomain perdomain perdomain perdomain + * R/O M2P R/O M2P R/O M2P n/a n/a + * R/W M2P R/W M2P R/W M2P R/W M2P R/W M2P + * P2M n/a n/a R/O M2P R/O M2P + * + * NB: + * update_pagetables(), __update_pagetables(), shadow_mode_enable(), + * shadow_l2_table(), shadow_hl2_table(), and alloc_monitor_pagetable() + * all play a part in maintaining these mappings. + */ +void __update_pagetables(struct vcpu *v) +{ + struct domain *d = v->domain; + unsigned long gmfn = pagetable_get_pfn(v->arch.guest_table); + unsigned long gpfn = __mfn_to_gpfn(d, gmfn); + unsigned long smfn, hl2mfn, old_smfn; + + int max_mode = ( shadow_mode_external(d) ? SHM_external + : shadow_mode_translate(d) ? SHM_translate + : shadow_mode_enabled(d) ? SHM_enable + : 0 ); + + ASSERT( ! IS_INVALID_M2P_ENTRY(gpfn) ); + ASSERT( max_mode ); + + /* + * arch.guest_vtable + */ + if ( max_mode & (SHM_enable | SHM_external) ) + { + if ( likely(v->arch.guest_vtable != NULL) ) + unmap_domain_page(v->arch.guest_vtable); + v->arch.guest_vtable = map_domain_page(gmfn); + } + + /* + * arch.shadow_table + */ + if ( unlikely(!(smfn = __shadow_status(d, gpfn, PGT_base_page_table))) ) + smfn = shadow_l2_table(d, gpfn, gmfn); + if ( !get_shadow_ref(smfn) ) + BUG(); + old_smfn = pagetable_get_pfn(v->arch.shadow_table); + v->arch.shadow_table = mk_pagetable(smfn << PAGE_SHIFT); + if ( old_smfn ) + put_shadow_ref(old_smfn); + + SH_VVLOG("__update_pagetables(gmfn=%lx, smfn=%lx)", gmfn, smfn); + + /* + * arch.shadow_vtable + */ + if ( max_mode == SHM_external ) + { + if ( v->arch.shadow_vtable ) + unmap_domain_page(v->arch.shadow_vtable); + v->arch.shadow_vtable = map_domain_page(smfn); + } + + /* + * arch.hl2_vtable + */ + + // if max_mode == SHM_translate, then the hl2 is already installed + // correctly in its smfn, and there's nothing to do. + // + if ( max_mode == SHM_external ) + { + if ( unlikely(!(hl2mfn = __shadow_status(d, gpfn, PGT_hl2_shadow))) ) + hl2mfn = shadow_hl2_table(d, gpfn, gmfn, smfn); + if ( v->arch.hl2_vtable ) + unmap_domain_page(v->arch.hl2_vtable); + v->arch.hl2_vtable = map_domain_page(hl2mfn); + } + + /* + * fixup pointers in monitor table, as necessary + */ + if ( max_mode == SHM_external ) + { + l2_pgentry_t *mpl2e = v->arch.monitor_vtable; + l2_pgentry_t old_hl2e = mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)]; + l2_pgentry_t old_sl2e = mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)]; + + ASSERT( shadow_mode_translate(d) ); + + if ( !get_shadow_ref(hl2mfn) ) + BUG(); + mpl2e[l2_table_offset(LINEAR_PT_VIRT_START)] = + l2e_from_pfn(hl2mfn, __PAGE_HYPERVISOR); + if ( l2e_get_flags(old_hl2e) & _PAGE_PRESENT ) + put_shadow_ref(l2e_get_pfn(old_hl2e)); + + if ( !get_shadow_ref(smfn) ) + BUG(); + mpl2e[l2_table_offset(SH_LINEAR_PT_VIRT_START)] = + l2e_from_pfn(smfn, __PAGE_HYPERVISOR); + if ( l2e_get_flags(old_sl2e) & _PAGE_PRESENT ) + put_shadow_ref(l2e_get_pfn(old_sl2e)); + + // XXX - maybe this can be optimized somewhat?? + local_flush_tlb(); + } +} + + +/************************************************************************/ +/************************************************************************/ +/************************************************************************/ + +#if SHADOW_DEBUG + +// The following is entirely for _check_pagetable()'s benefit. +// _check_pagetable() wants to know whether a given entry in a +// shadow page table is supposed to be the shadow of the guest's +// current entry, or the shadow of the entry held in the snapshot +// taken above. +// +// Here, we mark all currently existing entries as reflecting +// the snapshot, above. All other places in xen that update +// the shadow will keep the shadow in sync with the guest's +// entries (via l1pte_propagate_from_guest and friends), which clear +// the SHADOW_REFLECTS_SNAPSHOT bit. +// +static void +mark_shadows_as_reflecting_snapshot(struct domain *d, unsigned long gpfn) +{ + unsigned long smfn; + l1_pgentry_t *l1e; + l2_pgentry_t *l2e; + unsigned i; + + if ( (smfn = __shadow_status(d, gpfn, PGT_l1_shadow)) ) + { + l1e = map_domain_page(smfn); + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + if ( is_guest_l1_slot(i) && + (l1e_get_flags(l1e[i]) & _PAGE_PRESENT) ) + l1e_add_flags(l1e[i], SHADOW_REFLECTS_SNAPSHOT); + unmap_domain_page(l1e); + } + + if ( (smfn = __shadow_status(d, gpfn, PGT_l2_shadow)) ) + { + l2e = map_domain_page(smfn); + for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) + if ( is_guest_l2_slot(0, i) && + (l2e_get_flags(l2e[i]) & _PAGE_PRESENT) ) + l2e_add_flags(l2e[i], SHADOW_REFLECTS_SNAPSHOT); + unmap_domain_page(l2e); + } +} + +// BUG: these are not SMP safe... +static int sh_l2_present; +static int sh_l1_present; +char * sh_check_name; +int shadow_status_noswap; + +#define v2m(_v, _adr) ({ \ + unsigned long _a = (unsigned long)(_adr); \ + l2_pgentry_t _pde = shadow_linear_l2_table(_v)[l2_table_offset(_a)]; \ + unsigned long _pa = -1; \ + if ( l2e_get_flags(_pde) & _PAGE_PRESENT ) \ + { \ + l1_pgentry_t _pte; \ + _pte = shadow_linear_pg_table[l1_linear_offset(_a)]; \ + if ( l1e_get_flags(_pte) & _PAGE_PRESENT ) \ + _pa = l1e_get_paddr(_pte); \ + } \ + _pa | (_a & ~PAGE_MASK); \ +}) + +#define FAIL(_f, _a...) \ + do { \ + printk("XXX %s-FAIL (%d,%d,%d) " _f " at %s(%d)\n", \ + sh_check_name, level, l2_idx, l1_idx, ## _a, \ + __FILE__, __LINE__); \ + printk("guest_pte=%" PRIpte " eff_guest_pte=%" PRIpte \ + " shadow_pte=%" PRIpte " snapshot_pte=%" PRIpte \ + " &guest=%p &shadow=%p &snap=%p v2m(&guest)=%p" \ + " v2m(&shadow)=%p v2m(&snap)=%p ea=%08x\n", \ + l1e_get_intpte(guest_pte), l1e_get_intpte(eff_guest_pte), \ + l1e_get_intpte(shadow_pte), l1e_get_intpte(snapshot_pte), \ + p_guest_pte, p_shadow_pte, p_snapshot_pte, \ + (void *)v2m(v, p_guest_pte), (void *)v2m(v, p_shadow_pte), \ + (void *)v2m(v, p_snapshot_pte), \ + (l2_idx << L2_PAGETABLE_SHIFT) | \ + (l1_idx << L1_PAGETABLE_SHIFT)); \ + errors++; \ + } while ( 0 ) + +static int check_pte( + struct vcpu *v, + l1_pgentry_t *p_guest_pte, + l1_pgentry_t *p_shadow_pte, + l1_pgentry_t *p_snapshot_pte, + int level, int l2_idx, int l1_idx) +{ + struct domain *d = v->domain; + l1_pgentry_t guest_pte = *p_guest_pte; + l1_pgentry_t shadow_pte = *p_shadow_pte; + l1_pgentry_t snapshot_pte = p_snapshot_pte ? *p_snapshot_pte : l1e_empty(); + l1_pgentry_t eff_guest_pte; + unsigned long mask, eff_guest_pfn, eff_guest_mfn, shadow_mfn; + int errors = 0, guest_writable; + int page_table_page; + + if ( (l1e_get_intpte(shadow_pte) == 0) || + (l1e_get_intpte(shadow_pte) == 0xdeadface) || + (l1e_get_intpte(shadow_pte) == 0x00000E00) ) + return errors; /* always safe */ + + if ( !(l1e_get_flags(shadow_pte) & _PAGE_PRESENT) ) + FAIL("Non zero not present shadow_pte"); + + if ( level == 2 ) sh_l2_present++; + if ( level == 1 ) sh_l1_present++; + + if ( (l1e_get_flags(shadow_pte) & SHADOW_REFLECTS_SNAPSHOT) && p_snapshot_pte ) + eff_guest_pte = snapshot_pte; + else + eff_guest_pte = guest_pte; + + if ( !(l1e_get_flags(eff_guest_pte) & _PAGE_PRESENT) ) + FAIL("Guest not present yet shadow is"); + + mask = ~(_PAGE_GLOBAL|_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|_PAGE_AVAIL|PAGE_MASK); + + if ( ((l1e_get_intpte(shadow_pte) & mask) != (l1e_get_intpte(eff_guest_pte) & mask)) ) + FAIL("Corrupt?"); + + if ( (level == 1) && + (l1e_get_flags(shadow_pte) & _PAGE_DIRTY) && + !(l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY) ) + FAIL("Dirty coherence"); + + if ( (l1e_get_flags(shadow_pte) & _PAGE_ACCESSED) && + !(l1e_get_flags(eff_guest_pte) & _PAGE_ACCESSED) ) + FAIL("Accessed coherence"); + + if ( l1e_get_flags(shadow_pte) & _PAGE_GLOBAL ) + FAIL("global bit set in shadow"); + + eff_guest_pfn = l1e_get_pfn(eff_guest_pte); + eff_guest_mfn = __gpfn_to_mfn(d, eff_guest_pfn); + shadow_mfn = l1e_get_pfn(shadow_pte); + + if ( !VALID_MFN(eff_guest_mfn) && !shadow_mode_refcounts(d) ) + FAIL("%s: invalid eff_guest_pfn=%lx eff_guest_pte=%" PRIpte "\n", + __func__, eff_guest_pfn, l1e_get_intpte(eff_guest_pte)); + + page_table_page = mfn_is_page_table(eff_guest_mfn); + + guest_writable = + (l1e_get_flags(eff_guest_pte) & _PAGE_RW) || + (VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && (level == 1) && mfn_out_of_sync(eff_guest_mfn)); + + if ( (l1e_get_flags(shadow_pte) & _PAGE_RW ) && !guest_writable ) + { + printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n", + eff_guest_pfn, eff_guest_mfn, shadow_mfn, + frame_table[eff_guest_mfn].u.inuse.type_info, + page_table_page); + FAIL("RW coherence"); + } + + if ( (level == 1) && + (l1e_get_flags(shadow_pte) & _PAGE_RW ) && + !(guest_writable && (l1e_get_flags(eff_guest_pte) & _PAGE_DIRTY)) ) + { + printk("eff_guest_pfn=%lx eff_guest_mfn=%lx shadow_mfn=%lx t=0x%08x page_table_page=%d\n", + eff_guest_pfn, eff_guest_mfn, shadow_mfn, + frame_table[eff_guest_mfn].u.inuse.type_info, + page_table_page); + FAIL("RW2 coherence"); + } + + if ( eff_guest_mfn == shadow_mfn ) + { + if ( level > 1 ) + FAIL("Linear map ???"); /* XXX this will fail on BSD */ + } + else + { + if ( level < 2 ) + FAIL("Shadow in L1 entry?"); + + if ( level == 2 ) + { + if ( __shadow_status(d, eff_guest_pfn, PGT_l1_shadow) != shadow_mfn ) + FAIL("shadow_mfn problem eff_guest_pfn=%lx shadow_mfn=%lx", eff_guest_pfn, + __shadow_status(d, eff_guest_pfn, PGT_l1_shadow)); + } + else + BUG(); // XXX -- not handled yet. + } + + return errors; +} +#undef FAIL +#undef v2m + +static int check_l1_table( + struct vcpu *v, unsigned long gpfn, + unsigned long gmfn, unsigned long smfn, unsigned l2_idx) +{ + struct domain *d = v->domain; + int i; + unsigned long snapshot_mfn; + l1_pgentry_t *p_guest, *p_shadow, *p_snapshot = NULL; + int errors = 0; + + if ( page_out_of_sync(pfn_to_page(gmfn)) ) + { + snapshot_mfn = __shadow_status(d, gpfn, PGT_snapshot); + ASSERT(snapshot_mfn); + p_snapshot = map_domain_page(snapshot_mfn); + } + + p_guest = map_domain_page(gmfn); + p_shadow = map_domain_page(smfn); + + for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) + errors += check_pte(v, p_guest+i, p_shadow+i, + p_snapshot ? p_snapshot+i : NULL, + 1, l2_idx, i); + + unmap_domain_page(p_shadow); + unmap_domain_page(p_guest); + if ( p_snapshot ) + unmap_domain_page(p_snapshot); + + return errors; +} + +#define FAILPT(_f, _a...) \ + do { \ + printk("XXX FAIL %s-PT " _f "\n", sh_check_name, ## _a ); \ + errors++; \ + } while ( 0 ) + +int check_l2_table( + struct vcpu *v, unsigned long gmfn, unsigned long smfn, int oos_pdes) +{ + struct domain *d = v->domain; + l2_pgentry_t *gpl2e = (l2_pgentry_t *)map_domain_page(gmfn); + l2_pgentry_t *spl2e = (l2_pgentry_t *)map_domain_page(smfn); + l2_pgentry_t match; + int i; + int errors = 0; + int limit; + + if ( !oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != d) ) + FAILPT("domain doesn't own page"); + if ( oos_pdes && (page_get_owner(pfn_to_page(gmfn)) != NULL) ) + FAILPT("bogus owner for snapshot page"); + if ( page_get_owner(pfn_to_page(smfn)) != NULL ) + FAILPT("shadow page mfn=0x%lx is owned by someone, domid=%d", + smfn, page_get_owner(pfn_to_page(smfn))->domain_id); + +#if 0 + if ( memcmp(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE], + ((SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT) - + DOMAIN_ENTRIES_PER_L2_PAGETABLE) * sizeof(l2_pgentry_t)) ) + { + for ( i = DOMAIN_ENTRIES_PER_L2_PAGETABLE; + i < (SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT); + i++ ) + printk("+++ (%d) %lx %lx\n",i, + l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i])); + FAILPT("hypervisor entries inconsistent"); + } + + if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) != + l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) ) + FAILPT("hypervisor linear map inconsistent"); +#endif + + match = l2e_from_pfn(smfn, __PAGE_HYPERVISOR); + if ( !shadow_mode_external(d) && + l2e_has_changed(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT], + match, PAGE_FLAG_MASK)) + { + FAILPT("hypervisor shadow linear map inconsistent %" PRIpte " %" PRIpte, + l2e_get_intpte(spl2e[SH_LINEAR_PT_VIRT_START >> + L2_PAGETABLE_SHIFT]), + l2e_get_intpte(match)); + } + + match = l2e_from_paddr(__pa(d->arch.mm_perdomain_pt), __PAGE_HYPERVISOR); + if ( !shadow_mode_external(d) && + l2e_has_changed(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT], + match, PAGE_FLAG_MASK)) + { + FAILPT("hypervisor per-domain map inconsistent saw %" PRIpte ", expected (va=%p) %" PRIpte, + l2e_get_intpte(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]), + d->arch.mm_perdomain_pt, + l2e_get_intpte(match)); + } + +#ifdef __i386__ + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; +#else + limit = 0; /* XXX x86/64 XXX */ +#endif + + /* Check the whole L2. */ + for ( i = 0; i < limit; i++ ) + errors += check_pte(v, + (l1_pgentry_t*)(&gpl2e[i]), /* Hmm, dirty ... */ + (l1_pgentry_t*)(&spl2e[i]), + NULL, + 2, i, 0); + + unmap_domain_page(spl2e); + unmap_domain_page(gpl2e); + +#if 1 + if ( errors ) + printk("check_l2_table returning %d errors\n", errors); +#endif + + return errors; +} +#undef FAILPT + +int _check_pagetable(struct vcpu *v, char *s) +{ + struct domain *d = v->domain; + pagetable_t pt = v->arch.guest_table; + unsigned long gptbase = pagetable_get_paddr(pt); + unsigned long ptbase_pfn, smfn; + unsigned long i; + l2_pgentry_t *gpl2e, *spl2e; + unsigned long ptbase_mfn = 0; + int errors = 0, limit, oos_pdes = 0; + + //_audit_domain(d, AUDIT_QUIET); + shadow_lock(d); + + sh_check_name = s; + //SH_VVLOG("%s-PT Audit", s); + sh_l2_present = sh_l1_present = 0; + perfc_incrc(check_pagetable); + + ptbase_mfn = gptbase >> PAGE_SHIFT; + ptbase_pfn = __mfn_to_gpfn(d, ptbase_mfn); + + if ( !(smfn = __shadow_status(d, ptbase_pfn, PGT_base_page_table)) ) + { + printk("%s-PT %lx not shadowed\n", s, gptbase); + goto out; + } + if ( page_out_of_sync(pfn_to_page(ptbase_mfn)) ) + { + ptbase_mfn = __shadow_status(d, ptbase_pfn, PGT_snapshot); + oos_pdes = 1; + ASSERT(ptbase_mfn); + } + + errors += check_l2_table(v, ptbase_mfn, smfn, oos_pdes); + + gpl2e = (l2_pgentry_t *) map_domain_page(ptbase_mfn); + spl2e = (l2_pgentry_t *) map_domain_page(smfn); + + /* Go back and recurse. */ +#ifdef __i386__ + if ( shadow_mode_external(d) ) + limit = L2_PAGETABLE_ENTRIES; + else + limit = DOMAIN_ENTRIES_PER_L2_PAGETABLE; +#else + limit = 0; /* XXX x86/64 XXX */ +#endif + + for ( i = 0; i < limit; i++ ) + { + unsigned long gl1pfn = l2e_get_pfn(gpl2e[i]); + unsigned long gl1mfn = __gpfn_to_mfn(d, gl1pfn); + unsigned long sl1mfn = l2e_get_pfn(spl2e[i]); + + if ( l2e_get_intpte(spl2e[i]) != 0 ) /* FIXME: check flags? */ + { + errors += check_l1_table(v, gl1pfn, gl1mfn, sl1mfn, i); + } + } + + unmap_domain_page(spl2e); + unmap_domain_page(gpl2e); + +#if 0 + SH_VVLOG("PT verified : l2_present = %d, l1_present = %d", + sh_l2_present, sh_l1_present); +#endif + + out: + if ( errors ) + BUG(); + + shadow_unlock(d); + + return errors; +} + +int _check_all_pagetables(struct vcpu *v, char *s) +{ + struct domain *d = v->domain; + int i; + struct shadow_status *a; + unsigned long gmfn; + int errors = 0; + + shadow_status_noswap = 1; + + sh_check_name = s; + SH_VVLOG("%s-PT Audit domid=%d", s, d->domain_id); + sh_l2_present = sh_l1_present = 0; + perfc_incrc(check_all_pagetables); + + for (i = 0; i < shadow_ht_buckets; i++) + { + a = &d->arch.shadow_ht[i]; + while ( a && a->gpfn_and_flags ) + { + gmfn = __gpfn_to_mfn(d, a->gpfn_and_flags & PGT_mfn_mask); + + switch ( a->gpfn_and_flags & PGT_type_mask ) + { + case PGT_l1_shadow: + errors += check_l1_table(v, a->gpfn_and_flags & PGT_mfn_mask, + gmfn, a->smfn, 0); + break; + case PGT_l2_shadow: + errors += check_l2_table(v, gmfn, a->smfn, + page_out_of_sync(pfn_to_page(gmfn))); + break; + case PGT_l3_shadow: + case PGT_l4_shadow: + case PGT_hl2_shadow: + BUG(); // XXX - ought to fix this... + break; + case PGT_snapshot: + case PGT_writable_pred: + break; + default: + errors++; + printk("unexpected shadow type %lx, gpfn=%lx, " + "gmfn=%lx smfn=%lx\n", + a->gpfn_and_flags & PGT_type_mask, + a->gpfn_and_flags & PGT_mfn_mask, + gmfn, a->smfn); + BUG(); + } + a = a->next; + } + } + + shadow_status_noswap = 0; + + if ( errors ) + BUG(); + + return errors; +} + +#endif // SHADOW_DEBUG + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.