[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-changelog] [xen-unstable] [XEN] Make the spurious page-fault detection logic
# HG changeset patch # User kfraser@xxxxxxxxxxxxxxxxxxxxxxx # Node ID e1ae7b3cb5b73f11bed3a51a7f4ded85c30cffd8 # Parent 05ab081f3c67cc4a4b3139090914ad9be5a0a100 [XEN] Make the spurious page-fault detection logic more robust. In particular it must be able to handle spurious write faults on mappings that have been changed from read-only to writable. If a CPU has a stale read-only entry in its TLB, it is allowed to fault on the next write access without re-walking the page table. Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx> --- xen/arch/x86/traps.c | 210 ++++++++++++++++++++++++++++------------ xen/arch/x86/x86_32/traps.c | 34 ------ xen/arch/x86/x86_64/traps.c | 34 ------ xen/include/asm-x86/processor.h | 8 + 4 files changed, 155 insertions(+), 131 deletions(-) diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Fri Jun 16 18:08:27 2006 +0100 +++ b/xen/arch/x86/traps.c Fri Jun 16 18:18:55 2006 +0100 @@ -511,9 +511,9 @@ void propagate_page_fault(unsigned long v->vcpu_info->arch.cr2 = addr; /* Re-set error_code.user flag appropriately for the guest. */ - error_code &= ~4; + error_code &= ~PGERR_user_mode; if ( !guest_kernel_mode(v, guest_cpu_user_regs()) ) - error_code |= 4; + error_code |= PGERR_user_mode; ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault]; tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; @@ -578,54 +578,91 @@ static int handle_gdt_ldt_mapping_fault( (((va) >= HYPERVISOR_VIRT_START)) #endif -static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) +static int __spurious_page_fault( + unsigned long addr, struct cpu_user_regs *regs) +{ + unsigned long mfn = read_cr3() >> PAGE_SHIFT; +#if CONFIG_PAGING_LEVELS >= 4 + l4_pgentry_t l4e, *l4t; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t l3e, *l3t; +#endif + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned int required_flags, disallowed_flags; + + required_flags = _PAGE_PRESENT; + if ( regs->error_code & PGERR_write_access ) + required_flags |= _PAGE_RW; + if ( regs->error_code & PGERR_user_mode ) + required_flags |= _PAGE_USER; + + disallowed_flags = 0; + if ( regs->error_code & PGERR_instr_fetch ) + disallowed_flags |= _PAGE_NX; + +#if CONFIG_PAGING_LEVELS >= 4 + l4t = map_domain_page(mfn); + l4e = l4t[l4_table_offset(addr)]; + mfn = l4e_get_pfn(l4e); + unmap_domain_page(l4t); + if ( !(l4e_get_flags(l4e) & required_flags) || + (l4e_get_flags(l4e) & disallowed_flags) ) + return 0; +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + l3t = map_domain_page(mfn); + l3e = l3t[l3_table_offset(addr)]; + mfn = l3e_get_pfn(l3e); + unmap_domain_page(l3t); +#ifdef CONFIG_X86_PAE + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + return 0; +#else + if ( !(l3e_get_flags(l3e) & required_flags) || + (l3e_get_flags(l3e) & disallowed_flags) ) + return 0; +#endif +#endif + + l2t = map_domain_page(mfn); + l2e = l2t[l2_table_offset(addr)]; + mfn = l2e_get_pfn(l2e); + unmap_domain_page(l2t); + if ( !(l2e_get_flags(l2e) & required_flags) || + (l2e_get_flags(l2e) & disallowed_flags) ) + return 0; + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + return 1; + + l1t = map_domain_page(mfn); + l1e = l1t[l1_table_offset(addr)]; + mfn = l1e_get_pfn(l1e); + unmap_domain_page(l1t); + if ( !(l1e_get_flags(l1e) & required_flags) || + (l1e_get_flags(l1e) & disallowed_flags) ) + return 0; + return 1; +} + +static int spurious_page_fault( + unsigned long addr, struct cpu_user_regs *regs) { struct vcpu *v = current; struct domain *d = v->domain; - - if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) - { - if ( shadow_mode_external(d) && guest_mode(regs) ) - return shadow_fault(addr, regs); - if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) - return handle_gdt_ldt_mapping_fault( - addr - GDT_LDT_VIRT_START, regs); - } - else if ( unlikely(shadow_mode_enabled(d)) ) - { - return shadow_fault(addr, regs); - } - else if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) - { - LOCK_BIGLOCK(d); - if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && - unlikely(l2_linear_offset(addr) == - d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) - { - ptwr_flush(d, PTWR_PT_ACTIVE); - UNLOCK_BIGLOCK(d); - return EXCRET_fault_fixed; - } - - if ( guest_kernel_mode(v, regs) && - /* Protection violation on write? No reserved-bit violation? */ - ((regs->error_code & 0xb) == 0x3) && - ptwr_do_page_fault(d, addr, regs) ) - { - UNLOCK_BIGLOCK(d); - return EXCRET_fault_fixed; - } - UNLOCK_BIGLOCK(d); - } - - return 0; -} - -static int spurious_page_fault(unsigned long addr, struct cpu_user_regs *regs) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - int rc; + int is_spurious; + + /* Reserved bit violations are never spurious faults. */ + if ( regs->error_code & PGERR_reserved_bit ) + return 0; + + LOCK_BIGLOCK(d); + + is_spurious = __spurious_page_fault(addr, regs); + if ( is_spurious ) + goto out; /* * The only possible reason for a spurious page fault not to be picked @@ -635,10 +672,8 @@ static int spurious_page_fault(unsigned if ( is_idle_domain(d) || /* no ptwr in idle domain */ IN_HYPERVISOR_RANGE(addr) || /* no ptwr on hypervisor addrs */ shadow_mode_enabled(d) || /* no ptwr logic in shadow mode */ - ((regs->error_code & 0x1d) != 0) ) /* simple not-present fault? */ - return 0; - - LOCK_BIGLOCK(d); + (regs->error_code & PGERR_page_present) ) /* not-present fault? */ + goto out; /* * The page directory could have been detached again while we weren't @@ -649,16 +684,67 @@ static int spurious_page_fault(unsigned d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) { ptwr_flush(d, PTWR_PT_ACTIVE); - rc = 1; - } - else - { - /* Okay, walk the page tables. Only check for not-present faults.*/ - rc = __spurious_page_fault(addr); - } - + is_spurious = 1; + } + + out: UNLOCK_BIGLOCK(d); - return rc; + return is_spurious; +} + +static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) +{ + struct vcpu *v = current; + struct domain *d = v->domain; + int rc; + + if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) + { + if ( shadow_mode_external(d) && guest_mode(regs) ) + return shadow_fault(addr, regs); + if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) + return handle_gdt_ldt_mapping_fault( + addr - GDT_LDT_VIRT_START, regs); + /* + * Do not propagate spurious faults in the hypervisor area to the + * guest. It cannot fix them up. + */ + LOCK_BIGLOCK(d); + rc = __spurious_page_fault(addr, regs); + UNLOCK_BIGLOCK(d); + return rc; + } + + if ( unlikely(shadow_mode_enabled(d)) ) + return shadow_fault(addr, regs); + + if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) + { + LOCK_BIGLOCK(d); + if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && + unlikely(l2_linear_offset(addr) == + d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) + { + ptwr_flush(d, PTWR_PT_ACTIVE); + UNLOCK_BIGLOCK(d); + return EXCRET_fault_fixed; + } + + if ( guest_kernel_mode(v, regs) && + /* Protection violation on write? No reserved-bit violation? */ + ((regs->error_code & (PGERR_page_present | + PGERR_write_access | + PGERR_reserved_bit)) == + (PGERR_page_present | PGERR_write_access)) && + ptwr_do_page_fault(d, addr, regs) ) + { + UNLOCK_BIGLOCK(d); + return EXCRET_fault_fixed; + } + UNLOCK_BIGLOCK(d); + } + + return 0; } /* @@ -784,8 +870,8 @@ static inline int admin_io_okay( (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0)) /* Propagate a fault back to the guest kernel. */ -#define USER_READ_FAULT 4 /* user mode, read fault */ -#define USER_WRITE_FAULT 6 /* user mode, write fault */ +#define USER_READ_FAULT (PGERR_user_mode) +#define USER_WRITE_FAULT (PGERR_user_mode | PGERR_write_access) #define PAGE_FAULT(_faultaddr, _errcode) \ ({ propagate_page_fault(_faultaddr, _errcode); \ return EXCRET_fault_fixed; \ diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/x86_32/traps.c --- a/xen/arch/x86/x86_32/traps.c Fri Jun 16 18:08:27 2006 +0100 +++ b/xen/arch/x86/x86_32/traps.c Fri Jun 16 18:18:55 2006 +0100 @@ -113,40 +113,6 @@ void show_page_walk(unsigned long addr) unmap_domain_page(l1t); } -int __spurious_page_fault(unsigned long addr) -{ - unsigned long mfn = read_cr3() >> PAGE_SHIFT; -#ifdef CONFIG_X86_PAE - l3_pgentry_t l3e, *l3t; -#endif - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - -#ifdef CONFIG_X86_PAE - l3t = map_domain_page(mfn); - l3e = l3t[l3_table_offset(addr)]; - mfn = l3e_get_pfn(l3e); - unmap_domain_page(l3t); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - return 0; -#endif - - l2t = map_domain_page(mfn); - l2e = l2t[l2_table_offset(addr)]; - mfn = l2e_get_pfn(l2e); - unmap_domain_page(l2t); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return 0; - if ( l2e_get_flags(l2e) & _PAGE_PSE ) - return 1; - - l1t = map_domain_page(mfn); - l1e = l1t[l1_table_offset(addr)]; - mfn = l1e_get_pfn(l1e); - unmap_domain_page(l1t); - return !!(l1e_get_flags(l1e) & _PAGE_PRESENT); -} - #define DOUBLEFAULT_STACK_SIZE 1024 static struct tss_struct doublefault_tss; static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE]; diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/x86_64/traps.c --- a/xen/arch/x86/x86_64/traps.c Fri Jun 16 18:08:27 2006 +0100 +++ b/xen/arch/x86/x86_64/traps.c Fri Jun 16 18:18:55 2006 +0100 @@ -115,40 +115,6 @@ void show_page_walk(unsigned long addr) printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn); } -int __spurious_page_fault(unsigned long addr) -{ - unsigned long mfn = read_cr3() >> PAGE_SHIFT; - l4_pgentry_t l4e, *l4t; - l3_pgentry_t l3e, *l3t; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - - l4t = mfn_to_virt(mfn); - l4e = l4t[l4_table_offset(addr)]; - mfn = l4e_get_pfn(l4e); - if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return 0; - - l3t = mfn_to_virt(mfn); - l3e = l3t[l3_table_offset(addr)]; - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - return 0; - - l2t = mfn_to_virt(mfn); - l2e = l2t[l2_table_offset(addr)]; - mfn = l2e_get_pfn(l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return 0; - if ( l2e_get_flags(l2e) & _PAGE_PSE ) - return 1; - - l1t = mfn_to_virt(mfn); - l1e = l1t[l1_table_offset(addr)]; - mfn = l1e_get_pfn(l1e); - return !!(l1e_get_flags(l1e) & _PAGE_PRESENT); -} - asmlinkage void double_fault(void); asmlinkage void do_double_fault(struct cpu_user_regs *regs) { diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/include/asm-x86/processor.h --- a/xen/include/asm-x86/processor.h Fri Jun 16 18:08:27 2006 +0100 +++ b/xen/include/asm-x86/processor.h Fri Jun 16 18:18:55 2006 +0100 @@ -128,6 +128,13 @@ /* 'arch_vcpu' flags values */ #define _TF_kernel_mode 0 #define TF_kernel_mode (1<<_TF_kernel_mode) + +/* #PF error code values. */ +#define PGERR_page_present (1U<<0) +#define PGERR_write_access (1U<<1) +#define PGERR_user_mode (1U<<2) +#define PGERR_reserved_bit (1U<<3) +#define PGERR_instr_fetch (1U<<4) #ifndef __ASSEMBLY__ @@ -524,7 +531,6 @@ void show_stack(struct cpu_user_regs *re void show_stack(struct cpu_user_regs *regs); void show_registers(struct cpu_user_regs *regs); void show_page_walk(unsigned long addr); -int __spurious_page_fault(unsigned long addr); asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs); extern void mtrr_ap_init(void); _______________________________________________ Xen-changelog mailing list Xen-changelog@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-changelog
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |