[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [XEN] Make the spurious page-fault detection logic



# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxxxx
# Node ID e1ae7b3cb5b73f11bed3a51a7f4ded85c30cffd8
# Parent  05ab081f3c67cc4a4b3139090914ad9be5a0a100
[XEN] Make the spurious page-fault detection logic
more robust. In particular it must be able to handle
spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
 xen/arch/x86/traps.c            |  210 ++++++++++++++++++++++++++++------------
 xen/arch/x86/x86_32/traps.c     |   34 ------
 xen/arch/x86/x86_64/traps.c     |   34 ------
 xen/include/asm-x86/processor.h |    8 +
 4 files changed, 155 insertions(+), 131 deletions(-)

diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/arch/x86/traps.c      Fri Jun 16 18:18:55 2006 +0100
@@ -511,9 +511,9 @@ void propagate_page_fault(unsigned long 
     v->vcpu_info->arch.cr2           = addr;
 
     /* Re-set error_code.user flag appropriately for the guest. */
-    error_code &= ~4;
+    error_code &= ~PGERR_user_mode;
     if ( !guest_kernel_mode(v, guest_cpu_user_regs()) )
-        error_code |= 4;
+        error_code |= PGERR_user_mode;
 
     ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault];
     tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE;
@@ -578,54 +578,91 @@ static int handle_gdt_ldt_mapping_fault(
     (((va) >= HYPERVISOR_VIRT_START))
 #endif
 
-static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
+static int __spurious_page_fault(
+    unsigned long addr, struct cpu_user_regs *regs)
+{
+    unsigned long mfn = read_cr3() >> PAGE_SHIFT;
+#if CONFIG_PAGING_LEVELS >= 4
+    l4_pgentry_t l4e, *l4t;
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    l3_pgentry_t l3e, *l3t;
+#endif
+    l2_pgentry_t l2e, *l2t;
+    l1_pgentry_t l1e, *l1t;
+    unsigned int required_flags, disallowed_flags;
+
+    required_flags  = _PAGE_PRESENT;
+    if ( regs->error_code & PGERR_write_access )
+        required_flags |= _PAGE_RW;
+    if ( regs->error_code & PGERR_user_mode )
+        required_flags |= _PAGE_USER;
+
+    disallowed_flags = 0;
+    if ( regs->error_code & PGERR_instr_fetch )
+        disallowed_flags |= _PAGE_NX;
+
+#if CONFIG_PAGING_LEVELS >= 4
+    l4t = map_domain_page(mfn);
+    l4e = l4t[l4_table_offset(addr)];
+    mfn = l4e_get_pfn(l4e);
+    unmap_domain_page(l4t);
+    if ( !(l4e_get_flags(l4e) & required_flags) ||
+         (l4e_get_flags(l4e) & disallowed_flags) )
+        return 0;
+#endif
+
+#if CONFIG_PAGING_LEVELS >= 3
+    l3t = map_domain_page(mfn);
+    l3e = l3t[l3_table_offset(addr)];
+    mfn = l3e_get_pfn(l3e);
+    unmap_domain_page(l3t);
+#ifdef CONFIG_X86_PAE
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        return 0;
+#else
+    if ( !(l3e_get_flags(l3e) & required_flags) ||
+         (l3e_get_flags(l3e) & disallowed_flags) )
+        return 0;
+#endif
+#endif
+
+    l2t = map_domain_page(mfn);
+    l2e = l2t[l2_table_offset(addr)];
+    mfn = l2e_get_pfn(l2e);
+    unmap_domain_page(l2t);
+    if ( !(l2e_get_flags(l2e) & required_flags) ||
+         (l2e_get_flags(l2e) & disallowed_flags) )
+        return 0;
+    if ( l2e_get_flags(l2e) & _PAGE_PSE )
+        return 1;
+
+    l1t = map_domain_page(mfn);
+    l1e = l1t[l1_table_offset(addr)];
+    mfn = l1e_get_pfn(l1e);
+    unmap_domain_page(l1t);
+    if ( !(l1e_get_flags(l1e) & required_flags) ||
+         (l1e_get_flags(l1e) & disallowed_flags) )
+        return 0;
+    return 1;
+}
+
+static int spurious_page_fault(
+    unsigned long addr, struct cpu_user_regs *regs)
 {
     struct vcpu   *v = current;
     struct domain *d = v->domain;
-
-    if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
-    {
-        if ( shadow_mode_external(d) && guest_mode(regs) )
-            return shadow_fault(addr, regs);
-        if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
-            return handle_gdt_ldt_mapping_fault(
-                addr - GDT_LDT_VIRT_START, regs);
-    }
-    else if ( unlikely(shadow_mode_enabled(d)) )
-    {
-        return shadow_fault(addr, regs);
-    }
-    else if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
-    {
-        LOCK_BIGLOCK(d);
-        if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
-             unlikely(l2_linear_offset(addr) ==
-                      d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
-        {
-            ptwr_flush(d, PTWR_PT_ACTIVE);
-            UNLOCK_BIGLOCK(d);
-            return EXCRET_fault_fixed;
-        }
-
-        if ( guest_kernel_mode(v, regs) &&
-             /* Protection violation on write? No reserved-bit violation? */
-             ((regs->error_code & 0xb) == 0x3) &&
-             ptwr_do_page_fault(d, addr, regs) )
-        {
-            UNLOCK_BIGLOCK(d);
-            return EXCRET_fault_fixed;
-        }
-        UNLOCK_BIGLOCK(d);
-    }
-
-    return 0;
-}
-
-static int spurious_page_fault(unsigned long addr, struct cpu_user_regs *regs)
-{
-    struct vcpu   *v = current;
-    struct domain *d = v->domain;
-    int            rc;
+    int            is_spurious;
+
+    /* Reserved bit violations are never spurious faults. */
+    if ( regs->error_code & PGERR_reserved_bit )
+        return 0;
+
+    LOCK_BIGLOCK(d);
+
+    is_spurious = __spurious_page_fault(addr, regs);
+    if ( is_spurious )
+        goto out;
 
     /*
      * The only possible reason for a spurious page fault not to be picked
@@ -635,10 +672,8 @@ static int spurious_page_fault(unsigned 
     if ( is_idle_domain(d) ||               /* no ptwr in idle domain       */
          IN_HYPERVISOR_RANGE(addr) ||       /* no ptwr on hypervisor addrs  */
          shadow_mode_enabled(d) ||          /* no ptwr logic in shadow mode */
-         ((regs->error_code & 0x1d) != 0) ) /* simple not-present fault?    */
-        return 0;
-
-    LOCK_BIGLOCK(d);
+         (regs->error_code & PGERR_page_present) ) /* not-present fault?    */
+        goto out;
 
     /*
      * The page directory could have been detached again while we weren't
@@ -649,16 +684,67 @@ static int spurious_page_fault(unsigned 
                   d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
     {
         ptwr_flush(d, PTWR_PT_ACTIVE);
-        rc = 1;
-    }
-    else
-    {
-        /* Okay, walk the page tables. Only check for not-present faults.*/
-        rc = __spurious_page_fault(addr);
-    }
-
+        is_spurious = 1;
+    }
+
+ out:
     UNLOCK_BIGLOCK(d);
-    return rc;
+    return is_spurious;
+}
+
+static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
+{
+    struct vcpu   *v = current;
+    struct domain *d = v->domain;
+    int            rc;
+
+    if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
+    {
+        if ( shadow_mode_external(d) && guest_mode(regs) )
+            return shadow_fault(addr, regs);
+        if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
+            return handle_gdt_ldt_mapping_fault(
+                addr - GDT_LDT_VIRT_START, regs);
+        /*
+         * Do not propagate spurious faults in the hypervisor area to the
+         * guest. It cannot fix them up.
+         */
+        LOCK_BIGLOCK(d);
+        rc = __spurious_page_fault(addr, regs);
+        UNLOCK_BIGLOCK(d);
+        return rc;
+    }
+
+    if ( unlikely(shadow_mode_enabled(d)) )
+        return shadow_fault(addr, regs);
+
+    if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) )
+    {
+        LOCK_BIGLOCK(d);
+        if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) &&
+             unlikely(l2_linear_offset(addr) ==
+                      d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) )
+        {
+            ptwr_flush(d, PTWR_PT_ACTIVE);
+            UNLOCK_BIGLOCK(d);
+            return EXCRET_fault_fixed;
+        }
+
+        if ( guest_kernel_mode(v, regs) &&
+             /* Protection violation on write? No reserved-bit violation? */
+             ((regs->error_code & (PGERR_page_present |
+                                   PGERR_write_access |
+                                   PGERR_reserved_bit)) ==
+              (PGERR_page_present | PGERR_write_access)) &&
+             ptwr_do_page_fault(d, addr, regs) )
+        {
+            UNLOCK_BIGLOCK(d);
+            return EXCRET_fault_fixed;
+        }
+        UNLOCK_BIGLOCK(d);
+    }
+
+    return 0;
 }
 
 /*
@@ -784,8 +870,8 @@ static inline int admin_io_okay(
     (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0))
 
 /* Propagate a fault back to the guest kernel. */
-#define USER_READ_FAULT  4 /* user mode, read fault */
-#define USER_WRITE_FAULT 6 /* user mode, write fault */
+#define USER_READ_FAULT  (PGERR_user_mode)
+#define USER_WRITE_FAULT (PGERR_user_mode | PGERR_write_access)
 #define PAGE_FAULT(_faultaddr, _errcode)        \
 ({  propagate_page_fault(_faultaddr, _errcode); \
     return EXCRET_fault_fixed;                  \
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/arch/x86/x86_32/traps.c       Fri Jun 16 18:18:55 2006 +0100
@@ -113,40 +113,6 @@ void show_page_walk(unsigned long addr)
     unmap_domain_page(l1t);
 }
 
-int __spurious_page_fault(unsigned long addr)
-{
-    unsigned long mfn = read_cr3() >> PAGE_SHIFT;
-#ifdef CONFIG_X86_PAE
-    l3_pgentry_t l3e, *l3t;
-#endif
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-
-#ifdef CONFIG_X86_PAE
-    l3t = map_domain_page(mfn);
-    l3e = l3t[l3_table_offset(addr)];
-    mfn = l3e_get_pfn(l3e);
-    unmap_domain_page(l3t);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 0;
-#endif
-
-    l2t = map_domain_page(mfn);
-    l2e = l2t[l2_table_offset(addr)];
-    mfn = l2e_get_pfn(l2e);
-    unmap_domain_page(l2t);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 0;
-    if ( l2e_get_flags(l2e) & _PAGE_PSE )
-        return 1;
-
-    l1t = map_domain_page(mfn);
-    l1e = l1t[l1_table_offset(addr)];
-    mfn = l1e_get_pfn(l1e);
-    unmap_domain_page(l1t);
-    return !!(l1e_get_flags(l1e) & _PAGE_PRESENT);
-}
-
 #define DOUBLEFAULT_STACK_SIZE 1024
 static struct tss_struct doublefault_tss;
 static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE];
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c       Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/arch/x86/x86_64/traps.c       Fri Jun 16 18:18:55 2006 +0100
@@ -115,40 +115,6 @@ void show_page_walk(unsigned long addr)
     printk("    L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn);
 }
 
-int __spurious_page_fault(unsigned long addr)
-{
-    unsigned long mfn = read_cr3() >> PAGE_SHIFT;
-    l4_pgentry_t l4e, *l4t;
-    l3_pgentry_t l3e, *l3t;
-    l2_pgentry_t l2e, *l2t;
-    l1_pgentry_t l1e, *l1t;
-
-    l4t = mfn_to_virt(mfn);
-    l4e = l4t[l4_table_offset(addr)];
-    mfn = l4e_get_pfn(l4e);
-    if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return 0;
-
-    l3t = mfn_to_virt(mfn);
-    l3e = l3t[l3_table_offset(addr)];
-    mfn = l3e_get_pfn(l3e);
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 0;
-
-    l2t = mfn_to_virt(mfn);
-    l2e = l2t[l2_table_offset(addr)];
-    mfn = l2e_get_pfn(l2e);
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 0;
-    if ( l2e_get_flags(l2e) & _PAGE_PSE )
-        return 1;
-
-    l1t = mfn_to_virt(mfn);
-    l1e = l1t[l1_table_offset(addr)];
-    mfn = l1e_get_pfn(l1e);
-    return !!(l1e_get_flags(l1e) & _PAGE_PRESENT);
-}
-
 asmlinkage void double_fault(void);
 asmlinkage void do_double_fault(struct cpu_user_regs *regs)
 {
diff -r 05ab081f3c67 -r e1ae7b3cb5b7 xen/include/asm-x86/processor.h
--- a/xen/include/asm-x86/processor.h   Fri Jun 16 18:08:27 2006 +0100
+++ b/xen/include/asm-x86/processor.h   Fri Jun 16 18:18:55 2006 +0100
@@ -128,6 +128,13 @@
 /* 'arch_vcpu' flags values */
 #define _TF_kernel_mode        0
 #define TF_kernel_mode         (1<<_TF_kernel_mode)
+
+/* #PF error code values. */
+#define PGERR_page_present   (1U<<0)
+#define PGERR_write_access   (1U<<1)
+#define PGERR_user_mode      (1U<<2)
+#define PGERR_reserved_bit   (1U<<3)
+#define PGERR_instr_fetch    (1U<<4)
 
 #ifndef __ASSEMBLY__
 
@@ -524,7 +531,6 @@ void show_stack(struct cpu_user_regs *re
 void show_stack(struct cpu_user_regs *regs);
 void show_registers(struct cpu_user_regs *regs);
 void show_page_walk(unsigned long addr);
-int __spurious_page_fault(unsigned long addr);
 asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs);
 
 extern void mtrr_ap_init(void);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.