[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-unstable] [LINUX] Add spurious page-fault detection, intended primarily



# HG changeset patch
# User kfraser@xxxxxxxxxxxxxxxxxxxxxxx
# Node ID 533bad7c0883189e26c2a7f43011801c417b01fe
# Parent  e1ae7b3cb5b73f11bed3a51a7f4ded85c30cffd8
[LINUX] Add spurious page-fault detection, intended primarily
for spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
---
 linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c   |   51 ++++++++++++++++++++++
 linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c |   54 ++++++++++++++++++++++--
 2 files changed, 101 insertions(+), 4 deletions(-)

diff -r e1ae7b3cb5b7 -r 533bad7c0883 
linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c     Fri Jun 16 18:18:55 
2006 +0100
+++ b/linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c     Fri Jun 16 18:19:40 
2006 +0100
@@ -273,6 +273,49 @@ static void dump_fault_path(unsigned lon
 }
 #endif
 
+static int spurious_fault(struct pt_regs *regs,
+                         unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+#ifdef CONFIG_XEN
+       /* Faults in hypervisor area are never spurious. */
+       if (address >= HYPERVISOR_VIRT_START)
+               return 0;
+#endif
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & 0x0c)
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+       if ((error_code & 0x02) && !pte_write(*pte))
+               return 0;
+#ifdef CONFIG_X86_PAE
+       if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
+               return 0;
+#endif
+
+       return 1;
+}
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -327,8 +370,16 @@ fastcall void __kprobes do_page_fault(st
         * protection error (error_code & 1) == 0.
         */
        if (unlikely(address >= TASK_SIZE)) { 
+#ifdef CONFIG_XEN
+               /* Faults in hypervisor area can never be patched up. */
+               if (address >= HYPERVISOR_VIRT_START)
+                       goto bad_area_nosemaphore;
+#endif
                if (!(error_code & 5))
                        goto vmalloc_fault;
+               /* Can take a spurious fault if mapping changes R/O -> R/W. */
+               if (spurious_fault(regs, address, error_code))
+                       return;
                /* 
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.
diff -r e1ae7b3cb5b7 -r 533bad7c0883 
linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c   Fri Jun 16 18:18:55 
2006 +0100
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c   Fri Jun 16 18:19:40 
2006 +0100
@@ -307,6 +307,49 @@ int exception_trace = 1;
 #define MEM_LOG(_f, _a...) ((void)0)
 #endif
 
+static int spurious_fault(struct pt_regs *regs,
+                         unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+#ifdef CONFIG_XEN
+       /* Faults in hypervisor area are never spurious. */
+       if ((address >= HYPERVISOR_VIRT_START) &&
+           (address < HYPERVISOR_VIRT_END))
+               return 0;
+#endif
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & PF_RSVD|PF_USER)
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+       if ((error_code & PF_WRITE) && !pte_write(*pte))
+               return 0;
+       if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
+               return 0;
+
+       return 1;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -361,16 +404,19 @@ asmlinkage void __kprobes do_page_fault(
         */
        if (unlikely(address >= TASK_SIZE64)) {
                /*
-                * Must check for the entire kernel range here: with writable
-                * page tables the hypervisor may temporarily clear PMD
-                * entries.
+                * Don't check for the module range here: its PML4
+                * is always initialized because it's shared with the main
+                * kernel text. Only vmalloc may need PML4 syncups.
                 */
                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-                   address >= PAGE_OFFSET) {
+                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
                        if (vmalloc_fault(address) < 0)
                                goto bad_area_nosemaphore;
                        return;
                }
+               /* Can take a spurious fault if mapping changes R/O -> R/W. */
+               if (spurious_fault(regs, address, error_code))
+                       return;
                /*
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.