[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] Fix pagetable pinning logic for xen/i386 kernels. The pin



# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 25599e222c333565208c77c0d875bd56d6c719ef
# Parent  63aeaa2152d81f47fb76e1fd73359e52e89da331
Fix pagetable pinning logic for xen/i386 kernels. The pin
flag is now associated with the pgd rather than the mm -- this
avoids a race where a pgd is allocated from the pgd_cache but,
before it gets associated with an mm, the kernel suspends itself.
At this point the kernel mappings will not get rewritten when the
kernel is resumed, and the system will fail.

A further advantage is that the code is slightly simpler and less
invasive (no changes to mm_context for example).

Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>

diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c   Mon Nov  7 15:37:58 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/kernel/ldt.c   Mon Nov  7 17:14:45 2005
@@ -18,7 +18,6 @@
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
-#include <asm/mmu_context.h>
 
 #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
 static void flush_ldt(void *null)
@@ -101,18 +100,13 @@
        struct mm_struct * old_mm;
        int retval = 0;
 
-       memset(&mm->context, 0, sizeof(mm->context));
        init_MUTEX(&mm->context.sem);
+       mm->context.size = 0;
        old_mm = current->mm;
        if (old_mm && old_mm->context.size > 0) {
                down(&old_mm->context.sem);
                retval = copy_ldt(&mm->context, &old_mm->context);
                up(&old_mm->context.sem);
-       }
-       if (retval == 0) {
-               spin_lock(&mm_unpinned_lock);
-               list_add(&mm->context.unpinned, &mm_unpinned);
-               spin_unlock(&mm_unpinned_lock);
        }
        return retval;
 }
@@ -133,11 +127,6 @@
                else
                        kfree(mm->context.ldt);
                mm->context.size = 0;
-       }
-       if (!mm->context.pinned) {
-               spin_lock(&mm_unpinned_lock);
-               list_del(&mm->context.unpinned);
-               spin_unlock(&mm_unpinned_lock);
        }
 }
 
diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/arch/xen/i386/mm/init.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c      Mon Nov  7 15:37:58 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/init.c      Mon Nov  7 17:14:45 2005
@@ -376,7 +376,6 @@
                __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
        }
 
-       init_mm.context.pinned = 1;
        kernel_physical_mapping_init(pgd_base);
        remap_numa_kva();
 
@@ -689,6 +688,8 @@
 #ifndef CONFIG_SMP
        zap_low_mappings();
 #endif
+
+       set_bit(PG_pinned, &virt_to_page(init_mm.pgd)->flags);
 }
 
 kmem_cache_t *pgd_cache;
diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c
--- a/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c   Mon Nov  7 15:37:58 2005
+++ b/linux-2.6-xen-sparse/arch/xen/i386/mm/pgtable.c   Mon Nov  7 17:14:45 2005
@@ -26,6 +26,9 @@
 
 #include <asm-xen/foreign_page.h>
 #include <asm/hypervisor.h>
+
+static void __pgd_pin(pgd_t *pgd);
+static void __pgd_unpin(pgd_t *pgd);
 
 void show_mem(void)
 {
@@ -299,6 +302,8 @@
 {
        unsigned long flags; /* can be called from interrupt context */
 
+       BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags));
+
        if (HAVE_SHARED_KERNEL_PMD)
                return;
 
@@ -311,6 +316,8 @@
 {
        int i = 0;
        pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+
+       BUG_ON(test_bit(PG_pinned, &virt_to_page(pgd)->flags));
 
        if (PTRS_PER_PMD == 1 || !pgd)
                return pgd;
@@ -351,15 +358,9 @@
 void pgd_free(pgd_t *pgd)
 {
        int i;
-       pte_t *ptep = virt_to_ptep(pgd);
-
-       if (!pte_write(*ptep)) {
-               xen_pgd_unpin(__pa(pgd));
-               BUG_ON(HYPERVISOR_update_va_mapping(
-                       (unsigned long)pgd,
-                       pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
-                       0));
-       }
+
+       if (test_bit(PG_pinned, &virt_to_page(pgd)->flags))
+               __pgd_unpin(pgd);
 
        /* in the PAE case user pgd entries are overwritten before usage */
        if (PTRS_PER_PMD > 1) {
@@ -441,10 +442,7 @@
 }
 #endif /* CONFIG_XEN_SHADOW_MODE */
 
-LIST_HEAD(mm_unpinned);
-DEFINE_SPINLOCK(mm_unpinned_lock);
-
-static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+static inline void pgd_walk_set_prot(void *pt, pgprot_t flags)
 {
        struct page *page = virt_to_page(pt);
        unsigned long pfn = page_to_pfn(page);
@@ -456,103 +454,111 @@
                pfn_pte(pfn, flags), 0));
 }
 
-static void mm_walk(struct mm_struct *mm, pgprot_t flags)
-{
-       pgd_t       *pgd;
-       pud_t       *pud;
-       pmd_t       *pmd;
-       pte_t       *pte;
-       int          g,u,m;
-
-       pgd = mm->pgd;
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+       pgd_t *pgd = pgd_base;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int    g, u, m;
+
        for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
                if (pgd_none(*pgd))
                        continue;
                pud = pud_offset(pgd, 0);
                if (PTRS_PER_PUD > 1) /* not folded */
-                       mm_walk_set_prot(pud,flags);
+                       pgd_walk_set_prot(pud,flags);
                for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
                        if (pud_none(*pud))
                                continue;
                        pmd = pmd_offset(pud, 0);
                        if (PTRS_PER_PMD > 1) /* not folded */
-                               mm_walk_set_prot(pmd,flags);
+                               pgd_walk_set_prot(pmd,flags);
                        for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
                                if (pmd_none(*pmd))
                                        continue;
                                pte = pte_offset_kernel(pmd,0);
-                               mm_walk_set_prot(pte,flags);
+                               pgd_walk_set_prot(pte,flags);
                        }
                }
        }
+
+       BUG_ON(HYPERVISOR_update_va_mapping(
+               (unsigned long)pgd_base,
+               pfn_pte(virt_to_phys(pgd_base)>>PAGE_SHIFT, flags),
+               UVMF_TLB_FLUSH));
+}
+
+static void __pgd_pin(pgd_t *pgd)
+{
+       pgd_walk(pgd, PAGE_KERNEL_RO);
+       xen_pgd_pin(__pa(pgd));
+       set_bit(PG_pinned, &virt_to_page(pgd)->flags);
+}
+
+static void __pgd_unpin(pgd_t *pgd)
+{
+       xen_pgd_unpin(__pa(pgd));
+       pgd_walk(pgd, PAGE_KERNEL);
+       clear_bit(PG_pinned, &virt_to_page(pgd)->flags);
 }
 
 void mm_pin(struct mm_struct *mm)
 {
-    spin_lock(&mm->page_table_lock);
-
-    mm_walk(mm, PAGE_KERNEL_RO);
-    BUG_ON(HYPERVISOR_update_va_mapping(
-        (unsigned long)mm->pgd,
-        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
-        UVMF_TLB_FLUSH));
-    xen_pgd_pin(__pa(mm->pgd));
-    mm->context.pinned = 1;
-    spin_lock(&mm_unpinned_lock);
-    list_del(&mm->context.unpinned);
-    spin_unlock(&mm_unpinned_lock);
-
-    spin_unlock(&mm->page_table_lock);
+       spin_lock(&mm->page_table_lock);
+       __pgd_pin(mm->pgd);
+       spin_unlock(&mm->page_table_lock);
 }
 
 void mm_unpin(struct mm_struct *mm)
 {
-    spin_lock(&mm->page_table_lock);
-
-    xen_pgd_unpin(__pa(mm->pgd));
-    BUG_ON(HYPERVISOR_update_va_mapping(
-        (unsigned long)mm->pgd,
-        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0));
-    mm_walk(mm, PAGE_KERNEL);
-    xen_tlb_flush();
-    mm->context.pinned = 0;
-    spin_lock(&mm_unpinned_lock);
-    list_add(&mm->context.unpinned, &mm_unpinned);
-    spin_unlock(&mm_unpinned_lock);
-
-    spin_unlock(&mm->page_table_lock);
+       spin_lock(&mm->page_table_lock);
+       __pgd_unpin(mm->pgd);
+       spin_unlock(&mm->page_table_lock);
 }
 
 void mm_pin_all(void)
 {
-    while (!list_empty(&mm_unpinned))  
-       mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
-                         context.unpinned));
+       struct page *page;
+       for (page = pgd_list; page; page = (struct page *)page->index) {
+               if (!test_bit(PG_pinned, &page->flags))
+                       __pgd_pin((pgd_t *)page_address(page));
+       }
 }
 
 void _arch_exit_mmap(struct mm_struct *mm)
 {
-    struct task_struct *tsk = current;
-
-    task_lock(tsk);
-
-    /*
-     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
-     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
-     */
-    if ( tsk->active_mm == mm )
-    {
-        tsk->active_mm = &init_mm;
-        atomic_inc(&init_mm.mm_count);
-
-        switch_mm(mm, &init_mm, tsk);
-
-        atomic_dec(&mm->mm_count);
-        BUG_ON(atomic_read(&mm->mm_count) == 0);
-    }
-
-    task_unlock(tsk);
-
-    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
-        mm_unpin(mm);
-}
+       struct task_struct *tsk = current;
+
+       task_lock(tsk);
+
+       /*
+        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+        */
+       if (tsk->active_mm == mm) {
+               tsk->active_mm = &init_mm;
+               atomic_inc(&init_mm.mm_count);
+
+               switch_mm(mm, &init_mm, tsk);
+
+               atomic_dec(&mm->mm_count);
+               BUG_ON(atomic_read(&mm->mm_count) == 0);
+       }
+
+       task_unlock(tsk);
+
+       if (test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags) &&
+           (atomic_read(&mm->mm_count) == 1))
+               mm_unpin(mm);
+}
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/arch/xen/kernel/reboot.c
--- a/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c     Mon Nov  7 15:37:58 2005
+++ b/linux-2.6-xen-sparse/arch/xen/kernel/reboot.c     Mon Nov  7 17:14:45 2005
@@ -129,8 +129,8 @@
        preempt_disable();
 
 #ifdef __i386__
+       kmem_cache_shrink(pgd_cache);
        mm_pin_all();
-       kmem_cache_shrink(pgd_cache);
 #endif
 
        __cli();
diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h       Mon Nov  7 
15:37:58 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu.h       Mon Nov  7 
17:14:45 2005
@@ -12,12 +12,7 @@
        int size;
        struct semaphore sem;
        void *ldt;
-       unsigned pinned:1;
-       struct list_head unpinned;
 } mm_context_t;
-
-extern struct list_head mm_unpinned;
-extern spinlock_t mm_unpinned_lock;
 
 /* mm/memory.c:exit_mmap hook */
 extern void _arch_exit_mmap(struct mm_struct *mm);
diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h       Mon Nov 
 7 15:37:58 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/mmu_context.h       Mon Nov 
 7 17:14:45 2005
@@ -53,7 +53,7 @@
        struct mmuext_op _op[2], *op = _op;
 
        if (likely(prev != next)) {
-               if (!next->context.pinned)
+               if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags))
                        mm_pin(next);
 
                /* stop flush ipis for the previous mm */
diff -r 63aeaa2152d8 -r 25599e222c33 
linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h
--- a/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h   Mon Nov  7 
15:37:58 2005
+++ b/linux-2.6-xen-sparse/include/asm-xen/asm-i386/pgalloc.h   Mon Nov  7 
17:14:45 2005
@@ -7,12 +7,15 @@
 #include <linux/mm.h>          /* for struct page */
 #include <asm/io.h>            /* for phys_to_virt and page_to_pseudophys */
 
+/* Is this pagetable pinned? */
+#define PG_pinned      PG_arch_1
+
 #define pmd_populate_kernel(mm, pmd, pte) \
                set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte)))
 
 #define pmd_populate(mm, pmd, pte)                                     \
 do {                                                                   \
-       if (unlikely((mm)->context.pinned)) {                           \
+       if (test_bit(PG_pinned, &virt_to_page((mm)->pgd)->flags)) {     \
                if (!PageHighMem(pte))                                  \
                        BUG_ON(HYPERVISOR_update_va_mapping(            \
                          (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT),\

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.