[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen-3.0-testing] [LINUX][PAE] Improve allocation strategy when PAE pgdirs must be below 4GB.



# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID 1fa7c3a50ab030c23bdb3da205fbed0f9fda82f0
# Parent  02159733ec97aaeed6edaebecdc0e3b95d2e2dea
[LINUX][PAE] Improve allocation strategy when PAE pgdirs must be below 4GB.
Moving the re-allocation to low memory into pgd_alloc() has several
advantages:
 1. Avoids race with save/restore where pgdir may end up above 4GB after
    save/restore.
 2. If pgdir cannot be re-allocated we can return failure to the caller
    rather than BUG().
 3. Slightly reduces diff against native Linux code.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
xen-unstable changeset:   10347:8070050cc30f3d969835d7b1d6eda57959d56842
xen-unstable date:        Wed Jun 14 12:36:06 2006 +0100

[LINUX][PAE] More fixes to pgd allocation. Since allocating pmds
can sleep, we could race save/restore and end up with stale
machine addresses stores in pgd entries. Avoid this by
remembering virtuall addresses and translating to machine
addresses all at the end and protected by the pgd_lock.
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>
xen-unstable changeset:   10351:ee482dc60eab7ba59c39901a5e6d9e597acc2f52
xen-unstable date:        Wed Jun 14 17:06:28 2006 +0100
---
 linux-2.6-xen-sparse/arch/i386/mm/init-xen.c    |    2 
 linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c |  136 ++++++++++++++++--------
 2 files changed, 92 insertions(+), 46 deletions(-)

diff -r 02159733ec97 -r 1fa7c3a50ab0 
linux-2.6-xen-sparse/arch/i386/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Thu Jun 15 11:36:57 
2006 +0100
+++ b/linux-2.6-xen-sparse/arch/i386/mm/init-xen.c      Thu Jun 15 13:14:20 
2006 +0100
@@ -764,7 +764,7 @@ void __init pgtable_cache_init(void)
 #endif
                                0,
                                pgd_ctor,
-                               pgd_dtor);
+                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
        if (!pgd_cache)
                panic("pgtable_cache_init(): Cannot create pgd cache");
 }
diff -r 02159733ec97 -r 1fa7c3a50ab0 
linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c   Thu Jun 15 11:36:57 
2006 +0100
+++ b/linux-2.6-xen-sparse/arch/i386/mm/pgtable-xen.c   Thu Jun 15 13:14:20 
2006 +0100
@@ -300,11 +300,6 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
        unsigned long flags;
 
        if (PTRS_PER_PMD > 1) {
-               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
-                       int rc = xen_create_contiguous_region(
-                               (unsigned long)pgd, 0, 32);
-                       BUG_ON(rc);
-               }
                if (HAVE_SHARED_KERNEL_PMD)
                        clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
                                        swapper_pg_dir + USER_PTRS_PER_PGD,
@@ -320,69 +315,105 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
        }
 }
 
+/* never called when PTRS_PER_PMD > 1 */
 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
        unsigned long flags; /* can be called from interrupt context */
 
-       if (PTRS_PER_PMD > 1) {
-               if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
-                       xen_destroy_contiguous_region((unsigned long)pgd, 0);
-       } else {
-               spin_lock_irqsave(&pgd_lock, flags);
-               pgd_list_del(pgd);
-               spin_unlock_irqrestore(&pgd_lock, flags);
-
-               pgd_test_and_unpin(pgd);
-       }
+       spin_lock_irqsave(&pgd_lock, flags);
+       pgd_list_del(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+
+       pgd_test_and_unpin(pgd);
 }
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
        int i;
        pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+       pmd_t **pmd;
+       unsigned long flags;
 
        pgd_test_and_unpin(pgd);
 
        if (PTRS_PER_PMD == 1 || !pgd)
                return pgd;
 
-       for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-               if (!pmd)
-                       goto out_oom;
-               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
-       }
-
-       if (!HAVE_SHARED_KERNEL_PMD) {
-               unsigned long flags;
-
-               for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
+       if (HAVE_SHARED_KERNEL_PMD) {
+               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
                        pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
                        if (!pmd)
                                goto out_oom;
                        set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
                }
-
-               spin_lock_irqsave(&pgd_lock, flags);
-               for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
-                       unsigned long v = (unsigned long)i << PGDIR_SHIFT;
-                       pgd_t *kpgd = pgd_offset_k(v);
-                       pud_t *kpud = pud_offset(kpgd, v);
-                       pmd_t *kpmd = pmd_offset(kpud, v);
-                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
-                       memcpy(pmd, kpmd, PAGE_SIZE);
-                       make_lowmem_page_readonly(
-                               pmd, XENFEAT_writable_page_tables);
+               return pgd;
+       }
+
+       /*
+        * We can race save/restore (if we sleep during a GFP_KERNEL memory
+        * allocation). We therefore store virtual addresses of pmds as they
+        * do not change across save/restore, and poke the machine addresses
+        * into the pgdir under the pgd_lock.
+        */
+       pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL);
+       if (!pmd) {
+               kmem_cache_free(pgd_cache, pgd);
+               return NULL;
+       }
+
+       /* Allocate pmds, remember virtual addresses. */
+       for (i = 0; i < PTRS_PER_PGD; ++i) {
+               pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+               if (!pmd[i])
+                       goto out_oom;
+       }
+
+       spin_lock_irqsave(&pgd_lock, flags);
+
+       /* Protect against save/restore: move below 4GB under pgd_lock. */
+       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) {
+               int rc = xen_create_contiguous_region(
+                       (unsigned long)pgd, 0, 32);
+               if (rc) {
+                       spin_unlock_irqrestore(&pgd_lock, flags);
+                       goto out_oom;
                }
-               pgd_list_add(pgd);
-               spin_unlock_irqrestore(&pgd_lock, flags);
-       }
+       }
+
+       /* Copy kernel pmd contents and write-protect the new pmds. */
+       for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
+               unsigned long v = (unsigned long)i << PGDIR_SHIFT;
+               pgd_t *kpgd = pgd_offset_k(v);
+               pud_t *kpud = pud_offset(kpgd, v);
+               pmd_t *kpmd = pmd_offset(kpud, v);
+               memcpy(pmd[i], kpmd, PAGE_SIZE);
+               make_lowmem_page_readonly(
+                       pmd[i], XENFEAT_writable_page_tables);
+       }
+
+       /* It is safe to poke machine addresses of pmds under the pmd_lock. */
+       for (i = 0; i < PTRS_PER_PGD; i++)
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i])));
+
+       /* Ensure this pgd gets picked up and pinned on save/restore. */
+       pgd_list_add(pgd);
+
+       spin_unlock_irqrestore(&pgd_lock, flags);
+
+       kfree(pmd);
 
        return pgd;
 
 out_oom:
-       for (i--; i >= 0; i--)
-               kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+       if (HAVE_SHARED_KERNEL_PMD) {
+               for (i--; i >= 0; i--)
+                       kmem_cache_free(pmd_cache,
+                                       (void *)__va(pgd_val(pgd[i])-1));
+       } else {
+               for (i--; i >= 0; i--)
+                       kmem_cache_free(pmd_cache, pmd[i]);
+               kfree(pmd);
+       }
        kmem_cache_free(pgd_cache, pgd);
        return NULL;
 }
@@ -391,6 +422,14 @@ void pgd_free(pgd_t *pgd)
 {
        int i;
 
+       /*
+        * After this the pgd should not be pinned for the duration of this
+        * function's execution. We should never sleep and thus never race:
+        *  1. User pmds will not become write-protected under our feet due
+        *     to a concurrent mm_pin_all().
+        *  2. The machine addresses in PGD entries will not become invalid
+        *     due to a concurrent save/restore.
+        */
        pgd_test_and_unpin(pgd);
 
        /* in the PAE case user pgd entries are overwritten before usage */
@@ -399,11 +438,13 @@ void pgd_free(pgd_t *pgd)
                        pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
                        kmem_cache_free(pmd_cache, pmd);
                }
+
                if (!HAVE_SHARED_KERNEL_PMD) {
                        unsigned long flags;
                        spin_lock_irqsave(&pgd_lock, flags);
                        pgd_list_del(pgd);
                        spin_unlock_irqrestore(&pgd_lock, flags);
+
                        for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) {
                                pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
                                make_lowmem_page_writable(
@@ -411,8 +452,13 @@ void pgd_free(pgd_t *pgd)
                                memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
                                kmem_cache_free(pmd_cache, pmd);
                        }
+
+                       if (!xen_feature(XENFEAT_pae_pgdir_above_4gb))
+                               xen_destroy_contiguous_region(
+                                       (unsigned long)pgd, 0);
                }
        }
+
        /* in the non-PAE case, free_pgtables() clears user pgd entries */
        kmem_cache_free(pgd_cache, pgd);
 }
@@ -588,7 +634,7 @@ void mm_pin(struct mm_struct *mm)
 void mm_pin(struct mm_struct *mm)
 {
        if (xen_feature(XENFEAT_writable_page_tables))
-           return;
+               return;
        spin_lock(&mm->page_table_lock);
        __pgd_pin(mm->pgd);
        spin_unlock(&mm->page_table_lock);
@@ -597,7 +643,7 @@ void mm_unpin(struct mm_struct *mm)
 void mm_unpin(struct mm_struct *mm)
 {
        if (xen_feature(XENFEAT_writable_page_tables))
-           return;
+               return;
        spin_lock(&mm->page_table_lock);
        __pgd_unpin(mm->pgd);
        spin_unlock(&mm->page_table_lock);
@@ -607,7 +653,7 @@ void mm_pin_all(void)
 {
        struct page *page;
        if (xen_feature(XENFEAT_writable_page_tables))
-           return;
+               return;
        for (page = pgd_list; page; page = (struct page *)page->index) {
                if (!test_bit(PG_pinned, &page->flags))
                        __pgd_pin((pgd_t *)page_address(page));

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.