[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v6 8/9] x86/mm: move PV code to pv/mm.c



The following code is moved:

1. PV MMU hypercall handlers
2. PV memory management code such as:
   2.1 {get,put}_page_from_l{2,3,4}e
   2.2 pv_{alloc,free}_page_type
3. All helper functions for the above

The l1e functions can't be moved because they are needed by shadow
code as well.

Pure code movement.

Signed-off-by: Wei Liu <wei.liu2@xxxxxxxxxx>
---
 xen/arch/x86/mm.c    | 2620 ++------------------------------------------------
 xen/arch/x86/pv/mm.c | 2452 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2540 insertions(+), 2532 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index e004350e83..0b5fd199a4 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -539,218 +539,6 @@ const char __section(".bss.page_aligned.const") 
__aligned(PAGE_SIZE)
     zero_page[PAGE_SIZE];
 
 
-static int alloc_segdesc_page(struct page_info *page)
-{
-    const struct domain *owner = page_get_owner(page);
-    struct desc_struct *descs = __map_domain_page(page);
-    unsigned i;
-
-    for ( i = 0; i < 512; i++ )
-        if ( unlikely(!check_descriptor(owner, &descs[i])) )
-            break;
-
-    unmap_domain_page(descs);
-
-    return i == 512 ? 0 : -EINVAL;
-}
-
-static int get_page_and_type_from_mfn(
-    mfn_t mfn, unsigned long type, struct domain *d,
-    int partial, int preemptible)
-{
-    struct page_info *page = mfn_to_page(mfn);
-    int rc;
-
-    if ( likely(partial >= 0) &&
-         unlikely(!get_page_from_mfn(mfn, d)) )
-        return -EINVAL;
-
-    rc = (preemptible ?
-          get_page_type_preemptible(page, type) :
-          (get_page_type(page, type) ? 0 : -EINVAL));
-
-    if ( unlikely(rc) && partial >= 0 &&
-         (!preemptible || page != current->arch.old_guest_table) )
-        put_page(page);
-
-    return rc;
-}
-
-static void put_data_page(
-    struct page_info *page, int writeable)
-{
-    if ( writeable )
-        put_page_and_type(page);
-    else
-        put_page(page);
-}
-
-#ifdef CONFIG_PV_LINEAR_PT
-
-static bool inc_linear_entries(struct page_info *pg)
-{
-    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
-
-    do {
-        /*
-         * The check below checks for the "linear use" count being non-zero
-         * as well as overflow.  Signed integer overflow is undefined behavior
-         * according to the C spec.  However, as long as linear_pt_count is
-         * smaller in size than 'int', the arithmetic operation of the
-         * increment below won't overflow; rather the result will be truncated
-         * when stored.  Ensure that this is always true.
-         */
-        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
-        oc = nc++;
-        if ( nc <= 0 )
-            return false;
-        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
-    } while ( oc != nc );
-
-    return true;
-}
-
-static void dec_linear_entries(struct page_info *pg)
-{
-    typeof(pg->linear_pt_count) oc;
-
-    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
-    ASSERT(oc > 0);
-}
-
-static bool inc_linear_uses(struct page_info *pg)
-{
-    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
-
-    do {
-        /* See the respective comment in inc_linear_entries(). */
-        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
-        oc = nc--;
-        if ( nc >= 0 )
-            return false;
-        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
-    } while ( oc != nc );
-
-    return true;
-}
-
-static void dec_linear_uses(struct page_info *pg)
-{
-    typeof(pg->linear_pt_count) oc;
-
-    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
-    ASSERT(oc < 0);
-}
-
-/*
- * We allow root tables to map each other (a.k.a. linear page tables). It
- * needs some special care with reference counts and access permissions:
- *  1. The mapping entry must be read-only, or the guest may get write access
- *     to its own PTEs.
- *  2. We must only bump the reference counts for an *already validated*
- *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
- *     on a validation that is required to complete that validation.
- *  3. We only need to increment the reference counts for the mapped page
- *     frame if it is mapped by a different root table. This is sufficient and
- *     also necessary to allow validation of a root table mapping itself.
- */
-static bool __read_mostly opt_pv_linear_pt = true;
-boolean_param("pv-linear-pt", opt_pv_linear_pt);
-
-#define define_get_linear_pagetable(level)                                  \
-static int                                                                  \
-get_##level##_linear_pagetable(                                             \
-    level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d)         \
-{                                                                           \
-    unsigned long x, y;                                                     \
-    unsigned long pfn;                                                      \
-                                                                            \
-    if ( !opt_pv_linear_pt )                                                \
-    {                                                                       \
-        gdprintk(XENLOG_WARNING,                                            \
-                 "Attempt to create linear p.t. (feature disabled)\n");     \
-        return 0;                                                           \
-    }                                                                       \
-                                                                            \
-    if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
-    {                                                                       \
-        gdprintk(XENLOG_WARNING,                                            \
-                 "Attempt to create linear p.t. with write perms\n");       \
-        return 0;                                                           \
-    }                                                                       \
-                                                                            \
-    if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
-    {                                                                       \
-        struct page_info *page, *ptpg = mfn_to_page(_mfn(pde_pfn));         \
-                                                                            \
-        /* Make sure the page table belongs to the correct domain. */       \
-        if ( unlikely(page_get_owner(ptpg) != d) )                          \
-            return 0;                                                       \
-                                                                            \
-        /* Make sure the mapped frame belongs to the correct domain. */     \
-        page = get_page_from_mfn(_mfn(pfn), d);                             \
-        if ( unlikely(!page) )                                              \
-            return 0;                                                       \
-                                                                            \
-        /*                                                                  \
-         * Ensure that the mapped frame is an already-validated page table  \
-         * and is not itself having linear entries, as well as that the     \
-         * containing page table is not iself in use as a linear page table \
-         * elsewhere.                                                       \
-         * If so, atomically increment the count (checking for overflow).   \
-         */                                                                 \
-        if ( !inc_linear_entries(ptpg) )                                    \
-        {                                                                   \
-            put_page(page);                                                 \
-            return 0;                                                       \
-        }                                                                   \
-        if ( !inc_linear_uses(page) )                                       \
-        {                                                                   \
-            dec_linear_entries(ptpg);                                       \
-            put_page(page);                                                 \
-            return 0;                                                       \
-        }                                                                   \
-        y = page->u.inuse.type_info;                                        \
-        do {                                                                \
-            x = y;                                                          \
-            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||        \
-                 unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
-                          (PGT_##level##_page_table|PGT_validated)) )       \
-            {                                                               \
-                dec_linear_uses(page);                                      \
-                dec_linear_entries(ptpg);                                   \
-                put_page(page);                                             \
-                return 0;                                                   \
-            }                                                               \
-        }                                                                   \
-        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );   \
-    }                                                                       \
-                                                                            \
-    return 1;                                                               \
-}
-
-#else /* CONFIG_PV_LINEAR_PT */
-
-#define define_get_linear_pagetable(level)                              \
-static int                                                              \
-get_##level##_linear_pagetable(                                         \
-        level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
-{                                                                       \
-        return 0;                                                       \
-}
-
-static void dec_linear_uses(struct page_info *pg)
-{
-    ASSERT(pg->linear_pt_count == 0);
-}
-
-static void dec_linear_entries(struct page_info *pg)
-{
-    ASSERT(pg->linear_pt_count == 0);
-}
-
-#endif /* CONFIG_PV_LINEAR_PT */
-
 bool is_iomem_page(mfn_t mfn)
 {
     struct page_info *page;
@@ -1039,104 +827,6 @@ get_page_from_l1e(
     return -EBUSY;
 }
 
-
-/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
-/*
- * get_page_from_l2e returns:
- *   1 => page not present
- *   0 => success
- *  <0 => error code
- */
-define_get_linear_pagetable(l2);
-static int
-get_page_from_l2e(
-    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
-{
-    unsigned long mfn = l2e_get_pfn(l2e);
-    int rc;
-
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-        return 1;
-
-    if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
-    {
-        gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
-                 l2e_get_flags(l2e) & L2_DISALLOW_MASK);
-        return -EINVAL;
-    }
-
-    rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
-    if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
-        rc = 0;
-
-    return rc;
-}
-
-
-/*
- * get_page_from_l3e returns:
- *   1 => page not present
- *   0 => success
- *  <0 => error code
- */
-define_get_linear_pagetable(l3);
-static int
-get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
-{
-    int rc;
-
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-        return 1;
-
-    if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
-    {
-        gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
-                 l3e_get_flags(l3e) & l3_disallow_mask(d));
-        return -EINVAL;
-    }
-
-    rc = get_page_and_type_from_mfn(
-        l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
-    if ( unlikely(rc == -EINVAL) &&
-         !is_pv_32bit_domain(d) &&
-         get_l3_linear_pagetable(l3e, pfn, d) )
-        rc = 0;
-
-    return rc;
-}
-
-/*
- * get_page_from_l4e returns:
- *   1 => page not present
- *   0 => success
- *  <0 => error code
- */
-define_get_linear_pagetable(l4);
-static int
-get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
-{
-    int rc;
-
-    if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-        return 1;
-
-    if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
-    {
-        gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
-                 l4e_get_flags(l4e) & L4_DISALLOW_MASK);
-        return -EINVAL;
-    }
-
-    rc = get_page_and_type_from_mfn(
-        l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
-    if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
-        rc = 0;
-
-    return rc;
-}
-
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
 {
     unsigned long     pfn = l1e_get_pfn(l1e);
@@ -1199,306 +889,6 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain 
*l1e_owner)
     }
 }
 
-
-/*
- * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
- * Note also that this automatically deals correctly with linear p.t.'s.
- */
-static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
-{
-    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
-        return 1;
-
-    if ( l2e_get_flags(l2e) & _PAGE_PSE )
-    {
-        struct page_info *page = l2e_get_page(l2e);
-        unsigned int i;
-
-        for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
-            put_page_and_type(page);
-    }
-    else
-    {
-        struct page_info *pg = l2e_get_page(l2e);
-        int rc = put_page_type_ptpg(pg, mfn_to_page(_mfn(pfn)));
-
-        ASSERT(!rc);
-        put_page(pg);
-    }
-
-    return 0;
-}
-
-static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-                             int partial, bool defer)
-{
-    struct page_info *pg;
-    int rc;
-
-    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
-        return 1;
-
-    if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
-    {
-        unsigned long mfn = l3e_get_pfn(l3e);
-        int writeable = l3e_get_flags(l3e) & _PAGE_RW;
-
-        ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
-        do {
-            put_data_page(mfn_to_page(_mfn(mfn)), writeable);
-        } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
-
-        return 0;
-    }
-
-    pg = l3e_get_page(l3e);
-
-    if ( unlikely(partial > 0) )
-    {
-        ASSERT(!defer);
-        return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
-    }
-
-    if ( defer )
-    {
-        current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
-        current->arch.old_guest_table = pg;
-        return 0;
-    }
-
-    rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
-    if ( likely(!rc) )
-        put_page(pg);
-
-    return rc;
-}
-
-static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-                             int partial, bool defer)
-{
-    int rc = 1;
-
-    if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
-         (l4e_get_pfn(l4e) != pfn) )
-    {
-        struct page_info *pg = l4e_get_page(l4e);
-
-        if ( unlikely(partial > 0) )
-        {
-            ASSERT(!defer);
-            return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
-        }
-
-        if ( defer )
-        {
-            current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
-            current->arch.old_guest_table = pg;
-            return 0;
-        }
-
-        rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
-        if ( likely(!rc) )
-            put_page(pg);
-    }
-
-    return rc;
-}
-
-static int alloc_l1_table(struct page_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    l1_pgentry_t  *pl1e;
-    unsigned int   i;
-    int            ret = 0;
-
-    pl1e = __map_domain_page(page);
-
-    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-    {
-        switch ( ret = get_page_from_l1e(pl1e[i], d, d, l1_disallow_mask(d)) )
-        {
-        default:
-            goto fail;
-        case 0:
-            break;
-        case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
-            ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
-            l1e_flip_flags(pl1e[i], ret);
-            break;
-        }
-
-        pl1e[i] = adjust_guest_l1e(pl1e[i], d);
-    }
-
-    unmap_domain_page(pl1e);
-    return 0;
-
- fail:
-    gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
-    while ( i-- > 0 )
-        put_page_from_l1e(pl1e[i], d);
-
-    unmap_domain_page(pl1e);
-    return ret;
-}
-
-static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
-{
-    struct page_info *page;
-    l3_pgentry_t     l3e3;
-
-    if ( !is_pv_32bit_domain(d) )
-        return 1;
-
-    pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
-
-    /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
-    l3e3 = pl3e[3];
-    if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
-    {
-        gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
-        return 0;
-    }
-
-    /*
-     * The Xen-private mappings include linear mappings. The L2 thus cannot
-     * be shared by multiple L3 tables. The test here is adequate because:
-     *  1. Cannot appear in slots != 3 because get_page_type() checks the
-     *     PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
-     *  2. Cannot appear in another page table's L3:
-     *     a. alloc_l3_table() calls this function and this check will fail
-     *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
-     */
-    page = l3e_get_page(l3e3);
-    BUG_ON(page->u.inuse.type_info & PGT_pinned);
-    BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
-    BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
-    if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
-    {
-        gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
-        return 0;
-    }
-
-    return 1;
-}
-
-static int alloc_l2_table(struct page_info *page, unsigned long type,
-                          int preemptible)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long  pfn = mfn_x(page_to_mfn(page));
-    l2_pgentry_t  *pl2e;
-    unsigned int   i;
-    int            rc = 0;
-
-    pl2e = map_domain_page(_mfn(pfn));
-
-    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
-    {
-        if ( preemptible && i > page->nr_validated_ptes
-             && hypercall_preempt_check() )
-        {
-            page->nr_validated_ptes = i;
-            rc = -ERESTART;
-            break;
-        }
-
-        if ( !is_guest_l2_slot(d, type, i) ||
-             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
-            continue;
-
-        if ( rc < 0 )
-        {
-            gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", 
i);
-            while ( i-- > 0 )
-                if ( is_guest_l2_slot(d, type, i) )
-                    put_page_from_l2e(pl2e[i], pfn);
-            break;
-        }
-
-        pl2e[i] = adjust_guest_l2e(pl2e[i], d);
-    }
-
-    if ( rc >= 0 && (type & PGT_pae_xen_l2) )
-        init_xen_pae_l2_slots(pl2e, d);
-
-    unmap_domain_page(pl2e);
-    return rc > 0 ? 0 : rc;
-}
-
-static int alloc_l3_table(struct page_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long  pfn = mfn_x(page_to_mfn(page));
-    l3_pgentry_t  *pl3e;
-    unsigned int   i;
-    int            rc = 0, partial = page->partial_pte;
-
-    pl3e = map_domain_page(_mfn(pfn));
-
-    /*
-     * PAE guests allocate full pages, but aren't required to initialize
-     * more than the first four entries; when running in compatibility
-     * mode, however, the full page is visible to the MMU, and hence all
-     * 512 entries must be valid/verified, which is most easily achieved
-     * by clearing them out.
-     */
-    if ( is_pv_32bit_domain(d) )
-        memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
-
-    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
-          i++, partial = 0 )
-    {
-        if ( is_pv_32bit_domain(d) && (i == 3) )
-        {
-            if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
-                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
-                rc = -EINVAL;
-            else
-                rc = get_page_and_type_from_mfn(
-                    l3e_get_mfn(pl3e[i]),
-                    PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
-        }
-        else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
-            continue;
-
-        if ( rc == -ERESTART )
-        {
-            page->nr_validated_ptes = i;
-            page->partial_pte = partial ?: 1;
-        }
-        else if ( rc == -EINTR && i )
-        {
-            page->nr_validated_ptes = i;
-            page->partial_pte = 0;
-            rc = -ERESTART;
-        }
-        if ( rc < 0 )
-            break;
-
-        pl3e[i] = adjust_guest_l3e(pl3e[i], d);
-    }
-
-    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
-        rc = -EINVAL;
-    if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
-    {
-        gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
-        if ( i )
-        {
-            page->nr_validated_ptes = i;
-            page->partial_pte = 0;
-            current->arch.old_guest_ptpg = NULL;
-            current->arch.old_guest_table = page;
-        }
-        while ( i-- > 0 )
-            pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
-    }
-
-    unmap_domain_page(pl3e);
-    return rc > 0 ? 0 : rc;
-}
-
 void init_xen_pae_l2_slots(l2_pgentry_t *l2t, const struct domain *d)
 {
     memcpy(&l2t[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
@@ -1613,186 +1003,6 @@ void zap_ro_mpt(mfn_t mfn)
     unmap_domain_page(l4tab);
 }
 
-static int alloc_l4_table(struct page_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long  pfn = mfn_x(page_to_mfn(page));
-    l4_pgentry_t  *pl4e = map_domain_page(_mfn(pfn));
-    unsigned int   i;
-    int            rc = 0, partial = page->partial_pte;
-
-    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
-          i++, partial = 0 )
-    {
-        if ( !is_guest_l4_slot(d, i) ||
-             (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
-            continue;
-
-        if ( rc == -ERESTART )
-        {
-            page->nr_validated_ptes = i;
-            page->partial_pte = partial ?: 1;
-        }
-        else if ( rc < 0 )
-        {
-            if ( rc != -EINTR )
-                gdprintk(XENLOG_WARNING,
-                         "Failure in alloc_l4_table: slot %#x\n", i);
-            if ( i )
-            {
-                page->nr_validated_ptes = i;
-                page->partial_pte = 0;
-                if ( rc == -EINTR )
-                    rc = -ERESTART;
-                else
-                {
-                    if ( current->arch.old_guest_table )
-                        page->nr_validated_ptes++;
-                    current->arch.old_guest_ptpg = NULL;
-                    current->arch.old_guest_table = page;
-                }
-            }
-        }
-        if ( rc < 0 )
-        {
-            unmap_domain_page(pl4e);
-            return rc;
-        }
-
-        pl4e[i] = adjust_guest_l4e(pl4e[i], d);
-    }
-
-    if ( rc >= 0 )
-    {
-        init_xen_l4_slots(pl4e, _mfn(pfn),
-                          d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
-        atomic_inc(&d->arch.pv_domain.nr_l4_pages);
-        rc = 0;
-    }
-    unmap_domain_page(pl4e);
-
-    return rc;
-}
-
-static void free_l1_table(struct page_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    l1_pgentry_t *pl1e;
-    unsigned int  i;
-
-    pl1e = __map_domain_page(page);
-
-    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
-        put_page_from_l1e(pl1e[i], d);
-
-    unmap_domain_page(pl1e);
-}
-
-
-static int free_l2_table(struct page_info *page, int preemptible)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long pfn = mfn_x(page_to_mfn(page));
-    l2_pgentry_t *pl2e;
-    unsigned int  i = page->nr_validated_ptes - 1;
-    int err = 0;
-
-    pl2e = map_domain_page(_mfn(pfn));
-
-    ASSERT(page->nr_validated_ptes);
-    do {
-        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
-             put_page_from_l2e(pl2e[i], pfn) == 0 &&
-             preemptible && i && hypercall_preempt_check() )
-        {
-           page->nr_validated_ptes = i;
-           err = -ERESTART;
-        }
-    } while ( !err && i-- );
-
-    unmap_domain_page(pl2e);
-
-    if ( !err )
-        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
-
-    return err;
-}
-
-static int free_l3_table(struct page_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long pfn = mfn_x(page_to_mfn(page));
-    l3_pgentry_t *pl3e;
-    int rc = 0, partial = page->partial_pte;
-    unsigned int  i = page->nr_validated_ptes - !partial;
-
-    pl3e = map_domain_page(_mfn(pfn));
-
-    do {
-        rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
-        if ( rc < 0 )
-            break;
-        partial = 0;
-        if ( rc > 0 )
-            continue;
-        pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
-    } while ( i-- );
-
-    unmap_domain_page(pl3e);
-
-    if ( rc == -ERESTART )
-    {
-        page->nr_validated_ptes = i;
-        page->partial_pte = partial ?: -1;
-    }
-    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
-    {
-        page->nr_validated_ptes = i + 1;
-        page->partial_pte = 0;
-        rc = -ERESTART;
-    }
-    return rc > 0 ? 0 : rc;
-}
-
-static int free_l4_table(struct page_info *page)
-{
-    struct domain *d = page_get_owner(page);
-    unsigned long pfn = mfn_x(page_to_mfn(page));
-    l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
-    int rc = 0, partial = page->partial_pte;
-    unsigned int  i = page->nr_validated_ptes - !partial;
-
-    do {
-        if ( is_guest_l4_slot(d, i) )
-            rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
-        if ( rc < 0 )
-            break;
-        partial = 0;
-    } while ( i-- );
-
-    if ( rc == -ERESTART )
-    {
-        page->nr_validated_ptes = i;
-        page->partial_pte = partial ?: -1;
-    }
-    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
-    {
-        page->nr_validated_ptes = i + 1;
-        page->partial_pte = 0;
-        rc = -ERESTART;
-    }
-
-    unmap_domain_page(pl4e);
-
-    if ( rc >= 0 )
-    {
-        atomic_dec(&d->arch.pv_domain.nr_l4_pages);
-        rc = 0;
-    }
-
-    return rc;
-}
-
 #ifndef NDEBUG
 /*
  * We must never call _put_page_type() while holding a page_lock() for
@@ -1876,309 +1086,6 @@ void page_unlock(struct page_info *page)
     current_locked_page_set(NULL);
 }
 
-/*
- * PTE flags that a guest may change without re-validating the PTE.
- * All other bits affect translation, caching, or Xen's safety.
- */
-#define FASTPATH_FLAG_WHITELIST                                     \
-    (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
-     _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
-
-/* Update the L1 entry at pl1e to new value nl1e. */
-static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
-                        unsigned long gl1mfn, int preserve_ad,
-                        struct vcpu *pt_vcpu, struct domain *pg_dom)
-{
-    l1_pgentry_t ol1e;
-    struct domain *pt_dom = pt_vcpu->domain;
-    int rc = 0;
-
-    if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
-        return -EFAULT;
-
-    ASSERT(!paging_mode_refcounts(pt_dom));
-
-    if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
-    {
-        struct page_info *page = NULL;
-
-        if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
-        {
-            gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
-                    l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
-            return -EINVAL;
-        }
-
-        /* Translate foreign guest address. */
-        if ( paging_mode_translate(pg_dom) )
-        {
-            p2m_type_t p2mt;
-            p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
-                            P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
-
-            page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
-
-            if ( p2m_is_paged(p2mt) )
-            {
-                if ( page )
-                    put_page(page);
-                p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
-                return -ENOENT;
-            }
-
-            if ( p2mt == p2m_ram_paging_in && !page )
-                return -ENOENT;
-
-            /* Did our attempt to unshare fail? */
-            if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
-            {
-                /* We could not have obtained a page ref. */
-                ASSERT(!page);
-                /* And mem_sharing_notify has already been called. */
-                return -ENOMEM;
-            }
-
-            if ( !page )
-                return -EINVAL;
-            nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
-        }
-
-        /* Fast path for sufficiently-similar mappings. */
-        if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
-        {
-            nl1e = adjust_guest_l1e(nl1e, pt_dom);
-            rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
-                              preserve_ad);
-            if ( page )
-                put_page(page);
-            return rc ? 0 : -EBUSY;
-        }
-
-        switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom,
-                                        l1_disallow_mask(pt_dom)) )
-        {
-        default:
-            if ( page )
-                put_page(page);
-            return rc;
-        case 0:
-            break;
-        case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
-            ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
-            l1e_flip_flags(nl1e, rc);
-            rc = 0;
-            break;
-        }
-        if ( page )
-            put_page(page);
-
-        nl1e = adjust_guest_l1e(nl1e, pt_dom);
-        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
-                                    preserve_ad)) )
-        {
-            ol1e = nl1e;
-            rc = -EBUSY;
-        }
-    }
-    else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
-                                     preserve_ad)) )
-    {
-        return -EBUSY;
-    }
-
-    put_page_from_l1e(ol1e, pt_dom);
-    return rc;
-}
-
-
-/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
-static int mod_l2_entry(l2_pgentry_t *pl2e,
-                        l2_pgentry_t nl2e,
-                        unsigned long pfn,
-                        int preserve_ad,
-                        struct vcpu *vcpu)
-{
-    l2_pgentry_t ol2e;
-    struct domain *d = vcpu->domain;
-    struct page_info *l2pg = mfn_to_page(_mfn(pfn));
-    unsigned long type = l2pg->u.inuse.type_info;
-    int rc = 0;
-
-    if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
-    {
-        gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
-                 pgentry_ptr_to_slot(pl2e));
-        return -EPERM;
-    }
-
-    if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
-        return -EFAULT;
-
-    if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
-    {
-        if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
-        {
-            gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
-                    l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
-            return -EINVAL;
-        }
-
-        /* Fast path for sufficiently-similar mappings. */
-        if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
-        {
-            nl2e = adjust_guest_l2e(nl2e, d);
-            if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
-                return 0;
-            return -EBUSY;
-        }
-
-        if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
-            return rc;
-
-        nl2e = adjust_guest_l2e(nl2e, d);
-        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
-                                    preserve_ad)) )
-        {
-            ol2e = nl2e;
-            rc = -EBUSY;
-        }
-    }
-    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
-                                     preserve_ad)) )
-    {
-        return -EBUSY;
-    }
-
-    put_page_from_l2e(ol2e, pfn);
-    return rc;
-}
-
-/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
-static int mod_l3_entry(l3_pgentry_t *pl3e,
-                        l3_pgentry_t nl3e,
-                        unsigned long pfn,
-                        int preserve_ad,
-                        struct vcpu *vcpu)
-{
-    l3_pgentry_t ol3e;
-    struct domain *d = vcpu->domain;
-    int rc = 0;
-
-    /*
-     * Disallow updates to final L3 slot. It contains Xen mappings, and it
-     * would be a pain to ensure they remain continuously valid throughout.
-     */
-    if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
-        return -EINVAL;
-
-    if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
-        return -EFAULT;
-
-    if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
-    {
-        if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
-        {
-            gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
-                    l3e_get_flags(nl3e) & l3_disallow_mask(d));
-            return -EINVAL;
-        }
-
-        /* Fast path for sufficiently-similar mappings. */
-        if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
-        {
-            nl3e = adjust_guest_l3e(nl3e, d);
-            rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
-            return rc ? 0 : -EFAULT;
-        }
-
-        rc = get_page_from_l3e(nl3e, pfn, d, 0);
-        if ( unlikely(rc < 0) )
-            return rc;
-        rc = 0;
-
-        nl3e = adjust_guest_l3e(nl3e, d);
-        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
-                                    preserve_ad)) )
-        {
-            ol3e = nl3e;
-            rc = -EFAULT;
-        }
-    }
-    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
-                                     preserve_ad)) )
-    {
-        return -EFAULT;
-    }
-
-    if ( likely(rc == 0) )
-        if ( !create_pae_xen_mappings(d, pl3e) )
-            BUG();
-
-    put_page_from_l3e(ol3e, pfn, 0, 1);
-    return rc;
-}
-
-/* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
-static int mod_l4_entry(l4_pgentry_t *pl4e,
-                        l4_pgentry_t nl4e,
-                        unsigned long pfn,
-                        int preserve_ad,
-                        struct vcpu *vcpu)
-{
-    struct domain *d = vcpu->domain;
-    l4_pgentry_t ol4e;
-    int rc = 0;
-
-    if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
-    {
-        gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
-                 pgentry_ptr_to_slot(pl4e));
-        return -EINVAL;
-    }
-
-    if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
-        return -EFAULT;
-
-    if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
-    {
-        if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
-        {
-            gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
-                    l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
-            return -EINVAL;
-        }
-
-        /* Fast path for sufficiently-similar mappings. */
-        if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
-        {
-            nl4e = adjust_guest_l4e(nl4e, d);
-            rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
-            return rc ? 0 : -EFAULT;
-        }
-
-        rc = get_page_from_l4e(nl4e, pfn, d, 0);
-        if ( unlikely(rc < 0) )
-            return rc;
-        rc = 0;
-
-        nl4e = adjust_guest_l4e(nl4e, d);
-        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
-                                    preserve_ad)) )
-        {
-            ol4e = nl4e;
-            rc = -EFAULT;
-        }
-    }
-    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
-                                     preserve_ad)) )
-    {
-        return -EFAULT;
-    }
-
-    put_page_from_l4e(ol4e, pfn, 0, 1);
-    return rc;
-}
-
 static int cleanup_page_cacheattr(struct page_info *page)
 {
     unsigned int cacheattr =
@@ -2260,222 +1167,27 @@ int get_page(struct page_info *page, struct domain 
*domain)
     return 0;
 }
 
-/*
- * Special version of get_page() to be used exclusively when
- * - a page is known to already have a non-zero reference count
- * - the page does not need its owner to be checked
- * - it will not be called more than once without dropping the thus
- *   acquired reference again.
- * Due to get_page() reserving one reference, this call cannot fail.
- */
-static void get_page_light(struct page_info *page)
-{
-    unsigned long x, nx, y = page->count_info;
-
-    do {
-        x  = y;
-        nx = x + 1;
-        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
-        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
-        y = cmpxchg(&page->count_info, x, nx);
-    }
-    while ( unlikely(y != x) );
-}
-
-int pv_alloc_page_type(struct page_info *page, unsigned long type,
-                       bool preemptible)
+static int _put_page_type(struct page_info *page, bool preemptible,
+                          struct page_info *ptpg)
 {
-    struct domain *owner = page_get_owner(page);
-    int rc;
+    unsigned long nx, x, y = page->u.inuse.type_info;
 
-    /* A page table is dirtied when its type count becomes non-zero. */
-    if ( likely(owner != NULL) )
-        paging_mark_dirty(owner, page_to_mfn(page));
+    ASSERT(current_locked_page_ne_check(page));
 
-    switch ( type & PGT_type_mask )
+    for ( ; ; )
     {
-    case PGT_l1_page_table:
-        rc = alloc_l1_table(page);
-        break;
-    case PGT_l2_page_table:
-        rc = alloc_l2_table(page, type, preemptible);
-        break;
-    case PGT_l3_page_table:
-        ASSERT(preemptible);
-        rc = alloc_l3_table(page);
-        break;
-    case PGT_l4_page_table:
-        ASSERT(preemptible);
-        rc = alloc_l4_table(page);
-        break;
-    case PGT_seg_desc_page:
-        rc = alloc_segdesc_page(page);
-        break;
-    default:
-        printk("Bad type in %s %lx t=%" PRtype_info " c=%lx\n", __func__,
-               type, page->u.inuse.type_info,
-               page->count_info);
-        rc = -EINVAL;
-        BUG();
-    }
+        x  = y;
+        nx = x - 1;
 
-    /* No need for atomic update of type_info here: noone else updates it. */
-    smp_wmb();
-    switch ( rc )
-    {
-    case 0:
-        page->u.inuse.type_info |= PGT_validated;
-        break;
-    case -EINTR:
-        ASSERT((page->u.inuse.type_info &
-                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
-        page->u.inuse.type_info &= ~PGT_count_mask;
-        break;
-    default:
-        ASSERT(rc < 0);
-        gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
-                 " (pfn %" PRI_pfn ") for type %" PRtype_info
-                 ": caf=%08lx taf=%" PRtype_info "\n",
-                 mfn_x(page_to_mfn(page)),
-                 get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
-                 type, page->count_info, page->u.inuse.type_info);
-        if ( page != current->arch.old_guest_table )
-            page->u.inuse.type_info = 0;
-        else
+        ASSERT((x & PGT_count_mask) != 0);
+
+        switch ( nx & (PGT_locked | PGT_count_mask) )
         {
-            ASSERT((page->u.inuse.type_info &
-                    (PGT_count_mask | PGT_validated)) == 1);
-    case -ERESTART:
-            get_page_light(page);
-            page->u.inuse.type_info |= PGT_partial;
-        }
-        break;
-    }
-
-    return rc;
-}
-
-
-int pv_free_page_type(struct page_info *page, unsigned long type,
-                      bool preemptible)
-{
-    struct domain *owner = page_get_owner(page);
-    unsigned long gmfn;
-    int rc;
-
-    if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
-    {
-        /* A page table is dirtied when its type count becomes zero. */
-        paging_mark_dirty(owner, page_to_mfn(page));
-
-        ASSERT(!shadow_mode_refcounts(owner));
-
-        gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
-        ASSERT(VALID_M2P(gmfn));
-        /* Page sharing not supported for shadowed domains */
-        if(!SHARED_M2P(gmfn))
-            shadow_remove_all_shadows(owner, _mfn(gmfn));
-    }
-
-    if ( !(type & PGT_partial) )
-    {
-        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
-        page->partial_pte = 0;
-    }
-
-    switch ( type & PGT_type_mask )
-    {
-    case PGT_l1_page_table:
-        free_l1_table(page);
-        rc = 0;
-        break;
-    case PGT_l2_page_table:
-        rc = free_l2_table(page, preemptible);
-        break;
-    case PGT_l3_page_table:
-        ASSERT(preemptible);
-        rc = free_l3_table(page);
-        break;
-    case PGT_l4_page_table:
-        ASSERT(preemptible);
-        rc = free_l4_table(page);
-        break;
-    default:
-        gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
-                 type, mfn_x(page_to_mfn(page)));
-        rc = -EINVAL;
-        BUG();
-    }
-
-    return rc;
-}
-
-void pv_dec_linear_pt(struct page_info *ptpg, struct page_info *page,
-                      unsigned long type)
-{
-    if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
-    {
-        ASSERT(is_pv_domain(page_get_owner(page)));
-        ASSERT(is_pv_domain(page_get_owner(ptpg)));
-
-        dec_linear_uses(page);
-        dec_linear_entries(ptpg);
-    }
-}
-
-int pv_put_final_page_type(struct page_info *page, unsigned long type,
-                           bool preemptible, struct page_info *ptpg)
-{
-    int rc = pv_free_page_type(page, type, preemptible);
-
-    /* No need for atomic update of type_info here: noone else updates it. */
-    if ( rc == 0 )
-    {
-        pv_dec_linear_pt(ptpg, page, type);
-        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
-        set_tlbflush_timestamp(page);
-        smp_wmb();
-        page->u.inuse.type_info--;
-    }
-    else if ( rc == -EINTR )
-    {
-        ASSERT((page->u.inuse.type_info &
-                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
-        smp_wmb();
-        page->u.inuse.type_info |= PGT_validated;
-    }
-    else
-    {
-        BUG_ON(rc != -ERESTART);
-        smp_wmb();
-        get_page_light(page);
-        page->u.inuse.type_info |= PGT_partial;
-    }
-
-    return rc;
-}
-
-static int _put_page_type(struct page_info *page, bool preemptible,
-                          struct page_info *ptpg)
-{
-    unsigned long nx, x, y = page->u.inuse.type_info;
-
-    ASSERT(current_locked_page_ne_check(page));
-
-    for ( ; ; )
-    {
-        x  = y;
-        nx = x - 1;
-
-        ASSERT((x & PGT_count_mask) != 0);
-
-        switch ( nx & (PGT_locked | PGT_count_mask) )
-        {
-        case 0:
-            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
-                 likely(nx & (PGT_validated|PGT_partial)) )
-            {
-                int rc;
+        case 0:
+            if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
+                 likely(nx & (PGT_validated|PGT_partial)) )
+            {
+                int rc;
 
                 /*
                  * Only PV guests can enter this branch. HAP guests
@@ -2817,1141 +1529,101 @@ int vcpu_destroy_pagetables(struct vcpu *v)
     return rc != -EINTR ? rc : -ERESTART;
 }
 
-int new_guest_cr3(mfn_t mfn)
+int donate_page(
+    struct domain *d, struct page_info *page, unsigned int memflags)
 {
-    struct vcpu *curr = current;
-    struct domain *d = curr->domain;
-    int rc;
-    mfn_t old_base_mfn;
-
-    if ( is_pv_32bit_domain(d) )
-    {
-        mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
-        l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
-
-        rc = mod_l4_entry(pl4e,
-                          l4e_from_mfn(mfn,
-                                       (_PAGE_PRESENT | _PAGE_RW |
-                                        _PAGE_USER | _PAGE_ACCESSED)),
-                          mfn_x(gt_mfn), 0, curr);
-        unmap_domain_page(pl4e);
-        switch ( rc )
-        {
-        case 0:
-            break;
-        case -EINTR:
-        case -ERESTART:
-            return -ERESTART;
-        default:
-            gdprintk(XENLOG_WARNING,
-                     "Error while installing new compat baseptr %" PRI_mfn 
"\n",
-                     mfn_x(mfn));
-            return rc;
-        }
-
-        pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
-        write_ptbase(curr);
-
-        return 0;
-    }
-
-    rc = put_old_guest_table(curr);
-    if ( unlikely(rc) )
-        return rc;
-
-    old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
-    /*
-     * This is particularly important when getting restarted after the
-     * previous attempt got preempted in the put-old-MFN phase.
-     */
-    if ( mfn_eq(old_base_mfn, mfn) )
-    {
-        write_ptbase(curr);
-        return 0;
-    }
-
-    rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
-    switch ( rc )
-    {
-    case 0:
-        break;
-    case -EINTR:
-    case -ERESTART:
-        return -ERESTART;
-    default:
-        gdprintk(XENLOG_WARNING,
-                 "Error while installing new baseptr %" PRI_mfn "\n",
-                 mfn_x(mfn));
-        return rc;
-    }
-
-    pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
-
-    if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
-        fill_ro_mpt(mfn);
-    curr->arch.guest_table = pagetable_from_mfn(mfn);
-    update_cr3(curr);
-
-    write_ptbase(curr);
-
-    if ( likely(mfn_x(old_base_mfn) != 0) )
-    {
-        struct page_info *page = mfn_to_page(old_base_mfn);
-
-        if ( paging_mode_refcounts(d) )
-            put_page(page);
-        else
-            switch ( rc = put_page_and_type_preemptible(page) )
-            {
-            case -EINTR:
-                rc = -ERESTART;
-                /* fallthrough */
-            case -ERESTART:
-                curr->arch.old_guest_ptpg = NULL;
-                curr->arch.old_guest_table = page;
-                break;
-            default:
-                BUG_ON(rc);
-                break;
-            }
-    }
+    const struct domain *owner = dom_xen;
 
-    return rc;
-}
+    spin_lock(&d->page_alloc_lock);
 
-static struct domain *get_pg_owner(domid_t domid)
-{
-    struct domain *pg_owner = NULL, *curr = current->domain;
+    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
+        goto fail;
 
-    if ( likely(domid == DOMID_SELF) )
-    {
-        pg_owner = rcu_lock_current_domain();
-        goto out;
-    }
+    if ( d->is_dying )
+        goto fail;
 
-    if ( unlikely(domid == curr->domain_id) )
-    {
-        gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
-        goto out;
-    }
+    if ( page->count_info & ~(PGC_allocated | 1) )
+        goto fail;
 
-    switch ( domid )
+    if ( !(memflags & MEMF_no_refcount) )
     {
-    case DOMID_IO:
-        pg_owner = rcu_lock_domain(dom_io);
-        break;
-    case DOMID_XEN:
-        pg_owner = rcu_lock_domain(dom_xen);
-        break;
-    default:
-        if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
-        {
-            gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
-            break;
-        }
-        break;
+        if ( d->tot_pages >= d->max_pages )
+            goto fail;
+        domain_adjust_tot_pages(d, 1);
     }
 
- out:
-    return pg_owner;
-}
-
-static void put_pg_owner(struct domain *pg_owner)
-{
-    rcu_unlock_domain(pg_owner);
-}
-
-static inline int vcpumask_to_pcpumask(
-    struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t 
*pmask)
-{
-    unsigned int vcpu_id, vcpu_bias, offs;
-    unsigned long vmask;
-    struct vcpu *v;
-    bool is_native = !is_pv_32bit_domain(d);
-
-    cpumask_clear(pmask);
-    for ( vmask = 0, offs = 0; ; ++offs )
-    {
-        vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
-        if ( vcpu_bias >= d->max_vcpus )
-            return 0;
+    page->count_info = PGC_allocated | 1;
+    page_set_owner(page, d);
+    page_list_add_tail(page,&d->page_list);
 
-        if ( unlikely(is_native ?
-                      copy_from_guest_offset(&vmask, bmap, offs, 1) :
-                      copy_from_guest_offset((unsigned int *)&vmask, bmap,
-                                             offs, 1)) )
-        {
-            cpumask_clear(pmask);
-            return -EFAULT;
-        }
+    spin_unlock(&d->page_alloc_lock);
+    return 0;
 
-        while ( vmask )
-        {
-            vcpu_id = find_first_set_bit(vmask);
-            vmask &= ~(1UL << vcpu_id);
-            vcpu_id += vcpu_bias;
-            if ( (vcpu_id >= d->max_vcpus) )
-                return 0;
-            if ( ((v = d->vcpu[vcpu_id]) != NULL) && vcpu_cpu_dirty(v) )
-                __cpumask_set_cpu(v->dirty_cpu, pmask);
-        }
-    }
+ fail:
+    spin_unlock(&d->page_alloc_lock);
+    gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
+             " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
+             mfn_x(page_to_mfn(page)), d->domain_id,
+             owner ? owner->domain_id : DOMID_INVALID,
+             page->count_info, page->u.inuse.type_info);
+    return -EINVAL;
 }
 
-long do_mmuext_op(
-    XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
-    unsigned int count,
-    XEN_GUEST_HANDLE_PARAM(uint) pdone,
-    unsigned int foreigndom)
+int steal_page(
+    struct domain *d, struct page_info *page, unsigned int memflags)
 {
-    struct mmuext_op op;
-    unsigned long type;
-    unsigned int i, done = 0;
-    struct vcpu *curr = current;
-    struct domain *currd = curr->domain;
-    struct domain *pg_owner;
-    int rc = put_old_guest_table(curr);
-
-    if ( unlikely(rc) )
-    {
-        if ( likely(rc == -ERESTART) )
-            rc = hypercall_create_continuation(
-                     __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
-                     foreigndom);
-        return rc;
-    }
-
-    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
-         likely(guest_handle_is_null(uops)) )
-    {
-        /*
-         * See the curr->arch.old_guest_table related
-         * hypercall_create_continuation() below.
-         */
-        return (int)foreigndom;
-    }
-
-    if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
-    {
-        count &= ~MMU_UPDATE_PREEMPTED;
-        if ( unlikely(!guest_handle_is_null(pdone)) )
-            (void)copy_from_guest(&done, pdone, 1);
-    }
-    else
-        perfc_incr(calls_to_mmuext_op);
-
-    if ( unlikely(!guest_handle_okay(uops, count)) )
-        return -EFAULT;
-
-    if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
-        return -ESRCH;
-
-    if ( !is_pv_domain(pg_owner) )
-    {
-        put_pg_owner(pg_owner);
-        return -EINVAL;
-    }
-
-    rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
-    if ( rc )
-    {
-        put_pg_owner(pg_owner);
-        return rc;
-    }
-
-    for ( i = 0; i < count; i++ )
-    {
-        if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
-        {
-            rc = -ERESTART;
-            break;
-        }
+    unsigned long x, y;
+    bool drop_dom_ref = false;
+    const struct domain *owner = dom_xen;
 
-        if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
-        {
-            rc = -EFAULT;
-            break;
-        }
+    if ( paging_mode_external(d) )
+        return -EOPNOTSUPP;
 
-        if ( is_hvm_domain(currd) )
-        {
-            switch ( op.cmd )
-            {
-            case MMUEXT_PIN_L1_TABLE:
-            case MMUEXT_PIN_L2_TABLE:
-            case MMUEXT_PIN_L3_TABLE:
-            case MMUEXT_PIN_L4_TABLE:
-            case MMUEXT_UNPIN_TABLE:
-                break;
-            default:
-                rc = -EOPNOTSUPP;
-                goto done;
-            }
-        }
+    spin_lock(&d->page_alloc_lock);
 
-        rc = 0;
+    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
+        goto fail;
 
-        switch ( op.cmd )
-        {
-            struct page_info *page;
-            p2m_type_t p2mt;
+    /*
+     * We require there is just one reference (PGC_allocated). We temporarily
+     * drop this reference now so that we can safely swizzle the owner.
+     */
+    y = page->count_info;
+    do {
+        x = y;
+        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
+            goto fail;
+        y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
+    } while ( y != x );
 
-        case MMUEXT_PIN_L1_TABLE:
-            type = PGT_l1_page_table;
-            goto pin_page;
+    /*
+     * With the sole reference dropped temporarily, no-one can update type
+     * information. Type count also needs to be zero in this case, but e.g.
+     * PGT_seg_desc_page may still have PGT_validated set, which we need to
+     * clear before transferring ownership (as validation criteria vary
+     * depending on domain type).
+     */
+    BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
+                                      PGT_pinned));
+    page->u.inuse.type_info = 0;
 
-        case MMUEXT_PIN_L2_TABLE:
-            type = PGT_l2_page_table;
-            goto pin_page;
+    /* Swizzle the owner then reinstate the PGC_allocated reference. */
+    page_set_owner(page, NULL);
+    y = page->count_info;
+    do {
+        x = y;
+        BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
+    } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
 
-        case MMUEXT_PIN_L3_TABLE:
-            type = PGT_l3_page_table;
-            goto pin_page;
+    /* Unlink from original owner. */
+    if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
+        drop_dom_ref = true;
+    page_list_del(page, &d->page_list);
 
-        case MMUEXT_PIN_L4_TABLE:
-            if ( is_pv_32bit_domain(pg_owner) )
-                break;
-            type = PGT_l4_page_table;
-
-        pin_page:
-            /* Ignore pinning of invalid paging levels. */
-            if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
-                break;
-
-            if ( paging_mode_refcounts(pg_owner) )
-                break;
-
-            page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
-            if ( unlikely(!page) )
-            {
-                rc = -EINVAL;
-                break;
-            }
-
-            rc = get_page_type_preemptible(page, type);
-            if ( unlikely(rc) )
-            {
-                if ( rc == -EINTR )
-                    rc = -ERESTART;
-                else if ( rc != -ERESTART )
-                    gdprintk(XENLOG_WARNING,
-                             "Error %d while pinning mfn %" PRI_mfn "\n",
-                             rc, mfn_x(page_to_mfn(page)));
-                if ( page != curr->arch.old_guest_table )
-                    put_page(page);
-                break;
-            }
-
-            rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
-            if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
-                                                  &page->u.inuse.type_info)) )
-            {
-                gdprintk(XENLOG_WARNING,
-                         "mfn %" PRI_mfn " already pinned\n",
-                         mfn_x(page_to_mfn(page)));
-                rc = -EINVAL;
-            }
-
-            if ( unlikely(rc) )
-                goto pin_drop;
-
-            /* A page is dirtied when its pin status is set. */
-            paging_mark_dirty(pg_owner, page_to_mfn(page));
-
-            /* We can race domain destruction (domain_relinquish_resources). */
-            if ( unlikely(pg_owner != currd) )
-            {
-                bool drop_ref;
-
-                spin_lock(&pg_owner->page_alloc_lock);
-                drop_ref = (pg_owner->is_dying &&
-                            test_and_clear_bit(_PGT_pinned,
-                                               &page->u.inuse.type_info));
-                spin_unlock(&pg_owner->page_alloc_lock);
-                if ( drop_ref )
-                {
-        pin_drop:
-                    if ( type == PGT_l1_page_table )
-                        put_page_and_type(page);
-                    else
-                    {
-                        curr->arch.old_guest_ptpg = NULL;
-                        curr->arch.old_guest_table = page;
-                    }
-                }
-            }
-            break;
-
-        case MMUEXT_UNPIN_TABLE:
-            if ( paging_mode_refcounts(pg_owner) )
-                break;
-
-            page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
-            if ( unlikely(!page) )
-            {
-                gdprintk(XENLOG_WARNING,
-                         "mfn %" PRI_mfn " bad, or bad owner d%d\n",
-                         op.arg1.mfn, pg_owner->domain_id);
-                rc = -EINVAL;
-                break;
-            }
-
-            if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
-            {
-                put_page(page);
-                gdprintk(XENLOG_WARNING,
-                         "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
-                rc = -EINVAL;
-                break;
-            }
-
-            switch ( rc = put_page_and_type_preemptible(page) )
-            {
-            case -EINTR:
-            case -ERESTART:
-                curr->arch.old_guest_ptpg = NULL;
-                curr->arch.old_guest_table = page;
-                rc = 0;
-                break;
-            default:
-                BUG_ON(rc);
-                break;
-            }
-            put_page(page);
-
-            /* A page is dirtied when its pin status is cleared. */
-            paging_mark_dirty(pg_owner, page_to_mfn(page));
-            break;
-
-        case MMUEXT_NEW_BASEPTR:
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( unlikely(paging_mode_translate(currd)) )
-                rc = -EINVAL;
-            else
-                rc = new_guest_cr3(_mfn(op.arg1.mfn));
-            break;
-
-        case MMUEXT_NEW_USER_BASEPTR: {
-            unsigned long old_mfn;
-
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( unlikely(paging_mode_translate(currd)) )
-                rc = -EINVAL;
-            if ( unlikely(rc) )
-                break;
-
-            old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
-            /*
-             * This is particularly important when getting restarted after the
-             * previous attempt got preempted in the put-old-MFN phase.
-             */
-            if ( old_mfn == op.arg1.mfn )
-                break;
-
-            if ( op.arg1.mfn != 0 )
-            {
-                rc = get_page_and_type_from_mfn(
-                    _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
-
-                if ( unlikely(rc) )
-                {
-                    if ( rc == -EINTR )
-                        rc = -ERESTART;
-                    else if ( rc != -ERESTART )
-                        gdprintk(XENLOG_WARNING,
-                                 "Error %d installing new mfn %" PRI_mfn "\n",
-                                 rc, op.arg1.mfn);
-                    break;
-                }
-
-                if ( VM_ASSIST(currd, m2p_strict) )
-                    zap_ro_mpt(_mfn(op.arg1.mfn));
-            }
-
-            curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
-
-            if ( old_mfn != 0 )
-            {
-                page = mfn_to_page(_mfn(old_mfn));
-
-                switch ( rc = put_page_and_type_preemptible(page) )
-                {
-                case -EINTR:
-                    rc = -ERESTART;
-                    /* fallthrough */
-                case -ERESTART:
-                    curr->arch.old_guest_ptpg = NULL;
-                    curr->arch.old_guest_table = page;
-                    break;
-                default:
-                    BUG_ON(rc);
-                    break;
-                }
-            }
-
-            break;
-        }
-
-        case MMUEXT_TLB_FLUSH_LOCAL:
-            if ( likely(currd == pg_owner) )
-                flush_tlb_local();
-            else
-                rc = -EPERM;
-            break;
-
-        case MMUEXT_INVLPG_LOCAL:
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else
-                paging_invlpg(curr, op.arg1.linear_addr);
-            break;
-
-        case MMUEXT_TLB_FLUSH_MULTI:
-        case MMUEXT_INVLPG_MULTI:
-        {
-            cpumask_t *mask = this_cpu(scratch_cpumask);
-
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( unlikely(vcpumask_to_pcpumask(currd,
-                                   guest_handle_to_param(op.arg2.vcpumask,
-                                                         const_void),
-                                   mask)) )
-                rc = -EINVAL;
-            if ( unlikely(rc) )
-                break;
-
-            if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
-                flush_tlb_mask(mask);
-            else if ( __addr_ok(op.arg1.linear_addr) )
-                flush_tlb_one_mask(mask, op.arg1.linear_addr);
-            break;
-        }
-
-        case MMUEXT_TLB_FLUSH_ALL:
-            if ( likely(currd == pg_owner) )
-                flush_tlb_mask(currd->dirty_cpumask);
-            else
-                rc = -EPERM;
-            break;
-
-        case MMUEXT_INVLPG_ALL:
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( __addr_ok(op.arg1.linear_addr) )
-                flush_tlb_one_mask(currd->dirty_cpumask, op.arg1.linear_addr);
-            break;
-
-        case MMUEXT_FLUSH_CACHE:
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( unlikely(!cache_flush_permitted(currd)) )
-                rc = -EACCES;
-            else
-                wbinvd();
-            break;
-
-        case MMUEXT_FLUSH_CACHE_GLOBAL:
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( likely(cache_flush_permitted(currd)) )
-            {
-                unsigned int cpu;
-                cpumask_t *mask = this_cpu(scratch_cpumask);
-
-                cpumask_clear(mask);
-                for_each_online_cpu(cpu)
-                    if ( !cpumask_intersects(mask,
-                                             per_cpu(cpu_sibling_mask, cpu)) )
-                        __cpumask_set_cpu(cpu, mask);
-                flush_mask(mask, FLUSH_CACHE);
-            }
-            else
-                rc = -EINVAL;
-            break;
-
-        case MMUEXT_SET_LDT:
-        {
-            unsigned int ents = op.arg2.nr_ents;
-            unsigned long ptr = ents ? op.arg1.linear_addr : 0;
-
-            if ( unlikely(currd != pg_owner) )
-                rc = -EPERM;
-            else if ( paging_mode_external(currd) )
-                rc = -EINVAL;
-            else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
-                      (ents > 8192) )
-            {
-                gdprintk(XENLOG_WARNING,
-                         "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
-                rc = -EINVAL;
-            }
-            else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
-                      (curr->arch.pv_vcpu.ldt_base != ptr) )
-            {
-                if ( pv_destroy_ldt(curr) )
-                    flush_tlb_local();
-
-                curr->arch.pv_vcpu.ldt_base = ptr;
-                curr->arch.pv_vcpu.ldt_ents = ents;
-                load_LDT(curr);
-            }
-            break;
-        }
-
-        case MMUEXT_CLEAR_PAGE:
-            page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
-            if ( unlikely(p2mt != p2m_ram_rw) && page )
-            {
-                put_page(page);
-                page = NULL;
-            }
-            if ( !page || !get_page_type(page, PGT_writable_page) )
-            {
-                if ( page )
-                    put_page(page);
-                gdprintk(XENLOG_WARNING,
-                         "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
-                rc = -EINVAL;
-                break;
-            }
-
-            /* A page is dirtied when it's being cleared. */
-            paging_mark_dirty(pg_owner, page_to_mfn(page));
-
-            clear_domain_page(page_to_mfn(page));
-
-            put_page_and_type(page);
-            break;
-
-        case MMUEXT_COPY_PAGE:
-        {
-            struct page_info *src_page, *dst_page;
-
-            src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
-                                         P2M_ALLOC);
-            if ( unlikely(p2mt != p2m_ram_rw) && src_page )
-            {
-                put_page(src_page);
-                src_page = NULL;
-            }
-            if ( unlikely(!src_page) )
-            {
-                gdprintk(XENLOG_WARNING,
-                         "Error copying from mfn %" PRI_mfn "\n",
-                         op.arg2.src_mfn);
-                rc = -EINVAL;
-                break;
-            }
-
-            dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
-                                         P2M_ALLOC);
-            if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
-            {
-                put_page(dst_page);
-                dst_page = NULL;
-            }
-            rc = (dst_page &&
-                  get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
-            if ( unlikely(rc) )
-            {
-                put_page(src_page);
-                if ( dst_page )
-                    put_page(dst_page);
-                gdprintk(XENLOG_WARNING,
-                         "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
-                break;
-            }
-
-            /* A page is dirtied when it's being copied to. */
-            paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
-
-            copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
-
-            put_page_and_type(dst_page);
-            put_page(src_page);
-            break;
-        }
-
-        case MMUEXT_MARK_SUPER:
-        case MMUEXT_UNMARK_SUPER:
-            rc = -EOPNOTSUPP;
-            break;
-
-        default:
-            rc = -ENOSYS;
-            break;
-        }
-
- done:
-        if ( unlikely(rc) )
-            break;
-
-        guest_handle_add_offset(uops, 1);
-    }
-
-    if ( rc == -ERESTART )
-    {
-        ASSERT(i < count);
-        rc = hypercall_create_continuation(
-            __HYPERVISOR_mmuext_op, "hihi",
-            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
-    }
-    else if ( curr->arch.old_guest_table )
-    {
-        XEN_GUEST_HANDLE_PARAM(void) null;
-
-        ASSERT(rc || i == count);
-        set_xen_guest_handle(null, NULL);
-        /*
-         * In order to have a way to communicate the final return value to
-         * our continuation, we pass this in place of "foreigndom", building
-         * on the fact that this argument isn't needed anymore.
-         */
-        rc = hypercall_create_continuation(
-                __HYPERVISOR_mmuext_op, "hihi", null,
-                MMU_UPDATE_PREEMPTED, null, rc);
-    }
-
-    put_pg_owner(pg_owner);
-
-    perfc_add(num_mmuext_ops, i);
-
-    /* Add incremental work we have done to the @done output parameter. */
-    if ( unlikely(!guest_handle_is_null(pdone)) )
-    {
-        done += i;
-        copy_to_guest(pdone, &done, 1);
-    }
-
-    return rc;
-}
-
-long do_mmu_update(
-    XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
-    unsigned int count,
-    XEN_GUEST_HANDLE_PARAM(uint) pdone,
-    unsigned int foreigndom)
-{
-    struct mmu_update req;
-    void *va = NULL;
-    unsigned long gpfn, gmfn, mfn;
-    struct page_info *page;
-    unsigned int cmd, i = 0, done = 0, pt_dom;
-    struct vcpu *curr = current, *v = curr;
-    struct domain *d = v->domain, *pt_owner = d, *pg_owner;
-    mfn_t map_mfn = INVALID_MFN;
-    bool sync_guest = false;
-    uint32_t xsm_needed = 0;
-    uint32_t xsm_checked = 0;
-    int rc = put_old_guest_table(curr);
-
-    if ( unlikely(rc) )
-    {
-        if ( likely(rc == -ERESTART) )
-            rc = hypercall_create_continuation(
-                     __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
-                     foreigndom);
-        return rc;
-    }
-
-    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
-         likely(guest_handle_is_null(ureqs)) )
-    {
-        /*
-         * See the curr->arch.old_guest_table related
-         * hypercall_create_continuation() below.
-         */
-        return (int)foreigndom;
-    }
-
-    if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
-    {
-        count &= ~MMU_UPDATE_PREEMPTED;
-        if ( unlikely(!guest_handle_is_null(pdone)) )
-            (void)copy_from_guest(&done, pdone, 1);
-    }
-    else
-        perfc_incr(calls_to_mmu_update);
-
-    if ( unlikely(!guest_handle_okay(ureqs, count)) )
-        return -EFAULT;
-
-    if ( (pt_dom = foreigndom >> 16) != 0 )
-    {
-        /* Pagetables belong to a foreign domain (PFD). */
-        if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
-            return -ESRCH;
-
-        if ( pt_owner == d )
-            rcu_unlock_domain(pt_owner);
-        else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
-        {
-            rc = -EINVAL;
-            goto out;
-        }
-    }
-
-    if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
-    {
-        rc = -ESRCH;
-        goto out;
-    }
-
-    for ( i = 0; i < count; i++ )
-    {
-        if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
-        {
-            rc = -ERESTART;
-            break;
-        }
-
-        if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
-        {
-            rc = -EFAULT;
-            break;
-        }
-
-        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
-
-        switch ( cmd )
-        {
-            /*
-             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
-             * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
-             * current A/D bits.
-             */
-        case MMU_NORMAL_PT_UPDATE:
-        case MMU_PT_UPDATE_PRESERVE_AD:
-        {
-            p2m_type_t p2mt;
-
-            rc = -EOPNOTSUPP;
-            if ( unlikely(paging_mode_refcounts(pt_owner)) )
-                break;
-
-            xsm_needed |= XSM_MMU_NORMAL_UPDATE;
-            if ( get_pte_flags(req.val) & _PAGE_PRESENT )
-            {
-                xsm_needed |= XSM_MMU_UPDATE_READ;
-                if ( get_pte_flags(req.val) & _PAGE_RW )
-                    xsm_needed |= XSM_MMU_UPDATE_WRITE;
-            }
-            if ( xsm_needed != xsm_checked )
-            {
-                rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, 
xsm_needed);
-                if ( rc )
-                    break;
-                xsm_checked = xsm_needed;
-            }
-            rc = -EINVAL;
-
-            req.ptr -= cmd;
-            gmfn = req.ptr >> PAGE_SHIFT;
-            page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
-
-            if ( unlikely(!page) || p2mt != p2m_ram_rw )
-            {
-                if ( page )
-                    put_page(page);
-                if ( p2m_is_paged(p2mt) )
-                {
-                    p2m_mem_paging_populate(pt_owner, gmfn);
-                    rc = -ENOENT;
-                }
-                else
-                    gdprintk(XENLOG_WARNING,
-                             "Could not get page for normal update\n");
-                break;
-            }
-
-            mfn = mfn_x(page_to_mfn(page));
-
-            if ( !mfn_eq(_mfn(mfn), map_mfn) )
-            {
-                if ( va )
-                    unmap_domain_page(va);
-                va = map_domain_page(_mfn(mfn));
-                map_mfn = _mfn(mfn);
-            }
-            va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
-
-            if ( page_lock(page) )
-            {
-                switch ( page->u.inuse.type_info & PGT_type_mask )
-                {
-                case PGT_l1_page_table:
-                    rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
-                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
-                                      pg_owner);
-                    break;
-
-                case PGT_l2_page_table:
-                    if ( unlikely(pg_owner != pt_owner) )
-                        break;
-                    rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
-                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
-                    break;
-
-                case PGT_l3_page_table:
-                    if ( unlikely(pg_owner != pt_owner) )
-                        break;
-                    rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
-                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
-                    break;
-
-                case PGT_l4_page_table:
-                    if ( unlikely(pg_owner != pt_owner) )
-                        break;
-                    rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
-                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
-                    /*
-                     * No need to sync if all uses of the page can be accounted
-                     * to the page lock we hold, its pinned status, and uses on
-                     * this (v)CPU.
-                     */
-                    if ( !rc && this_cpu(root_pgt) &&
-                         ((page->u.inuse.type_info & PGT_count_mask) >
-                          (1 + !!(page->u.inuse.type_info & PGT_pinned) +
-                           (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
-                           (pagetable_get_pfn(curr->arch.guest_table_user) ==
-                            mfn))) )
-                        sync_guest = true;
-                    break;
-
-                case PGT_writable_page:
-                    perfc_incr(writable_mmu_updates);
-                    if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
-                        rc = 0;
-                    break;
-                }
-                page_unlock(page);
-                if ( rc == -EINTR )
-                    rc = -ERESTART;
-            }
-            else if ( get_page_type(page, PGT_writable_page) )
-            {
-                perfc_incr(writable_mmu_updates);
-                if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
-                    rc = 0;
-                put_page_type(page);
-            }
-
-            put_page(page);
-        }
-        break;
-
-        case MMU_MACHPHYS_UPDATE:
-            if ( unlikely(d != pt_owner) )
-            {
-                rc = -EPERM;
-                break;
-            }
-
-            if ( unlikely(paging_mode_translate(pg_owner)) )
-            {
-                rc = -EINVAL;
-                break;
-            }
-
-            mfn = req.ptr >> PAGE_SHIFT;
-            gpfn = req.val;
-
-            xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
-            if ( xsm_needed != xsm_checked )
-            {
-                rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
-                if ( rc )
-                    break;
-                xsm_checked = xsm_needed;
-            }
-
-            page = get_page_from_mfn(_mfn(mfn), pg_owner);
-            if ( unlikely(!page) )
-            {
-                gdprintk(XENLOG_WARNING,
-                         "Could not get page for mach->phys update\n");
-                rc = -EINVAL;
-                break;
-            }
-
-            set_gpfn_from_mfn(mfn, gpfn);
-
-            paging_mark_dirty(pg_owner, _mfn(mfn));
-
-            put_page(page);
-            break;
-
-        default:
-            rc = -ENOSYS;
-            break;
-        }
-
-        if ( unlikely(rc) )
-            break;
-
-        guest_handle_add_offset(ureqs, 1);
-    }
-
-    if ( rc == -ERESTART )
-    {
-        ASSERT(i < count);
-        rc = hypercall_create_continuation(
-            __HYPERVISOR_mmu_update, "hihi",
-            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
-    }
-    else if ( curr->arch.old_guest_table )
-    {
-        XEN_GUEST_HANDLE_PARAM(void) null;
-
-        ASSERT(rc || i == count);
-        set_xen_guest_handle(null, NULL);
-        /*
-         * In order to have a way to communicate the final return value to
-         * our continuation, we pass this in place of "foreigndom", building
-         * on the fact that this argument isn't needed anymore.
-         */
-        rc = hypercall_create_continuation(
-                __HYPERVISOR_mmu_update, "hihi", null,
-                MMU_UPDATE_PREEMPTED, null, rc);
-    }
-
-    put_pg_owner(pg_owner);
-
-    if ( va )
-        unmap_domain_page(va);
-
-    if ( sync_guest )
-    {
-        /*
-         * Force other vCPU-s of the affected guest to pick up L4 entry
-         * changes (if any). Issue a flush IPI with empty operation mask to
-         * facilitate this (including ourselves waiting for the IPI to
-         * actually have arrived). Utilize the fact that FLUSH_VA_VALID is
-         * meaningless without FLUSH_CACHE, but will allow to pass the no-op
-         * check in flush_area_mask().
-         */
-        unsigned int cpu = smp_processor_id();
-        cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
-
-        cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
-        if ( !cpumask_empty(mask) )
-            flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID);
-    }
-
-    perfc_add(num_page_updates, i);
-
- out:
-    if ( pt_owner != d )
-        rcu_unlock_domain(pt_owner);
-
-    /* Add incremental work we have done to the @done output parameter. */
-    if ( unlikely(!guest_handle_is_null(pdone)) )
-    {
-        done += i;
-        copy_to_guest(pdone, &done, 1);
-    }
-
-    return rc;
-}
-
-int donate_page(
-    struct domain *d, struct page_info *page, unsigned int memflags)
-{
-    const struct domain *owner = dom_xen;
-
-    spin_lock(&d->page_alloc_lock);
-
-    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
-        goto fail;
-
-    if ( d->is_dying )
-        goto fail;
-
-    if ( page->count_info & ~(PGC_allocated | 1) )
-        goto fail;
-
-    if ( !(memflags & MEMF_no_refcount) )
-    {
-        if ( d->tot_pages >= d->max_pages )
-            goto fail;
-        domain_adjust_tot_pages(d, 1);
-    }
-
-    page->count_info = PGC_allocated | 1;
-    page_set_owner(page, d);
-    page_list_add_tail(page,&d->page_list);
-
-    spin_unlock(&d->page_alloc_lock);
-    return 0;
-
- fail:
-    spin_unlock(&d->page_alloc_lock);
-    gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
-             " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
-             mfn_x(page_to_mfn(page)), d->domain_id,
-             owner ? owner->domain_id : DOMID_INVALID,
-             page->count_info, page->u.inuse.type_info);
-    return -EINVAL;
-}
-
-int steal_page(
-    struct domain *d, struct page_info *page, unsigned int memflags)
-{
-    unsigned long x, y;
-    bool drop_dom_ref = false;
-    const struct domain *owner = dom_xen;
-
-    if ( paging_mode_external(d) )
-        return -EOPNOTSUPP;
-
-    spin_lock(&d->page_alloc_lock);
-
-    if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
-        goto fail;
-
-    /*
-     * We require there is just one reference (PGC_allocated). We temporarily
-     * drop this reference now so that we can safely swizzle the owner.
-     */
-    y = page->count_info;
-    do {
-        x = y;
-        if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
-            goto fail;
-        y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
-    } while ( y != x );
-
-    /*
-     * With the sole reference dropped temporarily, no-one can update type
-     * information. Type count also needs to be zero in this case, but e.g.
-     * PGT_seg_desc_page may still have PGT_validated set, which we need to
-     * clear before transferring ownership (as validation criteria vary
-     * depending on domain type).
-     */
-    BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
-                                      PGT_pinned));
-    page->u.inuse.type_info = 0;
-
-    /* Swizzle the owner then reinstate the PGC_allocated reference. */
-    page_set_owner(page, NULL);
-    y = page->count_info;
-    do {
-        x = y;
-        BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
-    } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
-
-    /* Unlink from original owner. */
-    if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
-        drop_dom_ref = true;
-    page_list_del(page, &d->page_list);
-
-    spin_unlock(&d->page_alloc_lock);
-    if ( unlikely(drop_dom_ref) )
-        put_domain(d);
-    return 0;
+    spin_unlock(&d->page_alloc_lock);
+    if ( unlikely(drop_dom_ref) )
+        put_domain(d);
+    return 0;
 
  fail:
     spin_unlock(&d->page_alloc_lock);
@@ -3963,122 +1635,6 @@ int steal_page(
     return -EINVAL;
 }
 
-static int __do_update_va_mapping(
-    unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
-{
-    l1_pgentry_t   val = l1e_from_intpte(val64);
-    struct vcpu   *v   = current;
-    struct domain *d   = v->domain;
-    struct page_info *gl1pg;
-    l1_pgentry_t  *pl1e;
-    unsigned long  bmap_ptr;
-    mfn_t          gl1mfn;
-    cpumask_t     *mask = NULL;
-    int            rc;
-
-    perfc_incr(calls_to_update_va);
-
-    rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
-    if ( rc )
-        return rc;
-
-    rc = -EINVAL;
-    pl1e = map_guest_l1e(va, &gl1mfn);
-    gl1pg = pl1e ? get_page_from_mfn(gl1mfn, d) : NULL;
-    if ( unlikely(!gl1pg) )
-        goto out;
-
-    if ( !page_lock(gl1pg) )
-    {
-        put_page(gl1pg);
-        goto out;
-    }
-
-    if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
-    {
-        page_unlock(gl1pg);
-        put_page(gl1pg);
-        goto out;
-    }
-
-    rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
-
-    page_unlock(gl1pg);
-    put_page(gl1pg);
-
- out:
-    if ( pl1e )
-        unmap_domain_page(pl1e);
-
-    switch ( flags & UVMF_FLUSHTYPE_MASK )
-    {
-    case UVMF_TLB_FLUSH:
-        switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
-        {
-        case UVMF_LOCAL:
-            flush_tlb_local();
-            break;
-        case UVMF_ALL:
-            mask = d->dirty_cpumask;
-            break;
-        default:
-            mask = this_cpu(scratch_cpumask);
-            rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
-                                                                     void),
-                                      mask);
-            break;
-        }
-        if ( mask )
-            flush_tlb_mask(mask);
-        break;
-
-    case UVMF_INVLPG:
-        switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
-        {
-        case UVMF_LOCAL:
-            paging_invlpg(v, va);
-            break;
-        case UVMF_ALL:
-            mask = d->dirty_cpumask;
-            break;
-        default:
-            mask = this_cpu(scratch_cpumask);
-            rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
-                                                                     void),
-                                      mask);
-            break;
-        }
-        if ( mask )
-            flush_tlb_one_mask(mask, va);
-        break;
-    }
-
-    return rc;
-}
-
-long do_update_va_mapping(unsigned long va, u64 val64,
-                          unsigned long flags)
-{
-    return __do_update_va_mapping(va, val64, flags, current->domain);
-}
-
-long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
-                                      unsigned long flags,
-                                      domid_t domid)
-{
-    struct domain *pg_owner;
-    int rc;
-
-    if ( (pg_owner = get_pg_owner(domid)) == NULL )
-        return -ESRCH;
-
-    rc = __do_update_va_mapping(va, val64, flags, pg_owner);
-
-    put_pg_owner(pg_owner);
-
-    return rc;
-}
-
 typedef struct e820entry e820entry_t;
 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
 
diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
index 8d7a4fd85f..6504422ad1 100644
--- a/xen/arch/x86/pv/mm.c
+++ b/xen/arch/x86/pv/mm.c
@@ -20,9 +20,18 @@
  */
 
 #include <xen/guest_access.h>
+#include <xen/hypercall.h>
+#include <xen/lib.h>
+#include <xen/mm.h>
+#include <xsm/xsm.h>
 
 #include <asm/current.h>
+#include <asm/event.h>
+#include <asm/iocap.h>
+#include <asm/ldt.h>
 #include <asm/p2m.h>
+#include <asm/pv/mm.h>
+#include <asm/shadow.h>
 
 #include "mm.h"
 
@@ -133,6 +142,2449 @@ bool pv_map_ldt_shadow_page(unsigned int offset)
     return true;
 }
 
+/*
+ * PTE flags that a guest may change without re-validating the PTE.
+ * All other bits affect translation, caching, or Xen's safety.
+ */
+#define FASTPATH_FLAG_WHITELIST                                     \
+    (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
+     _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
+
+static int get_page_and_type_from_mfn(
+    mfn_t mfn, unsigned long type, struct domain *d,
+    int partial, int preemptible)
+{
+    struct page_info *page = mfn_to_page(mfn);
+    int rc;
+
+    if ( likely(partial >= 0) &&
+         unlikely(!get_page_from_mfn(mfn, d)) )
+        return -EINVAL;
+
+    rc = (preemptible ?
+          get_page_type_preemptible(page, type) :
+          (get_page_type(page, type) ? 0 : -EINVAL));
+
+    if ( unlikely(rc) && partial >= 0 &&
+         (!preemptible || page != current->arch.old_guest_table) )
+        put_page(page);
+
+    return rc;
+}
+
+static void put_data_page(
+    struct page_info *page, int writeable)
+{
+    if ( writeable )
+        put_page_and_type(page);
+    else
+        put_page(page);
+}
+
+static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
+{
+    struct page_info *page;
+    l3_pgentry_t     l3e3;
+
+    if ( !is_pv_32bit_domain(d) )
+        return 1;
+
+    pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
+
+    /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
+    l3e3 = pl3e[3];
+    if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
+    {
+        gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
+        return 0;
+    }
+
+    /*
+     * The Xen-private mappings include linear mappings. The L2 thus cannot
+     * be shared by multiple L3 tables. The test here is adequate because:
+     *  1. Cannot appear in slots != 3 because get_page_type() checks the
+     *     PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
+     *  2. Cannot appear in another page table's L3:
+     *     a. alloc_l3_table() calls this function and this check will fail
+     *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
+     */
+    page = l3e_get_page(l3e3);
+    BUG_ON(page->u.inuse.type_info & PGT_pinned);
+    BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
+    BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
+    if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
+    {
+        gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
+        return 0;
+    }
+
+    return 1;
+}
+
+#ifdef CONFIG_PV_LINEAR_PT
+
+static bool inc_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /*
+         * The check below checks for the "linear use" count being non-zero
+         * as well as overflow.  Signed integer overflow is undefined behavior
+         * according to the C spec.  However, as long as linear_pt_count is
+         * smaller in size than 'int', the arithmetic operation of the
+         * increment below won't overflow; rather the result will be truncated
+         * when stored.  Ensure that this is always true.
+         */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc++;
+        if ( nc <= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
+    ASSERT(oc > 0);
+}
+
+static bool inc_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
+
+    do {
+        /* See the respective comment in inc_linear_entries(). */
+        BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
+        oc = nc--;
+        if ( nc >= 0 )
+            return false;
+        nc = cmpxchg(&pg->linear_pt_count, oc, nc);
+    } while ( oc != nc );
+
+    return true;
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+    typeof(pg->linear_pt_count) oc;
+
+    oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
+    ASSERT(oc < 0);
+}
+
+/*
+ * We allow root tables to map each other (a.k.a. linear page tables). It
+ * needs some special care with reference counts and access permissions:
+ *  1. The mapping entry must be read-only, or the guest may get write access
+ *     to its own PTEs.
+ *  2. We must only bump the reference counts for an *already validated*
+ *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
+ *     on a validation that is required to complete that validation.
+ *  3. We only need to increment the reference counts for the mapped page
+ *     frame if it is mapped by a different root table. This is sufficient and
+ *     also necessary to allow validation of a root table mapping itself.
+ */
+static bool __read_mostly opt_pv_linear_pt = true;
+boolean_param("pv-linear-pt", opt_pv_linear_pt);
+
+#define define_get_linear_pagetable(level)                                  \
+static int                                                                  \
+get_##level##_linear_pagetable(                                             \
+    level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d)         \
+{                                                                           \
+    unsigned long x, y;                                                     \
+    unsigned long pfn;                                                      \
+                                                                            \
+    if ( !opt_pv_linear_pt )                                                \
+    {                                                                       \
+        gdprintk(XENLOG_WARNING,                                            \
+                 "Attempt to create linear p.t. (feature disabled)\n");     \
+        return 0;                                                           \
+    }                                                                       \
+                                                                            \
+    if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
+    {                                                                       \
+        gdprintk(XENLOG_WARNING,                                            \
+                 "Attempt to create linear p.t. with write perms\n");       \
+        return 0;                                                           \
+    }                                                                       \
+                                                                            \
+    if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
+    {                                                                       \
+        struct page_info *page, *ptpg = mfn_to_page(_mfn(pde_pfn));         \
+                                                                            \
+        /* Make sure the page table belongs to the correct domain. */       \
+        if ( unlikely(page_get_owner(ptpg) != d) )                          \
+            return 0;                                                       \
+                                                                            \
+        /* Make sure the mapped frame belongs to the correct domain. */     \
+        page = get_page_from_mfn(_mfn(pfn), d);                             \
+        if ( unlikely(!page) )                                              \
+            return 0;                                                       \
+                                                                            \
+        /*                                                                  \
+         * Ensure that the mapped frame is an already-validated page table  \
+         * and is not itself having linear entries, as well as that the     \
+         * containing page table is not iself in use as a linear page table \
+         * elsewhere.                                                       \
+         * If so, atomically increment the count (checking for overflow).   \
+         */                                                                 \
+        if ( !inc_linear_entries(ptpg) )                                    \
+        {                                                                   \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+        if ( !inc_linear_uses(page) )                                       \
+        {                                                                   \
+            dec_linear_entries(ptpg);                                       \
+            put_page(page);                                                 \
+            return 0;                                                       \
+        }                                                                   \
+        y = page->u.inuse.type_info;                                        \
+        do {                                                                \
+            x = y;                                                          \
+            if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||        \
+                 unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
+                          (PGT_##level##_page_table|PGT_validated)) )       \
+            {                                                               \
+                dec_linear_uses(page);                                      \
+                dec_linear_entries(ptpg);                                   \
+                put_page(page);                                             \
+                return 0;                                                   \
+            }                                                               \
+        }                                                                   \
+        while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );   \
+    }                                                                       \
+                                                                            \
+    return 1;                                                               \
+}
+
+#else /* CONFIG_PV_LINEAR_PT */
+
+#define define_get_linear_pagetable(level)                              \
+static int                                                              \
+get_##level##_linear_pagetable(                                         \
+        level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
+{                                                                       \
+        return 0;                                                       \
+}
+
+static void dec_linear_uses(struct page_info *pg)
+{
+    ASSERT(pg->linear_pt_count == 0);
+}
+
+static void dec_linear_entries(struct page_info *pg)
+{
+    ASSERT(pg->linear_pt_count == 0);
+}
+
+#endif /* CONFIG_PV_LINEAR_PT */
+
+/* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
+/*
+ * get_page_from_l2e returns:
+ *   1 => page not present
+ *   0 => success
+ *  <0 => error code
+ */
+define_get_linear_pagetable(l2);
+static int
+get_page_from_l2e(
+    l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
+{
+    unsigned long mfn = l2e_get_pfn(l2e);
+    int rc;
+
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
+        return 1;
+
+    if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
+    {
+        gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
+                 l2e_get_flags(l2e) & L2_DISALLOW_MASK);
+        return -EINVAL;
+    }
+
+    rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
+    if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
+        rc = 0;
+
+    return rc;
+}
+
+
+/*
+ * get_page_from_l3e returns:
+ *   1 => page not present
+ *   0 => success
+ *  <0 => error code
+ */
+define_get_linear_pagetable(l3);
+static int
+get_page_from_l3e(
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
+{
+    int rc;
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+        return 1;
+
+    if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
+    {
+        gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
+                 l3e_get_flags(l3e) & l3_disallow_mask(d));
+        return -EINVAL;
+    }
+
+    rc = get_page_and_type_from_mfn(
+        l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
+    if ( unlikely(rc == -EINVAL) &&
+         !is_pv_32bit_domain(d) &&
+         get_l3_linear_pagetable(l3e, pfn, d) )
+        rc = 0;
+
+    return rc;
+}
+
+/*
+ * get_page_from_l4e returns:
+ *   1 => page not present
+ *   0 => success
+ *  <0 => error code
+ */
+define_get_linear_pagetable(l4);
+static int
+get_page_from_l4e(
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
+{
+    int rc;
+
+    if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
+        return 1;
+
+    if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
+    {
+        gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
+                 l4e_get_flags(l4e) & L4_DISALLOW_MASK);
+        return -EINVAL;
+    }
+
+    rc = get_page_and_type_from_mfn(
+        l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
+    if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
+        rc = 0;
+
+    return rc;
+}
+
+/*
+ * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
+ * Note also that this automatically deals correctly with linear p.t.'s.
+ */
+static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
+{
+    if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
+        return 1;
+
+    if ( l2e_get_flags(l2e) & _PAGE_PSE )
+    {
+        struct page_info *page = l2e_get_page(l2e);
+        unsigned int i;
+
+        for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
+            put_page_and_type(page);
+    }
+    else
+    {
+        struct page_info *pg = l2e_get_page(l2e);
+        int rc = put_page_type_ptpg(pg, mfn_to_page(_mfn(pfn)));
+
+        ASSERT(!rc);
+        put_page(pg);
+    }
+
+    return 0;
+}
+
+static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
+                             int partial, bool defer)
+{
+    struct page_info *pg;
+    int rc;
+
+    if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
+        return 1;
+
+    if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
+    {
+        unsigned long mfn = l3e_get_pfn(l3e);
+        int writeable = l3e_get_flags(l3e) & _PAGE_RW;
+
+        ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
+        do {
+            put_data_page(mfn_to_page(_mfn(mfn)), writeable);
+        } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
+
+        return 0;
+    }
+
+    pg = l3e_get_page(l3e);
+
+    if ( unlikely(partial > 0) )
+    {
+        ASSERT(!defer);
+        return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+    }
+
+    if ( defer )
+    {
+        current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+        current->arch.old_guest_table = pg;
+        return 0;
+    }
+
+    rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+    if ( likely(!rc) )
+        put_page(pg);
+
+    return rc;
+}
+
+static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
+                             int partial, bool defer)
+{
+    int rc = 1;
+
+    if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
+         (l4e_get_pfn(l4e) != pfn) )
+    {
+        struct page_info *pg = l4e_get_page(l4e);
+
+        if ( unlikely(partial > 0) )
+        {
+            ASSERT(!defer);
+            return put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+        }
+
+        if ( defer )
+        {
+            current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
+            current->arch.old_guest_table = pg;
+            return 0;
+        }
+
+        rc = put_page_type_ptpg_preemptible(pg, mfn_to_page(_mfn(pfn)));
+        if ( likely(!rc) )
+            put_page(pg);
+    }
+
+    return rc;
+}
+
+/* Update the L1 entry at pl1e to new value nl1e. */
+static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
+                        unsigned long gl1mfn, int preserve_ad,
+                        struct vcpu *pt_vcpu, struct domain *pg_dom)
+{
+    l1_pgentry_t ol1e;
+    struct domain *pt_dom = pt_vcpu->domain;
+    int rc = 0;
+
+    if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
+        return -EFAULT;
+
+    ASSERT(!paging_mode_refcounts(pt_dom));
+
+    if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
+    {
+        struct page_info *page = NULL;
+
+        if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
+        {
+            gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
+                    l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
+            return -EINVAL;
+        }
+
+        /* Translate foreign guest address. */
+        if ( paging_mode_translate(pg_dom) )
+        {
+            p2m_type_t p2mt;
+            p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
+                            P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
+
+            page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
+
+            if ( p2m_is_paged(p2mt) )
+            {
+                if ( page )
+                    put_page(page);
+                p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
+                return -ENOENT;
+            }
+
+            if ( p2mt == p2m_ram_paging_in && !page )
+                return -ENOENT;
+
+            /* Did our attempt to unshare fail? */
+            if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
+            {
+                /* We could not have obtained a page ref. */
+                ASSERT(!page);
+                /* And mem_sharing_notify has already been called. */
+                return -ENOMEM;
+            }
+
+            if ( !page )
+                return -EINVAL;
+            nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
+        }
+
+        /* Fast path for sufficiently-similar mappings. */
+        if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
+        {
+            nl1e = adjust_guest_l1e(nl1e, pt_dom);
+            rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+                              preserve_ad);
+            if ( page )
+                put_page(page);
+            return rc ? 0 : -EBUSY;
+        }
+
+        switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom,
+                                        l1_disallow_mask(pt_dom)) )
+        {
+        default:
+            if ( page )
+                put_page(page);
+            return rc;
+        case 0:
+            break;
+        case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
+            ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
+            l1e_flip_flags(nl1e, rc);
+            rc = 0;
+            break;
+        }
+        if ( page )
+            put_page(page);
+
+        nl1e = adjust_guest_l1e(nl1e, pt_dom);
+        if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+                                    preserve_ad)) )
+        {
+            ol1e = nl1e;
+            rc = -EBUSY;
+        }
+    }
+    else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
+                                     preserve_ad)) )
+    {
+        return -EBUSY;
+    }
+
+    put_page_from_l1e(ol1e, pt_dom);
+    return rc;
+}
+
+
+/* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
+static int mod_l2_entry(l2_pgentry_t *pl2e,
+                        l2_pgentry_t nl2e,
+                        unsigned long pfn,
+                        int preserve_ad,
+                        struct vcpu *vcpu)
+{
+    l2_pgentry_t ol2e;
+    struct domain *d = vcpu->domain;
+    struct page_info *l2pg = mfn_to_page(_mfn(pfn));
+    unsigned long type = l2pg->u.inuse.type_info;
+    int rc = 0;
+
+    if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
+    {
+        gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
+                 pgentry_ptr_to_slot(pl2e));
+        return -EPERM;
+    }
+
+    if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
+        return -EFAULT;
+
+    if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
+    {
+        if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
+        {
+            gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
+                    l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
+            return -EINVAL;
+        }
+
+        /* Fast path for sufficiently-similar mappings. */
+        if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
+        {
+            nl2e = adjust_guest_l2e(nl2e, d);
+            if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
+                return 0;
+            return -EBUSY;
+        }
+
+        if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
+            return rc;
+
+        nl2e = adjust_guest_l2e(nl2e, d);
+        if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
+                                    preserve_ad)) )
+        {
+            ol2e = nl2e;
+            rc = -EBUSY;
+        }
+    }
+    else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
+                                     preserve_ad)) )
+    {
+        return -EBUSY;
+    }
+
+    put_page_from_l2e(ol2e, pfn);
+    return rc;
+}
+
+/* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
+static int mod_l3_entry(l3_pgentry_t *pl3e,
+                        l3_pgentry_t nl3e,
+                        unsigned long pfn,
+                        int preserve_ad,
+                        struct vcpu *vcpu)
+{
+    l3_pgentry_t ol3e;
+    struct domain *d = vcpu->domain;
+    int rc = 0;
+
+    /*
+     * Disallow updates to final L3 slot. It contains Xen mappings, and it
+     * would be a pain to ensure they remain continuously valid throughout.
+     */
+    if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
+        return -EINVAL;
+
+    if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
+        return -EFAULT;
+
+    if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
+    {
+        if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
+        {
+            gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
+                    l3e_get_flags(nl3e) & l3_disallow_mask(d));
+            return -EINVAL;
+        }
+
+        /* Fast path for sufficiently-similar mappings. */
+        if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
+        {
+            nl3e = adjust_guest_l3e(nl3e, d);
+            rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
+            return rc ? 0 : -EFAULT;
+        }
+
+        rc = get_page_from_l3e(nl3e, pfn, d, 0);
+        if ( unlikely(rc < 0) )
+            return rc;
+        rc = 0;
+
+        nl3e = adjust_guest_l3e(nl3e, d);
+        if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
+                                    preserve_ad)) )
+        {
+            ol3e = nl3e;
+            rc = -EFAULT;
+        }
+    }
+    else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
+                                     preserve_ad)) )
+    {
+        return -EFAULT;
+    }
+
+    if ( likely(rc == 0) )
+        if ( !create_pae_xen_mappings(d, pl3e) )
+            BUG();
+
+    put_page_from_l3e(ol3e, pfn, 0, 1);
+    return rc;
+}
+
+/* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
+static int mod_l4_entry(l4_pgentry_t *pl4e,
+                        l4_pgentry_t nl4e,
+                        unsigned long pfn,
+                        int preserve_ad,
+                        struct vcpu *vcpu)
+{
+    struct domain *d = vcpu->domain;
+    l4_pgentry_t ol4e;
+    int rc = 0;
+
+    if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
+    {
+        gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
+                 pgentry_ptr_to_slot(pl4e));
+        return -EINVAL;
+    }
+
+    if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
+        return -EFAULT;
+
+    if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
+    {
+        if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
+        {
+            gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
+                    l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
+            return -EINVAL;
+        }
+
+        /* Fast path for sufficiently-similar mappings. */
+        if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
+        {
+            nl4e = adjust_guest_l4e(nl4e, d);
+            rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
+            return rc ? 0 : -EFAULT;
+        }
+
+        rc = get_page_from_l4e(nl4e, pfn, d, 0);
+        if ( unlikely(rc < 0) )
+            return rc;
+        rc = 0;
+
+        nl4e = adjust_guest_l4e(nl4e, d);
+        if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
+                                    preserve_ad)) )
+        {
+            ol4e = nl4e;
+            rc = -EFAULT;
+        }
+    }
+    else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
+                                     preserve_ad)) )
+    {
+        return -EFAULT;
+    }
+
+    put_page_from_l4e(ol4e, pfn, 0, 1);
+    return rc;
+}
+
+static int alloc_l1_table(struct page_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    l1_pgentry_t  *pl1e;
+    unsigned int   i;
+    int            ret = 0;
+
+    pl1e = __map_domain_page(page);
+
+    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+    {
+        switch ( ret = get_page_from_l1e(pl1e[i], d, d, l1_disallow_mask(d)) )
+        {
+        default:
+            goto fail;
+        case 0:
+            break;
+        case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
+            ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
+            l1e_flip_flags(pl1e[i], ret);
+            break;
+        }
+
+        pl1e[i] = adjust_guest_l1e(pl1e[i], d);
+    }
+
+    unmap_domain_page(pl1e);
+    return 0;
+
+ fail:
+    gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
+    while ( i-- > 0 )
+        put_page_from_l1e(pl1e[i], d);
+
+    unmap_domain_page(pl1e);
+    return ret;
+}
+
+static int alloc_l2_table(struct page_info *page, unsigned long type,
+                          int preemptible)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long  pfn = mfn_x(page_to_mfn(page));
+    l2_pgentry_t  *pl2e;
+    unsigned int   i;
+    int            rc = 0;
+
+    pl2e = map_domain_page(_mfn(pfn));
+
+    for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
+    {
+        if ( preemptible && i > page->nr_validated_ptes
+             && hypercall_preempt_check() )
+        {
+            page->nr_validated_ptes = i;
+            rc = -ERESTART;
+            break;
+        }
+
+        if ( !is_guest_l2_slot(d, type, i) ||
+             (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
+            continue;
+
+        if ( rc < 0 )
+        {
+            gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", 
i);
+            while ( i-- > 0 )
+                if ( is_guest_l2_slot(d, type, i) )
+                    put_page_from_l2e(pl2e[i], pfn);
+            break;
+        }
+
+        pl2e[i] = adjust_guest_l2e(pl2e[i], d);
+    }
+
+    if ( rc >= 0 && (type & PGT_pae_xen_l2) )
+        init_xen_pae_l2_slots(pl2e, d);
+
+    unmap_domain_page(pl2e);
+    return rc > 0 ? 0 : rc;
+}
+
+static int alloc_l3_table(struct page_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long  pfn = mfn_x(page_to_mfn(page));
+    l3_pgentry_t  *pl3e;
+    unsigned int   i;
+    int            rc = 0, partial = page->partial_pte;
+
+    pl3e = map_domain_page(_mfn(pfn));
+
+    /*
+     * PAE guests allocate full pages, but aren't required to initialize
+     * more than the first four entries; when running in compatibility
+     * mode, however, the full page is visible to the MMU, and hence all
+     * 512 entries must be valid/verified, which is most easily achieved
+     * by clearing them out.
+     */
+    if ( is_pv_32bit_domain(d) )
+        memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
+
+    for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
+    {
+        if ( is_pv_32bit_domain(d) && (i == 3) )
+        {
+            if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
+                 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
+                rc = -EINVAL;
+            else
+                rc = get_page_and_type_from_mfn(
+                    l3e_get_mfn(pl3e[i]),
+                    PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
+        }
+        else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
+            continue;
+
+        if ( rc == -ERESTART )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = partial ?: 1;
+        }
+        else if ( rc == -EINTR && i )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 0;
+            rc = -ERESTART;
+        }
+        if ( rc < 0 )
+            break;
+
+        pl3e[i] = adjust_guest_l3e(pl3e[i], d);
+    }
+
+    if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
+        rc = -EINVAL;
+    if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
+    {
+        gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
+        if ( i )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = 0;
+            current->arch.old_guest_ptpg = NULL;
+            current->arch.old_guest_table = page;
+        }
+        while ( i-- > 0 )
+            pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+    }
+
+    unmap_domain_page(pl3e);
+    return rc > 0 ? 0 : rc;
+}
+
+static int alloc_l4_table(struct page_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long  pfn = mfn_x(page_to_mfn(page));
+    l4_pgentry_t  *pl4e = map_domain_page(_mfn(pfn));
+    unsigned int   i;
+    int            rc = 0, partial = page->partial_pte;
+
+    for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
+          i++, partial = 0 )
+    {
+        if ( !is_guest_l4_slot(d, i) ||
+             (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
+            continue;
+
+        if ( rc == -ERESTART )
+        {
+            page->nr_validated_ptes = i;
+            page->partial_pte = partial ?: 1;
+        }
+        else if ( rc < 0 )
+        {
+            if ( rc != -EINTR )
+                gdprintk(XENLOG_WARNING,
+                         "Failure in alloc_l4_table: slot %#x\n", i);
+            if ( i )
+            {
+                page->nr_validated_ptes = i;
+                page->partial_pte = 0;
+                if ( rc == -EINTR )
+                    rc = -ERESTART;
+                else
+                {
+                    if ( current->arch.old_guest_table )
+                        page->nr_validated_ptes++;
+                    current->arch.old_guest_ptpg = NULL;
+                    current->arch.old_guest_table = page;
+                }
+            }
+        }
+        if ( rc < 0 )
+        {
+            unmap_domain_page(pl4e);
+            return rc;
+        }
+
+        pl4e[i] = adjust_guest_l4e(pl4e[i], d);
+    }
+
+    if ( rc >= 0 )
+    {
+        init_xen_l4_slots(pl4e, _mfn(pfn),
+                          d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
+        atomic_inc(&d->arch.pv_domain.nr_l4_pages);
+        rc = 0;
+    }
+    unmap_domain_page(pl4e);
+
+    return rc;
+}
+
+static void free_l1_table(struct page_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    l1_pgentry_t *pl1e;
+    unsigned int  i;
+
+    pl1e = __map_domain_page(page);
+
+    for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        put_page_from_l1e(pl1e[i], d);
+
+    unmap_domain_page(pl1e);
+}
+
+
+static int free_l2_table(struct page_info *page, int preemptible)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long pfn = mfn_x(page_to_mfn(page));
+    l2_pgentry_t *pl2e;
+    unsigned int  i = page->nr_validated_ptes - 1;
+    int err = 0;
+
+    pl2e = map_domain_page(_mfn(pfn));
+
+    ASSERT(page->nr_validated_ptes);
+    do {
+        if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
+             put_page_from_l2e(pl2e[i], pfn) == 0 &&
+             preemptible && i && hypercall_preempt_check() )
+        {
+           page->nr_validated_ptes = i;
+           err = -ERESTART;
+        }
+    } while ( !err && i-- );
+
+    unmap_domain_page(pl2e);
+
+    if ( !err )
+        page->u.inuse.type_info &= ~PGT_pae_xen_l2;
+
+    return err;
+}
+
+static int free_l3_table(struct page_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long pfn = mfn_x(page_to_mfn(page));
+    l3_pgentry_t *pl3e;
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
+
+    pl3e = map_domain_page(_mfn(pfn));
+
+    do {
+        rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
+        if ( rc < 0 )
+            break;
+        partial = 0;
+        if ( rc > 0 )
+            continue;
+        pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
+    } while ( i-- );
+
+    unmap_domain_page(pl3e);
+
+    if ( rc == -ERESTART )
+    {
+        page->nr_validated_ptes = i;
+        page->partial_pte = partial ?: -1;
+    }
+    else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
+    {
+        page->nr_validated_ptes = i + 1;
+        page->partial_pte = 0;
+        rc = -ERESTART;
+    }
+    return rc > 0 ? 0 : rc;
+}
+
+static int free_l4_table(struct page_info *page)
+{
+    struct domain *d = page_get_owner(page);
+    unsigned long pfn = mfn_x(page_to_mfn(page));
+    l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
+    int rc = 0, partial = page->partial_pte;
+    unsigned int  i = page->nr_validated_ptes - !partial;
+
+    do {
+        if ( is_guest_l4_slot(d, i) )
+            rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
+        if ( rc < 0 )
+            break;
+        partial = 0;
+    } while ( i-- );
+
+    if ( rc == -ERESTART )
+    {
+        page->nr_validated_ptes = i;
+        page->partial_pte = partial ?: -1;
+    }
+    else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
+    {
+        page->nr_validated_ptes = i + 1;
+        page->partial_pte = 0;
+        rc = -ERESTART;
+    }
+
+    unmap_domain_page(pl4e);
+
+    if ( rc >= 0 )
+    {
+        atomic_dec(&d->arch.pv_domain.nr_l4_pages);
+        rc = 0;
+    }
+
+    return rc;
+}
+
+
+void pv_dec_linear_pt(struct page_info *ptpg, struct page_info *page,
+                      unsigned long type)
+{
+    if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
+    {
+        ASSERT(is_pv_domain(page_get_owner(page)));
+        ASSERT(is_pv_domain(page_get_owner(ptpg)));
+
+        dec_linear_uses(page);
+        dec_linear_entries(ptpg);
+    }
+}
+
+/*
+ * Special version of get_page() to be used exclusively when
+ * - a page is known to already have a non-zero reference count
+ * - the page does not need its owner to be checked
+ * - it will not be called more than once without dropping the thus
+ *   acquired reference again.
+ * Due to get_page() reserving one reference, this call cannot fail.
+ */
+static void get_page_light(struct page_info *page)
+{
+    unsigned long x, nx, y = page->count_info;
+
+    do {
+        x  = y;
+        nx = x + 1;
+        BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
+        BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
+        y = cmpxchg(&page->count_info, x, nx);
+    }
+    while ( unlikely(y != x) );
+}
+
+int pv_put_final_page_type(struct page_info *page, unsigned long type,
+                           bool preemptible, struct page_info *ptpg)
+{
+    int rc = pv_free_page_type(page, type, preemptible);
+
+    /* No need for atomic update of type_info here: noone else updates it. */
+    if ( rc == 0 )
+    {
+        pv_dec_linear_pt(ptpg, page, type);
+        ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
+        set_tlbflush_timestamp(page);
+        smp_wmb();
+        page->u.inuse.type_info--;
+    }
+    else if ( rc == -EINTR )
+    {
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+        smp_wmb();
+        page->u.inuse.type_info |= PGT_validated;
+    }
+    else
+    {
+        BUG_ON(rc != -ERESTART);
+        smp_wmb();
+        get_page_light(page);
+        page->u.inuse.type_info |= PGT_partial;
+    }
+
+    return rc;
+}
+
+static int alloc_segdesc_page(struct page_info *page)
+{
+    const struct domain *owner = page_get_owner(page);
+    struct desc_struct *descs = __map_domain_page(page);
+    unsigned i;
+
+    for ( i = 0; i < 512; i++ )
+        if ( unlikely(!check_descriptor(owner, &descs[i])) )
+            break;
+
+    unmap_domain_page(descs);
+
+    return i == 512 ? 0 : -EINVAL;
+}
+
+int pv_alloc_page_type(struct page_info *page, unsigned long type,
+                       bool preemptible)
+{
+    struct domain *owner = page_get_owner(page);
+    int rc;
+
+    /* A page table is dirtied when its type count becomes non-zero. */
+    if ( likely(owner != NULL) )
+        paging_mark_dirty(owner, page_to_mfn(page));
+
+    switch ( type & PGT_type_mask )
+    {
+    case PGT_l1_page_table:
+        rc = alloc_l1_table(page);
+        break;
+    case PGT_l2_page_table:
+        rc = alloc_l2_table(page, type, preemptible);
+        break;
+    case PGT_l3_page_table:
+        ASSERT(preemptible);
+        rc = alloc_l3_table(page);
+        break;
+    case PGT_l4_page_table:
+        ASSERT(preemptible);
+        rc = alloc_l4_table(page);
+        break;
+    case PGT_seg_desc_page:
+        rc = alloc_segdesc_page(page);
+        break;
+    default:
+        printk("Bad type in %s %lx t=%" PRtype_info " c=%lx\n", __func__,
+               type, page->u.inuse.type_info,
+               page->count_info);
+        rc = -EINVAL;
+        BUG();
+    }
+
+    /* No need for atomic update of type_info here: noone else updates it. */
+    smp_wmb();
+    switch ( rc )
+    {
+    case 0:
+        page->u.inuse.type_info |= PGT_validated;
+        break;
+    case -EINTR:
+        ASSERT((page->u.inuse.type_info &
+                (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
+        page->u.inuse.type_info &= ~PGT_count_mask;
+        break;
+    default:
+        ASSERT(rc < 0);
+        gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
+                 " (pfn %" PRI_pfn ") for type %" PRtype_info
+                 ": caf=%08lx taf=%" PRtype_info "\n",
+                 mfn_x(page_to_mfn(page)),
+                 get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
+                 type, page->count_info, page->u.inuse.type_info);
+        if ( page != current->arch.old_guest_table )
+            page->u.inuse.type_info = 0;
+        else
+        {
+            ASSERT((page->u.inuse.type_info &
+                    (PGT_count_mask | PGT_validated)) == 1);
+    case -ERESTART:
+            get_page_light(page);
+            page->u.inuse.type_info |= PGT_partial;
+        }
+        break;
+    }
+
+    return rc;
+}
+
+
+int pv_free_page_type(struct page_info *page, unsigned long type,
+                      bool preemptible)
+{
+    struct domain *owner = page_get_owner(page);
+    unsigned long gmfn;
+    int rc;
+
+    if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
+    {
+        /* A page table is dirtied when its type count becomes zero. */
+        paging_mark_dirty(owner, page_to_mfn(page));
+
+        ASSERT(!shadow_mode_refcounts(owner));
+
+        gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
+        ASSERT(VALID_M2P(gmfn));
+        /* Page sharing not supported for shadowed domains */
+        if(!SHARED_M2P(gmfn))
+            shadow_remove_all_shadows(owner, _mfn(gmfn));
+    }
+
+    if ( !(type & PGT_partial) )
+    {
+        page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
+        page->partial_pte = 0;
+    }
+
+    switch ( type & PGT_type_mask )
+    {
+    case PGT_l1_page_table:
+        free_l1_table(page);
+        rc = 0;
+        break;
+    case PGT_l2_page_table:
+        rc = free_l2_table(page, preemptible);
+        break;
+    case PGT_l3_page_table:
+        ASSERT(preemptible);
+        rc = free_l3_table(page);
+        break;
+    case PGT_l4_page_table:
+        ASSERT(preemptible);
+        rc = free_l4_table(page);
+        break;
+    default:
+        gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
+                 type, mfn_x(page_to_mfn(page)));
+        rc = -EINVAL;
+        BUG();
+    }
+
+    return rc;
+}
+
+int new_guest_cr3(mfn_t mfn)
+{
+    struct vcpu *curr = current;
+    struct domain *d = curr->domain;
+    int rc;
+    mfn_t old_base_mfn;
+
+    if ( is_pv_32bit_domain(d) )
+    {
+        mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
+        l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
+
+        rc = mod_l4_entry(pl4e,
+                          l4e_from_mfn(mfn,
+                                       (_PAGE_PRESENT | _PAGE_RW |
+                                        _PAGE_USER | _PAGE_ACCESSED)),
+                          mfn_x(gt_mfn), 0, curr);
+        unmap_domain_page(pl4e);
+        switch ( rc )
+        {
+        case 0:
+            break;
+        case -EINTR:
+        case -ERESTART:
+            return -ERESTART;
+        default:
+            gdprintk(XENLOG_WARNING,
+                     "Error while installing new compat baseptr %" PRI_mfn 
"\n",
+                     mfn_x(mfn));
+            return rc;
+        }
+
+        pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
+        write_ptbase(curr);
+
+        return 0;
+    }
+
+    rc = put_old_guest_table(curr);
+    if ( unlikely(rc) )
+        return rc;
+
+    old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
+    /*
+     * This is particularly important when getting restarted after the
+     * previous attempt got preempted in the put-old-MFN phase.
+     */
+    if ( mfn_eq(old_base_mfn, mfn) )
+    {
+        write_ptbase(curr);
+        return 0;
+    }
+
+    rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
+    switch ( rc )
+    {
+    case 0:
+        break;
+    case -EINTR:
+    case -ERESTART:
+        return -ERESTART;
+    default:
+        gdprintk(XENLOG_WARNING,
+                 "Error while installing new baseptr %" PRI_mfn "\n",
+                 mfn_x(mfn));
+        return rc;
+    }
+
+    pv_destroy_ldt(curr); /* Unconditional TLB flush later. */
+
+    if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
+        fill_ro_mpt(mfn);
+    curr->arch.guest_table = pagetable_from_mfn(mfn);
+    update_cr3(curr);
+
+    write_ptbase(curr);
+
+    if ( likely(mfn_x(old_base_mfn) != 0) )
+    {
+        struct page_info *page = mfn_to_page(old_base_mfn);
+
+        if ( paging_mode_refcounts(d) )
+            put_page(page);
+        else
+            switch ( rc = put_page_and_type_preemptible(page) )
+            {
+            case -EINTR:
+                rc = -ERESTART;
+                /* fallthrough */
+            case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
+                curr->arch.old_guest_table = page;
+                break;
+            default:
+                BUG_ON(rc);
+                break;
+            }
+    }
+
+    return rc;
+}
+
+static struct domain *get_pg_owner(domid_t domid)
+{
+    struct domain *pg_owner = NULL, *curr = current->domain;
+
+    if ( likely(domid == DOMID_SELF) )
+    {
+        pg_owner = rcu_lock_current_domain();
+        goto out;
+    }
+
+    if ( unlikely(domid == curr->domain_id) )
+    {
+        gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
+        goto out;
+    }
+
+    switch ( domid )
+    {
+    case DOMID_IO:
+        pg_owner = rcu_lock_domain(dom_io);
+        break;
+    case DOMID_XEN:
+        pg_owner = rcu_lock_domain(dom_xen);
+        break;
+    default:
+        if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
+        {
+            gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
+            break;
+        }
+        break;
+    }
+
+ out:
+    return pg_owner;
+}
+
+static void put_pg_owner(struct domain *pg_owner)
+{
+    rcu_unlock_domain(pg_owner);
+}
+
+static inline int vcpumask_to_pcpumask(
+    struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t 
*pmask)
+{
+    unsigned int vcpu_id, vcpu_bias, offs;
+    unsigned long vmask;
+    struct vcpu *v;
+    bool is_native = !is_pv_32bit_domain(d);
+
+    cpumask_clear(pmask);
+    for ( vmask = 0, offs = 0; ; ++offs )
+    {
+        vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
+        if ( vcpu_bias >= d->max_vcpus )
+            return 0;
+
+        if ( unlikely(is_native ?
+                      copy_from_guest_offset(&vmask, bmap, offs, 1) :
+                      copy_from_guest_offset((unsigned int *)&vmask, bmap,
+                                             offs, 1)) )
+        {
+            cpumask_clear(pmask);
+            return -EFAULT;
+        }
+
+        while ( vmask )
+        {
+            vcpu_id = find_first_set_bit(vmask);
+            vmask &= ~(1UL << vcpu_id);
+            vcpu_id += vcpu_bias;
+            if ( (vcpu_id >= d->max_vcpus) )
+                return 0;
+            if ( ((v = d->vcpu[vcpu_id]) != NULL) && vcpu_cpu_dirty(v) )
+                __cpumask_set_cpu(v->dirty_cpu, pmask);
+        }
+    }
+}
+
+long do_mmuext_op(
+    XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
+    unsigned int count,
+    XEN_GUEST_HANDLE_PARAM(uint) pdone,
+    unsigned int foreigndom)
+{
+    struct mmuext_op op;
+    unsigned long type;
+    unsigned int i, done = 0;
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct domain *pg_owner;
+    int rc = put_old_guest_table(curr);
+
+    if ( unlikely(rc) )
+    {
+        if ( likely(rc == -ERESTART) )
+            rc = hypercall_create_continuation(
+                     __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
+                     foreigndom);
+        return rc;
+    }
+
+    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+         likely(guest_handle_is_null(uops)) )
+    {
+        /*
+         * See the curr->arch.old_guest_table related
+         * hypercall_create_continuation() below.
+         */
+        return (int)foreigndom;
+    }
+
+    if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
+    {
+        count &= ~MMU_UPDATE_PREEMPTED;
+        if ( unlikely(!guest_handle_is_null(pdone)) )
+            (void)copy_from_guest(&done, pdone, 1);
+    }
+    else
+        perfc_incr(calls_to_mmuext_op);
+
+    if ( unlikely(!guest_handle_okay(uops, count)) )
+        return -EFAULT;
+
+    if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
+        return -ESRCH;
+
+    if ( !is_pv_domain(pg_owner) )
+    {
+        put_pg_owner(pg_owner);
+        return -EINVAL;
+    }
+
+    rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
+    if ( rc )
+    {
+        put_pg_owner(pg_owner);
+        return rc;
+    }
+
+    for ( i = 0; i < count; i++ )
+    {
+        if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
+        {
+            rc = -ERESTART;
+            break;
+        }
+
+        if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
+        {
+            rc = -EFAULT;
+            break;
+        }
+
+        if ( is_hvm_domain(currd) )
+        {
+            switch ( op.cmd )
+            {
+            case MMUEXT_PIN_L1_TABLE:
+            case MMUEXT_PIN_L2_TABLE:
+            case MMUEXT_PIN_L3_TABLE:
+            case MMUEXT_PIN_L4_TABLE:
+            case MMUEXT_UNPIN_TABLE:
+                break;
+            default:
+                rc = -EOPNOTSUPP;
+                goto done;
+            }
+        }
+
+        rc = 0;
+
+        switch ( op.cmd )
+        {
+            struct page_info *page;
+            p2m_type_t p2mt;
+
+        case MMUEXT_PIN_L1_TABLE:
+            type = PGT_l1_page_table;
+            goto pin_page;
+
+        case MMUEXT_PIN_L2_TABLE:
+            type = PGT_l2_page_table;
+            goto pin_page;
+
+        case MMUEXT_PIN_L3_TABLE:
+            type = PGT_l3_page_table;
+            goto pin_page;
+
+        case MMUEXT_PIN_L4_TABLE:
+            if ( is_pv_32bit_domain(pg_owner) )
+                break;
+            type = PGT_l4_page_table;
+
+        pin_page:
+            /* Ignore pinning of invalid paging levels. */
+            if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
+                break;
+
+            if ( paging_mode_refcounts(pg_owner) )
+                break;
+
+            page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
+            if ( unlikely(!page) )
+            {
+                rc = -EINVAL;
+                break;
+            }
+
+            rc = get_page_type_preemptible(page, type);
+            if ( unlikely(rc) )
+            {
+                if ( rc == -EINTR )
+                    rc = -ERESTART;
+                else if ( rc != -ERESTART )
+                    gdprintk(XENLOG_WARNING,
+                             "Error %d while pinning mfn %" PRI_mfn "\n",
+                             rc, mfn_x(page_to_mfn(page)));
+                if ( page != curr->arch.old_guest_table )
+                    put_page(page);
+                break;
+            }
+
+            rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
+            if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
+                                                  &page->u.inuse.type_info)) )
+            {
+                gdprintk(XENLOG_WARNING,
+                         "mfn %" PRI_mfn " already pinned\n",
+                         mfn_x(page_to_mfn(page)));
+                rc = -EINVAL;
+            }
+
+            if ( unlikely(rc) )
+                goto pin_drop;
+
+            /* A page is dirtied when its pin status is set. */
+            paging_mark_dirty(pg_owner, page_to_mfn(page));
+
+            /* We can race domain destruction (domain_relinquish_resources). */
+            if ( unlikely(pg_owner != currd) )
+            {
+                bool drop_ref;
+
+                spin_lock(&pg_owner->page_alloc_lock);
+                drop_ref = (pg_owner->is_dying &&
+                            test_and_clear_bit(_PGT_pinned,
+                                               &page->u.inuse.type_info));
+                spin_unlock(&pg_owner->page_alloc_lock);
+                if ( drop_ref )
+                {
+        pin_drop:
+                    if ( type == PGT_l1_page_table )
+                        put_page_and_type(page);
+                    else
+                    {
+                        curr->arch.old_guest_ptpg = NULL;
+                        curr->arch.old_guest_table = page;
+                    }
+                }
+            }
+            break;
+
+        case MMUEXT_UNPIN_TABLE:
+            if ( paging_mode_refcounts(pg_owner) )
+                break;
+
+            page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
+            if ( unlikely(!page) )
+            {
+                gdprintk(XENLOG_WARNING,
+                         "mfn %" PRI_mfn " bad, or bad owner d%d\n",
+                         op.arg1.mfn, pg_owner->domain_id);
+                rc = -EINVAL;
+                break;
+            }
+
+            if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
+            {
+                put_page(page);
+                gdprintk(XENLOG_WARNING,
+                         "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
+                rc = -EINVAL;
+                break;
+            }
+
+            switch ( rc = put_page_and_type_preemptible(page) )
+            {
+            case -EINTR:
+            case -ERESTART:
+                curr->arch.old_guest_ptpg = NULL;
+                curr->arch.old_guest_table = page;
+                rc = 0;
+                break;
+            default:
+                BUG_ON(rc);
+                break;
+            }
+            put_page(page);
+
+            /* A page is dirtied when its pin status is cleared. */
+            paging_mark_dirty(pg_owner, page_to_mfn(page));
+            break;
+
+        case MMUEXT_NEW_BASEPTR:
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( unlikely(paging_mode_translate(currd)) )
+                rc = -EINVAL;
+            else
+                rc = new_guest_cr3(_mfn(op.arg1.mfn));
+            break;
+
+        case MMUEXT_NEW_USER_BASEPTR: {
+            unsigned long old_mfn;
+
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( unlikely(paging_mode_translate(currd)) )
+                rc = -EINVAL;
+            if ( unlikely(rc) )
+                break;
+
+            old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
+            /*
+             * This is particularly important when getting restarted after the
+             * previous attempt got preempted in the put-old-MFN phase.
+             */
+            if ( old_mfn == op.arg1.mfn )
+                break;
+
+            if ( op.arg1.mfn != 0 )
+            {
+                rc = get_page_and_type_from_mfn(
+                    _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
+
+                if ( unlikely(rc) )
+                {
+                    if ( rc == -EINTR )
+                        rc = -ERESTART;
+                    else if ( rc != -ERESTART )
+                        gdprintk(XENLOG_WARNING,
+                                 "Error %d installing new mfn %" PRI_mfn "\n",
+                                 rc, op.arg1.mfn);
+                    break;
+                }
+
+                if ( VM_ASSIST(currd, m2p_strict) )
+                    zap_ro_mpt(_mfn(op.arg1.mfn));
+            }
+
+            curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
+
+            if ( old_mfn != 0 )
+            {
+                page = mfn_to_page(_mfn(old_mfn));
+
+                switch ( rc = put_page_and_type_preemptible(page) )
+                {
+                case -EINTR:
+                    rc = -ERESTART;
+                    /* fallthrough */
+                case -ERESTART:
+                    curr->arch.old_guest_ptpg = NULL;
+                    curr->arch.old_guest_table = page;
+                    break;
+                default:
+                    BUG_ON(rc);
+                    break;
+                }
+            }
+
+            break;
+        }
+
+        case MMUEXT_TLB_FLUSH_LOCAL:
+            if ( likely(currd == pg_owner) )
+                flush_tlb_local();
+            else
+                rc = -EPERM;
+            break;
+
+        case MMUEXT_INVLPG_LOCAL:
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else
+                paging_invlpg(curr, op.arg1.linear_addr);
+            break;
+
+        case MMUEXT_TLB_FLUSH_MULTI:
+        case MMUEXT_INVLPG_MULTI:
+        {
+            cpumask_t *mask = this_cpu(scratch_cpumask);
+
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( unlikely(vcpumask_to_pcpumask(currd,
+                                   guest_handle_to_param(op.arg2.vcpumask,
+                                                         const_void),
+                                   mask)) )
+                rc = -EINVAL;
+            if ( unlikely(rc) )
+                break;
+
+            if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
+                flush_tlb_mask(mask);
+            else if ( __addr_ok(op.arg1.linear_addr) )
+                flush_tlb_one_mask(mask, op.arg1.linear_addr);
+            break;
+        }
+
+        case MMUEXT_TLB_FLUSH_ALL:
+            if ( likely(currd == pg_owner) )
+                flush_tlb_mask(currd->dirty_cpumask);
+            else
+                rc = -EPERM;
+            break;
+
+        case MMUEXT_INVLPG_ALL:
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( __addr_ok(op.arg1.linear_addr) )
+                flush_tlb_one_mask(currd->dirty_cpumask, op.arg1.linear_addr);
+            break;
+
+        case MMUEXT_FLUSH_CACHE:
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( unlikely(!cache_flush_permitted(currd)) )
+                rc = -EACCES;
+            else
+                wbinvd();
+            break;
+
+        case MMUEXT_FLUSH_CACHE_GLOBAL:
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( likely(cache_flush_permitted(currd)) )
+            {
+                unsigned int cpu;
+                cpumask_t *mask = this_cpu(scratch_cpumask);
+
+                cpumask_clear(mask);
+                for_each_online_cpu(cpu)
+                    if ( !cpumask_intersects(mask,
+                                             per_cpu(cpu_sibling_mask, cpu)) )
+                        __cpumask_set_cpu(cpu, mask);
+                flush_mask(mask, FLUSH_CACHE);
+            }
+            else
+                rc = -EINVAL;
+            break;
+
+        case MMUEXT_SET_LDT:
+        {
+            unsigned int ents = op.arg2.nr_ents;
+            unsigned long ptr = ents ? op.arg1.linear_addr : 0;
+
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( paging_mode_external(currd) )
+                rc = -EINVAL;
+            else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
+                      (ents > 8192) )
+            {
+                gdprintk(XENLOG_WARNING,
+                         "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
+                rc = -EINVAL;
+            }
+            else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
+                      (curr->arch.pv_vcpu.ldt_base != ptr) )
+            {
+                if ( pv_destroy_ldt(curr) )
+                    flush_tlb_local();
+
+                curr->arch.pv_vcpu.ldt_base = ptr;
+                curr->arch.pv_vcpu.ldt_ents = ents;
+                load_LDT(curr);
+            }
+            break;
+        }
+
+        case MMUEXT_CLEAR_PAGE:
+            page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
+            if ( unlikely(p2mt != p2m_ram_rw) && page )
+            {
+                put_page(page);
+                page = NULL;
+            }
+            if ( !page || !get_page_type(page, PGT_writable_page) )
+            {
+                if ( page )
+                    put_page(page);
+                gdprintk(XENLOG_WARNING,
+                         "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
+                rc = -EINVAL;
+                break;
+            }
+
+            /* A page is dirtied when it's being cleared. */
+            paging_mark_dirty(pg_owner, page_to_mfn(page));
+
+            clear_domain_page(page_to_mfn(page));
+
+            put_page_and_type(page);
+            break;
+
+        case MMUEXT_COPY_PAGE:
+        {
+            struct page_info *src_page, *dst_page;
+
+            src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
+                                         P2M_ALLOC);
+            if ( unlikely(p2mt != p2m_ram_rw) && src_page )
+            {
+                put_page(src_page);
+                src_page = NULL;
+            }
+            if ( unlikely(!src_page) )
+            {
+                gdprintk(XENLOG_WARNING,
+                         "Error copying from mfn %" PRI_mfn "\n",
+                         op.arg2.src_mfn);
+                rc = -EINVAL;
+                break;
+            }
+
+            dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
+                                         P2M_ALLOC);
+            if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
+            {
+                put_page(dst_page);
+                dst_page = NULL;
+            }
+            rc = (dst_page &&
+                  get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
+            if ( unlikely(rc) )
+            {
+                put_page(src_page);
+                if ( dst_page )
+                    put_page(dst_page);
+                gdprintk(XENLOG_WARNING,
+                         "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
+                break;
+            }
+
+            /* A page is dirtied when it's being copied to. */
+            paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
+
+            copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
+
+            put_page_and_type(dst_page);
+            put_page(src_page);
+            break;
+        }
+
+        case MMUEXT_MARK_SUPER:
+        case MMUEXT_UNMARK_SUPER:
+            rc = -EOPNOTSUPP;
+            break;
+
+        default:
+            rc = -ENOSYS;
+            break;
+        }
+
+ done:
+        if ( unlikely(rc) )
+            break;
+
+        guest_handle_add_offset(uops, 1);
+    }
+
+    if ( rc == -ERESTART )
+    {
+        ASSERT(i < count);
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_mmuext_op, "hihi",
+            uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+    }
+    else if ( curr->arch.old_guest_table )
+    {
+        XEN_GUEST_HANDLE_PARAM(void) null;
+
+        ASSERT(rc || i == count);
+        set_xen_guest_handle(null, NULL);
+        /*
+         * In order to have a way to communicate the final return value to
+         * our continuation, we pass this in place of "foreigndom", building
+         * on the fact that this argument isn't needed anymore.
+         */
+        rc = hypercall_create_continuation(
+                __HYPERVISOR_mmuext_op, "hihi", null,
+                MMU_UPDATE_PREEMPTED, null, rc);
+    }
+
+    put_pg_owner(pg_owner);
+
+    perfc_add(num_mmuext_ops, i);
+
+    /* Add incremental work we have done to the @done output parameter. */
+    if ( unlikely(!guest_handle_is_null(pdone)) )
+    {
+        done += i;
+        copy_to_guest(pdone, &done, 1);
+    }
+
+    return rc;
+}
+
+long do_mmu_update(
+    XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
+    unsigned int count,
+    XEN_GUEST_HANDLE_PARAM(uint) pdone,
+    unsigned int foreigndom)
+{
+    struct mmu_update req;
+    void *va = NULL;
+    unsigned long gpfn, gmfn, mfn;
+    struct page_info *page;
+    unsigned int cmd, i = 0, done = 0, pt_dom;
+    struct vcpu *curr = current, *v = curr;
+    struct domain *d = v->domain, *pt_owner = d, *pg_owner;
+    mfn_t map_mfn = INVALID_MFN;
+    bool sync_guest = false;
+    uint32_t xsm_needed = 0;
+    uint32_t xsm_checked = 0;
+    int rc = put_old_guest_table(curr);
+
+    if ( unlikely(rc) )
+    {
+        if ( likely(rc == -ERESTART) )
+            rc = hypercall_create_continuation(
+                     __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
+                     foreigndom);
+        return rc;
+    }
+
+    if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
+         likely(guest_handle_is_null(ureqs)) )
+    {
+        /*
+         * See the curr->arch.old_guest_table related
+         * hypercall_create_continuation() below.
+         */
+        return (int)foreigndom;
+    }
+
+    if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
+    {
+        count &= ~MMU_UPDATE_PREEMPTED;
+        if ( unlikely(!guest_handle_is_null(pdone)) )
+            (void)copy_from_guest(&done, pdone, 1);
+    }
+    else
+        perfc_incr(calls_to_mmu_update);
+
+    if ( unlikely(!guest_handle_okay(ureqs, count)) )
+        return -EFAULT;
+
+    if ( (pt_dom = foreigndom >> 16) != 0 )
+    {
+        /* Pagetables belong to a foreign domain (PFD). */
+        if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
+            return -ESRCH;
+
+        if ( pt_owner == d )
+            rcu_unlock_domain(pt_owner);
+        else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
+        {
+            rc = -EINVAL;
+            goto out;
+        }
+    }
+
+    if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
+    {
+        rc = -ESRCH;
+        goto out;
+    }
+
+    for ( i = 0; i < count; i++ )
+    {
+        if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
+        {
+            rc = -ERESTART;
+            break;
+        }
+
+        if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
+        {
+            rc = -EFAULT;
+            break;
+        }
+
+        cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
+
+        switch ( cmd )
+        {
+            /*
+             * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
+             * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
+             * current A/D bits.
+             */
+        case MMU_NORMAL_PT_UPDATE:
+        case MMU_PT_UPDATE_PRESERVE_AD:
+        {
+            p2m_type_t p2mt;
+
+            rc = -EOPNOTSUPP;
+            if ( unlikely(paging_mode_refcounts(pt_owner)) )
+                break;
+
+            xsm_needed |= XSM_MMU_NORMAL_UPDATE;
+            if ( get_pte_flags(req.val) & _PAGE_PRESENT )
+            {
+                xsm_needed |= XSM_MMU_UPDATE_READ;
+                if ( get_pte_flags(req.val) & _PAGE_RW )
+                    xsm_needed |= XSM_MMU_UPDATE_WRITE;
+            }
+            if ( xsm_needed != xsm_checked )
+            {
+                rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, 
xsm_needed);
+                if ( rc )
+                    break;
+                xsm_checked = xsm_needed;
+            }
+            rc = -EINVAL;
+
+            req.ptr -= cmd;
+            gmfn = req.ptr >> PAGE_SHIFT;
+            page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
+
+            if ( unlikely(!page) || p2mt != p2m_ram_rw )
+            {
+                if ( page )
+                    put_page(page);
+                if ( p2m_is_paged(p2mt) )
+                {
+                    p2m_mem_paging_populate(pt_owner, gmfn);
+                    rc = -ENOENT;
+                }
+                else
+                    gdprintk(XENLOG_WARNING,
+                             "Could not get page for normal update\n");
+                break;
+            }
+
+            mfn = mfn_x(page_to_mfn(page));
+
+            if ( !mfn_eq(_mfn(mfn), map_mfn) )
+            {
+                if ( va )
+                    unmap_domain_page(va);
+                va = map_domain_page(_mfn(mfn));
+                map_mfn = _mfn(mfn);
+            }
+            va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
+
+            if ( page_lock(page) )
+            {
+                switch ( page->u.inuse.type_info & PGT_type_mask )
+                {
+                case PGT_l1_page_table:
+                    rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
+                                      pg_owner);
+                    break;
+
+                case PGT_l2_page_table:
+                    if ( unlikely(pg_owner != pt_owner) )
+                        break;
+                    rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    break;
+
+                case PGT_l3_page_table:
+                    if ( unlikely(pg_owner != pt_owner) )
+                        break;
+                    rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    break;
+
+                case PGT_l4_page_table:
+                    if ( unlikely(pg_owner != pt_owner) )
+                        break;
+                    rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
+                                      cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
+                    /*
+                     * No need to sync if all uses of the page can be accounted
+                     * to the page lock we hold, its pinned status, and uses on
+                     * this (v)CPU.
+                     */
+                    if ( !rc && this_cpu(root_pgt) &&
+                         ((page->u.inuse.type_info & PGT_count_mask) >
+                          (1 + !!(page->u.inuse.type_info & PGT_pinned) +
+                           (pagetable_get_pfn(curr->arch.guest_table) == mfn) +
+                           (pagetable_get_pfn(curr->arch.guest_table_user) ==
+                            mfn))) )
+                        sync_guest = true;
+                    break;
+
+                case PGT_writable_page:
+                    perfc_incr(writable_mmu_updates);
+                    if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
+                        rc = 0;
+                    break;
+                }
+                page_unlock(page);
+                if ( rc == -EINTR )
+                    rc = -ERESTART;
+            }
+            else if ( get_page_type(page, PGT_writable_page) )
+            {
+                perfc_incr(writable_mmu_updates);
+                if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
+                    rc = 0;
+                put_page_type(page);
+            }
+
+            put_page(page);
+        }
+        break;
+
+        case MMU_MACHPHYS_UPDATE:
+            if ( unlikely(d != pt_owner) )
+            {
+                rc = -EPERM;
+                break;
+            }
+
+            if ( unlikely(paging_mode_translate(pg_owner)) )
+            {
+                rc = -EINVAL;
+                break;
+            }
+
+            mfn = req.ptr >> PAGE_SHIFT;
+            gpfn = req.val;
+
+            xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
+            if ( xsm_needed != xsm_checked )
+            {
+                rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
+                if ( rc )
+                    break;
+                xsm_checked = xsm_needed;
+            }
+
+            page = get_page_from_mfn(_mfn(mfn), pg_owner);
+            if ( unlikely(!page) )
+            {
+                gdprintk(XENLOG_WARNING,
+                         "Could not get page for mach->phys update\n");
+                rc = -EINVAL;
+                break;
+            }
+
+            set_gpfn_from_mfn(mfn, gpfn);
+
+            paging_mark_dirty(pg_owner, _mfn(mfn));
+
+            put_page(page);
+            break;
+
+        default:
+            rc = -ENOSYS;
+            break;
+        }
+
+        if ( unlikely(rc) )
+            break;
+
+        guest_handle_add_offset(ureqs, 1);
+    }
+
+    if ( rc == -ERESTART )
+    {
+        ASSERT(i < count);
+        rc = hypercall_create_continuation(
+            __HYPERVISOR_mmu_update, "hihi",
+            ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
+    }
+    else if ( curr->arch.old_guest_table )
+    {
+        XEN_GUEST_HANDLE_PARAM(void) null;
+
+        ASSERT(rc || i == count);
+        set_xen_guest_handle(null, NULL);
+        /*
+         * In order to have a way to communicate the final return value to
+         * our continuation, we pass this in place of "foreigndom", building
+         * on the fact that this argument isn't needed anymore.
+         */
+        rc = hypercall_create_continuation(
+                __HYPERVISOR_mmu_update, "hihi", null,
+                MMU_UPDATE_PREEMPTED, null, rc);
+    }
+
+    put_pg_owner(pg_owner);
+
+    if ( va )
+        unmap_domain_page(va);
+
+    if ( sync_guest )
+    {
+        /*
+         * Force other vCPU-s of the affected guest to pick up L4 entry
+         * changes (if any). Issue a flush IPI with empty operation mask to
+         * facilitate this (including ourselves waiting for the IPI to
+         * actually have arrived). Utilize the fact that FLUSH_VA_VALID is
+         * meaningless without FLUSH_CACHE, but will allow to pass the no-op
+         * check in flush_area_mask().
+         */
+        unsigned int cpu = smp_processor_id();
+        cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
+
+        cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
+        if ( !cpumask_empty(mask) )
+            flush_area_mask(mask, ZERO_BLOCK_PTR, FLUSH_VA_VALID);
+    }
+
+    perfc_add(num_page_updates, i);
+
+ out:
+    if ( pt_owner != d )
+        rcu_unlock_domain(pt_owner);
+
+    /* Add incremental work we have done to the @done output parameter. */
+    if ( unlikely(!guest_handle_is_null(pdone)) )
+    {
+        done += i;
+        copy_to_guest(pdone, &done, 1);
+    }
+
+    return rc;
+}
+
+static int __do_update_va_mapping(
+    unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
+{
+    l1_pgentry_t   val = l1e_from_intpte(val64);
+    struct vcpu   *v   = current;
+    struct domain *d   = v->domain;
+    struct page_info *gl1pg;
+    l1_pgentry_t  *pl1e;
+    unsigned long  bmap_ptr;
+    mfn_t          gl1mfn;
+    cpumask_t     *mask = NULL;
+    int            rc;
+
+    perfc_incr(calls_to_update_va);
+
+    rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
+    if ( rc )
+        return rc;
+
+    rc = -EINVAL;
+    pl1e = map_guest_l1e(va, &gl1mfn);
+    gl1pg = pl1e ? get_page_from_mfn(gl1mfn, d) : NULL;
+    if ( unlikely(!gl1pg) )
+        goto out;
+
+    if ( !page_lock(gl1pg) )
+    {
+        put_page(gl1pg);
+        goto out;
+    }
+
+    if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
+    {
+        page_unlock(gl1pg);
+        put_page(gl1pg);
+        goto out;
+    }
+
+    rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
+
+    page_unlock(gl1pg);
+    put_page(gl1pg);
+
+ out:
+    if ( pl1e )
+        unmap_domain_page(pl1e);
+
+    switch ( flags & UVMF_FLUSHTYPE_MASK )
+    {
+    case UVMF_TLB_FLUSH:
+        switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
+        {
+        case UVMF_LOCAL:
+            flush_tlb_local();
+            break;
+        case UVMF_ALL:
+            mask = d->dirty_cpumask;
+            break;
+        default:
+            mask = this_cpu(scratch_cpumask);
+            rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
+                                                                     void),
+                                      mask);
+            break;
+        }
+        if ( mask )
+            flush_tlb_mask(mask);
+        break;
+
+    case UVMF_INVLPG:
+        switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
+        {
+        case UVMF_LOCAL:
+            paging_invlpg(v, va);
+            break;
+        case UVMF_ALL:
+            mask = d->dirty_cpumask;
+            break;
+        default:
+            mask = this_cpu(scratch_cpumask);
+            rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
+                                                                     void),
+                                      mask);
+            break;
+        }
+        if ( mask )
+            flush_tlb_one_mask(mask, va);
+        break;
+    }
+
+    return rc;
+}
+
+long do_update_va_mapping(unsigned long va, u64 val64,
+                          unsigned long flags)
+{
+    return __do_update_va_mapping(va, val64, flags, current->domain);
+}
+
+long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
+                                      unsigned long flags,
+                                      domid_t domid)
+{
+    struct domain *pg_owner;
+    int rc;
+
+    if ( (pg_owner = get_pg_owner(domid)) == NULL )
+        return -ESRCH;
+
+    rc = __do_update_va_mapping(va, val64, flags, pg_owner);
+
+    put_pg_owner(pg_owner);
+
+    return rc;
+}
+
 /*
  * Local variables:
  * mode: C
-- 
2.11.0


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.