[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 37/40] xen/mpu: move MMU specific P2M code to p2m_mmu.c



Current P2M implementation is designed for MMU system. Only a few codes
can be shared by MPU system, like P2M pool, IPA, etc
We move the MMU-specific codes into p2m_mmu.c, and place stub functions
in p2m_mpu.c which wait for implementing on the first usage. And we
keep generic codes in p2m.c

We also move MMU-specific definitions to p2m_mmu.h, like P2M_ROOT_LEVEL and
function p2m_tlb_flush_sync.

Signed-off-by: Penny Zheng <penny.zheng@xxxxxxx>
Signed-off-by: Wei Chen <wei.chen@xxxxxxx>
---
 xen/arch/arm/Makefile              |    5 +
 xen/arch/arm/include/asm/p2m.h     |   17 +-
 xen/arch/arm/include/asm/p2m_mmu.h |   28 +
 xen/arch/arm/p2m.c                 | 2276 +--------------------------
 xen/arch/arm/p2m_mmu.c             | 2295 ++++++++++++++++++++++++++++
 xen/arch/arm/p2m_mpu.c             |  191 +++
 6 files changed, 2528 insertions(+), 2284 deletions(-)
 create mode 100644 xen/arch/arm/include/asm/p2m_mmu.h
 create mode 100644 xen/arch/arm/p2m_mmu.c
 create mode 100644 xen/arch/arm/p2m_mpu.c

diff --git a/xen/arch/arm/Makefile b/xen/arch/arm/Makefile
index c949661590..ea650db52b 100644
--- a/xen/arch/arm/Makefile
+++ b/xen/arch/arm/Makefile
@@ -44,6 +44,11 @@ obj-y += mm_mpu.o
 endif
 obj-y += monitor.o
 obj-y += p2m.o
+ifneq ($(CONFIG_HAS_MPU), y)
+obj-y += p2m_mmu.o
+else
+obj-y += p2m_mpu.o
+endif
 obj-y += percpu.o
 obj-y += platform.o
 obj-y += platform_hypercall.o
diff --git a/xen/arch/arm/include/asm/p2m.h b/xen/arch/arm/include/asm/p2m.h
index 91df922e1c..a430aca232 100644
--- a/xen/arch/arm/include/asm/p2m.h
+++ b/xen/arch/arm/include/asm/p2m.h
@@ -14,17 +14,6 @@
 /* Holds the bit size of IPAs in p2m tables.  */
 extern unsigned int p2m_ipa_bits;
 
-#ifdef CONFIG_ARM_64
-extern unsigned int p2m_root_order;
-extern unsigned int p2m_root_level;
-#define P2M_ROOT_ORDER    p2m_root_order
-#define P2M_ROOT_LEVEL p2m_root_level
-#else
-/* First level P2M is always 2 consecutive pages */
-#define P2M_ROOT_ORDER    1
-#define P2M_ROOT_LEVEL 1
-#endif
-
 struct domain;
 
 extern void memory_type_changed(struct domain *);
@@ -162,6 +151,10 @@ typedef enum {
 #endif
 #include <xen/p2m-common.h>
 
+#ifndef CONFIG_HAS_MPU
+#include <asm/p2m_mmu.h>
+#endif
+
 static inline bool arch_acquire_resource_check(struct domain *d)
 {
     /*
@@ -252,8 +245,6 @@ static inline int p2m_is_write_locked(struct p2m_domain 
*p2m)
     return rw_is_write_locked(&p2m->lock);
 }
 
-void p2m_tlb_flush_sync(struct p2m_domain *p2m);
-
 /* Look up the MFN corresponding to a domain's GFN. */
 mfn_t p2m_lookup(struct domain *d, gfn_t gfn, p2m_type_t *t);
 
diff --git a/xen/arch/arm/include/asm/p2m_mmu.h 
b/xen/arch/arm/include/asm/p2m_mmu.h
new file mode 100644
index 0000000000..a0f2440336
--- /dev/null
+++ b/xen/arch/arm/include/asm/p2m_mmu.h
@@ -0,0 +1,28 @@
+#ifndef _XEN_P2M_MMU_H
+#define _XEN_P2M_MMU_H
+
+#ifdef CONFIG_ARM_64
+extern unsigned int p2m_root_order;
+extern unsigned int p2m_root_level;
+#define P2M_ROOT_ORDER    p2m_root_order
+#define P2M_ROOT_LEVEL p2m_root_level
+#else
+/* First level P2M is always 2 consecutive pages */
+#define P2M_ROOT_ORDER    1
+#define P2M_ROOT_LEVEL 1
+#endif
+
+struct p2m_domain;
+
+void p2m_tlb_flush_sync(struct p2m_domain *p2m);
+
+#endif /* _XEN_P2M_MMU_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 948f199d84..42f51051e0 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -1,36 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#include <xen/cpu.h>
-#include <xen/domain_page.h>
-#include <xen/iocap.h>
-#include <xen/ioreq.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
-#include <xen/softirq.h>
 
-#include <asm/alternative.h>
 #include <asm/event.h>
-#include <asm/flushtlb.h>
-#include <asm/guest_walk.h>
 #include <asm/page.h>
-#include <asm/traps.h>
-
-#define MAX_VMID_8_BIT  (1UL << 8)
-#define MAX_VMID_16_BIT (1UL << 16)
-
-#define INVALID_VMID 0 /* VMID 0 is reserved */
-
-#ifdef CONFIG_ARM_64
-unsigned int __read_mostly p2m_root_order;
-unsigned int __read_mostly p2m_root_level;
-static unsigned int __read_mostly max_vmid = MAX_VMID_8_BIT;
-/* VMID is by default 8 bit width on AArch64 */
-#define MAX_VMID       max_vmid
-#else
-/* VMID is always 8 bit width on AArch32 */
-#define MAX_VMID        MAX_VMID_8_BIT
-#endif
-
-#define P2M_ROOT_PAGES    (1<<P2M_ROOT_ORDER)
 
 /*
  * Set to the maximum configured support for IPA bits, so the number of IPA 
bits can be
@@ -38,50 +11,6 @@ static unsigned int __read_mostly max_vmid = MAX_VMID_8_BIT;
  */
 unsigned int __read_mostly p2m_ipa_bits = PADDR_BITS;
 
-static mfn_t __read_mostly empty_root_mfn;
-
-static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
-{
-    return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
-}
-
-static struct page_info *p2m_alloc_page(struct domain *d)
-{
-    struct page_info *pg;
-
-    /*
-     * For hardware domain, there should be no limit in the number of pages 
that
-     * can be allocated, so that the kernel may take advantage of the extended
-     * regions. Hence, allocate p2m pages for hardware domains from heap.
-     */
-    if ( is_hardware_domain(d) )
-    {
-        pg = alloc_domheap_page(NULL, 0);
-        if ( pg == NULL )
-            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
-    }
-    else
-    {
-        spin_lock(&d->arch.paging.lock);
-        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
-        spin_unlock(&d->arch.paging.lock);
-    }
-
-    return pg;
-}
-
-static void p2m_free_page(struct domain *d, struct page_info *pg)
-{
-    if ( is_hardware_domain(d) )
-        free_domheap_page(pg);
-    else
-    {
-        spin_lock(&d->arch.paging.lock);
-        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
-        spin_unlock(&d->arch.paging.lock);
-    }
-}
-
 /* Return the size of the pool, in bytes. */
 int arch_get_paging_mempool_size(struct domain *d, uint64_t *size)
 {
@@ -186,441 +115,10 @@ int p2m_teardown_allocation(struct domain *d)
     return ret;
 }
 
-/* Unlock the flush and do a P2M TLB flush if necessary */
-void p2m_write_unlock(struct p2m_domain *p2m)
-{
-    /*
-     * The final flush is done with the P2M write lock taken to avoid
-     * someone else modifying the P2M wbefore the TLB invalidation has
-     * completed.
-     */
-    p2m_tlb_flush_sync(p2m);
-
-    write_unlock(&p2m->lock);
-}
-
-void p2m_dump_info(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    p2m_read_lock(p2m);
-    printk("p2m mappings for domain %d (vmid %d):\n",
-           d->domain_id, p2m->vmid);
-    BUG_ON(p2m->stats.mappings[0] || p2m->stats.shattered[0]);
-    printk("  1G mappings: %ld (shattered %ld)\n",
-           p2m->stats.mappings[1], p2m->stats.shattered[1]);
-    printk("  2M mappings: %ld (shattered %ld)\n",
-           p2m->stats.mappings[2], p2m->stats.shattered[2]);
-    printk("  4K mappings: %ld\n", p2m->stats.mappings[3]);
-    p2m_read_unlock(p2m);
-}
-
 void memory_type_changed(struct domain *d)
 {
 }
 
-void dump_p2m_lookup(struct domain *d, paddr_t addr)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    printk("dom%d IPA 0x%"PRIpaddr"\n", d->domain_id, addr);
-
-    printk("P2M @ %p mfn:%#"PRI_mfn"\n",
-           p2m->root, mfn_x(page_to_mfn(p2m->root)));
-
-    dump_pt_walk(page_to_maddr(p2m->root), addr,
-                 P2M_ROOT_LEVEL, P2M_ROOT_PAGES);
-}
-
-/*
- * p2m_save_state and p2m_restore_state work in pair to workaround
- * ARM64_WORKAROUND_AT_SPECULATE. p2m_save_state will set-up VTTBR to
- * point to the empty page-tables to stop allocating TLB entries.
- */
-void p2m_save_state(struct vcpu *p)
-{
-    p->arch.sctlr = READ_SYSREG(SCTLR_EL1);
-
-    if ( cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
-    {
-        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), 
VTTBR_EL2);
-        /*
-         * Ensure VTTBR_EL2 is correctly synchronized so we can restore
-         * the next vCPU context without worrying about AT instruction
-         * speculation.
-         */
-        isb();
-    }
-}
-
-void p2m_restore_state(struct vcpu *n)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(n->domain);
-    uint8_t *last_vcpu_ran;
-
-    if ( is_idle_vcpu(n) )
-        return;
-
-    WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1);
-    WRITE_SYSREG(n->arch.hcr_el2, HCR_EL2);
-
-    /*
-     * ARM64_WORKAROUND_AT_SPECULATE: VTTBR_EL2 should be restored after all
-     * registers associated to EL1/EL0 translations regime have been
-     * synchronized.
-     */
-    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
-    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
-
-    last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
-
-    /*
-     * While we are restoring an out-of-context translation regime
-     * we still need to ensure:
-     *  - VTTBR_EL2 is synchronized before flushing the TLBs
-     *  - All registers for EL1 are synchronized before executing an AT
-     *  instructions targeting S1/S2.
-     */
-    isb();
-
-    /*
-     * Flush local TLB for the domain to prevent wrong TLB translation
-     * when running multiple vCPU of the same domain on a single pCPU.
-     */
-    if ( *last_vcpu_ran != INVALID_VCPU_ID && *last_vcpu_ran != n->vcpu_id )
-        flush_guest_tlb_local();
-
-    *last_vcpu_ran = n->vcpu_id;
-}
-
-/*
- * Force a synchronous P2M TLB flush.
- *
- * Must be called with the p2m lock held.
- */
-static void p2m_force_tlb_flush_sync(struct p2m_domain *p2m)
-{
-    unsigned long flags = 0;
-    uint64_t ovttbr;
-
-    ASSERT(p2m_is_write_locked(p2m));
-
-    /*
-     * ARM only provides an instruction to flush TLBs for the current
-     * VMID. So switch to the VTTBR of a given P2M if different.
-     */
-    ovttbr = READ_SYSREG64(VTTBR_EL2);
-    if ( ovttbr != p2m->vttbr )
-    {
-        uint64_t vttbr;
-
-        local_irq_save(flags);
-
-        /*
-         * ARM64_WORKAROUND_AT_SPECULATE: We need to stop AT to allocate
-         * TLBs entries because the context is partially modified. We
-         * only need the VMID for flushing the TLBs, so we can generate
-         * a new VTTBR with the VMID to flush and the empty root table.
-         */
-        if ( !cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
-            vttbr = p2m->vttbr;
-        else
-            vttbr = generate_vttbr(p2m->vmid, empty_root_mfn);
-
-        WRITE_SYSREG64(vttbr, VTTBR_EL2);
-
-        /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
-        isb();
-    }
-
-    flush_guest_tlb();
-
-    if ( ovttbr != READ_SYSREG64(VTTBR_EL2) )
-    {
-        WRITE_SYSREG64(ovttbr, VTTBR_EL2);
-        /* Ensure VTTBR_EL2 is back in place before continuing. */
-        isb();
-        local_irq_restore(flags);
-    }
-
-    p2m->need_flush = false;
-}
-
-void p2m_tlb_flush_sync(struct p2m_domain *p2m)
-{
-    if ( p2m->need_flush )
-        p2m_force_tlb_flush_sync(p2m);
-}
-
-/*
- * Find and map the root page table. The caller is responsible for
- * unmapping the table.
- *
- * The function will return NULL if the offset of the root table is
- * invalid.
- */
-static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m,
-                                    gfn_t gfn)
-{
-    unsigned long root_table;
-
-    /*
-     * While the root table index is the offset from the previous level,
-     * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be
-     * 0. Yet we still want to check if all the unused bits are zeroed.
-     */
-    root_table = gfn_x(gfn) >> (XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL) +
-                                XEN_PT_LPAE_SHIFT);
-    if ( root_table >= P2M_ROOT_PAGES )
-        return NULL;
-
-    return __map_domain_page(p2m->root + root_table);
-}
-
-/*
- * Lookup the MFN corresponding to a domain's GFN.
- * Lookup mem access in the ratrix tree.
- * The entries associated to the GFN is considered valid.
- */
-static p2m_access_t p2m_mem_access_radix_get(struct p2m_domain *p2m, gfn_t gfn)
-{
-    void *ptr;
-
-    if ( !p2m->mem_access_enabled )
-        return p2m->default_access;
-
-    ptr = radix_tree_lookup(&p2m->mem_access_settings, gfn_x(gfn));
-    if ( !ptr )
-        return p2m_access_rwx;
-    else
-        return radix_tree_ptr_to_int(ptr);
-}
-
-/*
- * In the case of the P2M, the valid bit is used for other purpose. Use
- * the type to check whether an entry is valid.
- */
-static inline bool p2m_is_valid(lpae_t pte)
-{
-    return pte.p2m.type != p2m_invalid;
-}
-
-/*
- * lpae_is_* helpers don't check whether the valid bit is set in the
- * PTE. Provide our own overlay to check the valid bit.
- */
-static inline bool p2m_is_mapping(lpae_t pte, unsigned int level)
-{
-    return p2m_is_valid(pte) && lpae_is_mapping(pte, level);
-}
-
-static inline bool p2m_is_superpage(lpae_t pte, unsigned int level)
-{
-    return p2m_is_valid(pte) && lpae_is_superpage(pte, level);
-}
-
-#define GUEST_TABLE_MAP_FAILED 0
-#define GUEST_TABLE_SUPER_PAGE 1
-#define GUEST_TABLE_NORMAL_PAGE 2
-
-static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry);
-
-/*
- * Take the currently mapped table, find the corresponding GFN entry,
- * and map the next table, if available. The previous table will be
- * unmapped if the next level was mapped (e.g GUEST_TABLE_NORMAL_PAGE
- * returned).
- *
- * The read_only parameters indicates whether intermediate tables should
- * be allocated when not present.
- *
- * Return values:
- *  GUEST_TABLE_MAP_FAILED: Either read_only was set and the entry
- *  was empty, or allocating a new page failed.
- *  GUEST_TABLE_NORMAL_PAGE: next level mapped normally
- *  GUEST_TABLE_SUPER_PAGE: The next entry points to a superpage.
- */
-static int p2m_next_level(struct p2m_domain *p2m, bool read_only,
-                          unsigned int level, lpae_t **table,
-                          unsigned int offset)
-{
-    lpae_t *entry;
-    int ret;
-    mfn_t mfn;
-
-    entry = *table + offset;
-
-    if ( !p2m_is_valid(*entry) )
-    {
-        if ( read_only )
-            return GUEST_TABLE_MAP_FAILED;
-
-        ret = p2m_create_table(p2m, entry);
-        if ( ret )
-            return GUEST_TABLE_MAP_FAILED;
-    }
-
-    /* The function p2m_next_level is never called at the 3rd level */
-    ASSERT(level < 3);
-    if ( p2m_is_mapping(*entry, level) )
-        return GUEST_TABLE_SUPER_PAGE;
-
-    mfn = lpae_get_mfn(*entry);
-
-    unmap_domain_page(*table);
-    *table = map_domain_page(mfn);
-
-    return GUEST_TABLE_NORMAL_PAGE;
-}
-
-/*
- * Get the details of a given gfn.
- *
- * If the entry is present, the associated MFN will be returned and the
- * access and type filled up. The page_order will correspond to the
- * order of the mapping in the page table (i.e it could be a superpage).
- *
- * If the entry is not present, INVALID_MFN will be returned and the
- * page_order will be set according to the order of the invalid range.
- *
- * valid will contain the value of bit[0] (e.g valid bit) of the
- * entry.
- */
-mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
-                    p2m_type_t *t, p2m_access_t *a,
-                    unsigned int *page_order,
-                    bool *valid)
-{
-    paddr_t addr = gfn_to_gaddr(gfn);
-    unsigned int level = 0;
-    lpae_t entry, *table;
-    int rc;
-    mfn_t mfn = INVALID_MFN;
-    p2m_type_t _t;
-    DECLARE_OFFSETS(offsets, addr);
-
-    ASSERT(p2m_is_locked(p2m));
-    BUILD_BUG_ON(THIRD_MASK != PAGE_MASK);
-
-    /* Allow t to be NULL */
-    t = t ?: &_t;
-
-    *t = p2m_invalid;
-
-    if ( valid )
-        *valid = false;
-
-    /* XXX: Check if the mapping is lower than the mapped gfn */
-
-    /* This gfn is higher than the highest the p2m map currently holds */
-    if ( gfn_x(gfn) > gfn_x(p2m->max_mapped_gfn) )
-    {
-        for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
-            if ( (gfn_x(gfn) & (XEN_PT_LEVEL_MASK(level) >> PAGE_SHIFT)) >
-                 gfn_x(p2m->max_mapped_gfn) )
-                break;
-
-        goto out;
-    }
-
-    table = p2m_get_root_pointer(p2m, gfn);
-
-    /*
-     * the table should always be non-NULL because the gfn is below
-     * p2m->max_mapped_gfn and the root table pages are always present.
-     */
-    if ( !table )
-    {
-        ASSERT_UNREACHABLE();
-        level = P2M_ROOT_LEVEL;
-        goto out;
-    }
-
-    for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
-    {
-        rc = p2m_next_level(p2m, true, level, &table, offsets[level]);
-        if ( rc == GUEST_TABLE_MAP_FAILED )
-            goto out_unmap;
-        else if ( rc != GUEST_TABLE_NORMAL_PAGE )
-            break;
-    }
-
-    entry = table[offsets[level]];
-
-    if ( p2m_is_valid(entry) )
-    {
-        *t = entry.p2m.type;
-
-        if ( a )
-            *a = p2m_mem_access_radix_get(p2m, gfn);
-
-        mfn = lpae_get_mfn(entry);
-        /*
-         * The entry may point to a superpage. Find the MFN associated
-         * to the GFN.
-         */
-        mfn = mfn_add(mfn,
-                      gfn_x(gfn) & ((1UL << XEN_PT_LEVEL_ORDER(level)) - 1));
-
-        if ( valid )
-            *valid = lpae_is_valid(entry);
-    }
-
-out_unmap:
-    unmap_domain_page(table);
-
-out:
-    if ( page_order )
-        *page_order = XEN_PT_LEVEL_ORDER(level);
-
-    return mfn;
-}
-
-mfn_t p2m_lookup(struct domain *d, gfn_t gfn, p2m_type_t *t)
-{
-    mfn_t mfn;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    p2m_read_lock(p2m);
-    mfn = p2m_get_entry(p2m, gfn, t, NULL, NULL, NULL);
-    p2m_read_unlock(p2m);
-
-    return mfn;
-}
-
-struct page_info *p2m_get_page_from_gfn(struct domain *d, gfn_t gfn,
-                                        p2m_type_t *t)
-{
-    struct page_info *page;
-    p2m_type_t p2mt;
-    mfn_t mfn = p2m_lookup(d, gfn, &p2mt);
-
-    if ( t )
-        *t = p2mt;
-
-    if ( !p2m_is_any_ram(p2mt) )
-        return NULL;
-
-    if ( !mfn_valid(mfn) )
-        return NULL;
-
-    page = mfn_to_page(mfn);
-
-    /*
-     * get_page won't work on foreign mapping because the page doesn't
-     * belong to the current domain.
-     */
-    if ( p2m_is_foreign(p2mt) )
-    {
-        struct domain *fdom = page_get_owner_and_reference(page);
-        ASSERT(fdom != NULL);
-        ASSERT(fdom != d);
-        return page;
-    }
-
-    return get_page(page, d) ? page : NULL;
-}
-
 int guest_physmap_mark_populate_on_demand(struct domain *d,
                                           unsigned long gfn,
                                           unsigned int order)
@@ -634,1780 +132,16 @@ unsigned long p2m_pod_decrease_reservation(struct 
domain *d, gfn_t gfn,
     return 0;
 }
 
-static void p2m_set_permission(lpae_t *e, p2m_type_t t, p2m_access_t a)
-{
-    /* First apply type permissions */
-    switch ( t )
-    {
-    case p2m_ram_rw:
-        e->p2m.xn = 0;
-        e->p2m.write = 1;
-        break;
-
-    case p2m_ram_ro:
-        e->p2m.xn = 0;
-        e->p2m.write = 0;
-        break;
-
-    case p2m_iommu_map_rw:
-    case p2m_map_foreign_rw:
-    case p2m_grant_map_rw:
-    case p2m_mmio_direct_dev:
-    case p2m_mmio_direct_nc:
-    case p2m_mmio_direct_c:
-        e->p2m.xn = 1;
-        e->p2m.write = 1;
-        break;
-
-    case p2m_iommu_map_ro:
-    case p2m_map_foreign_ro:
-    case p2m_grant_map_ro:
-    case p2m_invalid:
-        e->p2m.xn = 1;
-        e->p2m.write = 0;
-        break;
-
-    case p2m_max_real_type:
-        BUG();
-        break;
-    }
-
-    /* Then restrict with access permissions */
-    switch ( a )
-    {
-    case p2m_access_rwx:
-        break;
-    case p2m_access_wx:
-        e->p2m.read = 0;
-        break;
-    case p2m_access_rw:
-        e->p2m.xn = 1;
-        break;
-    case p2m_access_w:
-        e->p2m.read = 0;
-        e->p2m.xn = 1;
-        break;
-    case p2m_access_rx:
-    case p2m_access_rx2rw:
-        e->p2m.write = 0;
-        break;
-    case p2m_access_x:
-        e->p2m.write = 0;
-        e->p2m.read = 0;
-        break;
-    case p2m_access_r:
-        e->p2m.write = 0;
-        e->p2m.xn = 1;
-        break;
-    case p2m_access_n:
-    case p2m_access_n2rwx:
-        e->p2m.read = e->p2m.write = 0;
-        e->p2m.xn = 1;
-        break;
-    }
-}
-
-static lpae_t mfn_to_p2m_entry(mfn_t mfn, p2m_type_t t, p2m_access_t a)
-{
-    /*
-     * sh, xn and write bit will be defined in the following switches
-     * based on mattr and t.
-     */
-    lpae_t e = (lpae_t) {
-        .p2m.af = 1,
-        .p2m.read = 1,
-        .p2m.table = 1,
-        .p2m.valid = 1,
-        .p2m.type = t,
-    };
-
-    BUILD_BUG_ON(p2m_max_real_type > (1 << 4));
-
-    switch ( t )
-    {
-    case p2m_mmio_direct_dev:
-        e.p2m.mattr = MATTR_DEV;
-        e.p2m.sh = LPAE_SH_OUTER;
-        break;
-
-    case p2m_mmio_direct_c:
-        e.p2m.mattr = MATTR_MEM;
-        e.p2m.sh = LPAE_SH_OUTER;
-        break;
-
-    /*
-     * ARM ARM: Overlaying the shareability attribute (DDI
-     * 0406C.b B3-1376 to 1377)
-     *
-     * A memory region with a resultant memory type attribute of Normal,
-     * and a resultant cacheability attribute of Inner Non-cacheable,
-     * Outer Non-cacheable, must have a resultant shareability attribute
-     * of Outer Shareable, otherwise shareability is UNPREDICTABLE.
-     *
-     * On ARMv8 shareability is ignored and explicitly treated as Outer
-     * Shareable for Normal Inner Non_cacheable, Outer Non-cacheable.
-     * See the note for table D4-40, in page 1788 of the ARM DDI 0487A.j.
-     */
-    case p2m_mmio_direct_nc:
-        e.p2m.mattr = MATTR_MEM_NC;
-        e.p2m.sh = LPAE_SH_OUTER;
-        break;
-
-    default:
-        e.p2m.mattr = MATTR_MEM;
-        e.p2m.sh = LPAE_SH_INNER;
-    }
-
-    p2m_set_permission(&e, t, a);
-
-    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
-
-    lpae_set_mfn(e, mfn);
-
-    return e;
-}
-
-/* Generate table entry with correct attributes. */
-static lpae_t page_to_p2m_table(struct page_info *page)
+void __init p2m_restrict_ipa_bits(unsigned int ipa_bits)
 {
     /*
-     * The access value does not matter because the hardware will ignore
-     * the permission fields for table entry.
-     *
-     * We use p2m_ram_rw so the entry has a valid type. This is important
-     * for p2m_is_valid() to return valid on table entries.
+     * Calculate the minimum of the maximum IPA bits that any external entity
+     * can support.
      */
-    return mfn_to_p2m_entry(page_to_mfn(page), p2m_ram_rw, p2m_access_rwx);
-}
-
-static inline void p2m_write_pte(lpae_t *p, lpae_t pte, bool clean_pte)
-{
-    write_pte(p, pte);
-    if ( clean_pte )
-        clean_dcache(*p);
-}
-
-static inline void p2m_remove_pte(lpae_t *p, bool clean_pte)
-{
-    lpae_t pte;
-
-    memset(&pte, 0x00, sizeof(pte));
-    p2m_write_pte(p, pte, clean_pte);
-}
-
-/* Allocate a new page table page and hook it in via the given entry. */
-static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
-{
-    struct page_info *page;
-    lpae_t *p;
-
-    ASSERT(!p2m_is_valid(*entry));
-
-    page = p2m_alloc_page(p2m->domain);
-    if ( page == NULL )
-        return -ENOMEM;
-
-    page_list_add(page, &p2m->pages);
-
-    p = __map_domain_page(page);
-    clear_page(p);
-
-    if ( p2m->clean_pte )
-        clean_dcache_va_range(p, PAGE_SIZE);
-
-    unmap_domain_page(p);
-
-    p2m_write_pte(entry, page_to_p2m_table(page), p2m->clean_pte);
-
-    return 0;
-}
-
-static int p2m_mem_access_radix_set(struct p2m_domain *p2m, gfn_t gfn,
-                                    p2m_access_t a)
-{
-    int rc;
-
-    if ( !p2m->mem_access_enabled )
-        return 0;
-
-    if ( p2m_access_rwx == a )
-    {
-        radix_tree_delete(&p2m->mem_access_settings, gfn_x(gfn));
-        return 0;
-    }
-
-    rc = radix_tree_insert(&p2m->mem_access_settings, gfn_x(gfn),
-                           radix_tree_int_to_ptr(a));
-    if ( rc == -EEXIST )
-    {
-        /* If a setting already exists, change it to the new one */
-        radix_tree_replace_slot(
-            radix_tree_lookup_slot(
-                &p2m->mem_access_settings, gfn_x(gfn)),
-            radix_tree_int_to_ptr(a));
-        rc = 0;
-    }
-
-    return rc;
+    if ( ipa_bits < p2m_ipa_bits )
+        p2m_ipa_bits = ipa_bits;
 }
 
-/*
- * Put any references on the single 4K page referenced by pte.
- * TODO: Handle superpages, for now we only take special references for leaf
- * pages (specifically foreign ones, which can't be super mapped today).
- */
-static void p2m_put_l3_page(const lpae_t pte)
-{
-    mfn_t mfn = lpae_get_mfn(pte);
-
-    ASSERT(p2m_is_valid(pte));
-
-    /*
-     * TODO: Handle other p2m types
-     *
-     * It's safe to do the put_page here because page_alloc will
-     * flush the TLBs if the page is reallocated before the end of
-     * this loop.
-     */
-    if ( p2m_is_foreign(pte.p2m.type) )
-    {
-        ASSERT(mfn_valid(mfn));
-        put_page(mfn_to_page(mfn));
-    }
-    /* Detect the xenheap page and mark the stored GFN as invalid. */
-    else if ( p2m_is_ram(pte.p2m.type) && is_xen_heap_mfn(mfn) )
-        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
-}
-
-/* Free lpae sub-tree behind an entry */
-static void p2m_free_entry(struct p2m_domain *p2m,
-                           lpae_t entry, unsigned int level)
-{
-    unsigned int i;
-    lpae_t *table;
-    mfn_t mfn;
-    struct page_info *pg;
-
-    /* Nothing to do if the entry is invalid. */
-    if ( !p2m_is_valid(entry) )
-        return;
-
-    if ( p2m_is_superpage(entry, level) || (level == 3) )
-    {
-#ifdef CONFIG_IOREQ_SERVER
-        /*
-         * If this gets called then either the entry was replaced by an entry
-         * with a different base (valid case) or the shattering of a superpage
-         * has failed (error case).
-         * So, at worst, the spurious mapcache invalidation might be sent.
-         */
-        if ( p2m_is_ram(entry.p2m.type) &&
-             domain_has_ioreq_server(p2m->domain) )
-            ioreq_request_mapcache_invalidate(p2m->domain);
-#endif
-
-        p2m->stats.mappings[level]--;
-        /* Nothing to do if the entry is a super-page. */
-        if ( level == 3 )
-            p2m_put_l3_page(entry);
-        return;
-    }
-
-    table = map_domain_page(lpae_get_mfn(entry));
-    for ( i = 0; i < XEN_PT_LPAE_ENTRIES; i++ )
-        p2m_free_entry(p2m, *(table + i), level + 1);
-
-    unmap_domain_page(table);
-
-    /*
-     * Make sure all the references in the TLB have been removed before
-     * freing the intermediate page table.
-     * XXX: Should we defer the free of the page table to avoid the
-     * flush?
-     */
-    p2m_tlb_flush_sync(p2m);
-
-    mfn = lpae_get_mfn(entry);
-    ASSERT(mfn_valid(mfn));
-
-    pg = mfn_to_page(mfn);
-
-    page_list_del(pg, &p2m->pages);
-    p2m_free_page(p2m->domain, pg);
-}
-
-static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
-                                unsigned int level, unsigned int target,
-                                const unsigned int *offsets)
-{
-    struct page_info *page;
-    unsigned int i;
-    lpae_t pte, *table;
-    bool rv = true;
-
-    /* Convenience aliases */
-    mfn_t mfn = lpae_get_mfn(*entry);
-    unsigned int next_level = level + 1;
-    unsigned int level_order = XEN_PT_LEVEL_ORDER(next_level);
-
-    /*
-     * This should only be called with target != level and the entry is
-     * a superpage.
-     */
-    ASSERT(level < target);
-    ASSERT(p2m_is_superpage(*entry, level));
-
-    page = p2m_alloc_page(p2m->domain);
-    if ( !page )
-        return false;
-
-    page_list_add(page, &p2m->pages);
-    table = __map_domain_page(page);
-
-    /*
-     * We are either splitting a first level 1G page into 512 second level
-     * 2M pages, or a second level 2M page into 512 third level 4K pages.
-     */
-    for ( i = 0; i < XEN_PT_LPAE_ENTRIES; i++ )
-    {
-        lpae_t *new_entry = table + i;
-
-        /*
-         * Use the content of the superpage entry and override
-         * the necessary fields. So the correct permission are kept.
-         */
-        pte = *entry;
-        lpae_set_mfn(pte, mfn_add(mfn, i << level_order));
-
-        /*
-         * First and second level pages set p2m.table = 0, but third
-         * level entries set p2m.table = 1.
-         */
-        pte.p2m.table = (next_level == 3);
-
-        write_pte(new_entry, pte);
-    }
-
-    /* Update stats */
-    p2m->stats.shattered[level]++;
-    p2m->stats.mappings[level]--;
-    p2m->stats.mappings[next_level] += XEN_PT_LPAE_ENTRIES;
-
-    /*
-     * Shatter superpage in the page to the level we want to make the
-     * changes.
-     * This is done outside the loop to avoid checking the offset to
-     * know whether the entry should be shattered for every entry.
-     */
-    if ( next_level != target )
-        rv = p2m_split_superpage(p2m, table + offsets[next_level],
-                                 level + 1, target, offsets);
-
-    if ( p2m->clean_pte )
-        clean_dcache_va_range(table, PAGE_SIZE);
-
-    unmap_domain_page(table);
-
-    /*
-     * Even if we failed, we should install the newly allocated LPAE
-     * entry. The caller will be in charge to free the sub-tree.
-     */
-    p2m_write_pte(entry, page_to_p2m_table(page), p2m->clean_pte);
-
-    return rv;
-}
-
-/*
- * Insert an entry in the p2m. This should be called with a mapping
- * equal to a page/superpage (4K, 2M, 1G).
- */
-static int __p2m_set_entry(struct p2m_domain *p2m,
-                           gfn_t sgfn,
-                           unsigned int page_order,
-                           mfn_t smfn,
-                           p2m_type_t t,
-                           p2m_access_t a)
-{
-    unsigned int level = 0;
-    unsigned int target = 3 - (page_order / XEN_PT_LPAE_SHIFT);
-    lpae_t *entry, *table, orig_pte;
-    int rc;
-    /* A mapping is removed if the MFN is invalid. */
-    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
-    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
-
-    ASSERT(p2m_is_write_locked(p2m));
-
-    /*
-     * Check if the level target is valid: we only support
-     * 4K - 2M - 1G mapping.
-     */
-    ASSERT(target > 0 && target <= 3);
-
-    table = p2m_get_root_pointer(p2m, sgfn);
-    if ( !table )
-        return -EINVAL;
-
-    for ( level = P2M_ROOT_LEVEL; level < target; level++ )
-    {
-        /*
-         * Don't try to allocate intermediate page table if the mapping
-         * is about to be removed.
-         */
-        rc = p2m_next_level(p2m, removing_mapping,
-                            level, &table, offsets[level]);
-        if ( rc == GUEST_TABLE_MAP_FAILED )
-        {
-            /*
-             * We are here because p2m_next_level has failed to map
-             * the intermediate page table (e.g the table does not exist
-             * and they p2m tree is read-only). It is a valid case
-             * when removing a mapping as it may not exist in the
-             * page table. In this case, just ignore it.
-             */
-            rc = removing_mapping ?  0 : -ENOENT;
-            goto out;
-        }
-        else if ( rc != GUEST_TABLE_NORMAL_PAGE )
-            break;
-    }
-
-    entry = table + offsets[level];
-
-    /*
-     * If we are here with level < target, we must be at a leaf node,
-     * and we need to break up the superpage.
-     */
-    if ( level < target )
-    {
-        /* We need to split the original page. */
-        lpae_t split_pte = *entry;
-
-        ASSERT(p2m_is_superpage(*entry, level));
-
-        if ( !p2m_split_superpage(p2m, &split_pte, level, target, offsets) )
-        {
-            /*
-             * The current super-page is still in-place, so re-increment
-             * the stats.
-             */
-            p2m->stats.mappings[level]++;
-
-            /* Free the allocated sub-tree */
-            p2m_free_entry(p2m, split_pte, level);
-
-            rc = -ENOMEM;
-            goto out;
-        }
-
-        /*
-         * Follow the break-before-sequence to update the entry.
-         * For more details see (D4.7.1 in ARM DDI 0487A.j).
-         */
-        p2m_remove_pte(entry, p2m->clean_pte);
-        p2m_force_tlb_flush_sync(p2m);
-
-        p2m_write_pte(entry, split_pte, p2m->clean_pte);
-
-        /* then move to the level we want to make real changes */
-        for ( ; level < target; level++ )
-        {
-            rc = p2m_next_level(p2m, true, level, &table, offsets[level]);
-
-            /*
-             * The entry should be found and either be a table
-             * or a superpage if level 3 is not targeted
-             */
-            ASSERT(rc == GUEST_TABLE_NORMAL_PAGE ||
-                   (rc == GUEST_TABLE_SUPER_PAGE && target < 3));
-        }
-
-        entry = table + offsets[level];
-    }
-
-    /*
-     * We should always be there with the correct level because
-     * all the intermediate tables have been installed if necessary.
-     */
-    ASSERT(level == target);
-
-    orig_pte = *entry;
-
-    /*
-     * The radix-tree can only work on 4KB. This is only used when
-     * memaccess is enabled and during shutdown.
-     */
-    ASSERT(!p2m->mem_access_enabled || page_order == 0 ||
-           p2m->domain->is_dying);
-    /*
-     * The access type should always be p2m_access_rwx when the mapping
-     * is removed.
-     */
-    ASSERT(!mfn_eq(INVALID_MFN, smfn) || (a == p2m_access_rwx));
-    /*
-     * Update the mem access permission before update the P2M. So we
-     * don't have to revert the mapping if it has failed.
-     */
-    rc = p2m_mem_access_radix_set(p2m, sgfn, a);
-    if ( rc )
-        goto out;
-
-    /*
-     * Always remove the entry in order to follow the break-before-make
-     * sequence when updating the translation table (D4.7.1 in ARM DDI
-     * 0487A.j).
-     */
-    if ( lpae_is_valid(orig_pte) || removing_mapping )
-        p2m_remove_pte(entry, p2m->clean_pte);
-
-    if ( removing_mapping )
-        /* Flush can be deferred if the entry is removed */
-        p2m->need_flush |= !!lpae_is_valid(orig_pte);
-    else
-    {
-        lpae_t pte = mfn_to_p2m_entry(smfn, t, a);
-
-        if ( level < 3 )
-            pte.p2m.table = 0; /* Superpage entry */
-
-        /*
-         * It is necessary to flush the TLB before writing the new entry
-         * to keep coherency when the previous entry was valid.
-         *
-         * Although, it could be defered when only the permissions are
-         * changed (e.g in case of memaccess).
-         */
-        if ( lpae_is_valid(orig_pte) )
-        {
-            if ( likely(!p2m->mem_access_enabled) ||
-                 P2M_CLEAR_PERM(pte) != P2M_CLEAR_PERM(orig_pte) )
-                p2m_force_tlb_flush_sync(p2m);
-            else
-                p2m->need_flush = true;
-        }
-        else if ( !p2m_is_valid(orig_pte) ) /* new mapping */
-            p2m->stats.mappings[level]++;
-
-        p2m_write_pte(entry, pte, p2m->clean_pte);
-
-        p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
-                                      gfn_add(sgfn, (1UL << page_order) - 1));
-        p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
-    }
-
-    if ( is_iommu_enabled(p2m->domain) &&
-         (lpae_is_valid(orig_pte) || lpae_is_valid(*entry)) )
-    {
-        unsigned int flush_flags = 0;
-
-        if ( lpae_is_valid(orig_pte) )
-            flush_flags |= IOMMU_FLUSHF_modified;
-        if ( lpae_is_valid(*entry) )
-            flush_flags |= IOMMU_FLUSHF_added;
-
-        rc = iommu_iotlb_flush(p2m->domain, _dfn(gfn_x(sgfn)),
-                               1UL << page_order, flush_flags);
-    }
-    else
-        rc = 0;
-
-    /*
-     * Free the entry only if the original pte was valid and the base
-     * is different (to avoid freeing when permission is changed).
-     */
-    if ( p2m_is_valid(orig_pte) &&
-         !mfn_eq(lpae_get_mfn(*entry), lpae_get_mfn(orig_pte)) )
-        p2m_free_entry(p2m, orig_pte, level);
-
-out:
-    unmap_domain_page(table);
-
-    return rc;
-}
-
-int p2m_set_entry(struct p2m_domain *p2m,
-                  gfn_t sgfn,
-                  unsigned long nr,
-                  mfn_t smfn,
-                  p2m_type_t t,
-                  p2m_access_t a)
-{
-    int rc = 0;
-
-    /*
-     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
-     * be dropped in relinquish_p2m_mapping(). As the P2M will still
-     * be accessible after, we need to prevent mapping to be added when the
-     * domain is dying.
-     */
-    if ( unlikely(p2m->domain->is_dying) )
-        return -ENOMEM;
-
-    while ( nr )
-    {
-        unsigned long mask;
-        unsigned long order;
-
-        /*
-         * Don't take into account the MFN when removing mapping (i.e
-         * MFN_INVALID) to calculate the correct target order.
-         *
-         * XXX: Support superpage mappings if nr is not aligned to a
-         * superpage size.
-         */
-        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
-        mask |= gfn_x(sgfn) | nr;
-
-        /* Always map 4k by 4k when memaccess is enabled */
-        if ( unlikely(p2m->mem_access_enabled) )
-            order = THIRD_ORDER;
-        else if ( !(mask & ((1UL << FIRST_ORDER) - 1)) )
-            order = FIRST_ORDER;
-        else if ( !(mask & ((1UL << SECOND_ORDER) - 1)) )
-            order = SECOND_ORDER;
-        else
-            order = THIRD_ORDER;
-
-        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
-        if ( rc )
-            break;
-
-        sgfn = gfn_add(sgfn, (1 << order));
-        if ( !mfn_eq(smfn, INVALID_MFN) )
-           smfn = mfn_add(smfn, (1 << order));
-
-        nr -= (1 << order);
-    }
-
-    return rc;
-}
-
-/* Invalidate all entries in the table. The p2m should be write locked. */
-static void p2m_invalidate_table(struct p2m_domain *p2m, mfn_t mfn)
-{
-    lpae_t *table;
-    unsigned int i;
-
-    ASSERT(p2m_is_write_locked(p2m));
-
-    table = map_domain_page(mfn);
-
-    for ( i = 0; i < XEN_PT_LPAE_ENTRIES; i++ )
-    {
-        lpae_t pte = table[i];
-
-        /*
-         * Writing an entry can be expensive because it may involve
-         * cleaning the cache. So avoid updating the entry if the valid
-         * bit is already cleared.
-         */
-        if ( !pte.p2m.valid )
-            continue;
-
-        pte.p2m.valid = 0;
-
-        p2m_write_pte(&table[i], pte, p2m->clean_pte);
-    }
-
-    unmap_domain_page(table);
-
-    p2m->need_flush = true;
-}
-
-/*
- * Invalidate all entries in the root page-tables. This is
- * useful to get fault on entry and do an action.
- *
- * p2m_invalid_root() should not be called when the P2M is shared with
- * the IOMMU because it will cause IOMMU fault.
- */
-void p2m_invalidate_root(struct p2m_domain *p2m)
-{
-    unsigned int i;
-
-    ASSERT(!iommu_use_hap_pt(p2m->domain));
-
-    p2m_write_lock(p2m);
-
-    for ( i = 0; i < P2M_ROOT_LEVEL; i++ )
-        p2m_invalidate_table(p2m, page_to_mfn(p2m->root + i));
-
-    p2m_write_unlock(p2m);
-}
-
-/*
- * Resolve any translation fault due to change in the p2m. This
- * includes break-before-make and valid bit cleared.
- */
-bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    unsigned int level = 0;
-    bool resolved = false;
-    lpae_t entry, *table;
-
-    /* Convenience aliases */
-    DECLARE_OFFSETS(offsets, gfn_to_gaddr(gfn));
-
-    p2m_write_lock(p2m);
-
-    /* This gfn is higher than the highest the p2m map currently holds */
-    if ( gfn_x(gfn) > gfn_x(p2m->max_mapped_gfn) )
-        goto out;
-
-    table = p2m_get_root_pointer(p2m, gfn);
-    /*
-     * The table should always be non-NULL because the gfn is below
-     * p2m->max_mapped_gfn and the root table pages are always present.
-     */
-    if ( !table )
-    {
-        ASSERT_UNREACHABLE();
-        goto out;
-    }
-
-    /*
-     * Go down the page-tables until an entry has the valid bit unset or
-     * a block/page entry has been hit.
-     */
-    for ( level = P2M_ROOT_LEVEL; level <= 3; level++ )
-    {
-        int rc;
-
-        entry = table[offsets[level]];
-
-        if ( level == 3 )
-            break;
-
-        /* Stop as soon as we hit an entry with the valid bit unset. */
-        if ( !lpae_is_valid(entry) )
-            break;
-
-        rc = p2m_next_level(p2m, true, level, &table, offsets[level]);
-        if ( rc == GUEST_TABLE_MAP_FAILED )
-            goto out_unmap;
-        else if ( rc != GUEST_TABLE_NORMAL_PAGE )
-            break;
-    }
-
-    /*
-     * If the valid bit of the entry is set, it means someone was playing with
-     * the Stage-2 page table. Nothing to do and mark the fault as resolved.
-     */
-    if ( lpae_is_valid(entry) )
-    {
-        resolved = true;
-        goto out_unmap;
-    }
-
-    /*
-     * The valid bit is unset. If the entry is still not valid then the fault
-     * cannot be resolved, exit and report it.
-     */
-    if ( !p2m_is_valid(entry) )
-        goto out_unmap;
-
-    /*
-     * Now we have an entry with valid bit unset, but still valid from
-     * the P2M point of view.
-     *
-     * If an entry is pointing to a table, each entry of the table will
-     * have there valid bit cleared. This allows a function to clear the
-     * full p2m with just a couple of write. The valid bit will then be
-     * propagated on the fault.
-     * If an entry is pointing to a block/page, no work to do for now.
-     */
-    if ( lpae_is_table(entry, level) )
-        p2m_invalidate_table(p2m, lpae_get_mfn(entry));
-
-    /*
-     * Now that the work on the entry is done, set the valid bit to prevent
-     * another fault on that entry.
-     */
-    resolved = true;
-    entry.p2m.valid = 1;
-
-    p2m_write_pte(table + offsets[level], entry, p2m->clean_pte);
-
-    /*
-     * No need to flush the TLBs as the modified entry had the valid bit
-     * unset.
-     */
-
-out_unmap:
-    unmap_domain_page(table);
-
-out:
-    p2m_write_unlock(p2m);
-
-    return resolved;
-}
-
-int p2m_insert_mapping(struct domain *d, gfn_t start_gfn, unsigned long nr,
-                       mfn_t mfn, p2m_type_t t)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    int rc;
-
-    p2m_write_lock(p2m);
-    rc = p2m_set_entry(p2m, start_gfn, nr, mfn, t, p2m->default_access);
-    p2m_write_unlock(p2m);
-
-    return rc;
-}
-
-static inline int p2m_remove_mapping(struct domain *d,
-                                     gfn_t start_gfn,
-                                     unsigned long nr,
-                                     mfn_t mfn)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    unsigned long i;
-    int rc;
-
-    p2m_write_lock(p2m);
-    /*
-     * Before removing the GFN - MFN mapping for any RAM pages make sure
-     * that there is no difference between what is already mapped and what
-     * is requested to be unmapped.
-     * If they don't match bail out early. For instance, this could happen
-     * if two CPUs are requesting to unmap the same P2M entry concurrently.
-     */
-    for ( i = 0; i < nr; )
-    {
-        unsigned int cur_order;
-        p2m_type_t t;
-        mfn_t mfn_return = p2m_get_entry(p2m, gfn_add(start_gfn, i), &t, NULL,
-                                         &cur_order, NULL);
-
-        if ( p2m_is_any_ram(t) &&
-             (!mfn_valid(mfn) || !mfn_eq(mfn_add(mfn, i), mfn_return)) )
-        {
-            rc = -EILSEQ;
-            goto out;
-        }
-
-        i += (1UL << cur_order) -
-             ((gfn_x(start_gfn) + i) & ((1UL << cur_order) - 1));
-    }
-
-    rc = p2m_set_entry(p2m, start_gfn, nr, INVALID_MFN,
-                       p2m_invalid, p2m_access_rwx);
-
-out:
-    p2m_write_unlock(p2m);
-
-    return rc;
-}
-
-int map_regions_p2mt(struct domain *d,
-                     gfn_t gfn,
-                     unsigned long nr,
-                     mfn_t mfn,
-                     p2m_type_t p2mt)
-{
-    return p2m_insert_mapping(d, gfn, nr, mfn, p2mt);
-}
-
-int unmap_regions_p2mt(struct domain *d,
-                       gfn_t gfn,
-                       unsigned long nr,
-                       mfn_t mfn)
-{
-    return p2m_remove_mapping(d, gfn, nr, mfn);
-}
-
-int map_mmio_regions(struct domain *d,
-                     gfn_t start_gfn,
-                     unsigned long nr,
-                     mfn_t mfn)
-{
-    return p2m_insert_mapping(d, start_gfn, nr, mfn, p2m_mmio_direct_dev);
-}
-
-int unmap_mmio_regions(struct domain *d,
-                       gfn_t start_gfn,
-                       unsigned long nr,
-                       mfn_t mfn)
-{
-    return p2m_remove_mapping(d, start_gfn, nr, mfn);
-}
-
-int map_dev_mmio_page(struct domain *d, gfn_t gfn, mfn_t mfn)
-{
-    int res;
-
-    if ( !iomem_access_permitted(d, mfn_x(mfn), mfn_x(mfn)) )
-        return 0;
-
-    res = p2m_insert_mapping(d, gfn, 1, mfn, p2m_mmio_direct_c);
-    if ( res < 0 )
-    {
-        printk(XENLOG_G_ERR "Unable to map MFN %#"PRI_mfn" in %pd\n",
-               mfn_x(mfn), d);
-        return res;
-    }
-
-    return 0;
-}
-
-int guest_physmap_add_entry(struct domain *d,
-                            gfn_t gfn,
-                            mfn_t mfn,
-                            unsigned long page_order,
-                            p2m_type_t t)
-{
-    return p2m_insert_mapping(d, gfn, (1 << page_order), mfn, t);
-}
-
-int guest_physmap_remove_page(struct domain *d, gfn_t gfn, mfn_t mfn,
-                              unsigned int page_order)
-{
-    return p2m_remove_mapping(d, gfn, (1 << page_order), mfn);
-}
-
-int set_foreign_p2m_entry(struct domain *d, const struct domain *fd,
-                          unsigned long gfn, mfn_t mfn)
-{
-    struct page_info *page = mfn_to_page(mfn);
-    int rc;
-
-    ASSERT(arch_acquire_resource_check(d));
-
-    if ( !get_page(page, fd) )
-        return -EINVAL;
-
-    /*
-     * It is valid to always use p2m_map_foreign_rw here as if this gets
-     * called then d != fd. A case when d == fd would be rejected by
-     * rcu_lock_remote_domain_by_id() earlier. Put a respective ASSERT()
-     * to catch incorrect usage in future.
-     */
-    ASSERT(d != fd);
-
-    rc = guest_physmap_add_entry(d, _gfn(gfn), mfn, 0, p2m_map_foreign_rw);
-    if ( rc )
-        put_page(page);
-
-    return rc;
-}
-
-static struct page_info *p2m_allocate_root(void)
-{
-    struct page_info *page;
-    unsigned int i;
-
-    page = alloc_domheap_pages(NULL, P2M_ROOT_ORDER, 0);
-    if ( page == NULL )
-        return NULL;
-
-    /* Clear both first level pages */
-    for ( i = 0; i < P2M_ROOT_PAGES; i++ )
-        clear_and_clean_page(page + i);
-
-    return page;
-}
-
-static int p2m_alloc_table(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    p2m->root = p2m_allocate_root();
-    if ( !p2m->root )
-        return -ENOMEM;
-
-    p2m->vttbr = generate_vttbr(p2m->vmid, page_to_mfn(p2m->root));
-
-    /*
-     * Make sure that all TLBs corresponding to the new VMID are flushed
-     * before using it
-     */
-    p2m_write_lock(p2m);
-    p2m_force_tlb_flush_sync(p2m);
-    p2m_write_unlock(p2m);
-
-    return 0;
-}
-
-
-static spinlock_t vmid_alloc_lock = SPIN_LOCK_UNLOCKED;
-
-/*
- * VTTBR_EL2 VMID field is 8 or 16 bits. AArch64 may support 16-bit VMID.
- * Using a bitmap here limits us to 256 or 65536 (for AArch64) concurrent
- * domains. The bitmap space will be allocated dynamically based on
- * whether 8 or 16 bit VMIDs are supported.
- */
-static unsigned long *vmid_mask;
-
-static void p2m_vmid_allocator_init(void)
-{
-    /*
-     * allocate space for vmid_mask based on MAX_VMID
-     */
-    vmid_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(MAX_VMID));
-
-    if ( !vmid_mask )
-        panic("Could not allocate VMID bitmap space\n");
-
-    set_bit(INVALID_VMID, vmid_mask);
-}
-
-static int p2m_alloc_vmid(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    int rc, nr;
-
-    spin_lock(&vmid_alloc_lock);
-
-    nr = find_first_zero_bit(vmid_mask, MAX_VMID);
-
-    ASSERT(nr != INVALID_VMID);
-
-    if ( nr == MAX_VMID )
-    {
-        rc = -EBUSY;
-        printk(XENLOG_ERR "p2m.c: dom%d: VMID pool exhausted\n", d->domain_id);
-        goto out;
-    }
-
-    set_bit(nr, vmid_mask);
-
-    p2m->vmid = nr;
-
-    rc = 0;
-
-out:
-    spin_unlock(&vmid_alloc_lock);
-    return rc;
-}
-
-static void p2m_free_vmid(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    spin_lock(&vmid_alloc_lock);
-    if ( p2m->vmid != INVALID_VMID )
-        clear_bit(p2m->vmid, vmid_mask);
-
-    spin_unlock(&vmid_alloc_lock);
-}
-
-int p2m_teardown(struct domain *d, bool allow_preemption)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    unsigned long count = 0;
-    struct page_info *pg;
-    unsigned int i;
-    int rc = 0;
-
-    if ( page_list_empty(&p2m->pages) )
-        return 0;
-
-    p2m_write_lock(p2m);
-
-    /*
-     * We are about to free the intermediate page-tables, so clear the
-     * root to prevent any walk to use them.
-     */
-    for ( i = 0; i < P2M_ROOT_PAGES; i++ )
-        clear_and_clean_page(p2m->root + i);
-
-    /*
-     * The domain will not be scheduled anymore, so in theory we should
-     * not need to flush the TLBs. Do it for safety purpose.
-     *
-     * Note that all the devices have already been de-assigned. So we don't
-     * need to flush the IOMMU TLB here.
-     */
-    p2m_force_tlb_flush_sync(p2m);
-
-    while ( (pg = page_list_remove_head(&p2m->pages)) )
-    {
-        p2m_free_page(p2m->domain, pg);
-        count++;
-        /* Arbitrarily preempt every 512 iterations */
-        if ( allow_preemption && !(count % 512) && hypercall_preempt_check() )
-        {
-            rc = -ERESTART;
-            break;
-        }
-    }
-
-    p2m_write_unlock(p2m);
-
-    return rc;
-}
-
-void p2m_final_teardown(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-    /* p2m not actually initialized */
-    if ( !p2m->domain )
-        return;
-
-    /*
-     * No need to call relinquish_p2m_mapping() here because
-     * p2m_final_teardown() is called either after 
domain_relinquish_resources()
-     * where relinquish_p2m_mapping() has been called, or from failure path of
-     * domain_create()/arch_domain_create() where mappings that require
-     * p2m_put_l3_page() should never be created. For the latter case, also see
-     * comment on top of the p2m_set_entry() for more info.
-     */
-
-    BUG_ON(p2m_teardown(d, false));
-    ASSERT(page_list_empty(&p2m->pages));
-
-    while ( p2m_teardown_allocation(d) == -ERESTART )
-        continue; /* No preemption support here */
-    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
-
-    if ( p2m->root )
-        free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
-
-    p2m->root = NULL;
-
-    p2m_free_vmid(d);
-
-    radix_tree_destroy(&p2m->mem_access_settings, NULL);
-
-    p2m->domain = NULL;
-}
-
-int p2m_init(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    int rc;
-    unsigned int cpu;
-
-    rwlock_init(&p2m->lock);
-    spin_lock_init(&d->arch.paging.lock);
-    INIT_PAGE_LIST_HEAD(&p2m->pages);
-    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
-
-    p2m->vmid = INVALID_VMID;
-    p2m->max_mapped_gfn = _gfn(0);
-    p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
-
-    p2m->default_access = p2m_access_rwx;
-    p2m->mem_access_enabled = false;
-    radix_tree_init(&p2m->mem_access_settings);
-
-    /*
-     * Some IOMMUs don't support coherent PT walk. When the p2m is
-     * shared with the CPU, Xen has to make sure that the PT changes have
-     * reached the memory
-     */
-    p2m->clean_pte = is_iommu_enabled(d) &&
-        !iommu_has_feature(d, IOMMU_FEAT_COHERENT_WALK);
-
-    /*
-     * Make sure that the type chosen to is able to store the an vCPU ID
-     * between 0 and the maximum of virtual CPUS supported as long as
-     * the INVALID_VCPU_ID.
-     */
-    BUILD_BUG_ON((1 << (sizeof(p2m->last_vcpu_ran[0]) * 8)) < MAX_VIRT_CPUS);
-    BUILD_BUG_ON((1 << (sizeof(p2m->last_vcpu_ran[0])* 8)) < INVALID_VCPU_ID);
-
-    for_each_possible_cpu(cpu)
-       p2m->last_vcpu_ran[cpu] = INVALID_VCPU_ID;
-
-    /*
-     * "Trivial" initialisation is now complete.  Set the backpointer so
-     * p2m_teardown() and friends know to do something.
-     */
-    p2m->domain = d;
-
-    rc = p2m_alloc_vmid(d);
-    if ( rc )
-        return rc;
-
-    rc = p2m_alloc_table(d);
-    if ( rc )
-        return rc;
-
-    /*
-     * Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area
-     * when the domain is created. Considering the worst case for page
-     * tables and keep a buffer, populate 16 pages to the P2M pages pool here.
-     * For GICv3, the above-mentioned P2M mapping is not necessary, but since
-     * the allocated 16 pages here would not be lost, hence populate these
-     * pages unconditionally.
-     */
-    spin_lock(&d->arch.paging.lock);
-    rc = p2m_set_allocation(d, 16, NULL);
-    spin_unlock(&d->arch.paging.lock);
-    if ( rc )
-        return rc;
-
-    return 0;
-}
-
-/*
- * The function will go through the p2m and remove page reference when it
- * is required. The mapping will be removed from the p2m.
- *
- * XXX: See whether the mapping can be left intact in the p2m.
- */
-int relinquish_p2m_mapping(struct domain *d)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    unsigned long count = 0;
-    p2m_type_t t;
-    int rc = 0;
-    unsigned int order;
-    gfn_t start, end;
-
-    BUG_ON(!d->is_dying);
-    /* No mappings can be added in the P2M after the P2M lock is released. */
-    p2m_write_lock(p2m);
-
-    start = p2m->lowest_mapped_gfn;
-    end = gfn_add(p2m->max_mapped_gfn, 1);
-
-    for ( ; gfn_x(start) < gfn_x(end);
-          start = gfn_next_boundary(start, order) )
-    {
-        mfn_t mfn = p2m_get_entry(p2m, start, &t, NULL, &order, NULL);
-
-        count++;
-        /*
-         * Arbitrarily preempt every 512 iterations.
-         */
-        if ( !(count % 512) && hypercall_preempt_check() )
-        {
-            rc = -ERESTART;
-            break;
-        }
-
-        /*
-         * p2m_set_entry will take care of removing reference on page
-         * when it is necessary and removing the mapping in the p2m.
-         */
-        if ( !mfn_eq(mfn, INVALID_MFN) )
-        {
-            /*
-             * For valid mapping, the start will always be aligned as
-             * entry will be removed whilst relinquishing.
-             */
-            rc = __p2m_set_entry(p2m, start, order, INVALID_MFN,
-                                 p2m_invalid, p2m_access_rwx);
-            if ( unlikely(rc) )
-            {
-                printk(XENLOG_G_ERR "Unable to remove mapping gfn=%#"PRI_gfn" 
order=%u from the p2m of domain %d\n", gfn_x(start), order, d->domain_id);
-                break;
-            }
-        }
-    }
-
-    /*
-     * Update lowest_mapped_gfn so on the next call we still start where
-     * we stopped.
-     */
-    p2m->lowest_mapped_gfn = start;
-
-    p2m_write_unlock(p2m);
-
-    return rc;
-}
-
-int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    gfn_t next_block_gfn;
-    gfn_t start = *pstart;
-    mfn_t mfn = INVALID_MFN;
-    p2m_type_t t;
-    unsigned int order;
-    int rc = 0;
-    /* Counter for preemption */
-    unsigned short count = 0;
-
-    /*
-     * The operation cache flush will invalidate the RAM assigned to the
-     * guest in a given range. It will not modify the page table and
-     * flushing the cache whilst the page is used by another CPU is
-     * fine. So using read-lock is fine here.
-     */
-    p2m_read_lock(p2m);
-
-    start = gfn_max(start, p2m->lowest_mapped_gfn);
-    end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1));
-
-    next_block_gfn = start;
-
-    while ( gfn_x(start) < gfn_x(end) )
-    {
-       /*
-         * Cleaning the cache for the P2M may take a long time. So we
-         * need to be able to preempt. We will arbitrarily preempt every
-         * time count reach 512 or above.
-         *
-         * The count will be incremented by:
-         *  - 1 on region skipped
-         *  - 10 for each page requiring a flush
-         */
-        if ( count >= 512 )
-        {
-            if ( softirq_pending(smp_processor_id()) )
-            {
-                rc = -ERESTART;
-                break;
-            }
-            count = 0;
-        }
-
-        /*
-         * We want to flush page by page as:
-         *  - it may not be possible to map the full block (can be up to 1GB)
-         *    in Xen memory
-         *  - we may want to do fine grain preemption as flushing multiple
-         *    page in one go may take a long time
-         *
-         * As p2m_get_entry is able to return the size of the mapping
-         * in the p2m, it is pointless to execute it for each page.
-         *
-         * We can optimize it by tracking the gfn of the next
-         * block. So we will only call p2m_get_entry for each block (can
-         * be up to 1GB).
-         */
-        if ( gfn_eq(start, next_block_gfn) )
-        {
-            bool valid;
-
-            mfn = p2m_get_entry(p2m, start, &t, NULL, &order, &valid);
-            next_block_gfn = gfn_next_boundary(start, order);
-
-            if ( mfn_eq(mfn, INVALID_MFN) || !p2m_is_any_ram(t) || !valid )
-            {
-                count++;
-                start = next_block_gfn;
-                continue;
-            }
-        }
-
-        count += 10;
-
-        flush_page_to_ram(mfn_x(mfn), false);
-
-        start = gfn_add(start, 1);
-        mfn = mfn_add(mfn, 1);
-    }
-
-    if ( rc != -ERESTART )
-        invalidate_icache();
-
-    p2m_read_unlock(p2m);
-
-    *pstart = start;
-
-    return rc;
-}
-
-/*
- * Clean & invalidate RAM associated to the guest vCPU.
- *
- * The function can only work with the current vCPU and should be called
- * with IRQ enabled as the vCPU could get preempted.
- */
-void p2m_flush_vm(struct vcpu *v)
-{
-    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
-    int rc;
-    gfn_t start = _gfn(0);
-
-    ASSERT(v == current);
-    ASSERT(local_irq_is_enabled());
-    ASSERT(v->arch.need_flush_to_ram);
-
-    do
-    {
-        rc = p2m_cache_flush_range(v->domain, &start, _gfn(ULONG_MAX));
-        if ( rc == -ERESTART )
-            do_softirq();
-    } while ( rc == -ERESTART );
-
-    if ( rc != 0 )
-        gprintk(XENLOG_WARNING,
-                "P2M has not been correctly cleaned (rc = %d)\n",
-                rc);
-
-    /*
-     * Invalidate the p2m to track which page was modified by the guest
-     * between call of p2m_flush_vm().
-     */
-    p2m_invalidate_root(p2m);
-
-    v->arch.need_flush_to_ram = false;
-}
-
-/*
- * See note at ARMv7 ARM B1.14.4 (DDI 0406C.c) (TL;DR: S/W ops are not
- * easily virtualized).
- *
- * Main problems:
- *  - S/W ops are local to a CPU (not broadcast)
- *  - We have line migration behind our back (speculation)
- *  - System caches don't support S/W at all (damn!)
- *
- * In the face of the above, the best we can do is to try and convert
- * S/W ops to VA ops. Because the guest is not allowed to infer the S/W
- * to PA mapping, it can only use S/W to nuke the whole cache, which is
- * rather a good thing for us.
- *
- * Also, it is only used when turning caches on/off ("The expected
- * usage of the cache maintenance instructions that operate by set/way
- * is associated with the powerdown and powerup of caches, if this is
- * required by the implementation.").
- *
- * We use the following policy:
- *  - If we trap a S/W operation, we enabled VM trapping to detect
- *  caches being turned on/off, and do a full clean.
- *
- *  - We flush the caches on both caches being turned on and off.
- *
- *  - Once the caches are enabled, we stop trapping VM ops.
- */
-void p2m_set_way_flush(struct vcpu *v, struct cpu_user_regs *regs,
-                       const union hsr hsr)
-{
-    /* This function can only work with the current vCPU. */
-    ASSERT(v == current);
-
-    if ( iommu_use_hap_pt(current->domain) )
-    {
-        gprintk(XENLOG_ERR,
-                "The cache should be flushed by VA rather than by set/way.\n");
-        inject_undef_exception(regs, hsr);
-        return;
-    }
-
-    if ( !(v->arch.hcr_el2 & HCR_TVM) )
-    {
-        v->arch.need_flush_to_ram = true;
-        vcpu_hcr_set_flags(v, HCR_TVM);
-    }
-}
-
-void p2m_toggle_cache(struct vcpu *v, bool was_enabled)
-{
-    bool now_enabled = vcpu_has_cache_enabled(v);
-
-    /* This function can only work with the current vCPU. */
-    ASSERT(v == current);
-
-    /*
-     * If switching the MMU+caches on, need to invalidate the caches.
-     * If switching it off, need to clean the caches.
-     * Clean + invalidate does the trick always.
-     */
-    if ( was_enabled != now_enabled )
-        v->arch.need_flush_to_ram = true;
-
-    /* Caches are now on, stop trapping VM ops (until a S/W op) */
-    if ( now_enabled )
-        vcpu_hcr_clear_flags(v, HCR_TVM);
-}
-
-mfn_t gfn_to_mfn(struct domain *d, gfn_t gfn)
-{
-    return p2m_lookup(d, gfn, NULL);
-}
-
-struct page_info *get_page_from_gva(struct vcpu *v, vaddr_t va,
-                                    unsigned long flags)
-{
-    struct domain *d = v->domain;
-    struct p2m_domain *p2m = p2m_get_hostp2m(d);
-    struct page_info *page = NULL;
-    paddr_t maddr = 0;
-    uint64_t par;
-    mfn_t mfn;
-    p2m_type_t t;
-
-    /*
-     * XXX: To support a different vCPU, we would need to load the
-     * VTTBR_EL2, TTBR0_EL1, TTBR1_EL1 and SCTLR_EL1
-     */
-    if ( v != current )
-        return NULL;
-
-    /*
-     * The lock is here to protect us against the break-before-make
-     * sequence used when updating the entry.
-     */
-    p2m_read_lock(p2m);
-    par = gvirt_to_maddr(va, &maddr, flags);
-    p2m_read_unlock(p2m);
-
-    /*
-     * gvirt_to_maddr may fail if the entry does not have the valid bit
-     * set. Fallback to the second method:
-     *  1) Translate the VA to IPA using software lookup -> Stage-1 page-table
-     *  may not be accessible because the stage-2 entries may have valid
-     *  bit unset.
-     *  2) Software lookup of the MFN
-     *
-     * Note that when memaccess is enabled, we instead call directly
-     * p2m_mem_access_check_and_get_page(...). Because the function is a
-     * a variant of the methods described above, it will be able to
-     * handle entries with valid bit unset.
-     *
-     * TODO: Integrate more nicely memaccess with the rest of the
-     * function.
-     * TODO: Use the fault error in PAR_EL1 to avoid pointless
-     *  translation.
-     */
-    if ( par )
-    {
-        paddr_t ipa;
-        unsigned int s1_perms;
-
-        /*
-         * When memaccess is enabled, the translation GVA to MADDR may
-         * have failed because of a permission fault.
-         */
-        if ( p2m->mem_access_enabled )
-            return p2m_mem_access_check_and_get_page(va, flags, v);
-
-        /*
-         * The software stage-1 table walk can still fail, e.g, if the
-         * GVA is not mapped.
-         */
-        if ( !guest_walk_tables(v, va, &ipa, &s1_perms) )
-        {
-            dprintk(XENLOG_G_DEBUG,
-                    "%pv: Failed to walk page-table va %#"PRIvaddr"\n", v, va);
-            return NULL;
-        }
-
-        mfn = p2m_lookup(d, gaddr_to_gfn(ipa), &t);
-        if ( mfn_eq(INVALID_MFN, mfn) || !p2m_is_ram(t) )
-            return NULL;
-
-        /*
-         * Check permission that are assumed by the caller. For instance
-         * in case of guestcopy, the caller assumes that the translated
-         * page can be accessed with the requested permissions. If this
-         * is not the case, we should fail.
-         *
-         * Please note that we do not check for the GV2M_EXEC
-         * permission. This is fine because the hardware-based translation
-         * instruction does not test for execute permissions.
-         */
-        if ( (flags & GV2M_WRITE) && !(s1_perms & GV2M_WRITE) )
-            return NULL;
-
-        if ( (flags & GV2M_WRITE) && t != p2m_ram_rw )
-            return NULL;
-    }
-    else
-        mfn = maddr_to_mfn(maddr);
-
-    if ( !mfn_valid(mfn) )
-    {
-        dprintk(XENLOG_G_DEBUG, "%pv: Invalid MFN %#"PRI_mfn"\n",
-                v, mfn_x(mfn));
-        return NULL;
-    }
-
-    page = mfn_to_page(mfn);
-    ASSERT(page);
-
-    if ( unlikely(!get_page(page, d)) )
-    {
-        dprintk(XENLOG_G_DEBUG, "%pv: Failing to acquire the MFN 
%#"PRI_mfn"\n",
-                v, mfn_x(maddr_to_mfn(maddr)));
-        return NULL;
-    }
-
-    return page;
-}
-
-void __init p2m_restrict_ipa_bits(unsigned int ipa_bits)
-{
-    /*
-     * Calculate the minimum of the maximum IPA bits that any external entity
-     * can support.
-     */
-    if ( ipa_bits < p2m_ipa_bits )
-        p2m_ipa_bits = ipa_bits;
-}
-
-/* VTCR value to be configured by all CPUs. Set only once by the boot CPU */
-static register_t __read_mostly vtcr;
-
-static void setup_virt_paging_one(void *data)
-{
-    WRITE_SYSREG(vtcr, VTCR_EL2);
-
-    /*
-     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
-     * entries related to EL1/EL0 translation regime until a guest vCPU
-     * is running. For that, we need to set-up VTTBR to point to an empty
-     * page-table and turn on stage-2 translation. The TLB entries
-     * associated with EL1/EL0 translation regime will also be flushed in case
-     * an AT instruction was speculated before hand.
-     */
-    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
-    {
-        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), 
VTTBR_EL2);
-        WRITE_SYSREG(READ_SYSREG(HCR_EL2) | HCR_VM, HCR_EL2);
-        isb();
-
-        flush_all_guests_tlb_local();
-    }
-}
-
-void __init setup_virt_paging(void)
-{
-    /* Setup Stage 2 address translation */
-    register_t val = VTCR_RES1|VTCR_SH0_IS|VTCR_ORGN0_WBWA|VTCR_IRGN0_WBWA;
-
-#ifdef CONFIG_ARM_32
-    if ( p2m_ipa_bits < 40 )
-        panic("P2M: Not able to support %u-bit IPA at the moment\n",
-              p2m_ipa_bits);
-
-    printk("P2M: 40-bit IPA\n");
-    p2m_ipa_bits = 40;
-    val |= VTCR_T0SZ(0x18); /* 40 bit IPA */
-    val |= VTCR_SL0(0x1); /* P2M starts at first level */
-#else /* CONFIG_ARM_64 */
-    static const struct {
-        unsigned int pabits; /* Physical Address Size */
-        unsigned int t0sz;   /* Desired T0SZ, minimum in comment */
-        unsigned int root_order; /* Page order of the root of the p2m */
-        unsigned int sl0;    /* Desired SL0, maximum in comment */
-    } pa_range_info[] __initconst = {
-        /* T0SZ minimum and SL0 maximum from ARM DDI 0487H.a Table D5-6 */
-        /*      PA size, t0sz(min), root-order, sl0(max) */
-        [0] = { 32,      32/*32*/,  0,          1 },
-        [1] = { 36,      28/*28*/,  0,          1 },
-        [2] = { 40,      24/*24*/,  1,          1 },
-        [3] = { 42,      22/*22*/,  3,          1 },
-        [4] = { 44,      20/*20*/,  0,          2 },
-        [5] = { 48,      16/*16*/,  0,          2 },
-        [6] = { 52,      12/*12*/,  4,          2 },
-        [7] = { 0 }  /* Invalid */
-    };
-
-    unsigned int i;
-    unsigned int pa_range = 0x10; /* Larger than any possible value */
-
-    /*
-     * Restrict "p2m_ipa_bits" if needed. As P2M table is always configured
-     * with IPA bits == PA bits, compare against "pabits".
-     */
-    if ( pa_range_info[system_cpuinfo.mm64.pa_range].pabits < p2m_ipa_bits )
-        p2m_ipa_bits = pa_range_info[system_cpuinfo.mm64.pa_range].pabits;
-
-    /*
-     * cpu info sanitization made sure we support 16bits VMID only if all
-     * cores are supporting it.
-     */
-    if ( system_cpuinfo.mm64.vmid_bits == MM64_VMID_16_BITS_SUPPORT )
-        max_vmid = MAX_VMID_16_BIT;
-
-    /* Choose suitable "pa_range" according to the resulted "p2m_ipa_bits". */
-    for ( i = 0; i < ARRAY_SIZE(pa_range_info); i++ )
-    {
-        if ( p2m_ipa_bits == pa_range_info[i].pabits )
-        {
-            pa_range = i;
-            break;
-        }
-    }
-
-    /* pa_range is 4 bits but we don't support all modes */
-    if ( pa_range >= ARRAY_SIZE(pa_range_info) || 
!pa_range_info[pa_range].pabits )
-        panic("Unknown encoding of ID_AA64MMFR0_EL1.PARange %x\n", pa_range);
-
-    val |= VTCR_PS(pa_range);
-    val |= VTCR_TG0_4K;
-
-    /* Set the VS bit only if 16 bit VMID is supported. */
-    if ( MAX_VMID == MAX_VMID_16_BIT )
-        val |= VTCR_VS;
-    val |= VTCR_SL0(pa_range_info[pa_range].sl0);
-    val |= VTCR_T0SZ(pa_range_info[pa_range].t0sz);
-
-    p2m_root_order = pa_range_info[pa_range].root_order;
-    p2m_root_level = 2 - pa_range_info[pa_range].sl0;
-    p2m_ipa_bits = 64 - pa_range_info[pa_range].t0sz;
-
-    printk("P2M: %d-bit IPA with %d-bit PA and %d-bit VMID\n",
-           p2m_ipa_bits,
-           pa_range_info[pa_range].pabits,
-           ( MAX_VMID == MAX_VMID_16_BIT ) ? 16 : 8);
-#endif
-    printk("P2M: %d levels with order-%d root, VTCR 0x%"PRIregister"\n",
-           4 - P2M_ROOT_LEVEL, P2M_ROOT_ORDER, val);
-
-    p2m_vmid_allocator_init();
-
-    /* It is not allowed to concatenate a level zero root */
-    BUG_ON( P2M_ROOT_LEVEL == 0 && P2M_ROOT_ORDER > 0 );
-    vtcr = val;
-
-    /*
-     * ARM64_WORKAROUND_AT_SPECULATE requires to allocate root table
-     * with all entries zeroed.
-     */
-    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
-    {
-        struct page_info *root;
-
-        root = p2m_allocate_root();
-        if ( !root )
-            panic("Unable to allocate root table for 
ARM64_WORKAROUND_AT_SPECULATE\n");
-
-        empty_root_mfn = page_to_mfn(root);
-    }
-
-    setup_virt_paging_one(NULL);
-    smp_call_function(setup_virt_paging_one, NULL, 1);
-}
-
-static int cpu_virt_paging_callback(struct notifier_block *nfb,
-                                    unsigned long action,
-                                    void *hcpu)
-{
-    switch ( action )
-    {
-    case CPU_STARTING:
-        ASSERT(system_state != SYS_STATE_boot);
-        setup_virt_paging_one(NULL);
-        break;
-    default:
-        break;
-    }
-
-    return NOTIFY_DONE;
-}
-
-static struct notifier_block cpu_virt_paging_nfb = {
-    .notifier_call = cpu_virt_paging_callback,
-};
-
-static int __init cpu_virt_paging_init(void)
-{
-    register_cpu_notifier(&cpu_virt_paging_nfb);
-
-    return 0;
-}
-/*
- * Initialization of the notifier has to be done at init rather than 
presmp_init
- * phase because: the registered notifier is used to setup virtual paging for
- * non-boot CPUs after the initial virtual paging for all CPUs is already 
setup,
- * i.e. when a non-boot CPU is hotplugged after the system has booted. In other
- * words, the notifier should be registered after the virtual paging is
- * initially setup (setup_virt_paging() is called from start_xen()). This is
- * required because vtcr config value has to be set before a notifier can fire.
- */
-__initcall(cpu_virt_paging_init);
-
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/p2m_mmu.c b/xen/arch/arm/p2m_mmu.c
new file mode 100644
index 0000000000..88a9d8f392
--- /dev/null
+++ b/xen/arch/arm/p2m_mmu.c
@@ -0,0 +1,2295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <xen/cpu.h>
+#include <xen/domain_page.h>
+#include <xen/iocap.h>
+#include <xen/ioreq.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/softirq.h>
+
+#include <asm/alternative.h>
+#include <asm/event.h>
+#include <asm/flushtlb.h>
+#include <asm/guest_walk.h>
+#include <asm/page.h>
+#include <asm/traps.h>
+
+#define MAX_VMID_8_BIT  (1UL << 8)
+#define MAX_VMID_16_BIT (1UL << 16)
+
+#define INVALID_VMID 0 /* VMID 0 is reserved */
+
+#ifdef CONFIG_ARM_64
+static unsigned int __read_mostly max_vmid = MAX_VMID_8_BIT;
+/* VMID is by default 8 bit width on AArch64 */
+#define MAX_VMID       max_vmid
+#else
+/* VMID is always 8 bit width on AArch32 */
+#define MAX_VMID        MAX_VMID_8_BIT
+#endif
+
+#ifdef CONFIG_ARM_64
+unsigned int __read_mostly p2m_root_order;
+unsigned int __read_mostly p2m_root_level;
+#endif
+
+#define P2M_ROOT_PAGES    (1<<P2M_ROOT_ORDER)
+
+static mfn_t __read_mostly empty_root_mfn;
+
+static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn)
+{
+    return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48));
+}
+
+static struct page_info *p2m_alloc_page(struct domain *d)
+{
+    struct page_info *pg;
+
+    spin_lock(&d->arch.paging.lock);
+    /*
+     * For hardware domain, there should be no limit in the number of pages 
that
+     * can be allocated, so that the kernel may take advantage of the extended
+     * regions. Hence, allocate p2m pages for hardware domains from heap.
+     */
+    if ( is_hardware_domain(d) )
+    {
+        pg = alloc_domheap_page(NULL, 0);
+        if ( pg == NULL )
+        {
+            printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n");
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+    }
+    else
+    {
+        pg = page_list_remove_head(&d->arch.paging.p2m_freelist);
+        if ( unlikely(!pg) )
+        {
+            spin_unlock(&d->arch.paging.lock);
+            return NULL;
+        }
+        d->arch.paging.p2m_total_pages--;
+    }
+    spin_unlock(&d->arch.paging.lock);
+
+    return pg;
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    spin_lock(&d->arch.paging.lock);
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        d->arch.paging.p2m_total_pages++;
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+    }
+    spin_unlock(&d->arch.paging.lock);
+}
+
+/* Unlock the flush and do a P2M TLB flush if necessary */
+void p2m_write_unlock(struct p2m_domain *p2m)
+{
+    /*
+     * The final flush is done with the P2M write lock taken to avoid
+     * someone else modifying the P2M wbefore the TLB invalidation has
+     * completed.
+     */
+    p2m_tlb_flush_sync(p2m);
+
+    write_unlock(&p2m->lock);
+}
+
+void p2m_dump_info(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    p2m_read_lock(p2m);
+    printk("p2m mappings for domain %d (vmid %d):\n",
+           d->domain_id, p2m->vmid);
+    BUG_ON(p2m->stats.mappings[0] || p2m->stats.shattered[0]);
+    printk("  1G mappings: %ld (shattered %ld)\n",
+           p2m->stats.mappings[1], p2m->stats.shattered[1]);
+    printk("  2M mappings: %ld (shattered %ld)\n",
+           p2m->stats.mappings[2], p2m->stats.shattered[2]);
+    printk("  4K mappings: %ld\n", p2m->stats.mappings[3]);
+    p2m_read_unlock(p2m);
+}
+
+void dump_p2m_lookup(struct domain *d, paddr_t addr)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    printk("dom%d IPA 0x%"PRIpaddr"\n", d->domain_id, addr);
+
+    printk("P2M @ %p mfn:%#"PRI_mfn"\n",
+           p2m->root, mfn_x(page_to_mfn(p2m->root)));
+
+    dump_pt_walk(page_to_maddr(p2m->root), addr,
+                 P2M_ROOT_LEVEL, P2M_ROOT_PAGES);
+}
+
+/*
+ * p2m_save_state and p2m_restore_state work in pair to workaround
+ * ARM64_WORKAROUND_AT_SPECULATE. p2m_save_state will set-up VTTBR to
+ * point to the empty page-tables to stop allocating TLB entries.
+ */
+void p2m_save_state(struct vcpu *p)
+{
+    p->arch.sctlr = READ_SYSREG(SCTLR_EL1);
+
+    if ( cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+    {
+        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), 
VTTBR_EL2);
+        /*
+         * Ensure VTTBR_EL2 is correctly synchronized so we can restore
+         * the next vCPU context without worrying about AT instruction
+         * speculation.
+         */
+        isb();
+    }
+}
+
+void p2m_restore_state(struct vcpu *n)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(n->domain);
+    uint8_t *last_vcpu_ran;
+
+    if ( is_idle_vcpu(n) )
+        return;
+
+    WRITE_SYSREG(n->arch.sctlr, SCTLR_EL1);
+    WRITE_SYSREG(n->arch.hcr_el2, HCR_EL2);
+
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE: VTTBR_EL2 should be restored after all
+     * registers associated to EL1/EL0 translations regime have been
+     * synchronized.
+     */
+    asm volatile(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_AT_SPECULATE));
+    WRITE_SYSREG64(p2m->vttbr, VTTBR_EL2);
+
+    last_vcpu_ran = &p2m->last_vcpu_ran[smp_processor_id()];
+
+    /*
+     * While we are restoring an out-of-context translation regime
+     * we still need to ensure:
+     *  - VTTBR_EL2 is synchronized before flushing the TLBs
+     *  - All registers for EL1 are synchronized before executing an AT
+     *  instructions targeting S1/S2.
+     */
+    isb();
+
+    /*
+     * Flush local TLB for the domain to prevent wrong TLB translation
+     * when running multiple vCPU of the same domain on a single pCPU.
+     */
+    if ( *last_vcpu_ran != INVALID_VCPU_ID && *last_vcpu_ran != n->vcpu_id )
+        flush_guest_tlb_local();
+
+    *last_vcpu_ran = n->vcpu_id;
+}
+
+/*
+ * Force a synchronous P2M TLB flush.
+ *
+ * Must be called with the p2m lock held.
+ */
+static void p2m_force_tlb_flush_sync(struct p2m_domain *p2m)
+{
+    unsigned long flags = 0;
+    uint64_t ovttbr;
+
+    ASSERT(p2m_is_write_locked(p2m));
+
+    /*
+     * ARM only provides an instruction to flush TLBs for the current
+     * VMID. So switch to the VTTBR of a given P2M if different.
+     */
+    ovttbr = READ_SYSREG64(VTTBR_EL2);
+    if ( ovttbr != p2m->vttbr )
+    {
+        uint64_t vttbr;
+
+        local_irq_save(flags);
+
+        /*
+         * ARM64_WORKAROUND_AT_SPECULATE: We need to stop AT to allocate
+         * TLBs entries because the context is partially modified. We
+         * only need the VMID for flushing the TLBs, so we can generate
+         * a new VTTBR with the VMID to flush and the empty root table.
+         */
+        if ( !cpus_have_const_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+            vttbr = p2m->vttbr;
+        else
+            vttbr = generate_vttbr(p2m->vmid, empty_root_mfn);
+
+        WRITE_SYSREG64(vttbr, VTTBR_EL2);
+
+        /* Ensure VTTBR_EL2 is synchronized before flushing the TLBs */
+        isb();
+    }
+
+    flush_guest_tlb();
+
+    if ( ovttbr != READ_SYSREG64(VTTBR_EL2) )
+    {
+        WRITE_SYSREG64(ovttbr, VTTBR_EL2);
+        /* Ensure VTTBR_EL2 is back in place before continuing. */
+        isb();
+        local_irq_restore(flags);
+    }
+
+    p2m->need_flush = false;
+}
+
+void p2m_tlb_flush_sync(struct p2m_domain *p2m)
+{
+    if ( p2m->need_flush )
+        p2m_force_tlb_flush_sync(p2m);
+}
+
+/*
+ * Find and map the root page table. The caller is responsible for
+ * unmapping the table.
+ *
+ * The function will return NULL if the offset of the root table is
+ * invalid.
+ */
+static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m,
+                                    gfn_t gfn)
+{
+    unsigned long root_table;
+
+    /*
+     * While the root table index is the offset from the previous level,
+     * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be
+     * 0. Yet we still want to check if all the unused bits are zeroed.
+     */
+    root_table = gfn_x(gfn) >> (XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL) +
+                                XEN_PT_LPAE_SHIFT);
+    if ( root_table >= P2M_ROOT_PAGES )
+        return NULL;
+
+    return __map_domain_page(p2m->root + root_table);
+}
+
+/*
+ * Lookup the MFN corresponding to a domain's GFN.
+ * Lookup mem access in the ratrix tree.
+ * The entries associated to the GFN is considered valid.
+ */
+static p2m_access_t p2m_mem_access_radix_get(struct p2m_domain *p2m, gfn_t gfn)
+{
+    void *ptr;
+
+    if ( !p2m->mem_access_enabled )
+        return p2m->default_access;
+
+    ptr = radix_tree_lookup(&p2m->mem_access_settings, gfn_x(gfn));
+    if ( !ptr )
+        return p2m_access_rwx;
+    else
+        return radix_tree_ptr_to_int(ptr);
+}
+
+/*
+ * In the case of the P2M, the valid bit is used for other purpose. Use
+ * the type to check whether an entry is valid.
+ */
+static inline bool p2m_is_valid(lpae_t pte)
+{
+    return pte.p2m.type != p2m_invalid;
+}
+
+/*
+ * lpae_is_* helpers don't check whether the valid bit is set in the
+ * PTE. Provide our own overlay to check the valid bit.
+ */
+static inline bool p2m_is_mapping(lpae_t pte, unsigned int level)
+{
+    return p2m_is_valid(pte) && lpae_is_mapping(pte, level);
+}
+
+static inline bool p2m_is_superpage(lpae_t pte, unsigned int level)
+{
+    return p2m_is_valid(pte) && lpae_is_superpage(pte, level);
+}
+
+#define GUEST_TABLE_MAP_FAILED 0
+#define GUEST_TABLE_SUPER_PAGE 1
+#define GUEST_TABLE_NORMAL_PAGE 2
+
+static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry);
+
+/*
+ * Take the currently mapped table, find the corresponding GFN entry,
+ * and map the next table, if available. The previous table will be
+ * unmapped if the next level was mapped (e.g GUEST_TABLE_NORMAL_PAGE
+ * returned).
+ *
+ * The read_only parameters indicates whether intermediate tables should
+ * be allocated when not present.
+ *
+ * Return values:
+ *  GUEST_TABLE_MAP_FAILED: Either read_only was set and the entry
+ *  was empty, or allocating a new page failed.
+ *  GUEST_TABLE_NORMAL_PAGE: next level mapped normally
+ *  GUEST_TABLE_SUPER_PAGE: The next entry points to a superpage.
+ */
+static int p2m_next_level(struct p2m_domain *p2m, bool read_only,
+                          unsigned int level, lpae_t **table,
+                          unsigned int offset)
+{
+    lpae_t *entry;
+    int ret;
+    mfn_t mfn;
+
+    entry = *table + offset;
+
+    if ( !p2m_is_valid(*entry) )
+    {
+        if ( read_only )
+            return GUEST_TABLE_MAP_FAILED;
+
+        ret = p2m_create_table(p2m, entry);
+        if ( ret )
+            return GUEST_TABLE_MAP_FAILED;
+    }
+
+    /* The function p2m_next_level is never called at the 3rd level */
+    ASSERT(level < 3);
+    if ( p2m_is_mapping(*entry, level) )
+        return GUEST_TABLE_SUPER_PAGE;
+
+    mfn = lpae_get_mfn(*entry);
+
+    unmap_domain_page(*table);
+    *table = map_domain_page(mfn);
+
+    return GUEST_TABLE_NORMAL_PAGE;
+}
+
+/*
+ * Get the details of a given gfn.
+ *
+ * If the entry is present, the associated MFN will be returned and the
+ * access and type filled up. The page_order will correspond to the
+ * order of the mapping in the page table (i.e it could be a superpage).
+ *
+ * If the entry is not present, INVALID_MFN will be returned and the
+ * page_order will be set according to the order of the invalid range.
+ *
+ * valid will contain the value of bit[0] (e.g valid bit) of the
+ * entry.
+ */
+mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
+                    p2m_type_t *t, p2m_access_t *a,
+                    unsigned int *page_order,
+                    bool *valid)
+{
+    paddr_t addr = gfn_to_gaddr(gfn);
+    unsigned int level = 0;
+    lpae_t entry, *table;
+    int rc;
+    mfn_t mfn = INVALID_MFN;
+    p2m_type_t _t;
+    DECLARE_OFFSETS(offsets, addr);
+
+    ASSERT(p2m_is_locked(p2m));
+    BUILD_BUG_ON(THIRD_MASK != PAGE_MASK);
+
+    /* Allow t to be NULL */
+    t = t ?: &_t;
+
+    *t = p2m_invalid;
+
+    if ( valid )
+        *valid = false;
+
+    /* XXX: Check if the mapping is lower than the mapped gfn */
+
+    /* This gfn is higher than the highest the p2m map currently holds */
+    if ( gfn_x(gfn) > gfn_x(p2m->max_mapped_gfn) )
+    {
+        for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
+            if ( (gfn_x(gfn) & (XEN_PT_LEVEL_MASK(level) >> PAGE_SHIFT)) >
+                 gfn_x(p2m->max_mapped_gfn) )
+                break;
+
+        goto out;
+    }
+
+    table = p2m_get_root_pointer(p2m, gfn);
+
+    /*
+     * the table should always be non-NULL because the gfn is below
+     * p2m->max_mapped_gfn and the root table pages are always present.
+     */
+    if ( !table )
+    {
+        ASSERT_UNREACHABLE();
+        level = P2M_ROOT_LEVEL;
+        goto out;
+    }
+
+    for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
+    {
+        rc = p2m_next_level(p2m, true, level, &table, offsets[level]);
+        if ( rc == GUEST_TABLE_MAP_FAILED )
+            goto out_unmap;
+        else if ( rc != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    entry = table[offsets[level]];
+
+    if ( p2m_is_valid(entry) )
+    {
+        *t = entry.p2m.type;
+
+        if ( a )
+            *a = p2m_mem_access_radix_get(p2m, gfn);
+
+        mfn = lpae_get_mfn(entry);
+        /*
+         * The entry may point to a superpage. Find the MFN associated
+         * to the GFN.
+         */
+        mfn = mfn_add(mfn,
+                      gfn_x(gfn) & ((1UL << XEN_PT_LEVEL_ORDER(level)) - 1));
+
+        if ( valid )
+            *valid = lpae_is_valid(entry);
+    }
+
+out_unmap:
+    unmap_domain_page(table);
+
+out:
+    if ( page_order )
+        *page_order = XEN_PT_LEVEL_ORDER(level);
+
+    return mfn;
+}
+
+mfn_t p2m_lookup(struct domain *d, gfn_t gfn, p2m_type_t *t)
+{
+    mfn_t mfn;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    p2m_read_lock(p2m);
+    mfn = p2m_get_entry(p2m, gfn, t, NULL, NULL, NULL);
+    p2m_read_unlock(p2m);
+
+    return mfn;
+}
+
+struct page_info *p2m_get_page_from_gfn(struct domain *d, gfn_t gfn,
+                                        p2m_type_t *t)
+{
+    struct page_info *page;
+    p2m_type_t p2mt;
+    mfn_t mfn = p2m_lookup(d, gfn, &p2mt);
+
+    if ( t )
+        *t = p2mt;
+
+    if ( !p2m_is_any_ram(p2mt) )
+        return NULL;
+
+    if ( !mfn_valid(mfn) )
+        return NULL;
+
+    page = mfn_to_page(mfn);
+
+    /*
+     * get_page won't work on foreign mapping because the page doesn't
+     * belong to the current domain.
+     */
+    if ( p2m_is_foreign(p2mt) )
+    {
+        struct domain *fdom = page_get_owner_and_reference(page);
+        ASSERT(fdom != NULL);
+        ASSERT(fdom != d);
+        return page;
+    }
+
+    return get_page(page, d) ? page : NULL;
+}
+
+static void p2m_set_permission(lpae_t *e, p2m_type_t t, p2m_access_t a)
+{
+    /* First apply type permissions */
+    switch ( t )
+    {
+    case p2m_ram_rw:
+        e->p2m.xn = 0;
+        e->p2m.write = 1;
+        break;
+
+    case p2m_ram_ro:
+        e->p2m.xn = 0;
+        e->p2m.write = 0;
+        break;
+
+    case p2m_iommu_map_rw:
+    case p2m_map_foreign_rw:
+    case p2m_grant_map_rw:
+    case p2m_mmio_direct_dev:
+    case p2m_mmio_direct_nc:
+    case p2m_mmio_direct_c:
+        e->p2m.xn = 1;
+        e->p2m.write = 1;
+        break;
+
+    case p2m_iommu_map_ro:
+    case p2m_map_foreign_ro:
+    case p2m_grant_map_ro:
+    case p2m_invalid:
+        e->p2m.xn = 1;
+        e->p2m.write = 0;
+        break;
+
+    case p2m_max_real_type:
+        BUG();
+        break;
+    }
+
+    /* Then restrict with access permissions */
+    switch ( a )
+    {
+    case p2m_access_rwx:
+        break;
+    case p2m_access_wx:
+        e->p2m.read = 0;
+        break;
+    case p2m_access_rw:
+        e->p2m.xn = 1;
+        break;
+    case p2m_access_w:
+        e->p2m.read = 0;
+        e->p2m.xn = 1;
+        break;
+    case p2m_access_rx:
+    case p2m_access_rx2rw:
+        e->p2m.write = 0;
+        break;
+    case p2m_access_x:
+        e->p2m.write = 0;
+        e->p2m.read = 0;
+        break;
+    case p2m_access_r:
+        e->p2m.write = 0;
+        e->p2m.xn = 1;
+        break;
+    case p2m_access_n:
+    case p2m_access_n2rwx:
+        e->p2m.read = e->p2m.write = 0;
+        e->p2m.xn = 1;
+        break;
+    }
+}
+
+static lpae_t mfn_to_p2m_entry(mfn_t mfn, p2m_type_t t, p2m_access_t a)
+{
+    /*
+     * sh, xn and write bit will be defined in the following switches
+     * based on mattr and t.
+     */
+    lpae_t e = (lpae_t) {
+        .p2m.af = 1,
+        .p2m.read = 1,
+        .p2m.table = 1,
+        .p2m.valid = 1,
+        .p2m.type = t,
+    };
+
+    BUILD_BUG_ON(p2m_max_real_type > (1 << 4));
+
+    switch ( t )
+    {
+    case p2m_mmio_direct_dev:
+        e.p2m.mattr = MATTR_DEV;
+        e.p2m.sh = LPAE_SH_OUTER;
+        break;
+
+    case p2m_mmio_direct_c:
+        e.p2m.mattr = MATTR_MEM;
+        e.p2m.sh = LPAE_SH_OUTER;
+        break;
+
+    /*
+     * ARM ARM: Overlaying the shareability attribute (DDI
+     * 0406C.b B3-1376 to 1377)
+     *
+     * A memory region with a resultant memory type attribute of Normal,
+     * and a resultant cacheability attribute of Inner Non-cacheable,
+     * Outer Non-cacheable, must have a resultant shareability attribute
+     * of Outer Shareable, otherwise shareability is UNPREDICTABLE.
+     *
+     * On ARMv8 shareability is ignored and explicitly treated as Outer
+     * Shareable for Normal Inner Non_cacheable, Outer Non-cacheable.
+     * See the note for table D4-40, in page 1788 of the ARM DDI 0487A.j.
+     */
+    case p2m_mmio_direct_nc:
+        e.p2m.mattr = MATTR_MEM_NC;
+        e.p2m.sh = LPAE_SH_OUTER;
+        break;
+
+    default:
+        e.p2m.mattr = MATTR_MEM;
+        e.p2m.sh = LPAE_SH_INNER;
+    }
+
+    p2m_set_permission(&e, t, a);
+
+    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
+
+    lpae_set_mfn(e, mfn);
+
+    return e;
+}
+
+/* Generate table entry with correct attributes. */
+static lpae_t page_to_p2m_table(struct page_info *page)
+{
+    /*
+     * The access value does not matter because the hardware will ignore
+     * the permission fields for table entry.
+     *
+     * We use p2m_ram_rw so the entry has a valid type. This is important
+     * for p2m_is_valid() to return valid on table entries.
+     */
+    return mfn_to_p2m_entry(page_to_mfn(page), p2m_ram_rw, p2m_access_rwx);
+}
+
+static inline void p2m_write_pte(lpae_t *p, lpae_t pte, bool clean_pte)
+{
+    write_pte(p, pte);
+    if ( clean_pte )
+        clean_dcache(*p);
+}
+
+static inline void p2m_remove_pte(lpae_t *p, bool clean_pte)
+{
+    lpae_t pte;
+
+    memset(&pte, 0x00, sizeof(pte));
+    p2m_write_pte(p, pte, clean_pte);
+}
+
+/* Allocate a new page table page and hook it in via the given entry. */
+static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry)
+{
+    struct page_info *page;
+    lpae_t *p;
+
+    ASSERT(!p2m_is_valid(*entry));
+
+    page = p2m_alloc_page(p2m->domain);
+    if ( page == NULL )
+        return -ENOMEM;
+
+    page_list_add(page, &p2m->pages);
+
+    p = __map_domain_page(page);
+    clear_page(p);
+
+    if ( p2m->clean_pte )
+        clean_dcache_va_range(p, PAGE_SIZE);
+
+    unmap_domain_page(p);
+
+    p2m_write_pte(entry, page_to_p2m_table(page), p2m->clean_pte);
+
+    return 0;
+}
+
+static int p2m_mem_access_radix_set(struct p2m_domain *p2m, gfn_t gfn,
+                                    p2m_access_t a)
+{
+    int rc;
+
+    if ( !p2m->mem_access_enabled )
+        return 0;
+
+    if ( p2m_access_rwx == a )
+    {
+        radix_tree_delete(&p2m->mem_access_settings, gfn_x(gfn));
+        return 0;
+    }
+
+    rc = radix_tree_insert(&p2m->mem_access_settings, gfn_x(gfn),
+                           radix_tree_int_to_ptr(a));
+    if ( rc == -EEXIST )
+    {
+        /* If a setting already exists, change it to the new one */
+        radix_tree_replace_slot(
+            radix_tree_lookup_slot(
+                &p2m->mem_access_settings, gfn_x(gfn)),
+            radix_tree_int_to_ptr(a));
+        rc = 0;
+    }
+
+    return rc;
+}
+
+/*
+ * Put any references on the single 4K page referenced by pte.
+ * TODO: Handle superpages, for now we only take special references for leaf
+ * pages (specifically foreign ones, which can't be super mapped today).
+ */
+static void p2m_put_l3_page(const lpae_t pte)
+{
+    mfn_t mfn = lpae_get_mfn(pte);
+
+    ASSERT(p2m_is_valid(pte));
+
+    /*
+     * TODO: Handle other p2m types
+     *
+     * It's safe to do the put_page here because page_alloc will
+     * flush the TLBs if the page is reallocated before the end of
+     * this loop.
+     */
+    if ( p2m_is_foreign(pte.p2m.type) )
+    {
+        ASSERT(mfn_valid(mfn));
+        put_page(mfn_to_page(mfn));
+    }
+    /* Detect the xenheap page and mark the stored GFN as invalid. */
+    else if ( p2m_is_ram(pte.p2m.type) && is_xen_heap_mfn(mfn) )
+        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
+}
+
+/* Free lpae sub-tree behind an entry */
+static void p2m_free_entry(struct p2m_domain *p2m,
+                           lpae_t entry, unsigned int level)
+{
+    unsigned int i;
+    lpae_t *table;
+    mfn_t mfn;
+    struct page_info *pg;
+
+    /* Nothing to do if the entry is invalid. */
+    if ( !p2m_is_valid(entry) )
+        return;
+
+    if ( p2m_is_superpage(entry, level) || (level == 3) )
+    {
+#ifdef CONFIG_IOREQ_SERVER
+        /*
+         * If this gets called then either the entry was replaced by an entry
+         * with a different base (valid case) or the shattering of a superpage
+         * has failed (error case).
+         * So, at worst, the spurious mapcache invalidation might be sent.
+         */
+        if ( p2m_is_ram(entry.p2m.type) &&
+             domain_has_ioreq_server(p2m->domain) )
+            ioreq_request_mapcache_invalidate(p2m->domain);
+#endif
+
+        p2m->stats.mappings[level]--;
+        /* Nothing to do if the entry is a super-page. */
+        if ( level == 3 )
+            p2m_put_l3_page(entry);
+        return;
+    }
+
+    table = map_domain_page(lpae_get_mfn(entry));
+    for ( i = 0; i < XEN_PT_LPAE_ENTRIES; i++ )
+        p2m_free_entry(p2m, *(table + i), level + 1);
+
+    unmap_domain_page(table);
+
+    /*
+     * Make sure all the references in the TLB have been removed before
+     * freing the intermediate page table.
+     * XXX: Should we defer the free of the page table to avoid the
+     * flush?
+     */
+    p2m_tlb_flush_sync(p2m);
+
+    mfn = lpae_get_mfn(entry);
+    ASSERT(mfn_valid(mfn));
+
+    pg = mfn_to_page(mfn);
+
+    page_list_del(pg, &p2m->pages);
+    p2m_free_page(p2m->domain, pg);
+}
+
+static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry,
+                                unsigned int level, unsigned int target,
+                                const unsigned int *offsets)
+{
+    struct page_info *page;
+    unsigned int i;
+    lpae_t pte, *table;
+    bool rv = true;
+
+    /* Convenience aliases */
+    mfn_t mfn = lpae_get_mfn(*entry);
+    unsigned int next_level = level + 1;
+    unsigned int level_order = XEN_PT_LEVEL_ORDER(next_level);
+
+    /*
+     * This should only be called with target != level and the entry is
+     * a superpage.
+     */
+    ASSERT(level < target);
+    ASSERT(p2m_is_superpage(*entry, level));
+
+    page = p2m_alloc_page(p2m->domain);
+    if ( !page )
+        return false;
+
+    page_list_add(page, &p2m->pages);
+    table = __map_domain_page(page);
+
+    /*
+     * We are either splitting a first level 1G page into 512 second level
+     * 2M pages, or a second level 2M page into 512 third level 4K pages.
+     */
+    for ( i = 0; i < XEN_PT_LPAE_ENTRIES; i++ )
+    {
+        lpae_t *new_entry = table + i;
+
+        /*
+         * Use the content of the superpage entry and override
+         * the necessary fields. So the correct permission are kept.
+         */
+        pte = *entry;
+        lpae_set_mfn(pte, mfn_add(mfn, i << level_order));
+
+        /*
+         * First and second level pages set p2m.table = 0, but third
+         * level entries set p2m.table = 1.
+         */
+        pte.p2m.table = (next_level == 3);
+
+        write_pte(new_entry, pte);
+    }
+
+    /* Update stats */
+    p2m->stats.shattered[level]++;
+    p2m->stats.mappings[level]--;
+    p2m->stats.mappings[next_level] += XEN_PT_LPAE_ENTRIES;
+
+    /*
+     * Shatter superpage in the page to the level we want to make the
+     * changes.
+     * This is done outside the loop to avoid checking the offset to
+     * know whether the entry should be shattered for every entry.
+     */
+    if ( next_level != target )
+        rv = p2m_split_superpage(p2m, table + offsets[next_level],
+                                 level + 1, target, offsets);
+
+    if ( p2m->clean_pte )
+        clean_dcache_va_range(table, PAGE_SIZE);
+
+    unmap_domain_page(table);
+
+    /*
+     * Even if we failed, we should install the newly allocated LPAE
+     * entry. The caller will be in charge to free the sub-tree.
+     */
+    p2m_write_pte(entry, page_to_p2m_table(page), p2m->clean_pte);
+
+    return rv;
+}
+
+/*
+ * Insert an entry in the p2m. This should be called with a mapping
+ * equal to a page/superpage (4K, 2M, 1G).
+ */
+static int __p2m_set_entry(struct p2m_domain *p2m,
+                           gfn_t sgfn,
+                           unsigned int page_order,
+                           mfn_t smfn,
+                           p2m_type_t t,
+                           p2m_access_t a)
+{
+    unsigned int level = 0;
+    unsigned int target = 3 - (page_order / XEN_PT_LPAE_SHIFT);
+    lpae_t *entry, *table, orig_pte;
+    int rc;
+    /* A mapping is removed if the MFN is invalid. */
+    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
+    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
+
+    ASSERT(p2m_is_write_locked(p2m));
+
+    /*
+     * Check if the level target is valid: we only support
+     * 4K - 2M - 1G mapping.
+     */
+    ASSERT(target > 0 && target <= 3);
+
+    table = p2m_get_root_pointer(p2m, sgfn);
+    if ( !table )
+        return -EINVAL;
+
+    for ( level = P2M_ROOT_LEVEL; level < target; level++ )
+    {
+        /*
+         * Don't try to allocate intermediate page table if the mapping
+         * is about to be removed.
+         */
+        rc = p2m_next_level(p2m, removing_mapping,
+                            level, &table, offsets[level]);
+        if ( rc == GUEST_TABLE_MAP_FAILED )
+        {
+            /*
+             * We are here because p2m_next_level has failed to map
+             * the intermediate page table (e.g the table does not exist
+             * and they p2m tree is read-only). It is a valid case
+             * when removing a mapping as it may not exist in the
+             * page table. In this case, just ignore it.
+             */
+            rc = removing_mapping ?  0 : -ENOENT;
+            goto out;
+        }
+        else if ( rc != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    entry = table + offsets[level];
+
+    /*
+     * If we are here with level < target, we must be at a leaf node,
+     * and we need to break up the superpage.
+     */
+    if ( level < target )
+    {
+        /* We need to split the original page. */
+        lpae_t split_pte = *entry;
+
+        ASSERT(p2m_is_superpage(*entry, level));
+
+        if ( !p2m_split_superpage(p2m, &split_pte, level, target, offsets) )
+        {
+            /*
+             * The current super-page is still in-place, so re-increment
+             * the stats.
+             */
+            p2m->stats.mappings[level]++;
+
+            /* Free the allocated sub-tree */
+            p2m_free_entry(p2m, split_pte, level);
+
+            rc = -ENOMEM;
+            goto out;
+        }
+
+        /*
+         * Follow the break-before-sequence to update the entry.
+         * For more details see (D4.7.1 in ARM DDI 0487A.j).
+         */
+        p2m_remove_pte(entry, p2m->clean_pte);
+        p2m_force_tlb_flush_sync(p2m);
+
+        p2m_write_pte(entry, split_pte, p2m->clean_pte);
+
+        /* then move to the level we want to make real changes */
+        for ( ; level < target; level++ )
+        {
+            rc = p2m_next_level(p2m, true, level, &table, offsets[level]);
+
+            /*
+             * The entry should be found and either be a table
+             * or a superpage if level 3 is not targeted
+             */
+            ASSERT(rc == GUEST_TABLE_NORMAL_PAGE ||
+                   (rc == GUEST_TABLE_SUPER_PAGE && target < 3));
+        }
+
+        entry = table + offsets[level];
+    }
+
+    /*
+     * We should always be there with the correct level because
+     * all the intermediate tables have been installed if necessary.
+     */
+    ASSERT(level == target);
+
+    orig_pte = *entry;
+
+    /*
+     * The radix-tree can only work on 4KB. This is only used when
+     * memaccess is enabled and during shutdown.
+     */
+    ASSERT(!p2m->mem_access_enabled || page_order == 0 ||
+           p2m->domain->is_dying);
+    /*
+     * The access type should always be p2m_access_rwx when the mapping
+     * is removed.
+     */
+    ASSERT(!mfn_eq(INVALID_MFN, smfn) || (a == p2m_access_rwx));
+    /*
+     * Update the mem access permission before update the P2M. So we
+     * don't have to revert the mapping if it has failed.
+     */
+    rc = p2m_mem_access_radix_set(p2m, sgfn, a);
+    if ( rc )
+        goto out;
+
+    /*
+     * Always remove the entry in order to follow the break-before-make
+     * sequence when updating the translation table (D4.7.1 in ARM DDI
+     * 0487A.j).
+     */
+    if ( lpae_is_valid(orig_pte) || removing_mapping )
+        p2m_remove_pte(entry, p2m->clean_pte);
+
+    if ( removing_mapping )
+        /* Flush can be deferred if the entry is removed */
+        p2m->need_flush |= !!lpae_is_valid(orig_pte);
+    else
+    {
+        lpae_t pte = mfn_to_p2m_entry(smfn, t, a);
+
+        if ( level < 3 )
+            pte.p2m.table = 0; /* Superpage entry */
+
+        /*
+         * It is necessary to flush the TLB before writing the new entry
+         * to keep coherency when the previous entry was valid.
+         *
+         * Although, it could be defered when only the permissions are
+         * changed (e.g in case of memaccess).
+         */
+        if ( lpae_is_valid(orig_pte) )
+        {
+            if ( likely(!p2m->mem_access_enabled) ||
+                 P2M_CLEAR_PERM(pte) != P2M_CLEAR_PERM(orig_pte) )
+                p2m_force_tlb_flush_sync(p2m);
+            else
+                p2m->need_flush = true;
+        }
+        else if ( !p2m_is_valid(orig_pte) ) /* new mapping */
+            p2m->stats.mappings[level]++;
+
+        p2m_write_pte(entry, pte, p2m->clean_pte);
+
+        p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
+                                      gfn_add(sgfn, (1UL << page_order) - 1));
+        p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
+    }
+
+    if ( is_iommu_enabled(p2m->domain) &&
+         (lpae_is_valid(orig_pte) || lpae_is_valid(*entry)) )
+    {
+        unsigned int flush_flags = 0;
+
+        if ( lpae_is_valid(orig_pte) )
+            flush_flags |= IOMMU_FLUSHF_modified;
+        if ( lpae_is_valid(*entry) )
+            flush_flags |= IOMMU_FLUSHF_added;
+
+        rc = iommu_iotlb_flush(p2m->domain, _dfn(gfn_x(sgfn)),
+                               1UL << page_order, flush_flags);
+    }
+    else
+        rc = 0;
+
+    /*
+     * Free the entry only if the original pte was valid and the base
+     * is different (to avoid freeing when permission is changed).
+     */
+    if ( p2m_is_valid(orig_pte) &&
+         !mfn_eq(lpae_get_mfn(*entry), lpae_get_mfn(orig_pte)) )
+        p2m_free_entry(p2m, orig_pte, level);
+
+out:
+    unmap_domain_page(table);
+
+    return rc;
+}
+
+int p2m_set_entry(struct p2m_domain *p2m,
+                  gfn_t sgfn,
+                  unsigned long nr,
+                  mfn_t smfn,
+                  p2m_type_t t,
+                  p2m_access_t a)
+{
+    int rc = 0;
+
+    /*
+     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
+     * be dropped in relinquish_p2m_mapping(). As the P2M will still
+     * be accessible after, we need to prevent mapping to be added when the
+     * domain is dying.
+     */
+    if ( unlikely(p2m->domain->is_dying) )
+        return -ENOMEM;
+
+    while ( nr )
+    {
+        unsigned long mask;
+        unsigned long order;
+
+        /*
+         * Don't take into account the MFN when removing mapping (i.e
+         * MFN_INVALID) to calculate the correct target order.
+         *
+         * XXX: Support superpage mappings if nr is not aligned to a
+         * superpage size.
+         */
+        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
+        mask |= gfn_x(sgfn) | nr;
+
+        /* Always map 4k by 4k when memaccess is enabled */
+        if ( unlikely(p2m->mem_access_enabled) )
+            order = THIRD_ORDER;
+        else if ( !(mask & ((1UL << FIRST_ORDER) - 1)) )
+            order = FIRST_ORDER;
+        else if ( !(mask & ((1UL << SECOND_ORDER) - 1)) )
+            order = SECOND_ORDER;
+        else
+            order = THIRD_ORDER;
+
+        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
+        if ( rc )
+            break;
+
+        sgfn = gfn_add(sgfn, (1 << order));
+        if ( !mfn_eq(smfn, INVALID_MFN) )
+           smfn = mfn_add(smfn, (1 << order));
+
+        nr -= (1 << order);
+    }
+
+    return rc;
+}
+
+/* Invalidate all entries in the table. The p2m should be write locked. */
+static void p2m_invalidate_table(struct p2m_domain *p2m, mfn_t mfn)
+{
+    lpae_t *table;
+    unsigned int i;
+
+    ASSERT(p2m_is_write_locked(p2m));
+
+    table = map_domain_page(mfn);
+
+    for ( i = 0; i < XEN_PT_LPAE_ENTRIES; i++ )
+    {
+        lpae_t pte = table[i];
+
+        /*
+         * Writing an entry can be expensive because it may involve
+         * cleaning the cache. So avoid updating the entry if the valid
+         * bit is already cleared.
+         */
+        if ( !pte.p2m.valid )
+            continue;
+
+        pte.p2m.valid = 0;
+
+        p2m_write_pte(&table[i], pte, p2m->clean_pte);
+    }
+
+    unmap_domain_page(table);
+
+    p2m->need_flush = true;
+}
+
+/*
+ * Invalidate all entries in the root page-tables. This is
+ * useful to get fault on entry and do an action.
+ *
+ * p2m_invalid_root() should not be called when the P2M is shared with
+ * the IOMMU because it will cause IOMMU fault.
+ */
+void p2m_invalidate_root(struct p2m_domain *p2m)
+{
+    unsigned int i;
+
+    ASSERT(!iommu_use_hap_pt(p2m->domain));
+
+    p2m_write_lock(p2m);
+
+    for ( i = 0; i < P2M_ROOT_LEVEL; i++ )
+        p2m_invalidate_table(p2m, page_to_mfn(p2m->root + i));
+
+    p2m_write_unlock(p2m);
+}
+
+/*
+ * Resolve any translation fault due to change in the p2m. This
+ * includes break-before-make and valid bit cleared.
+ */
+bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    unsigned int level = 0;
+    bool resolved = false;
+    lpae_t entry, *table;
+
+    /* Convenience aliases */
+    DECLARE_OFFSETS(offsets, gfn_to_gaddr(gfn));
+
+    p2m_write_lock(p2m);
+
+    /* This gfn is higher than the highest the p2m map currently holds */
+    if ( gfn_x(gfn) > gfn_x(p2m->max_mapped_gfn) )
+        goto out;
+
+    table = p2m_get_root_pointer(p2m, gfn);
+    /*
+     * The table should always be non-NULL because the gfn is below
+     * p2m->max_mapped_gfn and the root table pages are always present.
+     */
+    if ( !table )
+    {
+        ASSERT_UNREACHABLE();
+        goto out;
+    }
+
+    /*
+     * Go down the page-tables until an entry has the valid bit unset or
+     * a block/page entry has been hit.
+     */
+    for ( level = P2M_ROOT_LEVEL; level <= 3; level++ )
+    {
+        int rc;
+
+        entry = table[offsets[level]];
+
+        if ( level == 3 )
+            break;
+
+        /* Stop as soon as we hit an entry with the valid bit unset. */
+        if ( !lpae_is_valid(entry) )
+            break;
+
+        rc = p2m_next_level(p2m, true, level, &table, offsets[level]);
+        if ( rc == GUEST_TABLE_MAP_FAILED )
+            goto out_unmap;
+        else if ( rc != GUEST_TABLE_NORMAL_PAGE )
+            break;
+    }
+
+    /*
+     * If the valid bit of the entry is set, it means someone was playing with
+     * the Stage-2 page table. Nothing to do and mark the fault as resolved.
+     */
+    if ( lpae_is_valid(entry) )
+    {
+        resolved = true;
+        goto out_unmap;
+    }
+
+    /*
+     * The valid bit is unset. If the entry is still not valid then the fault
+     * cannot be resolved, exit and report it.
+     */
+    if ( !p2m_is_valid(entry) )
+        goto out_unmap;
+
+    /*
+     * Now we have an entry with valid bit unset, but still valid from
+     * the P2M point of view.
+     *
+     * If an entry is pointing to a table, each entry of the table will
+     * have there valid bit cleared. This allows a function to clear the
+     * full p2m with just a couple of write. The valid bit will then be
+     * propagated on the fault.
+     * If an entry is pointing to a block/page, no work to do for now.
+     */
+    if ( lpae_is_table(entry, level) )
+        p2m_invalidate_table(p2m, lpae_get_mfn(entry));
+
+    /*
+     * Now that the work on the entry is done, set the valid bit to prevent
+     * another fault on that entry.
+     */
+    resolved = true;
+    entry.p2m.valid = 1;
+
+    p2m_write_pte(table + offsets[level], entry, p2m->clean_pte);
+
+    /*
+     * No need to flush the TLBs as the modified entry had the valid bit
+     * unset.
+     */
+
+out_unmap:
+    unmap_domain_page(table);
+
+out:
+    p2m_write_unlock(p2m);
+
+    return resolved;
+}
+
+int p2m_insert_mapping(struct domain *d, gfn_t start_gfn, unsigned long nr,
+                       mfn_t mfn, p2m_type_t t)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc;
+
+    p2m_write_lock(p2m);
+    rc = p2m_set_entry(p2m, start_gfn, nr, mfn, t, p2m->default_access);
+    p2m_write_unlock(p2m);
+
+    return rc;
+}
+
+static inline int p2m_remove_mapping(struct domain *d,
+                                     gfn_t start_gfn,
+                                     unsigned long nr,
+                                     mfn_t mfn)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    unsigned long i;
+    int rc;
+
+    p2m_write_lock(p2m);
+    /*
+     * Before removing the GFN - MFN mapping for any RAM pages make sure
+     * that there is no difference between what is already mapped and what
+     * is requested to be unmapped.
+     * If they don't match bail out early. For instance, this could happen
+     * if two CPUs are requesting to unmap the same P2M entry concurrently.
+     */
+    for ( i = 0; i < nr; )
+    {
+        unsigned int cur_order;
+        p2m_type_t t;
+        mfn_t mfn_return = p2m_get_entry(p2m, gfn_add(start_gfn, i), &t, NULL,
+                                         &cur_order, NULL);
+
+        if ( p2m_is_any_ram(t) &&
+             (!mfn_valid(mfn) || !mfn_eq(mfn_add(mfn, i), mfn_return)) )
+        {
+            rc = -EILSEQ;
+            goto out;
+        }
+
+        i += (1UL << cur_order) -
+             ((gfn_x(start_gfn) + i) & ((1UL << cur_order) - 1));
+    }
+
+    rc = p2m_set_entry(p2m, start_gfn, nr, INVALID_MFN,
+                       p2m_invalid, p2m_access_rwx);
+
+out:
+    p2m_write_unlock(p2m);
+
+    return rc;
+}
+
+int map_regions_p2mt(struct domain *d,
+                     gfn_t gfn,
+                     unsigned long nr,
+                     mfn_t mfn,
+                     p2m_type_t p2mt)
+{
+    return p2m_insert_mapping(d, gfn, nr, mfn, p2mt);
+}
+
+int unmap_regions_p2mt(struct domain *d,
+                       gfn_t gfn,
+                       unsigned long nr,
+                       mfn_t mfn)
+{
+    return p2m_remove_mapping(d, gfn, nr, mfn);
+}
+
+int map_mmio_regions(struct domain *d,
+                     gfn_t start_gfn,
+                     unsigned long nr,
+                     mfn_t mfn)
+{
+    return p2m_insert_mapping(d, start_gfn, nr, mfn, p2m_mmio_direct_dev);
+}
+
+int unmap_mmio_regions(struct domain *d,
+                       gfn_t start_gfn,
+                       unsigned long nr,
+                       mfn_t mfn)
+{
+    return p2m_remove_mapping(d, start_gfn, nr, mfn);
+}
+
+int map_dev_mmio_page(struct domain *d, gfn_t gfn, mfn_t mfn)
+{
+    int res;
+
+    if ( !iomem_access_permitted(d, mfn_x(mfn), mfn_x(mfn)) )
+        return 0;
+
+    res = p2m_insert_mapping(d, gfn, 1, mfn, p2m_mmio_direct_c);
+    if ( res < 0 )
+    {
+        printk(XENLOG_G_ERR "Unable to map MFN %#"PRI_mfn" in %pd\n",
+               mfn_x(mfn), d);
+        return res;
+    }
+
+    return 0;
+}
+
+int guest_physmap_add_entry(struct domain *d,
+                            gfn_t gfn,
+                            mfn_t mfn,
+                            unsigned long page_order,
+                            p2m_type_t t)
+{
+    return p2m_insert_mapping(d, gfn, (1 << page_order), mfn, t);
+}
+
+int guest_physmap_remove_page(struct domain *d, gfn_t gfn, mfn_t mfn,
+                              unsigned int page_order)
+{
+    return p2m_remove_mapping(d, gfn, (1 << page_order), mfn);
+}
+
+int set_foreign_p2m_entry(struct domain *d, const struct domain *fd,
+                          unsigned long gfn, mfn_t mfn)
+{
+    struct page_info *page = mfn_to_page(mfn);
+    int rc;
+
+    ASSERT(arch_acquire_resource_check(d));
+
+    if ( !get_page(page, fd) )
+        return -EINVAL;
+
+    /*
+     * It is valid to always use p2m_map_foreign_rw here as if this gets
+     * called then d != fd. A case when d == fd would be rejected by
+     * rcu_lock_remote_domain_by_id() earlier. Put a respective ASSERT()
+     * to catch incorrect usage in future.
+     */
+    ASSERT(d != fd);
+
+    rc = guest_physmap_add_entry(d, _gfn(gfn), mfn, 0, p2m_map_foreign_rw);
+    if ( rc )
+        put_page(page);
+
+    return rc;
+}
+
+static struct page_info *p2m_allocate_root(void)
+{
+    struct page_info *page;
+    unsigned int i;
+
+    page = alloc_domheap_pages(NULL, P2M_ROOT_ORDER, 0);
+    if ( page == NULL )
+        return NULL;
+
+    /* Clear both first level pages */
+    for ( i = 0; i < P2M_ROOT_PAGES; i++ )
+        clear_and_clean_page(page + i);
+
+    return page;
+}
+
+static int p2m_alloc_table(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    p2m->root = p2m_allocate_root();
+    if ( !p2m->root )
+        return -ENOMEM;
+
+    p2m->vttbr = generate_vttbr(p2m->vmid, page_to_mfn(p2m->root));
+
+    /*
+     * Make sure that all TLBs corresponding to the new VMID are flushed
+     * before using it
+     */
+    p2m_write_lock(p2m);
+    p2m_force_tlb_flush_sync(p2m);
+    p2m_write_unlock(p2m);
+
+    return 0;
+}
+
+
+static spinlock_t vmid_alloc_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * VTTBR_EL2 VMID field is 8 or 16 bits. AArch64 may support 16-bit VMID.
+ * Using a bitmap here limits us to 256 or 65536 (for AArch64) concurrent
+ * domains. The bitmap space will be allocated dynamically based on
+ * whether 8 or 16 bit VMIDs are supported.
+ */
+static unsigned long *vmid_mask;
+
+static void p2m_vmid_allocator_init(void)
+{
+    /*
+     * allocate space for vmid_mask based on MAX_VMID
+     */
+    vmid_mask = xzalloc_array(unsigned long, BITS_TO_LONGS(MAX_VMID));
+
+    if ( !vmid_mask )
+        panic("Could not allocate VMID bitmap space\n");
+
+    set_bit(INVALID_VMID, vmid_mask);
+}
+
+static int p2m_alloc_vmid(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    int rc, nr;
+
+    spin_lock(&vmid_alloc_lock);
+
+    nr = find_first_zero_bit(vmid_mask, MAX_VMID);
+
+    ASSERT(nr != INVALID_VMID);
+
+    if ( nr == MAX_VMID )
+    {
+        rc = -EBUSY;
+        printk(XENLOG_ERR "p2m.c: dom%d: VMID pool exhausted\n", d->domain_id);
+        goto out;
+    }
+
+    set_bit(nr, vmid_mask);
+
+    p2m->vmid = nr;
+
+    rc = 0;
+
+out:
+    spin_unlock(&vmid_alloc_lock);
+    return rc;
+}
+
+static void p2m_free_vmid(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    spin_lock(&vmid_alloc_lock);
+    if ( p2m->vmid != INVALID_VMID )
+        clear_bit(p2m->vmid, vmid_mask);
+
+    spin_unlock(&vmid_alloc_lock);
+}
+
+int p2m_teardown(struct domain *d, bool allow_preemption)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    unsigned long count = 0;
+    struct page_info *pg;
+    unsigned int i;
+    int rc = 0;
+
+    if ( page_list_empty(&p2m->pages) )
+        return 0;
+
+    p2m_write_lock(p2m);
+
+    /*
+     * We are about to free the intermediate page-tables, so clear the
+     * root to prevent any walk to use them.
+     */
+    for ( i = 0; i < P2M_ROOT_PAGES; i++ )
+        clear_and_clean_page(p2m->root + i);
+
+    /*
+     * The domain will not be scheduled anymore, so in theory we should
+     * not need to flush the TLBs. Do it for safety purpose.
+     *
+     * Note that all the devices have already been de-assigned. So we don't
+     * need to flush the IOMMU TLB here.
+     */
+    p2m_force_tlb_flush_sync(p2m);
+
+    while ( (pg = page_list_remove_head(&p2m->pages)) )
+    {
+        p2m_free_page(p2m->domain, pg);
+        count++;
+        /* Arbitrarily preempt every 512 iterations */
+        if ( allow_preemption && !(count % 512) && hypercall_preempt_check() )
+        {
+            rc = -ERESTART;
+            break;
+        }
+    }
+
+    p2m_write_unlock(p2m);
+
+    return rc;
+}
+
+void p2m_final_teardown(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    /* p2m not actually initialized */
+    if ( !p2m->domain )
+        return;
+
+    /*
+     * No need to call relinquish_p2m_mapping() here because
+     * p2m_final_teardown() is called either after 
domain_relinquish_resources()
+     * where relinquish_p2m_mapping() has been called, or from failure path of
+     * domain_create()/arch_domain_create() where mappings that require
+     * p2m_put_l3_page() should never be created. For the latter case, also see
+     * comment on top of the p2m_set_entry() for more info.
+     */
+
+    BUG_ON(p2m_teardown(d, false));
+    ASSERT(page_list_empty(&p2m->pages));
+
+    while ( p2m_teardown_allocation(d) == -ERESTART )
+        continue; /* No preemption support here */
+    ASSERT(page_list_empty(&d->arch.paging.p2m_freelist));
+
+    if ( p2m->root )
+        free_domheap_pages(p2m->root, P2M_ROOT_ORDER);
+
+    p2m->root = NULL;
+
+    p2m_free_vmid(d);
+
+    radix_tree_destroy(&p2m->mem_access_settings, NULL);
+
+    p2m->domain = NULL;
+}
+
+int p2m_init(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc;
+    unsigned int cpu;
+
+    rwlock_init(&p2m->lock);
+    spin_lock_init(&d->arch.paging.lock);
+    INIT_PAGE_LIST_HEAD(&p2m->pages);
+    INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist);
+
+    p2m->vmid = INVALID_VMID;
+    p2m->max_mapped_gfn = _gfn(0);
+    p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
+
+    p2m->default_access = p2m_access_rwx;
+    p2m->mem_access_enabled = false;
+    radix_tree_init(&p2m->mem_access_settings);
+
+    /*
+     * Some IOMMUs don't support coherent PT walk. When the p2m is
+     * shared with the CPU, Xen has to make sure that the PT changes have
+     * reached the memory
+     */
+    p2m->clean_pte = is_iommu_enabled(d) &&
+        !iommu_has_feature(d, IOMMU_FEAT_COHERENT_WALK);
+
+    /*
+     * Make sure that the type chosen to is able to store the an vCPU ID
+     * between 0 and the maximum of virtual CPUS supported as long as
+     * the INVALID_VCPU_ID.
+     */
+    BUILD_BUG_ON((1 << (sizeof(p2m->last_vcpu_ran[0]) * 8)) < MAX_VIRT_CPUS);
+    BUILD_BUG_ON((1 << (sizeof(p2m->last_vcpu_ran[0])* 8)) < INVALID_VCPU_ID);
+
+    for_each_possible_cpu(cpu)
+       p2m->last_vcpu_ran[cpu] = INVALID_VCPU_ID;
+
+    /*
+     * "Trivial" initialisation is now complete.  Set the backpointer so
+     * p2m_teardown() and friends know to do something.
+     */
+    p2m->domain = d;
+
+    rc = p2m_alloc_vmid(d);
+    if ( rc )
+        return rc;
+
+    rc = p2m_alloc_table(d);
+    if ( rc )
+        return rc;
+
+    /*
+     * Hardware using GICv2 needs to create a P2M mapping of 8KB GICv2 area
+     * when the domain is created. Considering the worst case for page
+     * tables and keep a buffer, populate 16 pages to the P2M pages pool here.
+     * For GICv3, the above-mentioned P2M mapping is not necessary, but since
+     * the allocated 16 pages here would not be lost, hence populate these
+     * pages unconditionally.
+     */
+    spin_lock(&d->arch.paging.lock);
+    rc = p2m_set_allocation(d, 16, NULL);
+    spin_unlock(&d->arch.paging.lock);
+    if ( rc )
+        return rc;
+
+    return 0;
+}
+
+/*
+ * The function will go through the p2m and remove page reference when it
+ * is required. The mapping will be removed from the p2m.
+ *
+ * XXX: See whether the mapping can be left intact in the p2m.
+ */
+int relinquish_p2m_mapping(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    unsigned long count = 0;
+    p2m_type_t t;
+    int rc = 0;
+    unsigned int order;
+    gfn_t start, end;
+
+    BUG_ON(!d->is_dying);
+    /* No mappings can be added in the P2M after the P2M lock is released. */
+    p2m_write_lock(p2m);
+
+    start = p2m->lowest_mapped_gfn;
+    end = gfn_add(p2m->max_mapped_gfn, 1);
+
+    for ( ; gfn_x(start) < gfn_x(end);
+          start = gfn_next_boundary(start, order) )
+    {
+        mfn_t mfn = p2m_get_entry(p2m, start, &t, NULL, &order, NULL);
+
+        count++;
+        /*
+         * Arbitrarily preempt every 512 iterations.
+         */
+        if ( !(count % 512) && hypercall_preempt_check() )
+        {
+            rc = -ERESTART;
+            break;
+        }
+
+        /*
+         * p2m_set_entry will take care of removing reference on page
+         * when it is necessary and removing the mapping in the p2m.
+         */
+        if ( !mfn_eq(mfn, INVALID_MFN) )
+        {
+            /*
+             * For valid mapping, the start will always be aligned as
+             * entry will be removed whilst relinquishing.
+             */
+            rc = __p2m_set_entry(p2m, start, order, INVALID_MFN,
+                                 p2m_invalid, p2m_access_rwx);
+            if ( unlikely(rc) )
+            {
+                printk(XENLOG_G_ERR "Unable to remove mapping gfn=%#"PRI_gfn" 
order=%u from the p2m of domain %d\n", gfn_x(start), order, d->domain_id);
+                break;
+            }
+        }
+    }
+
+    /*
+     * Update lowest_mapped_gfn so on the next call we still start where
+     * we stopped.
+     */
+    p2m->lowest_mapped_gfn = start;
+
+    p2m_write_unlock(p2m);
+
+    return rc;
+}
+
+int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    gfn_t next_block_gfn;
+    gfn_t start = *pstart;
+    mfn_t mfn = INVALID_MFN;
+    p2m_type_t t;
+    unsigned int order;
+    int rc = 0;
+    /* Counter for preemption */
+    unsigned short count = 0;
+
+    /*
+     * The operation cache flush will invalidate the RAM assigned to the
+     * guest in a given range. It will not modify the page table and
+     * flushing the cache whilst the page is used by another CPU is
+     * fine. So using read-lock is fine here.
+     */
+    p2m_read_lock(p2m);
+
+    start = gfn_max(start, p2m->lowest_mapped_gfn);
+    end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1));
+
+    next_block_gfn = start;
+
+    while ( gfn_x(start) < gfn_x(end) )
+    {
+       /*
+         * Cleaning the cache for the P2M may take a long time. So we
+         * need to be able to preempt. We will arbitrarily preempt every
+         * time count reach 512 or above.
+         *
+         * The count will be incremented by:
+         *  - 1 on region skipped
+         *  - 10 for each page requiring a flush
+         */
+        if ( count >= 512 )
+        {
+            if ( softirq_pending(smp_processor_id()) )
+            {
+                rc = -ERESTART;
+                break;
+            }
+            count = 0;
+        }
+
+        /*
+         * We want to flush page by page as:
+         *  - it may not be possible to map the full block (can be up to 1GB)
+         *    in Xen memory
+         *  - we may want to do fine grain preemption as flushing multiple
+         *    page in one go may take a long time
+         *
+         * As p2m_get_entry is able to return the size of the mapping
+         * in the p2m, it is pointless to execute it for each page.
+         *
+         * We can optimize it by tracking the gfn of the next
+         * block. So we will only call p2m_get_entry for each block (can
+         * be up to 1GB).
+         */
+        if ( gfn_eq(start, next_block_gfn) )
+        {
+            bool valid;
+
+            mfn = p2m_get_entry(p2m, start, &t, NULL, &order, &valid);
+            next_block_gfn = gfn_next_boundary(start, order);
+
+            if ( mfn_eq(mfn, INVALID_MFN) || !p2m_is_any_ram(t) || !valid )
+            {
+                count++;
+                start = next_block_gfn;
+                continue;
+            }
+        }
+
+        count += 10;
+
+        flush_page_to_ram(mfn_x(mfn), false);
+
+        start = gfn_add(start, 1);
+        mfn = mfn_add(mfn, 1);
+    }
+
+    if ( rc != -ERESTART )
+        invalidate_icache();
+
+    p2m_read_unlock(p2m);
+
+    *pstart = start;
+
+    return rc;
+}
+
+/*
+ * Clean & invalidate RAM associated to the guest vCPU.
+ *
+ * The function can only work with the current vCPU and should be called
+ * with IRQ enabled as the vCPU could get preempted.
+ */
+void p2m_flush_vm(struct vcpu *v)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+    int rc;
+    gfn_t start = _gfn(0);
+
+    ASSERT(v == current);
+    ASSERT(local_irq_is_enabled());
+    ASSERT(v->arch.need_flush_to_ram);
+
+    do
+    {
+        rc = p2m_cache_flush_range(v->domain, &start, _gfn(ULONG_MAX));
+        if ( rc == -ERESTART )
+            do_softirq();
+    } while ( rc == -ERESTART );
+
+    if ( rc != 0 )
+        gprintk(XENLOG_WARNING,
+                "P2M has not been correctly cleaned (rc = %d)\n",
+                rc);
+
+    /*
+     * Invalidate the p2m to track which page was modified by the guest
+     * between call of p2m_flush_vm().
+     */
+    p2m_invalidate_root(p2m);
+
+    v->arch.need_flush_to_ram = false;
+}
+
+/*
+ * See note at ARMv7 ARM B1.14.4 (DDI 0406C.c) (TL;DR: S/W ops are not
+ * easily virtualized).
+ *
+ * Main problems:
+ *  - S/W ops are local to a CPU (not broadcast)
+ *  - We have line migration behind our back (speculation)
+ *  - System caches don't support S/W at all (damn!)
+ *
+ * In the face of the above, the best we can do is to try and convert
+ * S/W ops to VA ops. Because the guest is not allowed to infer the S/W
+ * to PA mapping, it can only use S/W to nuke the whole cache, which is
+ * rather a good thing for us.
+ *
+ * Also, it is only used when turning caches on/off ("The expected
+ * usage of the cache maintenance instructions that operate by set/way
+ * is associated with the powerdown and powerup of caches, if this is
+ * required by the implementation.").
+ *
+ * We use the following policy:
+ *  - If we trap a S/W operation, we enabled VM trapping to detect
+ *  caches being turned on/off, and do a full clean.
+ *
+ *  - We flush the caches on both caches being turned on and off.
+ *
+ *  - Once the caches are enabled, we stop trapping VM ops.
+ */
+void p2m_set_way_flush(struct vcpu *v, struct cpu_user_regs *regs,
+                       const union hsr hsr)
+{
+    /* This function can only work with the current vCPU. */
+    ASSERT(v == current);
+
+    if ( iommu_use_hap_pt(current->domain) )
+    {
+        gprintk(XENLOG_ERR,
+                "The cache should be flushed by VA rather than by set/way.\n");
+        inject_undef_exception(regs, hsr);
+        return;
+    }
+
+    if ( !(v->arch.hcr_el2 & HCR_TVM) )
+    {
+        v->arch.need_flush_to_ram = true;
+        vcpu_hcr_set_flags(v, HCR_TVM);
+    }
+}
+
+void p2m_toggle_cache(struct vcpu *v, bool was_enabled)
+{
+    bool now_enabled = vcpu_has_cache_enabled(v);
+
+    /* This function can only work with the current vCPU. */
+    ASSERT(v == current);
+
+    /*
+     * If switching the MMU+caches on, need to invalidate the caches.
+     * If switching it off, need to clean the caches.
+     * Clean + invalidate does the trick always.
+     */
+    if ( was_enabled != now_enabled )
+        v->arch.need_flush_to_ram = true;
+
+    /* Caches are now on, stop trapping VM ops (until a S/W op) */
+    if ( now_enabled )
+        vcpu_hcr_clear_flags(v, HCR_TVM);
+}
+
+mfn_t gfn_to_mfn(struct domain *d, gfn_t gfn)
+{
+    return p2m_lookup(d, gfn, NULL);
+}
+
+struct page_info *get_page_from_gva(struct vcpu *v, vaddr_t va,
+                                    unsigned long flags)
+{
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    struct page_info *page = NULL;
+    paddr_t maddr = 0;
+    uint64_t par;
+    mfn_t mfn;
+    p2m_type_t t;
+
+    /*
+     * XXX: To support a different vCPU, we would need to load the
+     * VTTBR_EL2, TTBR0_EL1, TTBR1_EL1 and SCTLR_EL1
+     */
+    if ( v != current )
+        return NULL;
+
+    /*
+     * The lock is here to protect us against the break-before-make
+     * sequence used when updating the entry.
+     */
+    p2m_read_lock(p2m);
+    par = gvirt_to_maddr(va, &maddr, flags);
+    p2m_read_unlock(p2m);
+
+    /*
+     * gvirt_to_maddr may fail if the entry does not have the valid bit
+     * set. Fallback to the second method:
+     *  1) Translate the VA to IPA using software lookup -> Stage-1 page-table
+     *  may not be accessible because the stage-2 entries may have valid
+     *  bit unset.
+     *  2) Software lookup of the MFN
+     *
+     * Note that when memaccess is enabled, we instead call directly
+     * p2m_mem_access_check_and_get_page(...). Because the function is a
+     * a variant of the methods described above, it will be able to
+     * handle entries with valid bit unset.
+     *
+     * TODO: Integrate more nicely memaccess with the rest of the
+     * function.
+     * TODO: Use the fault error in PAR_EL1 to avoid pointless
+     *  translation.
+     */
+    if ( par )
+    {
+        paddr_t ipa;
+        unsigned int s1_perms;
+
+        /*
+         * When memaccess is enabled, the translation GVA to MADDR may
+         * have failed because of a permission fault.
+         */
+        if ( p2m->mem_access_enabled )
+            return p2m_mem_access_check_and_get_page(va, flags, v);
+
+        /*
+         * The software stage-1 table walk can still fail, e.g, if the
+         * GVA is not mapped.
+         */
+        if ( !guest_walk_tables(v, va, &ipa, &s1_perms) )
+        {
+            dprintk(XENLOG_G_DEBUG,
+                    "%pv: Failed to walk page-table va %#"PRIvaddr"\n", v, va);
+            return NULL;
+        }
+
+        mfn = p2m_lookup(d, gaddr_to_gfn(ipa), &t);
+        if ( mfn_eq(INVALID_MFN, mfn) || !p2m_is_ram(t) )
+            return NULL;
+
+        /*
+         * Check permission that are assumed by the caller. For instance
+         * in case of guestcopy, the caller assumes that the translated
+         * page can be accessed with the requested permissions. If this
+         * is not the case, we should fail.
+         *
+         * Please note that we do not check for the GV2M_EXEC
+         * permission. This is fine because the hardware-based translation
+         * instruction does not test for execute permissions.
+         */
+        if ( (flags & GV2M_WRITE) && !(s1_perms & GV2M_WRITE) )
+            return NULL;
+
+        if ( (flags & GV2M_WRITE) && t != p2m_ram_rw )
+            return NULL;
+    }
+    else
+        mfn = maddr_to_mfn(maddr);
+
+    if ( !mfn_valid(mfn) )
+    {
+        dprintk(XENLOG_G_DEBUG, "%pv: Invalid MFN %#"PRI_mfn"\n",
+                v, mfn_x(mfn));
+        return NULL;
+    }
+
+    page = mfn_to_page(mfn);
+    ASSERT(page);
+
+    if ( unlikely(!get_page(page, d)) )
+    {
+        dprintk(XENLOG_G_DEBUG, "%pv: Failing to acquire the MFN 
%#"PRI_mfn"\n",
+                v, mfn_x(maddr_to_mfn(maddr)));
+        return NULL;
+    }
+
+    return page;
+}
+
+/* VTCR value to be configured by all CPUs. Set only once by the boot CPU */
+static register_t __read_mostly vtcr;
+
+static void setup_virt_paging_one(void *data)
+{
+    WRITE_SYSREG(vtcr, VTCR_EL2);
+
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE: We want to keep the TLBs free from
+     * entries related to EL1/EL0 translation regime until a guest vCPU
+     * is running. For that, we need to set-up VTTBR to point to an empty
+     * page-table and turn on stage-2 translation. The TLB entries
+     * associated with EL1/EL0 translation regime will also be flushed in case
+     * an AT instruction was speculated before hand.
+     */
+    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+    {
+        WRITE_SYSREG64(generate_vttbr(INVALID_VMID, empty_root_mfn), 
VTTBR_EL2);
+        WRITE_SYSREG(READ_SYSREG(HCR_EL2) | HCR_VM, HCR_EL2);
+        isb();
+
+        flush_all_guests_tlb_local();
+    }
+}
+
+void __init setup_virt_paging(void)
+{
+    /* Setup Stage 2 address translation */
+    register_t val = VTCR_RES1|VTCR_SH0_IS|VTCR_ORGN0_WBWA|VTCR_IRGN0_WBWA;
+
+#ifdef CONFIG_ARM_32
+    if ( p2m_ipa_bits < 40 )
+        panic("P2M: Not able to support %u-bit IPA at the moment\n",
+              p2m_ipa_bits);
+
+    printk("P2M: 40-bit IPA\n");
+    p2m_ipa_bits = 40;
+    val |= VTCR_T0SZ(0x18); /* 40 bit IPA */
+    val |= VTCR_SL0(0x1); /* P2M starts at first level */
+#else /* CONFIG_ARM_64 */
+    const struct {
+        unsigned int pabits; /* Physical Address Size */
+        unsigned int t0sz;   /* Desired T0SZ, minimum in comment */
+        unsigned int root_order; /* Page order of the root of the p2m */
+        unsigned int sl0;    /* Desired SL0, maximum in comment */
+    } pa_range_info[] = {
+        /* T0SZ minimum and SL0 maximum from ARM DDI 0487H.a Table D5-6 */
+        /*      PA size, t0sz(min), root-order, sl0(max) */
+        [0] = { 32,      32/*32*/,  0,          1 },
+        [1] = { 36,      28/*28*/,  0,          1 },
+        [2] = { 40,      24/*24*/,  1,          1 },
+        [3] = { 42,      22/*22*/,  3,          1 },
+        [4] = { 44,      20/*20*/,  0,          2 },
+        [5] = { 48,      16/*16*/,  0,          2 },
+        [6] = { 52,      12/*12*/,  4,          2 },
+        [7] = { 0 }  /* Invalid */
+    };
+
+    unsigned int i;
+    unsigned int pa_range = 0x10; /* Larger than any possible value */
+
+    /*
+     * Restrict "p2m_ipa_bits" if needed. As P2M table is always configured
+     * with IPA bits == PA bits, compare against "pabits".
+     */
+    if ( pa_range_info[system_cpuinfo.mm64.pa_range].pabits < p2m_ipa_bits )
+        p2m_ipa_bits = pa_range_info[system_cpuinfo.mm64.pa_range].pabits;
+
+    /*
+     * cpu info sanitization made sure we support 16bits VMID only if all
+     * cores are supporting it.
+     */
+    if ( system_cpuinfo.mm64.vmid_bits == MM64_VMID_16_BITS_SUPPORT )
+        max_vmid = MAX_VMID_16_BIT;
+
+    /* Choose suitable "pa_range" according to the resulted "p2m_ipa_bits". */
+    for ( i = 0; i < ARRAY_SIZE(pa_range_info); i++ )
+    {
+        if ( p2m_ipa_bits == pa_range_info[i].pabits )
+        {
+            pa_range = i;
+            break;
+        }
+    }
+
+    /* pa_range is 4 bits but we don't support all modes */
+    if ( pa_range >= ARRAY_SIZE(pa_range_info) || 
!pa_range_info[pa_range].pabits )
+        panic("Unknown encoding of ID_AA64MMFR0_EL1.PARange %x\n", pa_range);
+
+    val |= VTCR_PS(pa_range);
+    val |= VTCR_TG0_4K;
+
+    /* Set the VS bit only if 16 bit VMID is supported. */
+    if ( MAX_VMID == MAX_VMID_16_BIT )
+        val |= VTCR_VS;
+    val |= VTCR_SL0(pa_range_info[pa_range].sl0);
+    val |= VTCR_T0SZ(pa_range_info[pa_range].t0sz);
+
+    p2m_root_order = pa_range_info[pa_range].root_order;
+    p2m_root_level = 2 - pa_range_info[pa_range].sl0;
+    p2m_ipa_bits = 64 - pa_range_info[pa_range].t0sz;
+
+    printk("P2M: %d-bit IPA with %d-bit PA and %d-bit VMID\n",
+           p2m_ipa_bits,
+           pa_range_info[pa_range].pabits,
+           ( MAX_VMID == MAX_VMID_16_BIT ) ? 16 : 8);
+#endif
+    printk("P2M: %d levels with order-%d root, VTCR 0x%"PRIregister"\n",
+           4 - P2M_ROOT_LEVEL, P2M_ROOT_ORDER, val);
+
+    p2m_vmid_allocator_init();
+
+    /* It is not allowed to concatenate a level zero root */
+    BUG_ON( P2M_ROOT_LEVEL == 0 && P2M_ROOT_ORDER > 0 );
+    vtcr = val;
+
+    /*
+     * ARM64_WORKAROUND_AT_SPECULATE requires to allocate root table
+     * with all entries zeroed.
+     */
+    if ( cpus_have_cap(ARM64_WORKAROUND_AT_SPECULATE) )
+    {
+        struct page_info *root;
+
+        root = p2m_allocate_root();
+        if ( !root )
+            panic("Unable to allocate root table for 
ARM64_WORKAROUND_AT_SPECULATE\n");
+
+        empty_root_mfn = page_to_mfn(root);
+    }
+
+    setup_virt_paging_one(NULL);
+    smp_call_function(setup_virt_paging_one, NULL, 1);
+}
+
+static int cpu_virt_paging_callback(struct notifier_block *nfb,
+                                    unsigned long action,
+                                    void *hcpu)
+{
+    switch ( action )
+    {
+    case CPU_STARTING:
+        ASSERT(system_state != SYS_STATE_boot);
+        setup_virt_paging_one(NULL);
+        break;
+    default:
+        break;
+    }
+
+    return NOTIFY_DONE;
+}
+
+static struct notifier_block cpu_virt_paging_nfb = {
+    .notifier_call = cpu_virt_paging_callback,
+};
+
+static int __init cpu_virt_paging_init(void)
+{
+    register_cpu_notifier(&cpu_virt_paging_nfb);
+
+    return 0;
+}
+/*
+ * Initialization of the notifier has to be done at init rather than 
presmp_init
+ * phase because: the registered notifier is used to setup virtual paging for
+ * non-boot CPUs after the initial virtual paging for all CPUs is already 
setup,
+ * i.e. when a non-boot CPU is hotplugged after the system has booted. In other
+ * words, the notifier should be registered after the virtual paging is
+ * initially setup (setup_virt_paging() is called from start_xen()). This is
+ * required because vtcr config value has to be set before a notifier can fire.
+ */
+__initcall(cpu_virt_paging_init);
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/arm/p2m_mpu.c b/xen/arch/arm/p2m_mpu.c
new file mode 100644
index 0000000000..0a95d58111
--- /dev/null
+++ b/xen/arch/arm/p2m_mpu.c
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <xen/lib.h>
+#include <xen/mm-frame.h>
+#include <xen/sched.h>
+
+#include <asm/p2m.h>
+
+/* TODO: Implement on the first usage */
+void p2m_write_unlock(struct p2m_domain *p2m)
+{
+}
+
+void p2m_dump_info(struct domain *d)
+{
+}
+
+void dump_p2m_lookup(struct domain *d, paddr_t addr)
+{
+}
+
+void p2m_save_state(struct vcpu *p)
+{
+}
+
+void p2m_restore_state(struct vcpu *n)
+{
+}
+
+mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
+                    p2m_type_t *t, p2m_access_t *a,
+                    unsigned int *page_order,
+                    bool *valid)
+{
+    return INVALID_MFN;
+}
+
+mfn_t p2m_lookup(struct domain *d, gfn_t gfn, p2m_type_t *t)
+{
+    return INVALID_MFN;
+}
+
+struct page_info *p2m_get_page_from_gfn(struct domain *d, gfn_t gfn,
+                                        p2m_type_t *t)
+{
+    return NULL;
+}
+
+int p2m_set_entry(struct p2m_domain *p2m,
+                  gfn_t sgfn,
+                  unsigned long nr,
+                  mfn_t smfn,
+                  p2m_type_t t,
+                  p2m_access_t a)
+{
+    return -ENOSYS;
+}
+
+void p2m_invalidate_root(struct p2m_domain *p2m)
+{
+}
+
+bool p2m_resolve_translation_fault(struct domain *d, gfn_t gfn)
+{
+    return false;
+}
+
+int p2m_insert_mapping(struct domain *d, gfn_t start_gfn, unsigned long nr,
+                       mfn_t mfn, p2m_type_t t)
+{
+    return -ENOSYS;
+}
+
+int map_regions_p2mt(struct domain *d,
+                     gfn_t gfn,
+                     unsigned long nr,
+                     mfn_t mfn,
+                     p2m_type_t p2mt)
+{
+    return -ENOSYS;
+}
+
+int unmap_regions_p2mt(struct domain *d,
+                       gfn_t gfn,
+                       unsigned long nr,
+                       mfn_t mfn)
+{
+    return -ENOSYS;
+}
+
+int map_mmio_regions(struct domain *d,
+                     gfn_t start_gfn,
+                     unsigned long nr,
+                     mfn_t mfn)
+{
+    return -ENOSYS;
+}
+
+int unmap_mmio_regions(struct domain *d,
+                       gfn_t start_gfn,
+                       unsigned long nr,
+                       mfn_t mfn)
+{
+    return -ENOSYS;
+}
+
+int map_dev_mmio_page(struct domain *d, gfn_t gfn, mfn_t mfn)
+{
+    return -ENOSYS;
+}
+
+int guest_physmap_add_entry(struct domain *d,
+                            gfn_t gfn,
+                            mfn_t mfn,
+                            unsigned long page_order,
+                            p2m_type_t t)
+{
+    return -ENOSYS;
+}
+
+int guest_physmap_remove_page(struct domain *d, gfn_t gfn, mfn_t mfn,
+                              unsigned int page_order)
+{
+    return -ENOSYS;
+}
+
+int set_foreign_p2m_entry(struct domain *d, const struct domain *fd,
+                          unsigned long gfn, mfn_t mfn)
+{
+    return -ENOSYS;
+}
+
+int p2m_teardown(struct domain *d, bool allow_preemption)
+{
+    return -ENOSYS;
+}
+
+void p2m_final_teardown(struct domain *d)
+{
+}
+
+int p2m_init(struct domain *d)
+{
+    return -ENOSYS;
+}
+
+int relinquish_p2m_mapping(struct domain *d)
+{
+    return -ENOSYS;
+}
+
+int p2m_cache_flush_range(struct domain *d, gfn_t *pstart, gfn_t end)
+{
+    return -ENOSYS;
+}
+
+void p2m_flush_vm(struct vcpu *v)
+{
+}
+
+void p2m_set_way_flush(struct vcpu *v, struct cpu_user_regs *regs,
+                       const union hsr hsr)
+{
+}
+
+void p2m_toggle_cache(struct vcpu *v, bool was_enabled)
+{
+}
+
+mfn_t gfn_to_mfn(struct domain *d, gfn_t gfn)
+{
+    return INVALID_MFN;
+}
+
+struct page_info *get_page_from_gva(struct vcpu *v, vaddr_t va,
+                                    unsigned long flags)
+{
+    return NULL;
+}
+
+void __init setup_virt_paging(void)
+{
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
2.25.1




 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.