Add hypercall (shadow op: enable/disable and clean/peek dirtied page bitmap).
It consists of two parts: dirty page detecting and saving.
For detecting, we setup the guest p2m's leaf PTE read-only and whenever the guest
tries to write something, permission fault happens and traps into xen.
The permission-faulted GPA should be saved for the toolstack (when it wants to see
which pages are dirtied). For this purpose, we temporarily save the GPAs into bitmap.
Changes from v4:
1. For temporary saving dirty pages, use bitmap rather than linked list.
2. Setup the p2m's second level page as read-write in the view of xen's memory access.
It happens in p2m_create_table function.
Signed-off-by: Jaeyong Yoo <jaeyong.yoo@xxxxxxxxxxx>
---
xen/arch/arm/domain.c | 14 +++
xen/arch/arm/domctl.c | 9 ++
xen/arch/arm/mm.c | 103 +++++++++++++++++++-
xen/arch/arm/p2m.c | 206 ++++++++++++++++++++++++++++++++++++++++
xen/arch/arm/traps.c | 9 ++
xen/include/asm-arm/domain.h | 7 ++
xen/include/asm-arm/mm.h | 7 ++
xen/include/asm-arm/p2m.h | 4 +
xen/include/asm-arm/processor.h | 2 +
9 files changed, 360 insertions(+), 1 deletion(-)
diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index c0b5dd8..0a32301 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -215,6 +215,12 @@ static void ctxt_switch_to(struct vcpu *n)
WRITE_SYSREG(hcr, HCR_EL2);
isb();
+ /* for dirty-page tracing
+ * XXX: how do we consider SMP case?
+ */
+ if ( n->domain->arch.dirty.mode )
+ restore_vlpt(n->domain);
+
/* This is could trigger an hardware interrupt from the virtual
* timer. The interrupt needs to be injected into the guest. */
virt_timer_restore(n);
@@ -509,11 +515,19 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
/* Default the virtual ID to match the physical */
d->arch.vpidr = boot_cpu_data.midr.bits;
+ /* init for dirty-page tracing */
+ d->arch.dirty.count = 0;
+ d->arch.dirty.mode = 0;
+ spin_lock_init(&d->arch.dirty.lock);
+
d->arch.dirty.second_lvl_start = 0;
d->arch.dirty.second_lvl_end = 0;
d->arch.dirty.second_lvl[0] = NULL;
d->arch.dirty.second_lvl[1] = NULL;
+ memset(d->arch.dirty.bitmap, 0, sizeof(d->arch.dirty.bitmap));
+ d->arch.dirty.bitmap_pages = 0;
+
clear_page(d->shared_info);
share_xen_page_with_guest(
virt_to_page(d->shared_info), d, XENSHARE_writable);
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index cb38e59..eb74225 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -93,6 +93,15 @@ long arch_do_domctl(struct xen_domctl *domctl, struct domain *d,
xfree(c.data);
}
break;
+ case XEN_DOMCTL_shadow_op:
+ {
+ domain_pause(d);
+ ret = dirty_mode_op(d, &domctl->u.shadow_op);
+ domain_unpause(d);
+
+ copyback = 1;
+ }
+ break;
default:
return -EINVAL;
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index bf13993..d5a0a11 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -845,7 +845,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e)
create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
}
-enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
{
lpae_t pte;
@@ -1320,6 +1319,60 @@ int is_iomem_page(unsigned long mfn)
* xen: arm: 64-bit guest support and domU FDT autogeneration
* will be upstreamed.
*/
+
+static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr)
+{
+ paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
+ int bit_index = PFN_DOWN(addr - ram_base);
+ int page_index = bit_index >> (PAGE_SHIFT + 3);
+ int bit_index_residual = bit_index & ((1ul << (PAGE_SHIFT + 3)) - 1);
+
+ set_bit(bit_index_residual, d->arch.dirty.bitmap[page_index]);
+}
+
+/* routine for dirty-page tracing
+ *
+ * On first write, it page faults, its entry is changed to read-write,
+ * and on retry the write succeeds.
+ *
+ * for locating p2m of the faulting entry, we use virtual-linear page table.
+ * returns zero if addr is not valid or dirty mode is not set
+ */
+int handle_page_fault(struct domain *d, paddr_t addr)
+{
+
+ lpae_t *vlp2m_pte = 0;
+ paddr_t gma_start = 0;
+ paddr_t gma_end = 0;
+
+ if ( !d->arch.dirty.mode ) return 0;
+ get_gma_start_end(d, &gma_start, &gma_end);
+
+ /* Ensure that addr is inside guest's RAM */
+ if ( addr < gma_start ||
+ addr > gma_end ) return 0;
+
+ vlp2m_pte = get_vlpt_3lvl_pte(addr);
+ if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
+ vlp2m_pte->p2m.avail == 0 /* reuse avail bit as read-only */ )
+ {
+ lpae_t pte = *vlp2m_pte;
+ pte.p2m.write = 1;
+ write_pte(vlp2m_pte, pte);
+ flush_tlb_local();
+
+ /* only necessary to lock between get-dirty bitmap and mark dirty
+ * bitmap. If get-dirty bitmap happens immediately before this
+ * lock, the corresponding dirty-page would be marked at the next
+ * round of get-dirty bitmap */
+ spin_lock(&d->arch.dirty.lock);
+ mark_dirty_bitmap(d, addr);
+ spin_unlock(&d->arch.dirty.lock);
+ }
+
+ return 1;
+}
+
void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end)
{
if ( start )
@@ -1440,6 +1493,54 @@ void cleanup_vlpt(struct domain *d)
unmap_domain_page_global(d->arch.dirty.second_lvl[0]);
unmap_domain_page_global(d->arch.dirty.second_lvl[1]);
}
+
+int prepare_bitmap(struct domain *d)
+{
+ paddr_t gma_start = 0;
+ paddr_t gma_end = 0;
+ int nr_bytes;
+ int nr_pages;
+ int i;
+
+ get_gma_start_end(d, &gma_start, &gma_end);
+
+ nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
+ nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ BUG_ON( nr_pages > MAX_DIRTY_BITMAP_PAGES );
+
+ for ( i = 0; i < nr_pages; ++i )
+ {
+ struct page_info *page;
+ page = alloc_domheap_page(NULL, 0);
+ if ( page == NULL )
+ goto cleanup_on_failure;
+
+ d->arch.dirty.bitmap[i] = map_domain_page_global(__page_to_mfn(page));
+ clear_page(d->arch.dirty.bitmap[i]);
+ }
+
+ d->arch.dirty.bitmap_pages = nr_pages;
+ return 0;
+
+cleanup_on_failure:
+ nr_pages = i;
+ for ( i = 0; i < nr_pages; ++i )
+ {
+ unmap_domain_page_global(d->arch.dirty.bitmap[i]);
+ }
+ return -ENOMEM;
+}
+
+void cleanup_bitmap(struct domain *d)
+{
+ int i;
+ for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+ {
+ unmap_domain_page_global(d->arch.dirty.bitmap[i]);
+ }
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 2d09fef..b7dbf7d 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -6,6 +6,8 @@
#include <xen/bitops.h>
#include <asm/flushtlb.h>
#include <asm/gic.h>
+#include <xen/guest_access.h>
+#include <xen/pfn.h>
void dump_p2m_lookup(struct domain *d, paddr_t addr)
{
@@ -113,6 +115,10 @@ static int p2m_create_table(struct domain *d,
pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM);
+ /* mark the write bit (page table's case, ro bit) as 0
+ * so, it is writable in case of vlpt access */
+ pte.pt.ro = 0;
+
write_pte(entry, pte);
return 0;
@@ -408,6 +414,206 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long gpfn)
return p >> PAGE_SHIFT;
}
+/* Change types across all p2m entries in a domain */
+void p2m_change_entry_type_global(struct domain *d, enum mg nt)
+{
+ struct p2m_domain *p2m = &d->arch.p2m;
+ paddr_t ram_base;
+ int i1, i2, i3;
+ int first_index, second_index, third_index;
+ lpae_t *first = __map_domain_page(p2m->first_level);
+ lpae_t pte, *second = NULL, *third = NULL;
+
+ get_gma_start_end(d, &ram_base, NULL);
+
+ first_index = first_table_offset((uint64_t)ram_base);
+ second_index = second_table_offset((uint64_t)ram_base);
+ third_index = third_table_offset((uint64_t)ram_base);
+
+ BUG_ON( !first && "Can't map first level p2m." );
+
+ spin_lock(&p2m->lock);
+
+ for ( i1 = first_index; i1 < LPAE_ENTRIES*2; ++i1 )
+ {
+ lpae_walk_t first_pte = first[i1].walk;
+ if ( !first_pte.valid || !first_pte.table )
+ goto out;
+
+ second = map_domain_page(first_pte.base);
+ BUG_ON( !second && "Can't map second level p2m.");
+ for ( i2 = second_index; i2 < LPAE_ENTRIES; ++i2 )
+ {
+ lpae_walk_t second_pte = second[i2].walk;
+
+ if ( !second_pte.valid || !second_pte.table )
+ goto out;
+
+ third = map_domain_page(second_pte.base);
+ BUG_ON( !third && "Can't map third level p2m.");
+
+ for ( i3 = third_index; i3 < LPAE_ENTRIES; ++i3 )
+ {
+ lpae_walk_t third_pte = third[i3].walk;
+ if ( !third_pte.valid )
+ goto out;
+
+ pte = third[i3];
+ if ( nt == mg_ro )
+ {
+ if ( pte.p2m.write == 1 )
+ {
+ pte.p2m.write = 0;
+ pte.p2m.avail = 0;
+ }
+ else
+ {
+ /* reuse avail bit as an indicator
+ * of 'actual' read-only */
+ pte.p2m.avail = 1;
+ }
+ }
+ else if ( nt == mg_rw )
+ {
+ if ( pte.p2m.write == 0 && pte.p2m.avail == 0 )
+ {
+ pte.p2m.write = 1;
+ }
+ }
+ write_pte(&third[i3], pte);
+ }
+ unmap_domain_page(third);
+
+ third = NULL;
+ third_index = 0;
+ }
+ unmap_domain_page(second);
+
+ second = NULL;
+ second_index = 0;
+ third_index = 0;
+ }
+
+out:
+ flush_tlb_all_local();
+ if ( third ) unmap_domain_page(third);
+ if ( second ) unmap_domain_page(second);
+ if ( first ) unmap_domain_page(first);
+
+ spin_unlock(&p2m->lock);
+}
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats. */
+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+ int peek = 1;
+ int i;
+ int bitmap_size;
+ paddr_t gma_start, gma_end;
+
+ /* this hypercall is called from domain 0, and we don't know which guest's
+ * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
+ restore_vlpt(d);
+
+ get_gma_start_end(d, &gma_start, &gma_end);
+ bitmap_size = (gma_end - gma_start) / 8;
+
+ if ( guest_handle_is_null(sc->dirty_bitmap) )
+ {
+ peek = 0;
+ }
+ else
+ {
+ spin_lock(&d->arch.dirty.lock);
+ for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+ {
+ int j = 0;
+ uint8_t *bitmap;
+ copy_to_guest_offset(sc->dirty_bitmap, i * PAGE_SIZE,
+ d->arch.dirty.bitmap[i],
+ bitmap_size < PAGE_SIZE ? bitmap_size :
+ PAGE_SIZE);
+ bitmap_size -= PAGE_SIZE;
+
+ /* set p2m page table read-only */
+ bitmap = d->arch.dirty.bitmap[i];
+ while ((j = find_next_bit((const long unsigned int *)bitmap,
+ PAGE_SIZE*8, j)) < PAGE_SIZE*8)
+ {
+ lpae_t *vlpt;
+ paddr_t addr = gma_start +
+ (i << (2*PAGE_SHIFT+3)) +
+ (j << PAGE_SHIFT);
+ vlpt = get_vlpt_3lvl_pte(addr);
+ vlpt->p2m.write = 0;
+ j++;
+ }
+ }
+
+ if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
+ {
+ for ( i = 0; i < d->arch.dirty.bitmap_pages; ++i )
+ {
+ clear_page(d->arch.dirty.bitmap[i]);
+ }
+ }
+
+ spin_unlock(&d->arch.dirty.lock);
+ flush_tlb_local();
+ }
+
+ sc->stats.dirty_count = d->arch.dirty.count;
+
+ return 0;
+}
+
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+ long ret = 0;
+ switch (sc->op)
+ {
+ case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+ case XEN_DOMCTL_SHADOW_OP_OFF:
+ {
+ enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
+
+ d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
+ p2m_change_entry_type_global(d, nt);
+
+ if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
+ {
+ cleanup_vlpt(d);
+ cleanup_bitmap(d);
+ }
+ else
+ {
+ if ( (ret = prepare_vlpt(d)) )
+ return ret;
+
+ if ( (ret = prepare_bitmap(d)) )
+ {
+ /* in case of failure, we have to cleanup vlpt */
+ cleanup_vlpt(d);
+ return ret;
+ }
+ }
+ }
+ break;
+
+ case XEN_DOMCTL_SHADOW_OP_CLEAN:
+ case XEN_DOMCTL_SHADOW_OP_PEEK:
+ {
+ ret = log_dirty_op(d, sc);
+ }
+ break;
+
+ default:
+ return -ENOSYS;
+ }
+ return ret;
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index 287dd7b..1a7ed11 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1321,6 +1321,8 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
const char *msg;
int rc, level = -1;
mmio_info_t info;
+ int page_fault = ( (dabt.dfsc & FSC_MASK) ==
+ (FSC_FLT_PERM | FSC_3D_LEVEL) && dabt.write );
if ( !check_conditional_instr(regs, hsr) )
{
@@ -1342,6 +1344,13 @@ static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
if ( rc == -EFAULT )
goto bad_data_abort;
+ /* domU page fault handling for guest live migration */
+ /* dabt.valid can be 0 here */
+ if ( page_fault && handle_page_fault(current->domain, info.gpa) )
+ {
+ /* Do not modify pc after page fault to repeat memory operation */
+ return;
+ }
/* XXX: Decode the instruction if ISS is not valid */
if ( !dabt.valid )
goto bad_data_abort;
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 4f366f1..180d924 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -114,9 +114,16 @@ struct arch_domain
/* dirty-page tracing */
struct {
+#define MAX_DIRTY_BITMAP_PAGES 64 /* support upto 8GB guest memory */
+ spinlock_t lock; /* protect list: head, mvn_head */
+ volatile int mode; /* 1 if dirty pages tracing enabled */
+ volatile unsigned int count; /* dirty pages counter */
volatile int second_lvl_start; /* for context switch */
volatile int second_lvl_end;
lpae_t *second_lvl[2]; /* copy of guest p2m's first */
+ /* dirty bitmap */
+ uint8_t *bitmap[MAX_DIRTY_BITMAP_PAGES];
+ int bitmap_pages; /* number of dirty bitmap pages */
} dirty;
} __cacheline_aligned;
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index a74e135..1ce7a4b 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -341,11 +341,18 @@ static inline void put_page_and_type(struct page_info *page)
put_page(page);
}
+enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
+
+/* routine for dirty-page tracing */
+int handle_page_fault(struct domain *d, paddr_t addr);
void get_gma_start_end(struct domain *d, paddr_t *start, paddr_t *end);
int prepare_vlpt(struct domain *d);
void cleanup_vlpt(struct domain *d);
void restore_vlpt(struct domain *d);
+int prepare_bitmap(struct domain *d);
+void cleanup_bitmap(struct domain *d);
+
/* calculate the xen's virtual address for accessing the leaf PTE of
* a given address (GPA) */
static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index c660820..dba9a7b 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,6 +2,7 @@
#define _XEN_P2M_H
#include <xen/mm.h>
+#include <public/domctl.h>
struct domain;
@@ -110,6 +111,9 @@ static inline int get_page_and_type(struct page_info *page,
return rc;
}
+void p2m_change_entry_type_global(struct domain *d, enum mg nt);
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
+
#endif /* _XEN_P2M_H */
/*
diff --git a/xen/include/asm-arm/processor.h b/xen/include/asm-arm/processor.h
index 5294421..fced6ad 100644
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -399,6 +399,8 @@ union hsr {
#define FSC_CPR (0x3a) /* Coprocossor Abort */
#define FSC_LL_MASK (0x03<<0)
+#define FSC_MASK (0x3f) /* Fault status mask */
+#define FSC_3D_LEVEL (0x03) /* Third level fault*/
/* Time counter hypervisor control register */
#define CNTHCTL_PA (1u<<0) /* Kernel/user access to physical counter */
--
1.8.1.2