[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v6 4/5] xen/arm: Implement hypercall for dirty page tracing



Add hypercall (shadow op: enable/disable and clean/peek dirtied page bitmap).
It consists of two parts: dirty page detecting and saving.
For detecting, we setup the guest p2m's leaf PTE read-only and whenever the 
guest
tries to write something, permission fault happens and traps into xen.
The permission-faulted GPA should be saved for the toolstack (when it wants to 
see
which pages are dirtied). For this purpose, we temporarily save the GPAs into 
bitmap.

Signed-off-by: Jaeyong Yoo <jaeyong.yoo@xxxxxxxxxxx>
Signed-off-by: Junghyun Yoo <yjhyun.yoo@xxxxxxxxxxx>
---
 xen/arch/arm/domain.c        |   3 +
 xen/arch/arm/domctl.c        |   9 ++
 xen/arch/arm/mm.c            |  90 +++++++++++++++++++-
 xen/arch/arm/p2m.c           | 195 +++++++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/traps.c         |  18 ++++
 xen/include/asm-arm/domain.h |   3 +
 xen/include/asm-arm/mm.h     |   6 ++
 xen/include/asm-arm/p2m.h    |   7 +-
 8 files changed, 329 insertions(+), 2 deletions(-)

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 4978765..6a0b36d 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -512,6 +512,9 @@ int arch_domain_create(struct domain *d, unsigned int 
domcr_flags)
 
     /* init for dirty-page tracing */
     d->arch.dirty.mode = 0;
+    spin_lock_init(&d->arch.dirty.lock);
+    d->arch.dirty.bitmap = NULL;
+    d->arch.dirty.bitmap_nr_bytes = 0;
 
     d->arch.dirty.p2m_start_idx = 0;
     d->arch.dirty.p2m_end_idx = 0;
diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c
index 9f65442..054de16 100644
--- a/xen/arch/arm/domctl.c
+++ b/xen/arch/arm/domctl.c
@@ -107,6 +107,15 @@ long arch_do_domctl(struct xen_domctl *domctl, struct 
domain *d,
             xfree(c.data);
     }
     break;
+    case XEN_DOMCTL_shadow_op:
+    {
+        domain_pause(d);
+        ret = dirty_mode_op(d, &domctl->u.shadow_op);
+        domain_unpause(d);
+
+        copyback = 1;
+    }
+    break;
 
     default:
         return subarch_do_domctl(domctl, d, u_domctl);
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 0fc9d9a..238a15e 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -865,7 +865,6 @@ void destroy_xen_mappings(unsigned long v, unsigned long e)
     create_xen_entries(REMOVE, v, 0, (e - v) >> PAGE_SHIFT, 0);
 }
 
-enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
 static void set_pte_flags_on_range(const char *p, unsigned long l, enum mg mg)
 {
     lpae_t pte;
@@ -1328,6 +1327,95 @@ void cleanup_vlpt(struct domain *d)
     unmap_domain_page_global(d->arch.dirty.p2m_first[1]);
 }
 
+static inline void mark_dirty_bitmap(struct domain *d, paddr_t addr)
+{
+    paddr_t ram_base = (paddr_t) GUEST_RAM_BASE;
+    int bit_index = PFN_DOWN(addr - ram_base);
+
+    set_bit(bit_index, d->arch.dirty.bitmap);
+}
+
+/* routine for dirty-page tracing
+ *
+ * On first write, it page faults, its entry is changed to read-write,
+ * and on retry the write succeeds.
+ *
+ * for locating p2m of the faulting entry, we use virtual-linear page table.
+ * returns zero if addr is not valid or dirty mode is not set
+ */
+int handle_page_fault(struct domain *d, paddr_t addr)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+    lpae_t *vlp2m_pte = 0;
+    paddr_t gma_start = GUEST_RAM_BASE;
+    paddr_t gma_end = 0;
+
+    if ( !d->arch.dirty.mode ) return 0;
+    gma_end = get_gma_end(d);
+
+    /* Ensure that addr is inside guest's RAM */
+    if ( addr < gma_start ||
+         addr > gma_end ) return 0;
+
+    spin_lock(&p2m->lock);
+    vlp2m_pte = get_vlpt_3lvl_pte(addr);
+    if ( vlp2m_pte->p2m.valid && vlp2m_pte->p2m.write == 0 &&
+         vlp2m_pte->p2m.type == p2m_ram_logdirty )
+    {
+        lpae_t pte = *vlp2m_pte;
+        pte.p2m.write = 1;
+        write_pte(vlp2m_pte, pte);
+        flush_tlb();
+        spin_unlock(&p2m->lock);
+
+        /* only necessary to lock between get-dirty bitmap and mark dirty
+         * bitmap. If get-dirty bitmap happens immediately before this
+         * lock, the corresponding dirty-page would be marked at the next
+         * round of get-dirty bitmap */
+        spin_lock(&d->arch.dirty.lock);
+        mark_dirty_bitmap(d, addr);
+        spin_unlock(&d->arch.dirty.lock);
+    }
+    else
+        spin_unlock(&p2m->lock);
+
+    return 1;
+}
+
+int prepare_bitmap(struct domain *d)
+{
+    paddr_t gma_start = GUEST_RAM_BASE;
+    paddr_t gma_end = 0;
+    uint32_t nr_bytes, nr_pages, order;
+
+    gma_end = get_gma_end(d);
+
+    nr_bytes = (PFN_DOWN(gma_end - gma_start) + 7) / 8;
+    nr_pages = (nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+    order = get_order_from_pages(nr_pages);
+
+    d->arch.dirty.bitmap = alloc_xenheap_pages(order, 0);
+    if ( d->arch.dirty.bitmap == NULL )
+        return -ENOMEM;
+
+    memset(d->arch.dirty.bitmap, 0, nr_bytes);
+
+    d->arch.dirty.bitmap_nr_bytes = nr_bytes;
+    return 0;
+}
+
+void cleanup_bitmap(struct domain *d)
+{
+    uint32_t nr_pages, order;
+
+    nr_pages = (d->arch.dirty.bitmap_nr_bytes + PAGE_SIZE - 1) / PAGE_SIZE;
+    order = get_order_from_pages(nr_pages);
+
+    free_xenheap_pages(d->arch.dirty.bitmap, order);
+    d->arch.dirty.bitmap = NULL;
+    d->arch.dirty.bitmap_nr_bytes = 0;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
index 96bc0ef..b111452 100644
--- a/xen/arch/arm/p2m.c
+++ b/xen/arch/arm/p2m.c
@@ -4,6 +4,8 @@
 #include <xen/errno.h>
 #include <xen/domain_page.h>
 #include <xen/bitops.h>
+#include <xen/guest_access.h>
+#include <xen/pfn.h>
 #include <asm/flushtlb.h>
 #include <asm/gic.h>
 #include <asm/event.h>
@@ -223,6 +225,7 @@ static lpae_t mfn_to_p2m_entry(unsigned long mfn, unsigned 
int mattr,
         break;
 
     case p2m_ram_ro:
+       case p2m_ram_logdirty:
         e.p2m.xn = 0;
         e.p2m.write = 0;
         break;
@@ -284,6 +287,10 @@ static int p2m_create_table(struct domain *d, lpae_t 
*entry, bool_t flush_cache)
 
     pte = mfn_to_p2m_entry(page_to_mfn(page), MATTR_MEM, p2m_invalid);
 
+    /* mark the write bit (page table's case, ro bit) as 0
+     * so, it is writable in case of vlpt access */
+    pte.pt.ro = 0;
+
     p2m_write_pte(entry, pte, flush_cache);
 
     return 0;
@@ -715,6 +722,194 @@ unsigned long gmfn_to_mfn(struct domain *d, unsigned long 
gpfn)
     return p >> PAGE_SHIFT;
 }
 
+/* Change types across all p2m entries in a domain */
+int p2m_change_entry_type_global(struct domain *d, enum mg nt)
+{
+    struct p2m_domain *p2m = &d->arch.p2m;
+    paddr_t ram_base = GUEST_RAM_BASE;
+    paddr_t ram_end;
+    paddr_t paddr;
+    int nr_pages;
+    int rc = -EFAULT, i;
+    unsigned long cur_first_offset = ~0, cur_second_offset = ~0;
+    lpae_t *first = NULL, *second = NULL, *third = NULL;
+    lpae_t pte;
+
+    ram_end = get_gma_end(d);
+    paddr = ram_base;
+    nr_pages = (ram_end - ram_base) >> PAGE_SHIFT;
+
+    spin_lock(&p2m->lock);
+
+    first = __map_domain_page(p2m->first_level);
+    if ( !first ||
+         !first[first_table_offset(paddr)].p2m.valid ||
+         !first[first_table_offset(paddr)].p2m.table )
+        goto err;
+
+    for ( i = 0; i < nr_pages; ++i )
+    {
+        if ( cur_first_offset != first_table_offset(paddr) )
+        {
+            if ( second ) unmap_domain_page(second);
+            second = 
map_domain_page(first[first_table_offset(paddr)].p2m.base);
+            cur_first_offset = first_table_offset(paddr);
+        }
+        if ( !second ||
+             !second[second_table_offset(paddr)].p2m.valid ||
+             !second[second_table_offset(paddr)].p2m.table )
+            goto err;
+        if ( cur_second_offset != second_table_offset(paddr) )
+        {
+            if ( third ) unmap_domain_page(third);
+            third = 
map_domain_page(second[second_table_offset(paddr)].p2m.base);
+            cur_second_offset = second_table_offset(paddr);
+        }
+        if ( !third ||
+             !third[third_table_offset(paddr)].p2m.valid )
+            goto err;
+
+        pte = third[third_table_offset(paddr)];
+
+        if ( nt == mg_ro )
+        {
+                       /* use avail-bit as a backup for write bit */
+                       if ( pte.p2m.write == 1 )
+                       {
+                               pte.p2m.write = 0;
+                               pte.p2m.type = p2m_ram_logdirty;
+                       }
+                       else
+                       {
+                               pte.p2m.type = p2m_ram_rw;
+                       }
+               }
+               else if ( nt == mg_rw )
+               {
+                       /* restore the write bit */
+                       if ( pte.p2m.write == 0 && pte.p2m.type == 
p2m_ram_logdirty )
+                       {
+                               pte.p2m.write = p2m_ram_rw;
+                       }
+               }
+               
+               write_pte(&third[third_table_offset(paddr)], pte);
+        paddr += PAGE_SIZE;
+    }
+
+    rc = 0;
+err:
+    flush_tlb_all_local();
+    if ( third ) unmap_domain_page(third);
+    if ( second ) unmap_domain_page(second);
+    if ( first ) unmap_domain_page(first);
+    spin_unlock(&p2m->lock);
+    return rc;
+}
+
+/* Read a domain's log-dirty bitmap and stats.
+ * If the operation is a CLEAN, clear the bitmap and stats. */
+int log_dirty_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    int bitmap_size;
+    paddr_t gma_start = GUEST_RAM_BASE, gma_end;
+
+    /* this hypercall is called from domain 0, and we don't know which guest's
+     * vlpt is mapped in xen_second, so, to be sure, we restore vlpt here */
+    restore_vlpt(d);
+
+    gma_end = get_gma_end(d);
+    bitmap_size = (gma_end - gma_start) / 8;
+
+    if ( guest_handle_is_null(sc->dirty_bitmap) )
+    {
+        return -EINVAL;
+    }
+    else
+    {
+        uint32_t j = 0;
+        uint8_t *bitmap = d->arch.dirty.bitmap;
+        uint32_t nr_bytes = d->arch.dirty.bitmap_nr_bytes;
+        spin_lock(&d->arch.dirty.lock);
+
+        if ( copy_to_guest_offset(sc->dirty_bitmap, 0, bitmap, nr_bytes) )
+        {
+            return -EINVAL;
+        }
+
+        dsb(sy);
+        while ((j = find_next_bit((const long unsigned int *)bitmap,
+                                  nr_bytes * 8, j)) < nr_bytes * 8)
+        {
+            lpae_t *vlpt, new_vlpt;
+            paddr_t addr = gma_start + (j << PAGE_SHIFT);
+            vlpt = get_vlpt_3lvl_pte(addr);
+            new_vlpt = *vlpt;
+            new_vlpt.p2m.write = 0;
+            __write_pte(vlpt, new_vlpt);
+            j++;
+        }
+        dsb(sy);
+
+        if ( sc->op == XEN_DOMCTL_SHADOW_OP_CLEAN )
+            memset(bitmap, 0, nr_bytes);
+
+        spin_unlock(&d->arch.dirty.lock);
+        flush_tlb_local();
+    }
+
+    sc->stats.dirty_count = 0;
+
+    return 0;
+}
+
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc)
+{
+    long ret = 0;
+    switch (sc->op)
+    {
+        case XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY:
+        case XEN_DOMCTL_SHADOW_OP_OFF:
+        {
+            enum mg nt = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? mg_rw : mg_ro;
+
+            d->arch.dirty.mode = sc->op == XEN_DOMCTL_SHADOW_OP_OFF ? 0 : 1;
+            if ( (ret = p2m_change_entry_type_global(d, nt)) )
+                return ret;
+
+            if ( sc->op == XEN_DOMCTL_SHADOW_OP_OFF )
+            {
+                cleanup_vlpt(d);
+                cleanup_bitmap(d);
+            }
+            else
+            {
+                if ( (ret = prepare_vlpt(d)) )
+                   return ret;
+
+                if ( (ret = prepare_bitmap(d)) )
+                {
+                   /* in case of failure, we have to cleanup vlpt */
+                   cleanup_vlpt(d);
+                   return ret;
+                }
+            }
+        }
+        break;
+
+        case XEN_DOMCTL_SHADOW_OP_CLEAN:
+        case XEN_DOMCTL_SHADOW_OP_PEEK:
+        {
+            ret = log_dirty_op(d, sc);
+        }
+        break;
+
+        default:
+            return -ENOSYS;
+    }
+    return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index 03a3da6..9b6d746 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1603,6 +1603,13 @@ static void do_trap_instr_abort_guest(struct 
cpu_user_regs *regs,
     inject_iabt_exception(regs, addr, hsr.len);
 }
 
+static inline int dabt_is_page_fault(struct hsr_dabt dabt)
+{
+    /* dabt.valid can be 0 here */
+    return (dabt.dfsc & FSC_TYPE_MASK) == FSC_TYPE_FAULT &&
+           (dabt.dfsc & FSC_LL_MASK) == 0x3 /* third level */;
+}
+
 static void do_trap_data_abort_guest(struct cpu_user_regs *regs,
                                      union hsr hsr)
 {
@@ -1630,6 +1637,17 @@ static void do_trap_data_abort_guest(struct 
cpu_user_regs *regs,
     if ( rc == -EFAULT )
         goto bad_data_abort;
 
+    /* domU page fault handling for guest live migration */
+    if ( dabt_is_page_fault(dabt) )
+    {
+        /* Do not advancey pc here for repeating memory operation in guest */
+        if ( handle_page_fault(current->domain, info.gpa) ) return;
+
+        /* handle_page_fault returns 0 means either dirty-page tracing is not
+         * yet started or 'real' permission fault happens.
+         * Then just fall through */
+    }
+
     /* XXX: Decode the instruction if ISS is not valid */
     if ( !dabt.valid )
         goto bad_data_abort;
diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h
index 9674175..75f3a57 100644
--- a/xen/include/asm-arm/domain.h
+++ b/xen/include/asm-arm/domain.h
@@ -171,6 +171,9 @@ struct arch_domain
         lpae_t *p2m_first[2];            /* copy of guest p2m's first */
         int p2m_start_idx;               /* start index of p2m_first */
         int p2m_end_idx;                 /* end index of p2m_first */
+        uint8_t *bitmap;                 /* dirty bitmap */
+        uint32_t bitmap_nr_bytes;        /* number of bytes for dirty bitmap */
+        spinlock_t lock;                 /* protect list: head, mvn_head */
     } dirty;
 
 }  __cacheline_aligned;
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 7ceb568..90ece9a 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -344,6 +344,8 @@ static inline void put_page_and_type(struct page_info *page)
 
 void clear_and_clean_page(struct page_info *page);
 
+enum mg { mg_clear, mg_ro, mg_rw, mg_rx };
+
 /* routine for dirty-page tracing */
 #define VLPT_SIZE (1 << SECOND_SHIFT)
 #define VLPT_VA_TO_IDX(va) ((va - DOMHEAP_VIRT_START) >> SECOND_SHIFT)
@@ -356,6 +358,10 @@ int prepare_vlpt(struct domain *d);
 void cleanup_vlpt(struct domain *d);
 void restore_vlpt(struct domain *d);
 
+int handle_page_fault(struct domain *d, paddr_t addr);
+int prepare_bitmap(struct domain *d);
+void cleanup_bitmap(struct domain *d);
+
 /* calculate the xen's virtual address for accessing the leaf PTE of
  * a given address (GPA) */
 static inline lpae_t * get_vlpt_3lvl_pte(paddr_t addr)
diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
index bd71abe..cc17ec3 100644
--- a/xen/include/asm-arm/p2m.h
+++ b/xen/include/asm-arm/p2m.h
@@ -2,6 +2,7 @@
 #define _XEN_P2M_H
 
 #include <xen/mm.h>
+#include <public/domctl.h>
 
 struct domain;
 
@@ -41,6 +42,7 @@ typedef enum {
     p2m_invalid = 0,    /* Nothing mapped here */
     p2m_ram_rw,         /* Normal read/write guest RAM */
     p2m_ram_ro,         /* Read-only; writes are silently dropped */
+    p2m_ram_logdirty,  /* Read-only; specisl mode for log dirty */
     p2m_mmio_direct,    /* Read/write mapping of genuine MMIO area */
     p2m_map_foreign,    /* Ram pages from foreign domain */
     p2m_grant_map_rw,   /* Read/write grant mapping */
@@ -49,7 +51,7 @@ typedef enum {
 } p2m_type_t;
 
 #define p2m_is_foreign(_t)  ((_t) == p2m_map_foreign)
-#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro)
+#define p2m_is_ram(_t)      ((_t) == p2m_ram_rw || (_t) == p2m_ram_ro || (_t) 
== p2m_ram_logdirty )
 
 /* Initialise vmid allocator */
 void p2m_vmid_allocator_init(void);
@@ -178,6 +180,9 @@ static inline int get_page_and_type(struct page_info *page,
     return rc;
 }
 
+int p2m_change_entry_type_global(struct domain *d, enum mg nt);
+long dirty_mode_op(struct domain *d, xen_domctl_shadow_op_t *sc);
+
 #endif /* _XEN_P2M_H */
 
 /*
-- 
1.8.1.2


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.